Azrail commited on
Commit
b3a82a1
·
verified ·
1 Parent(s): 2dfb166

Training in progress, step 2000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4ceb1eb66e59b2be3b7b56c92ceeee0dff3d3afb66f4e798a3c7f7a973769da
3
  size 150625560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48c8ccaea45630ddf9ea1b88f0156cd1fedd61905cf18898819db2276984fcb7
3
  size 150625560
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:300510ba8259c34ff91864564df369f7ef5da88e12e952b5fc750f0733f75299
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7b50e0d348776f6bfde587ad7885adcff251381a86d03c221fe493591258125
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dec81976f45ae46b24f8606fae652d67e12c656a1fb7001a6621b0275456c831
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddd99e2663b559b91167c1d3714d334dd46c05cbb133b69d7b52f7b883548de2
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ebbd97a5f85f91a146bd613ec7c9e8b4ff3314b259cf17fa7c45467f21162760
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7286e34301379ab62676eaae8fceab28248cb123c9029e5c4370161a2e81133d
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.24141104757306456,
6
  "eval_steps": 500,
7
- "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -226,11 +226,229 @@
226
  "eval_steps_per_second": 21.33,
227
  "num_input_tokens_seen": 482701824,
228
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  }
230
  ],
231
  "logging_steps": 50,
232
  "max_steps": 16568,
233
- "num_input_tokens_seen": 482701824,
234
  "num_train_epochs": 4,
235
  "save_steps": 1000,
236
  "stateful_callbacks": {
@@ -245,7 +463,7 @@
245
  "attributes": {}
246
  }
247
  },
248
- "total_flos": 1.2912744908980224e+17,
249
  "train_batch_size": 16,
250
  "trial_name": null,
251
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.4828220951461291,
6
  "eval_steps": 500,
7
+ "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
226
  "eval_steps_per_second": 21.33,
227
  "num_input_tokens_seen": 482701824,
228
  "step": 1000
229
+ },
230
+ {
231
+ "epoch": 0.2534815999517178,
232
+ "grad_norm": 0.62890625,
233
+ "learning_rate": 1.584188292094146e-05,
234
+ "loss": 2.557,
235
+ "mean_token_accuracy": 0.5126943441852927,
236
+ "num_input_tokens_seen": 506798000,
237
+ "num_tokens": 213604239.0,
238
+ "step": 1050
239
+ },
240
+ {
241
+ "epoch": 0.265552152330371,
242
+ "grad_norm": 0.578125,
243
+ "learning_rate": 1.659625829812915e-05,
244
+ "loss": 2.5533,
245
+ "mean_token_accuracy": 0.5126326360553503,
246
+ "num_input_tokens_seen": 530786496,
247
+ "num_tokens": 223786306.0,
248
+ "step": 1100
249
+ },
250
+ {
251
+ "epoch": 0.2776227047090242,
252
+ "grad_norm": 0.56640625,
253
+ "learning_rate": 1.7350633675316838e-05,
254
+ "loss": 2.5528,
255
+ "mean_token_accuracy": 0.5130741761997342,
256
+ "num_input_tokens_seen": 554798768,
257
+ "num_tokens": 233857737.0,
258
+ "step": 1150
259
+ },
260
+ {
261
+ "epoch": 0.28969325708767746,
262
+ "grad_norm": 0.5234375,
263
+ "learning_rate": 1.8105009052504525e-05,
264
+ "loss": 2.5433,
265
+ "mean_token_accuracy": 0.5140750538557768,
266
+ "num_input_tokens_seen": 578877696,
267
+ "num_tokens": 243994162.0,
268
+ "step": 1200
269
+ },
270
+ {
271
+ "epoch": 0.3017638094663307,
272
+ "grad_norm": 0.671875,
273
+ "learning_rate": 1.8859384429692215e-05,
274
+ "loss": 2.5414,
275
+ "mean_token_accuracy": 0.5151798555627465,
276
+ "num_input_tokens_seen": 603141184,
277
+ "num_tokens": 254196491.0,
278
+ "step": 1250
279
+ },
280
+ {
281
+ "epoch": 0.3138343618449839,
282
+ "grad_norm": 0.59375,
283
+ "learning_rate": 1.9613759806879906e-05,
284
+ "loss": 2.5383,
285
+ "mean_token_accuracy": 0.5148227337375283,
286
+ "num_input_tokens_seen": 627221936,
287
+ "num_tokens": 264309463.0,
288
+ "step": 1300
289
+ },
290
+ {
291
+ "epoch": 0.32590491422363715,
292
+ "grad_norm": 0.498046875,
293
+ "learning_rate": 2.0368135184067593e-05,
294
+ "loss": 2.5292,
295
+ "mean_token_accuracy": 0.5156710411980748,
296
+ "num_input_tokens_seen": 651386848,
297
+ "num_tokens": 274462595.0,
298
+ "step": 1350
299
+ },
300
+ {
301
+ "epoch": 0.3379754666022904,
302
+ "grad_norm": 0.51171875,
303
+ "learning_rate": 2.1122510561255283e-05,
304
+ "loss": 2.523,
305
+ "mean_token_accuracy": 0.5161588852107525,
306
+ "num_input_tokens_seen": 675579984,
307
+ "num_tokens": 284636254.0,
308
+ "step": 1400
309
+ },
310
+ {
311
+ "epoch": 0.3500460189809436,
312
+ "grad_norm": 6.375,
313
+ "learning_rate": 2.187688593844297e-05,
314
+ "loss": 2.528,
315
+ "mean_token_accuracy": 0.5156370849534869,
316
+ "num_input_tokens_seen": 699751424,
317
+ "num_tokens": 294907617.0,
318
+ "step": 1450
319
+ },
320
+ {
321
+ "epoch": 0.36211657135959685,
322
+ "grad_norm": 0.68359375,
323
+ "learning_rate": 2.2631261315630658e-05,
324
+ "loss": 2.5232,
325
+ "num_input_tokens_seen": 723883888,
326
+ "step": 1500
327
+ },
328
+ {
329
+ "epoch": 0.36211657135959685,
330
+ "eval_loss": 2.42679500579834,
331
+ "eval_mean_token_accuracy": 0.5353444569074368,
332
+ "eval_num_tokens": 305087605.0,
333
+ "eval_runtime": 125.481,
334
+ "eval_samples_per_second": 85.367,
335
+ "eval_steps_per_second": 21.342,
336
+ "num_input_tokens_seen": 723883888,
337
+ "step": 1500
338
+ },
339
+ {
340
+ "epoch": 0.3741871237382501,
341
+ "grad_norm": 0.455078125,
342
+ "learning_rate": 2.3385636692818348e-05,
343
+ "loss": 2.5178,
344
+ "mean_token_accuracy": 0.5161234551295638,
345
+ "num_input_tokens_seen": 748106064,
346
+ "num_tokens": 315240015.0,
347
+ "step": 1550
348
+ },
349
+ {
350
+ "epoch": 0.3862576761169033,
351
+ "grad_norm": 0.5078125,
352
+ "learning_rate": 2.4140012070006035e-05,
353
+ "loss": 2.5157,
354
+ "mean_token_accuracy": 0.5167792574688792,
355
+ "num_input_tokens_seen": 772242320,
356
+ "num_tokens": 325393577.0,
357
+ "step": 1600
358
+ },
359
+ {
360
+ "epoch": 0.39832822849555655,
361
+ "grad_norm": 0.54296875,
362
+ "learning_rate": 2.4894387447193726e-05,
363
+ "loss": 2.505,
364
+ "mean_token_accuracy": 0.5181788290664554,
365
+ "num_input_tokens_seen": 796536976,
366
+ "num_tokens": 335636146.0,
367
+ "step": 1650
368
+ },
369
+ {
370
+ "epoch": 0.4103987808742098,
371
+ "grad_norm": 0.50390625,
372
+ "learning_rate": 2.5648762824381413e-05,
373
+ "loss": 2.5034,
374
+ "mean_token_accuracy": 0.5191374982148409,
375
+ "num_input_tokens_seen": 820727792,
376
+ "num_tokens": 345721073.0,
377
+ "step": 1700
378
+ },
379
+ {
380
+ "epoch": 0.422469333252863,
381
+ "grad_norm": 0.40234375,
382
+ "learning_rate": 2.64031382015691e-05,
383
+ "loss": 2.5049,
384
+ "mean_token_accuracy": 0.5170740441232919,
385
+ "num_input_tokens_seen": 844900704,
386
+ "num_tokens": 355941787.0,
387
+ "step": 1750
388
+ },
389
+ {
390
+ "epoch": 0.4345398856315162,
391
+ "grad_norm": 0.46484375,
392
+ "learning_rate": 2.7157513578756787e-05,
393
+ "loss": 2.4944,
394
+ "mean_token_accuracy": 0.5191365649551153,
395
+ "num_input_tokens_seen": 868882304,
396
+ "num_tokens": 366140276.0,
397
+ "step": 1800
398
+ },
399
+ {
400
+ "epoch": 0.4466104380101694,
401
+ "grad_norm": 0.4609375,
402
+ "learning_rate": 2.791188895594448e-05,
403
+ "loss": 2.4852,
404
+ "mean_token_accuracy": 0.519771606773138,
405
+ "num_input_tokens_seen": 893201120,
406
+ "num_tokens": 376397263.0,
407
+ "step": 1850
408
+ },
409
+ {
410
+ "epoch": 0.45868099038882265,
411
+ "grad_norm": 0.42578125,
412
+ "learning_rate": 2.866626433313217e-05,
413
+ "loss": 2.4242,
414
+ "mean_token_accuracy": 0.5218987537547946,
415
+ "num_input_tokens_seen": 917474880,
416
+ "num_tokens": 386613084.0,
417
+ "step": 1900
418
+ },
419
+ {
420
+ "epoch": 0.4707515427674759,
421
+ "grad_norm": 0.486328125,
422
+ "learning_rate": 2.942063971031986e-05,
423
+ "loss": 2.4118,
424
+ "mean_token_accuracy": 0.5205260647833347,
425
+ "num_input_tokens_seen": 941694304,
426
+ "num_tokens": 396813412.0,
427
+ "step": 1950
428
+ },
429
+ {
430
+ "epoch": 0.4828220951461291,
431
+ "grad_norm": 0.421875,
432
+ "learning_rate": 3.0175015087507546e-05,
433
+ "loss": 2.3856,
434
+ "num_input_tokens_seen": 965894272,
435
+ "step": 2000
436
+ },
437
+ {
438
+ "epoch": 0.4828220951461291,
439
+ "eval_loss": 2.281926155090332,
440
+ "eval_mean_token_accuracy": 0.5452906315098778,
441
+ "eval_num_tokens": 407043305.0,
442
+ "eval_runtime": 126.3646,
443
+ "eval_samples_per_second": 84.771,
444
+ "eval_steps_per_second": 21.193,
445
+ "num_input_tokens_seen": 965894272,
446
+ "step": 2000
447
  }
448
  ],
449
  "logging_steps": 50,
450
  "max_steps": 16568,
451
+ "num_input_tokens_seen": 965894272,
452
  "num_train_epochs": 4,
453
  "save_steps": 1000,
454
  "stateful_callbacks": {
 
463
  "attributes": {}
464
  }
465
  },
466
+ "total_flos": 2.5838614488809472e+17,
467
  "train_batch_size": 16,
468
  "trial_name": null,
469
  "trial_params": null