ljcamargo commited on
Commit
4470dc1
·
verified ·
1 Parent(s): b8829bd

Training in progress, step 750, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e062c064398a956fac974a79f09e1c9659956a9fdf96df5c70aa72db86396863
3
  size 3809184360
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56df8a1ec29b8cbb0c42a5264c1932a6249f0599e86f80aff848eb9853130cc0
3
  size 3809184360
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:45f3f458d13720e9a7d4cd7e4225dcd1cc7c188cc14bfe1f5cdf1c81c33315ba
3
  size 2457459557
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25722e7bf5fd0346d0d888862ebc90ce085ec4c8463a3f71bd2676c7fe82bedb
3
  size 2457459557
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81d4f4b1fec8227486261e0ca0332075e5277c747f156631e8baf30d09642001
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdb8b30fd18ca6a24d25d627bd2e13a1e8f8cc7de78183781c4d89f29175eee0
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b05714b3f7fdb6eaa769e652ab97d810715e0b9a1f62855693cf5929568c9e83
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8b125c082de6f20d827ac9ce3a7228054a763972dd6779dfe18031391e49829
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.2,
6
  "eval_steps": 500,
7
- "global_step": 500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -358,6 +358,181 @@
358
  "learning_rate": 4.042338709677419e-05,
359
  "loss": 0.9574,
360
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
  }
362
  ],
363
  "logging_steps": 10,
@@ -377,7 +552,7 @@
377
  "attributes": {}
378
  }
379
  },
380
- "total_flos": 9036323224934400.0,
381
  "train_batch_size": 2,
382
  "trial_name": null,
383
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.3,
6
  "eval_steps": 500,
7
+ "global_step": 750,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
358
  "learning_rate": 4.042338709677419e-05,
359
  "loss": 0.9574,
360
  "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.204,
364
+ "grad_norm": 21.463293075561523,
365
+ "learning_rate": 4.022177419354839e-05,
366
+ "loss": 0.9361,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.208,
371
+ "grad_norm": 18.652297973632812,
372
+ "learning_rate": 4.002016129032258e-05,
373
+ "loss": 0.8447,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.212,
378
+ "grad_norm": 20.645477294921875,
379
+ "learning_rate": 3.981854838709678e-05,
380
+ "loss": 1.1835,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.216,
385
+ "grad_norm": 34.288246154785156,
386
+ "learning_rate": 3.961693548387097e-05,
387
+ "loss": 0.9498,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.22,
392
+ "grad_norm": 20.84208106994629,
393
+ "learning_rate": 3.941532258064517e-05,
394
+ "loss": 0.815,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.224,
399
+ "grad_norm": 20.82636070251465,
400
+ "learning_rate": 3.9213709677419355e-05,
401
+ "loss": 0.9922,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.228,
406
+ "grad_norm": 22.959444046020508,
407
+ "learning_rate": 3.901209677419355e-05,
408
+ "loss": 0.8921,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.232,
413
+ "grad_norm": 17.115203857421875,
414
+ "learning_rate": 3.8810483870967744e-05,
415
+ "loss": 0.992,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.236,
420
+ "grad_norm": 46.51063919067383,
421
+ "learning_rate": 3.860887096774194e-05,
422
+ "loss": 1.0348,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 0.24,
427
+ "grad_norm": 11.92985725402832,
428
+ "learning_rate": 3.8407258064516134e-05,
429
+ "loss": 0.8958,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 0.244,
434
+ "grad_norm": 21.362239837646484,
435
+ "learning_rate": 3.820564516129033e-05,
436
+ "loss": 0.902,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 0.248,
441
+ "grad_norm": 23.70836639404297,
442
+ "learning_rate": 3.800403225806452e-05,
443
+ "loss": 1.1867,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 0.252,
448
+ "grad_norm": 13.512877464294434,
449
+ "learning_rate": 3.780241935483871e-05,
450
+ "loss": 1.1404,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 0.256,
455
+ "grad_norm": 26.752376556396484,
456
+ "learning_rate": 3.7600806451612906e-05,
457
+ "loss": 0.8339,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 0.26,
462
+ "grad_norm": 21.153675079345703,
463
+ "learning_rate": 3.7399193548387094e-05,
464
+ "loss": 0.9203,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 0.264,
469
+ "grad_norm": 19.008054733276367,
470
+ "learning_rate": 3.719758064516129e-05,
471
+ "loss": 1.029,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 0.268,
476
+ "grad_norm": 16.564945220947266,
477
+ "learning_rate": 3.6995967741935484e-05,
478
+ "loss": 0.9101,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 0.272,
483
+ "grad_norm": 17.936182022094727,
484
+ "learning_rate": 3.679435483870968e-05,
485
+ "loss": 0.8259,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 0.276,
490
+ "grad_norm": 28.142871856689453,
491
+ "learning_rate": 3.659274193548387e-05,
492
+ "loss": 0.8789,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 0.28,
497
+ "grad_norm": 22.677753448486328,
498
+ "learning_rate": 3.639112903225806e-05,
499
+ "loss": 0.7686,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 0.284,
504
+ "grad_norm": 24.73027801513672,
505
+ "learning_rate": 3.6189516129032256e-05,
506
+ "loss": 0.8722,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 0.288,
511
+ "grad_norm": 17.17127227783203,
512
+ "learning_rate": 3.598790322580645e-05,
513
+ "loss": 0.9406,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 0.292,
518
+ "grad_norm": 21.88510513305664,
519
+ "learning_rate": 3.5786290322580645e-05,
520
+ "loss": 0.9067,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 0.296,
525
+ "grad_norm": 16.742895126342773,
526
+ "learning_rate": 3.558467741935484e-05,
527
+ "loss": 0.756,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 0.3,
532
+ "grad_norm": 18.23801040649414,
533
+ "learning_rate": 3.5383064516129035e-05,
534
+ "loss": 0.8895,
535
+ "step": 750
536
  }
537
  ],
538
  "logging_steps": 10,
 
552
  "attributes": {}
553
  }
554
  },
555
+ "total_flos": 1.3535660950272e+16,
556
  "train_batch_size": 2,
557
  "trial_name": null,
558
  "trial_params": null