irishprancer commited on
Commit
610c578
·
verified ·
1 Parent(s): 1861e5d

Training in progress, step 750, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:372891febfadfccbabac9570878fa86511c85965c83b7adbeef55c8d100f4f2d
3
  size 527048968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e70fb031762fcfd8c2a1f24a8bd93eb87a81e3c17768955c081c28620420fc2
3
  size 527048968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:620d4f511180616a4534055c09585878071e140c18fdd5dc3beb5a71366c356b
3
  size 1054136250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1fe1a49d9560e0ca1782d1d6f8e87b1fe1f66e212ac402fe9c344df8a527655
3
  size 1054136250
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5cf065c84ff75b4c8bc24f08fcd1880a75e81b5b99444434709d4c17d68aad0f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f2273afa8515c993e20ab8b02a38f3946423176fb53d9323aa6e0510256a9c8
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb248e7cc2fe7b509c9e866be7b72af3b33225d8b86373c1a62393cc3a24f4da
3
  size 1256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f47df519d3e34f85833ffe9513be298918979811657719c019fec7ab68351e14
3
  size 1256
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.7259252071380615,
3
- "best_model_checkpoint": "./output/checkpoint-450",
4
- "epoch": 10.0,
5
  "eval_steps": 150,
6
- "global_step": 450,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -402,6 +402,296 @@
402
  "EMA_steps_per_second": 22.503,
403
  "epoch": 10.0,
404
  "step": 450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
  }
406
  ],
407
  "logging_steps": 10,
@@ -421,7 +711,7 @@
421
  "attributes": {}
422
  }
423
  },
424
- "total_flos": 4801636770840576.0,
425
  "train_batch_size": 2,
426
  "trial_name": null,
427
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.7214915752410889,
3
+ "best_model_checkpoint": "./output/checkpoint-750",
4
+ "epoch": 16.666666666666668,
5
  "eval_steps": 150,
6
+ "global_step": 750,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
402
  "EMA_steps_per_second": 22.503,
403
  "epoch": 10.0,
404
  "step": 450
405
+ },
406
+ {
407
+ "epoch": 10.222222222222221,
408
+ "grad_norm": 2.220747470855713,
409
+ "learning_rate": 2.9996147467351836e-05,
410
+ "loss": 0.5056,
411
+ "step": 460
412
+ },
413
+ {
414
+ "epoch": 10.444444444444445,
415
+ "grad_norm": 1.4205608367919922,
416
+ "learning_rate": 2.9995930474939753e-05,
417
+ "loss": 0.4901,
418
+ "step": 470
419
+ },
420
+ {
421
+ "epoch": 10.666666666666666,
422
+ "grad_norm": 1.9306081533432007,
423
+ "learning_rate": 2.9995707538619954e-05,
424
+ "loss": 0.6361,
425
+ "step": 480
426
+ },
427
+ {
428
+ "epoch": 10.88888888888889,
429
+ "grad_norm": 2.1457133293151855,
430
+ "learning_rate": 2.9995478658480802e-05,
431
+ "loss": 0.5528,
432
+ "step": 490
433
+ },
434
+ {
435
+ "epoch": 11.11111111111111,
436
+ "grad_norm": 1.8677959442138672,
437
+ "learning_rate": 2.9995243834613023e-05,
438
+ "loss": 0.5233,
439
+ "step": 500
440
+ },
441
+ {
442
+ "epoch": 11.333333333333334,
443
+ "grad_norm": 1.6708972454071045,
444
+ "learning_rate": 2.9995003067109687e-05,
445
+ "loss": 0.5387,
446
+ "step": 510
447
+ },
448
+ {
449
+ "epoch": 11.555555555555555,
450
+ "grad_norm": 2.6434991359710693,
451
+ "learning_rate": 2.9994756356066226e-05,
452
+ "loss": 0.5847,
453
+ "step": 520
454
+ },
455
+ {
456
+ "epoch": 11.777777777777779,
457
+ "grad_norm": 2.2601070404052734,
458
+ "learning_rate": 2.999450370158044e-05,
459
+ "loss": 0.5341,
460
+ "step": 530
461
+ },
462
+ {
463
+ "epoch": 12.0,
464
+ "grad_norm": 1.5335863828659058,
465
+ "learning_rate": 2.9994245103752457e-05,
466
+ "loss": 0.5242,
467
+ "step": 540
468
+ },
469
+ {
470
+ "epoch": 12.222222222222221,
471
+ "grad_norm": 1.2394074201583862,
472
+ "learning_rate": 2.999398056268479e-05,
473
+ "loss": 0.5356,
474
+ "step": 550
475
+ },
476
+ {
477
+ "epoch": 12.444444444444445,
478
+ "grad_norm": 1.472650170326233,
479
+ "learning_rate": 2.9993710078482286e-05,
480
+ "loss": 0.415,
481
+ "step": 560
482
+ },
483
+ {
484
+ "epoch": 12.666666666666666,
485
+ "grad_norm": 3.3844995498657227,
486
+ "learning_rate": 2.9993433651252164e-05,
487
+ "loss": 0.6192,
488
+ "step": 570
489
+ },
490
+ {
491
+ "epoch": 12.88888888888889,
492
+ "grad_norm": 1.4811444282531738,
493
+ "learning_rate": 2.9993151281103986e-05,
494
+ "loss": 0.5351,
495
+ "step": 580
496
+ },
497
+ {
498
+ "epoch": 13.11111111111111,
499
+ "grad_norm": 2.4430384635925293,
500
+ "learning_rate": 2.9992862968149675e-05,
501
+ "loss": 0.4177,
502
+ "step": 590
503
+ },
504
+ {
505
+ "epoch": 13.333333333333334,
506
+ "grad_norm": 2.456298351287842,
507
+ "learning_rate": 2.9992568712503513e-05,
508
+ "loss": 0.5317,
509
+ "step": 600
510
+ },
511
+ {
512
+ "epoch": 13.333333333333334,
513
+ "eval_loss": 0.7220126986503601,
514
+ "eval_runtime": 0.5023,
515
+ "eval_samples_per_second": 19.91,
516
+ "eval_steps_per_second": 19.91,
517
+ "step": 600
518
+ },
519
+ {
520
+ "Start_State_loss": 0.7391407489776611,
521
+ "Start_State_runtime": 0.4727,
522
+ "Start_State_samples_per_second": 21.154,
523
+ "Start_State_steps_per_second": 21.154,
524
+ "epoch": 13.333333333333334,
525
+ "step": 600
526
+ },
527
+ {
528
+ "Raw_Model_loss": 0.7220126986503601,
529
+ "Raw_Model_runtime": 0.5347,
530
+ "Raw_Model_samples_per_second": 18.703,
531
+ "Raw_Model_steps_per_second": 18.703,
532
+ "epoch": 13.333333333333334,
533
+ "step": 600
534
+ },
535
+ {
536
+ "SWA_loss": 0.7282296419143677,
537
+ "SWA_runtime": 0.5752,
538
+ "SWA_samples_per_second": 17.384,
539
+ "SWA_steps_per_second": 17.384,
540
+ "epoch": 13.333333333333334,
541
+ "step": 600
542
+ },
543
+ {
544
+ "EMA_loss": 0.7385488748550415,
545
+ "EMA_runtime": 0.5662,
546
+ "EMA_samples_per_second": 17.661,
547
+ "EMA_steps_per_second": 17.661,
548
+ "epoch": 13.333333333333334,
549
+ "step": 600
550
+ },
551
+ {
552
+ "epoch": 13.555555555555555,
553
+ "grad_norm": 2.3377010822296143,
554
+ "learning_rate": 2.9992268514282122e-05,
555
+ "loss": 0.565,
556
+ "step": 610
557
+ },
558
+ {
559
+ "epoch": 13.777777777777779,
560
+ "grad_norm": 2.2196319103240967,
561
+ "learning_rate": 2.99919623736045e-05,
562
+ "loss": 0.441,
563
+ "step": 620
564
+ },
565
+ {
566
+ "epoch": 14.0,
567
+ "grad_norm": 2.2767350673675537,
568
+ "learning_rate": 2.9991650290591996e-05,
569
+ "loss": 0.6033,
570
+ "step": 630
571
+ },
572
+ {
573
+ "epoch": 14.222222222222221,
574
+ "grad_norm": 2.253643035888672,
575
+ "learning_rate": 2.99913322653683e-05,
576
+ "loss": 0.4925,
577
+ "step": 640
578
+ },
579
+ {
580
+ "epoch": 14.444444444444445,
581
+ "grad_norm": 1.8424692153930664,
582
+ "learning_rate": 2.9991008298059473e-05,
583
+ "loss": 0.5007,
584
+ "step": 650
585
+ },
586
+ {
587
+ "epoch": 14.666666666666666,
588
+ "grad_norm": 1.5401960611343384,
589
+ "learning_rate": 2.9990678388793924e-05,
590
+ "loss": 0.5318,
591
+ "step": 660
592
+ },
593
+ {
594
+ "epoch": 14.88888888888889,
595
+ "grad_norm": 1.2824598550796509,
596
+ "learning_rate": 2.999034253770242e-05,
597
+ "loss": 0.4575,
598
+ "step": 670
599
+ },
600
+ {
601
+ "epoch": 15.11111111111111,
602
+ "grad_norm": 2.5211098194122314,
603
+ "learning_rate": 2.9990000744918076e-05,
604
+ "loss": 0.449,
605
+ "step": 680
606
+ },
607
+ {
608
+ "epoch": 15.333333333333334,
609
+ "grad_norm": 1.6035919189453125,
610
+ "learning_rate": 2.9989653010576372e-05,
611
+ "loss": 0.4529,
612
+ "step": 690
613
+ },
614
+ {
615
+ "epoch": 15.555555555555555,
616
+ "grad_norm": 1.4720438718795776,
617
+ "learning_rate": 2.9989299334815138e-05,
618
+ "loss": 0.4804,
619
+ "step": 700
620
+ },
621
+ {
622
+ "epoch": 15.777777777777779,
623
+ "grad_norm": 2.241570472717285,
624
+ "learning_rate": 2.9988939717774558e-05,
625
+ "loss": 0.524,
626
+ "step": 710
627
+ },
628
+ {
629
+ "epoch": 16.0,
630
+ "grad_norm": 1.3463960886001587,
631
+ "learning_rate": 2.9988574159597174e-05,
632
+ "loss": 0.5105,
633
+ "step": 720
634
+ },
635
+ {
636
+ "epoch": 16.22222222222222,
637
+ "grad_norm": 2.1436588764190674,
638
+ "learning_rate": 2.9988202660427887e-05,
639
+ "loss": 0.4644,
640
+ "step": 730
641
+ },
642
+ {
643
+ "epoch": 16.444444444444443,
644
+ "grad_norm": 3.0679702758789062,
645
+ "learning_rate": 2.9987825220413937e-05,
646
+ "loss": 0.5349,
647
+ "step": 740
648
+ },
649
+ {
650
+ "epoch": 16.666666666666668,
651
+ "grad_norm": 1.908082127571106,
652
+ "learning_rate": 2.998744183970494e-05,
653
+ "loss": 0.4263,
654
+ "step": 750
655
+ },
656
+ {
657
+ "epoch": 16.666666666666668,
658
+ "eval_loss": 0.7214915752410889,
659
+ "eval_runtime": 0.4155,
660
+ "eval_samples_per_second": 24.069,
661
+ "eval_steps_per_second": 24.069,
662
+ "step": 750
663
+ },
664
+ {
665
+ "Start_State_loss": 0.7391407489776611,
666
+ "Start_State_runtime": 0.4325,
667
+ "Start_State_samples_per_second": 23.123,
668
+ "Start_State_steps_per_second": 23.123,
669
+ "epoch": 16.666666666666668,
670
+ "step": 750
671
+ },
672
+ {
673
+ "Raw_Model_loss": 0.7214915752410889,
674
+ "Raw_Model_runtime": 0.4148,
675
+ "Raw_Model_samples_per_second": 24.108,
676
+ "Raw_Model_steps_per_second": 24.108,
677
+ "epoch": 16.666666666666668,
678
+ "step": 750
679
+ },
680
+ {
681
+ "SWA_loss": 0.7257974147796631,
682
+ "SWA_runtime": 0.4246,
683
+ "SWA_samples_per_second": 23.553,
684
+ "SWA_steps_per_second": 23.553,
685
+ "epoch": 16.666666666666668,
686
+ "step": 750
687
+ },
688
+ {
689
+ "EMA_loss": 0.7391572594642639,
690
+ "EMA_runtime": 0.4233,
691
+ "EMA_samples_per_second": 23.622,
692
+ "EMA_steps_per_second": 23.622,
693
+ "epoch": 16.666666666666668,
694
+ "step": 750
695
  }
696
  ],
697
  "logging_steps": 10,
 
711
  "attributes": {}
712
  }
713
  },
714
+ "total_flos": 7981049240027136.0,
715
  "train_batch_size": 2,
716
  "trial_name": null,
717
  "trial_params": null