ljcamargo commited on
Commit
beef0bf
·
verified ·
1 Parent(s): 5ef5fec

Training in progress, step 1000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f773dc94967b7ee6e551db696f34227eb983340cfd6ce1fc1ae2d7d9ba5d943
3
  size 3826461296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb7d10311d3e39a51d905753f968302b84088808379e2021ad4fb96b9d17f533
3
  size 3826461296
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8632716595be573dc0ed32f03651903484a98a76acfa6f6710d2c042e6a3c5ea
3
  size 2479123301
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3f8fbb04fd4187bd9f3419a33f7ebc13f51ccd8e6c98e79de1bb006485e1840
3
  size 2479123301
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:100000ea5d81ef450688ed224677c94deb5fa0928415e9497ab5b09006179386
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:025efedff4c7b611b2aee1ebff4b8949b561e4fc6b52396ed3a28018d052e541
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c8b125c082de6f20d827ac9ce3a7228054a763972dd6779dfe18031391e49829
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1eff7a0a9bded1a7bd3fdba602c9613b8d890d63962a1be5e1c426de3b212f74
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.3,
6
  "eval_steps": 500,
7
- "global_step": 750,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -533,6 +533,181 @@
533
  "learning_rate": 3.5383064516129035e-05,
534
  "loss": 0.8446,
535
  "step": 750
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
536
  }
537
  ],
538
  "logging_steps": 10,
@@ -552,7 +727,7 @@
552
  "attributes": {}
553
  }
554
  },
555
- "total_flos": 1.3512970727276544e+16,
556
  "train_batch_size": 2,
557
  "trial_name": null,
558
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.4,
6
  "eval_steps": 500,
7
+ "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
533
  "learning_rate": 3.5383064516129035e-05,
534
  "loss": 0.8446,
535
  "step": 750
536
+ },
537
+ {
538
+ "epoch": 0.304,
539
+ "grad_norm": 21.616487503051758,
540
+ "learning_rate": 3.518145161290323e-05,
541
+ "loss": 0.8152,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 0.308,
546
+ "grad_norm": 13.02557373046875,
547
+ "learning_rate": 3.497983870967742e-05,
548
+ "loss": 0.6836,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 0.312,
553
+ "grad_norm": 22.531129837036133,
554
+ "learning_rate": 3.477822580645161e-05,
555
+ "loss": 0.8337,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 0.316,
560
+ "grad_norm": 24.401342391967773,
561
+ "learning_rate": 3.457661290322581e-05,
562
+ "loss": 0.7016,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 0.32,
567
+ "grad_norm": 15.145552635192871,
568
+ "learning_rate": 3.4375e-05,
569
+ "loss": 0.7273,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 0.324,
574
+ "grad_norm": 20.092849731445312,
575
+ "learning_rate": 3.41733870967742e-05,
576
+ "loss": 0.7287,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 0.328,
581
+ "grad_norm": 15.03227424621582,
582
+ "learning_rate": 3.397177419354839e-05,
583
+ "loss": 0.6846,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 0.332,
588
+ "grad_norm": 13.607186317443848,
589
+ "learning_rate": 3.377016129032258e-05,
590
+ "loss": 0.724,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 0.336,
595
+ "grad_norm": 24.089006423950195,
596
+ "learning_rate": 3.3568548387096774e-05,
597
+ "loss": 0.7993,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 0.34,
602
+ "grad_norm": 18.13868522644043,
603
+ "learning_rate": 3.336693548387097e-05,
604
+ "loss": 0.6757,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 0.344,
609
+ "grad_norm": 17.819578170776367,
610
+ "learning_rate": 3.3165322580645164e-05,
611
+ "loss": 0.6762,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 0.348,
616
+ "grad_norm": 29.358142852783203,
617
+ "learning_rate": 3.296370967741936e-05,
618
+ "loss": 0.6936,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 0.352,
623
+ "grad_norm": 27.773387908935547,
624
+ "learning_rate": 3.2762096774193553e-05,
625
+ "loss": 0.6531,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 0.356,
630
+ "grad_norm": 10.760952949523926,
631
+ "learning_rate": 3.256048387096775e-05,
632
+ "loss": 0.7669,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 0.36,
637
+ "grad_norm": 20.802019119262695,
638
+ "learning_rate": 3.2358870967741936e-05,
639
+ "loss": 0.6365,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 0.364,
644
+ "grad_norm": 18.4460391998291,
645
+ "learning_rate": 3.215725806451613e-05,
646
+ "loss": 0.9778,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 0.368,
651
+ "grad_norm": 23.085039138793945,
652
+ "learning_rate": 3.1955645161290326e-05,
653
+ "loss": 0.7247,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 0.372,
658
+ "grad_norm": 13.907185554504395,
659
+ "learning_rate": 3.175403225806452e-05,
660
+ "loss": 0.6822,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 0.376,
665
+ "grad_norm": 13.967331886291504,
666
+ "learning_rate": 3.1552419354838715e-05,
667
+ "loss": 0.7839,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 0.38,
672
+ "grad_norm": 14.392730712890625,
673
+ "learning_rate": 3.135080645161291e-05,
674
+ "loss": 0.7518,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 0.384,
679
+ "grad_norm": 12.910331726074219,
680
+ "learning_rate": 3.11491935483871e-05,
681
+ "loss": 0.6257,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 0.388,
686
+ "grad_norm": 17.412134170532227,
687
+ "learning_rate": 3.0947580645161286e-05,
688
+ "loss": 0.8162,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 0.392,
693
+ "grad_norm": 16.036808013916016,
694
+ "learning_rate": 3.074596774193548e-05,
695
+ "loss": 0.8296,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 0.396,
700
+ "grad_norm": 14.738393783569336,
701
+ "learning_rate": 3.0544354838709676e-05,
702
+ "loss": 0.5135,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 0.4,
707
+ "grad_norm": 13.25367546081543,
708
+ "learning_rate": 3.034274193548387e-05,
709
+ "loss": 0.7414,
710
+ "step": 1000
711
  }
712
  ],
713
  "logging_steps": 10,
 
727
  "attributes": {}
728
  }
729
  },
730
+ "total_flos": 1.7984652389369856e+16,
731
  "train_batch_size": 2,
732
  "trial_name": null,
733
  "trial_params": null