Azrail commited on
Commit
857f50c
·
verified ·
1 Parent(s): 4c981e0

Training in progress, step 4000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f81f1606cbb4066658322a9b01b024ebe1fe01d7f9c79d6a2b4af556fe6aa975
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a313d509db7def5f49214f9d05b89c42300ce0ca3fd0d7a1b4c56154cf0a72db
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2bdc54e623a858f4b04c457346b0f903dc827e2ac006197959be017f0bd1f45
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ad0ef37ec0c5bff68abf0acafb2e524cd857e55490e94ef61cc44d1f7b08679
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:11ff07d587c5a9307740887f980afedff8f43c8da2bd4cbf45f5f3cf546cf38d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b2c7bb39719c8f039a2a4dd5473921c41a834e3390491b2b93e9a2772ee802f
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a3d374142fb5a9a375b1a828a38137498daacdc810ac93109a9de1e8639e3a1
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bff7808903acfc88c8f83b83043a92f900db8f72ffc7d87d61c8ee1abceef7bc
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.06589812972870564,
6
  "eval_steps": 500,
7
- "global_step": 3000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -542,11 +542,189 @@
542
  "eval_steps_per_second": 18.952,
543
  "num_input_tokens_seen": 3145728000,
544
  "step": 3000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
545
  }
546
  ],
547
  "logging_steps": 50,
548
  "max_steps": 200000,
549
- "num_input_tokens_seen": 3145728000,
550
  "num_train_epochs": 5,
551
  "save_steps": 1000,
552
  "stateful_callbacks": {
@@ -561,7 +739,7 @@
561
  "attributes": {}
562
  }
563
  },
564
- "total_flos": 1.791515147894784e+18,
565
  "train_batch_size": 64,
566
  "trial_name": null,
567
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.08786417297160752,
6
  "eval_steps": 500,
7
+ "global_step": 4000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
542
  "eval_steps_per_second": 18.952,
543
  "num_input_tokens_seen": 3145728000,
544
  "step": 3000
545
+ },
546
+ {
547
+ "epoch": 0.06699643189085074,
548
+ "grad_norm": 0.21884848177433014,
549
+ "learning_rate": 0.00061,
550
+ "loss": 3.2437,
551
+ "num_input_tokens_seen": 3198156800,
552
+ "step": 3050
553
+ },
554
+ {
555
+ "epoch": 0.06809473405299582,
556
+ "grad_norm": 0.2534893751144409,
557
+ "learning_rate": 0.00062,
558
+ "loss": 3.2366,
559
+ "num_input_tokens_seen": 3250585600,
560
+ "step": 3100
561
+ },
562
+ {
563
+ "epoch": 0.06919303621514092,
564
+ "grad_norm": 0.2408875823020935,
565
+ "learning_rate": 0.00063,
566
+ "loss": 3.2264,
567
+ "num_input_tokens_seen": 3303014400,
568
+ "step": 3150
569
+ },
570
+ {
571
+ "epoch": 0.07029133837728602,
572
+ "grad_norm": 0.22240856289863586,
573
+ "learning_rate": 0.00064,
574
+ "loss": 3.2102,
575
+ "num_input_tokens_seen": 3355443200,
576
+ "step": 3200
577
+ },
578
+ {
579
+ "epoch": 0.0713896405394311,
580
+ "grad_norm": 0.21527299284934998,
581
+ "learning_rate": 0.0006500000000000001,
582
+ "loss": 3.1985,
583
+ "num_input_tokens_seen": 3407872000,
584
+ "step": 3250
585
+ },
586
+ {
587
+ "epoch": 0.0724879427015762,
588
+ "grad_norm": 0.26642242074012756,
589
+ "learning_rate": 0.00066,
590
+ "loss": 3.1923,
591
+ "num_input_tokens_seen": 3460300800,
592
+ "step": 3300
593
+ },
594
+ {
595
+ "epoch": 0.0735862448637213,
596
+ "grad_norm": 0.22164040803909302,
597
+ "learning_rate": 0.00067,
598
+ "loss": 3.1848,
599
+ "num_input_tokens_seen": 3512729600,
600
+ "step": 3350
601
+ },
602
+ {
603
+ "epoch": 0.07468454702586638,
604
+ "grad_norm": 0.21594341099262238,
605
+ "learning_rate": 0.00068,
606
+ "loss": 3.1764,
607
+ "num_input_tokens_seen": 3565158400,
608
+ "step": 3400
609
+ },
610
+ {
611
+ "epoch": 0.07578284918801148,
612
+ "grad_norm": 0.1921539604663849,
613
+ "learning_rate": 0.00069,
614
+ "loss": 3.1643,
615
+ "num_input_tokens_seen": 3617587200,
616
+ "step": 3450
617
+ },
618
+ {
619
+ "epoch": 0.07688115135015658,
620
+ "grad_norm": 0.2266080528497696,
621
+ "learning_rate": 0.0007,
622
+ "loss": 3.1647,
623
+ "num_input_tokens_seen": 3670016000,
624
+ "step": 3500
625
+ },
626
+ {
627
+ "epoch": 0.07688115135015658,
628
+ "eval_loss": 3.061373472213745,
629
+ "eval_runtime": 63.388,
630
+ "eval_samples_per_second": 78.879,
631
+ "eval_steps_per_second": 19.72,
632
+ "num_input_tokens_seen": 3670016000,
633
+ "step": 3500
634
+ },
635
+ {
636
+ "epoch": 0.07797945351230168,
637
+ "grad_norm": 0.19900226593017578,
638
+ "learning_rate": 0.00071,
639
+ "loss": 3.1557,
640
+ "num_input_tokens_seen": 3722444800,
641
+ "step": 3550
642
+ },
643
+ {
644
+ "epoch": 0.07907775567444676,
645
+ "grad_norm": 0.20299012959003448,
646
+ "learning_rate": 0.0007199999999999999,
647
+ "loss": 3.1503,
648
+ "num_input_tokens_seen": 3774873600,
649
+ "step": 3600
650
+ },
651
+ {
652
+ "epoch": 0.08017605783659186,
653
+ "grad_norm": 0.232399120926857,
654
+ "learning_rate": 0.00073,
655
+ "loss": 3.1387,
656
+ "num_input_tokens_seen": 3827302400,
657
+ "step": 3650
658
+ },
659
+ {
660
+ "epoch": 0.08127435999873696,
661
+ "grad_norm": 0.2127719670534134,
662
+ "learning_rate": 0.00074,
663
+ "loss": 3.1388,
664
+ "num_input_tokens_seen": 3879731200,
665
+ "step": 3700
666
+ },
667
+ {
668
+ "epoch": 0.08237266216088204,
669
+ "grad_norm": 0.22336533665657043,
670
+ "learning_rate": 0.00075,
671
+ "loss": 3.1247,
672
+ "num_input_tokens_seen": 3932160000,
673
+ "step": 3750
674
+ },
675
+ {
676
+ "epoch": 0.08347096432302714,
677
+ "grad_norm": 0.18270662426948547,
678
+ "learning_rate": 0.00076,
679
+ "loss": 3.1192,
680
+ "num_input_tokens_seen": 3984588800,
681
+ "step": 3800
682
+ },
683
+ {
684
+ "epoch": 0.08456926648517224,
685
+ "grad_norm": 0.16843897104263306,
686
+ "learning_rate": 0.0007700000000000001,
687
+ "loss": 3.1153,
688
+ "num_input_tokens_seen": 4037017600,
689
+ "step": 3850
690
+ },
691
+ {
692
+ "epoch": 0.08566756864731732,
693
+ "grad_norm": 0.19947747886180878,
694
+ "learning_rate": 0.0007800000000000001,
695
+ "loss": 3.1048,
696
+ "num_input_tokens_seen": 4089446400,
697
+ "step": 3900
698
+ },
699
+ {
700
+ "epoch": 0.08676587080946242,
701
+ "grad_norm": 0.17078733444213867,
702
+ "learning_rate": 0.00079,
703
+ "loss": 3.1014,
704
+ "num_input_tokens_seen": 4141875200,
705
+ "step": 3950
706
+ },
707
+ {
708
+ "epoch": 0.08786417297160752,
709
+ "grad_norm": 0.22091113030910492,
710
+ "learning_rate": 0.0008,
711
+ "loss": 3.0982,
712
+ "num_input_tokens_seen": 4194304000,
713
+ "step": 4000
714
+ },
715
+ {
716
+ "epoch": 0.08786417297160752,
717
+ "eval_loss": 2.9978296756744385,
718
+ "eval_runtime": 65.6064,
719
+ "eval_samples_per_second": 76.212,
720
+ "eval_steps_per_second": 19.053,
721
+ "num_input_tokens_seen": 4194304000,
722
+ "step": 4000
723
  }
724
  ],
725
  "logging_steps": 50,
726
  "max_steps": 200000,
727
+ "num_input_tokens_seen": 4194304000,
728
  "num_train_epochs": 5,
729
  "save_steps": 1000,
730
  "stateful_callbacks": {
 
739
  "attributes": {}
740
  }
741
  },
742
+ "total_flos": 2.388686863859712e+18,
743
  "train_batch_size": 64,
744
  "trial_name": null,
745
  "trial_params": null