Azrail commited on
Commit
ee18e01
·
verified ·
1 Parent(s): cd85b47

Training in progress, step 16000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fdf1b0f1f31678e02e392db01936d097de602c17608d494f22362854ea1faea3
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:314ace06359dc0a1588628e331e678c1756658b1e5b55ebfc40ace3ca7f19975
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e719c4342dca47d074b491692ded6689afd8a826c27c44b36fe769c38219ad92
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:145b83ea857b82f412f448a97a8f9f5c98bbb7dffd2fbc6b5c490420cb05fd6d
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed76fec8b31c184dac30ebd8181dfe95aa10c557692428e198df8bc24024a3d1
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37104b59ca9cf24df14fa2064ba3c6de266e9640e43d473d5abb1378b6567288
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e2f83db9058ff0e3a2778afbea4452d3483a420d1f349a8a276a60ee0edb90fc
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fdb7ab6d17f71d52f24a53832c179d2b096c87d3e016344e7361ae053671325
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.3294906486435282,
6
  "eval_steps": 500,
7
- "global_step": 15000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2678,11 +2678,189 @@
2678
  "eval_steps_per_second": 18.968,
2679
  "num_input_tokens_seen": 15728640000,
2680
  "step": 15000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2681
  }
2682
  ],
2683
  "logging_steps": 50,
2684
  "max_steps": 200000,
2685
- "num_input_tokens_seen": 15728640000,
2686
  "num_train_epochs": 5,
2687
  "save_steps": 1000,
2688
  "stateful_callbacks": {
@@ -2697,7 +2875,7 @@
2697
  "attributes": {}
2698
  }
2699
  },
2700
- "total_flos": 8.95757573947392e+18,
2701
  "train_batch_size": 64,
2702
  "trial_name": null,
2703
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.3514566918864301,
6
  "eval_steps": 500,
7
+ "global_step": 16000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2678
  "eval_steps_per_second": 18.968,
2679
  "num_input_tokens_seen": 15728640000,
2680
  "step": 15000
2681
+ },
2682
+ {
2683
+ "epoch": 0.3305889508056733,
2684
+ "grad_norm": 0.11965218186378479,
2685
+ "learning_rate": 0.001,
2686
+ "loss": 2.7443,
2687
+ "num_input_tokens_seen": 15781068800,
2688
+ "step": 15050
2689
+ },
2690
+ {
2691
+ "epoch": 0.33168725296781837,
2692
+ "grad_norm": 0.14668309688568115,
2693
+ "learning_rate": 0.001,
2694
+ "loss": 2.7496,
2695
+ "num_input_tokens_seen": 15833497600,
2696
+ "step": 15100
2697
+ },
2698
+ {
2699
+ "epoch": 0.3327855551299635,
2700
+ "grad_norm": 0.12492749840021133,
2701
+ "learning_rate": 0.001,
2702
+ "loss": 2.7485,
2703
+ "num_input_tokens_seen": 15885926400,
2704
+ "step": 15150
2705
+ },
2706
+ {
2707
+ "epoch": 0.33388385729210857,
2708
+ "grad_norm": 0.1333470493555069,
2709
+ "learning_rate": 0.001,
2710
+ "loss": 2.7511,
2711
+ "num_input_tokens_seen": 15938355200,
2712
+ "step": 15200
2713
+ },
2714
+ {
2715
+ "epoch": 0.33498215945425364,
2716
+ "grad_norm": 0.14136457443237305,
2717
+ "learning_rate": 0.001,
2718
+ "loss": 2.74,
2719
+ "num_input_tokens_seen": 15990784000,
2720
+ "step": 15250
2721
+ },
2722
+ {
2723
+ "epoch": 0.33608046161639876,
2724
+ "grad_norm": 0.14975622296333313,
2725
+ "learning_rate": 0.001,
2726
+ "loss": 2.7543,
2727
+ "num_input_tokens_seen": 16043212800,
2728
+ "step": 15300
2729
+ },
2730
+ {
2731
+ "epoch": 0.33717876377854383,
2732
+ "grad_norm": 0.1193549856543541,
2733
+ "learning_rate": 0.001,
2734
+ "loss": 2.7497,
2735
+ "num_input_tokens_seen": 16095641600,
2736
+ "step": 15350
2737
+ },
2738
+ {
2739
+ "epoch": 0.33827706594068896,
2740
+ "grad_norm": 0.1429223120212555,
2741
+ "learning_rate": 0.001,
2742
+ "loss": 2.7463,
2743
+ "num_input_tokens_seen": 16148070400,
2744
+ "step": 15400
2745
+ },
2746
+ {
2747
+ "epoch": 0.33937536810283403,
2748
+ "grad_norm": 0.16827304661273956,
2749
+ "learning_rate": 0.001,
2750
+ "loss": 2.7415,
2751
+ "num_input_tokens_seen": 16200499200,
2752
+ "step": 15450
2753
+ },
2754
+ {
2755
+ "epoch": 0.3404736702649791,
2756
+ "grad_norm": 0.13952937722206116,
2757
+ "learning_rate": 0.001,
2758
+ "loss": 2.7388,
2759
+ "num_input_tokens_seen": 16252928000,
2760
+ "step": 15500
2761
+ },
2762
+ {
2763
+ "epoch": 0.3404736702649791,
2764
+ "eval_loss": 2.6472089290618896,
2765
+ "eval_runtime": 65.4943,
2766
+ "eval_samples_per_second": 76.343,
2767
+ "eval_steps_per_second": 19.086,
2768
+ "num_input_tokens_seen": 16252928000,
2769
+ "step": 15500
2770
+ },
2771
+ {
2772
+ "epoch": 0.3415719724271242,
2773
+ "grad_norm": 0.13359376788139343,
2774
+ "learning_rate": 0.001,
2775
+ "loss": 2.7522,
2776
+ "num_input_tokens_seen": 16305356800,
2777
+ "step": 15550
2778
+ },
2779
+ {
2780
+ "epoch": 0.3426702745892693,
2781
+ "grad_norm": 0.13101224601268768,
2782
+ "learning_rate": 0.001,
2783
+ "loss": 2.7483,
2784
+ "num_input_tokens_seen": 16357785600,
2785
+ "step": 15600
2786
+ },
2787
+ {
2788
+ "epoch": 0.3437685767514144,
2789
+ "grad_norm": 0.14006133377552032,
2790
+ "learning_rate": 0.001,
2791
+ "loss": 2.7439,
2792
+ "num_input_tokens_seen": 16410214400,
2793
+ "step": 15650
2794
+ },
2795
+ {
2796
+ "epoch": 0.3448668789135595,
2797
+ "grad_norm": 0.15062059462070465,
2798
+ "learning_rate": 0.001,
2799
+ "loss": 2.7454,
2800
+ "num_input_tokens_seen": 16462643200,
2801
+ "step": 15700
2802
+ },
2803
+ {
2804
+ "epoch": 0.3459651810757046,
2805
+ "grad_norm": 0.13822610676288605,
2806
+ "learning_rate": 0.001,
2807
+ "loss": 2.74,
2808
+ "num_input_tokens_seen": 16515072000,
2809
+ "step": 15750
2810
+ },
2811
+ {
2812
+ "epoch": 0.3470634832378497,
2813
+ "grad_norm": 0.1368207335472107,
2814
+ "learning_rate": 0.001,
2815
+ "loss": 2.745,
2816
+ "num_input_tokens_seen": 16567500800,
2817
+ "step": 15800
2818
+ },
2819
+ {
2820
+ "epoch": 0.34816178539999476,
2821
+ "grad_norm": 0.14573991298675537,
2822
+ "learning_rate": 0.001,
2823
+ "loss": 2.742,
2824
+ "num_input_tokens_seen": 16619929600,
2825
+ "step": 15850
2826
+ },
2827
+ {
2828
+ "epoch": 0.3492600875621399,
2829
+ "grad_norm": 12.025542259216309,
2830
+ "learning_rate": 0.001,
2831
+ "loss": 3.3278,
2832
+ "num_input_tokens_seen": 16672358400,
2833
+ "step": 15900
2834
+ },
2835
+ {
2836
+ "epoch": 0.35035838972428496,
2837
+ "grad_norm": 0.15699023008346558,
2838
+ "learning_rate": 0.001,
2839
+ "loss": 4.04,
2840
+ "num_input_tokens_seen": 16724787200,
2841
+ "step": 15950
2842
+ },
2843
+ {
2844
+ "epoch": 0.3514566918864301,
2845
+ "grad_norm": 0.13041897118091583,
2846
+ "learning_rate": 0.001,
2847
+ "loss": 2.8233,
2848
+ "num_input_tokens_seen": 16777216000,
2849
+ "step": 16000
2850
+ },
2851
+ {
2852
+ "epoch": 0.3514566918864301,
2853
+ "eval_loss": 2.689638614654541,
2854
+ "eval_runtime": 66.0949,
2855
+ "eval_samples_per_second": 75.649,
2856
+ "eval_steps_per_second": 18.912,
2857
+ "num_input_tokens_seen": 16777216000,
2858
+ "step": 16000
2859
  }
2860
  ],
2861
  "logging_steps": 50,
2862
  "max_steps": 200000,
2863
+ "num_input_tokens_seen": 16777216000,
2864
  "num_train_epochs": 5,
2865
  "save_steps": 1000,
2866
  "stateful_callbacks": {
 
2875
  "attributes": {}
2876
  }
2877
  },
2878
+ "total_flos": 9.554747455438848e+18,
2879
  "train_batch_size": 64,
2880
  "trial_name": null,
2881
  "trial_params": null