shulijia commited on
Commit
60ac9f5
·
verified ·
1 Parent(s): d0960f8

Training in progress, step 3270, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:31d9122a84b2d3c5cf5893ddcf3d410c40a6e910883ed09aa758af7d3d918b91
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b681f70180e1b9b225d43794577d9735c1e90ae1f568ab2e1fb38b668291955
3
  size 2384234968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67b348575e8f1b08569c8702876caefe242bd5ec9e432ba23de8cf313eac7d95
3
  size 4768663315
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4baa6e79b984a80f604cda311b89dfe7d2e9a825e68647fc5d3797a8b813e2ea
3
  size 4768663315
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a46667fdbbd166561e9277fe7ced0cc3234da12ccb11f38025f6a5cb754a1493
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:860913dca1e25255803c968661dca63cd03ec08cdee939cf5f78b0d42cbe6907
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.7528979685527375,
6
  "eval_steps": 100,
7
- "global_step": 3000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2708,6 +2708,249 @@
2708
  "mean_token_accuracy": 0.8454745601862669,
2709
  "num_tokens": 24560640.0,
2710
  "step": 3000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2711
  }
2712
  ],
2713
  "logging_steps": 10,
@@ -2722,12 +2965,12 @@
2722
  "should_evaluate": false,
2723
  "should_log": false,
2724
  "should_save": true,
2725
- "should_training_stop": false
2726
  },
2727
  "attributes": {}
2728
  }
2729
  },
2730
- "total_flos": 6.490897571119104e+16,
2731
  "train_batch_size": 2,
2732
  "trial_name": null,
2733
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
  "eval_steps": 100,
7
+ "global_step": 3270,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2708
  "mean_token_accuracy": 0.8454745601862669,
2709
  "num_tokens": 24560640.0,
2710
  "step": 3000
2711
+ },
2712
+ {
2713
+ "epoch": 2.7620796510960632,
2714
+ "grad_norm": 1.08968985080719,
2715
+ "learning_rate": 8.868501529051989e-07,
2716
+ "loss": 0.1619,
2717
+ "mean_token_accuracy": 0.8165728993713856,
2718
+ "num_tokens": 24642560.0,
2719
+ "step": 3010
2720
+ },
2721
+ {
2722
+ "epoch": 2.7712613336393894,
2723
+ "grad_norm": 0.9702316522598267,
2724
+ "learning_rate": 8.528712198436969e-07,
2725
+ "loss": 0.1506,
2726
+ "mean_token_accuracy": 0.8149584170430899,
2727
+ "num_tokens": 24724480.0,
2728
+ "step": 3020
2729
+ },
2730
+ {
2731
+ "epoch": 2.7804430161827156,
2732
+ "grad_norm": 1.215406060218811,
2733
+ "learning_rate": 8.188922867821951e-07,
2734
+ "loss": 0.1273,
2735
+ "mean_token_accuracy": 0.8383683957159519,
2736
+ "num_tokens": 24806400.0,
2737
+ "step": 3030
2738
+ },
2739
+ {
2740
+ "epoch": 2.7896246987260414,
2741
+ "grad_norm": 1.3644214868545532,
2742
+ "learning_rate": 7.849133537206933e-07,
2743
+ "loss": 0.1361,
2744
+ "mean_token_accuracy": 0.823642372712493,
2745
+ "num_tokens": 24888320.0,
2746
+ "step": 3040
2747
+ },
2748
+ {
2749
+ "epoch": 2.7988063812693675,
2750
+ "grad_norm": 1.827764630317688,
2751
+ "learning_rate": 7.509344206591913e-07,
2752
+ "loss": 0.1394,
2753
+ "mean_token_accuracy": 0.8243884552270174,
2754
+ "num_tokens": 24970240.0,
2755
+ "step": 3050
2756
+ },
2757
+ {
2758
+ "epoch": 2.8079880638126937,
2759
+ "grad_norm": 1.2215831279754639,
2760
+ "learning_rate": 7.169554875976895e-07,
2761
+ "loss": 0.1345,
2762
+ "mean_token_accuracy": 0.8343688864260912,
2763
+ "num_tokens": 25052160.0,
2764
+ "step": 3060
2765
+ },
2766
+ {
2767
+ "epoch": 2.8171697463560195,
2768
+ "grad_norm": 1.2051235437393188,
2769
+ "learning_rate": 6.829765545361876e-07,
2770
+ "loss": 0.1448,
2771
+ "mean_token_accuracy": 0.8268346361815929,
2772
+ "num_tokens": 25134080.0,
2773
+ "step": 3070
2774
+ },
2775
+ {
2776
+ "epoch": 2.8263514288993457,
2777
+ "grad_norm": 1.358314037322998,
2778
+ "learning_rate": 6.489976214746857e-07,
2779
+ "loss": 0.1401,
2780
+ "mean_token_accuracy": 0.8394080217927694,
2781
+ "num_tokens": 25216000.0,
2782
+ "step": 3080
2783
+ },
2784
+ {
2785
+ "epoch": 2.835533111442672,
2786
+ "grad_norm": 1.5445815324783325,
2787
+ "learning_rate": 6.150186884131839e-07,
2788
+ "loss": 0.1444,
2789
+ "mean_token_accuracy": 0.8344789650291204,
2790
+ "num_tokens": 25297920.0,
2791
+ "step": 3090
2792
+ },
2793
+ {
2794
+ "epoch": 2.844714793985998,
2795
+ "grad_norm": 0.9232423305511475,
2796
+ "learning_rate": 5.81039755351682e-07,
2797
+ "loss": 0.1197,
2798
+ "mean_token_accuracy": 0.8397871796041727,
2799
+ "num_tokens": 25379840.0,
2800
+ "step": 3100
2801
+ },
2802
+ {
2803
+ "epoch": 2.853896476529324,
2804
+ "grad_norm": 1.2474477291107178,
2805
+ "learning_rate": 5.470608222901801e-07,
2806
+ "loss": 0.1358,
2807
+ "mean_token_accuracy": 0.830565071478486,
2808
+ "num_tokens": 25461760.0,
2809
+ "step": 3110
2810
+ },
2811
+ {
2812
+ "epoch": 2.86307815907265,
2813
+ "grad_norm": 1.3741815090179443,
2814
+ "learning_rate": 5.130818892286782e-07,
2815
+ "loss": 0.1367,
2816
+ "mean_token_accuracy": 0.822761744633317,
2817
+ "num_tokens": 25543680.0,
2818
+ "step": 3120
2819
+ },
2820
+ {
2821
+ "epoch": 2.872259841615976,
2822
+ "grad_norm": 0.8645684719085693,
2823
+ "learning_rate": 4.791029561671764e-07,
2824
+ "loss": 0.1414,
2825
+ "mean_token_accuracy": 0.8360322870314121,
2826
+ "num_tokens": 25625600.0,
2827
+ "step": 3130
2828
+ },
2829
+ {
2830
+ "epoch": 2.8814415241593023,
2831
+ "grad_norm": 1.3521939516067505,
2832
+ "learning_rate": 4.451240231056745e-07,
2833
+ "loss": 0.1368,
2834
+ "mean_token_accuracy": 0.8312010750174522,
2835
+ "num_tokens": 25707520.0,
2836
+ "step": 3140
2837
+ },
2838
+ {
2839
+ "epoch": 2.890623206702628,
2840
+ "grad_norm": 1.3353580236434937,
2841
+ "learning_rate": 4.111450900441726e-07,
2842
+ "loss": 0.1192,
2843
+ "mean_token_accuracy": 0.8368639908730984,
2844
+ "num_tokens": 25789440.0,
2845
+ "step": 3150
2846
+ },
2847
+ {
2848
+ "epoch": 2.8998048892459543,
2849
+ "grad_norm": 1.0142643451690674,
2850
+ "learning_rate": 3.7716615698267073e-07,
2851
+ "loss": 0.1391,
2852
+ "mean_token_accuracy": 0.8295865952968597,
2853
+ "num_tokens": 25871360.0,
2854
+ "step": 3160
2855
+ },
2856
+ {
2857
+ "epoch": 2.9089865717892804,
2858
+ "grad_norm": 1.3363066911697388,
2859
+ "learning_rate": 3.4318722392116895e-07,
2860
+ "loss": 0.15,
2861
+ "mean_token_accuracy": 0.8194104671478272,
2862
+ "num_tokens": 25953280.0,
2863
+ "step": 3170
2864
+ },
2865
+ {
2866
+ "epoch": 2.918168254332606,
2867
+ "grad_norm": 1.1663857698440552,
2868
+ "learning_rate": 3.09208290859667e-07,
2869
+ "loss": 0.1391,
2870
+ "mean_token_accuracy": 0.8333170261234045,
2871
+ "num_tokens": 26035200.0,
2872
+ "step": 3180
2873
+ },
2874
+ {
2875
+ "epoch": 2.9273499368759324,
2876
+ "grad_norm": 1.1857463121414185,
2877
+ "learning_rate": 2.752293577981652e-07,
2878
+ "loss": 0.1508,
2879
+ "mean_token_accuracy": 0.8284246563911438,
2880
+ "num_tokens": 26117120.0,
2881
+ "step": 3190
2882
+ },
2883
+ {
2884
+ "epoch": 2.9365316194192586,
2885
+ "grad_norm": 1.3892704248428345,
2886
+ "learning_rate": 2.412504247366633e-07,
2887
+ "loss": 0.1399,
2888
+ "mean_token_accuracy": 0.8355797432363034,
2889
+ "num_tokens": 26199040.0,
2890
+ "step": 3200
2891
+ },
2892
+ {
2893
+ "epoch": 2.9457133019625847,
2894
+ "grad_norm": 1.4766535758972168,
2895
+ "learning_rate": 2.0727149167516142e-07,
2896
+ "loss": 0.1713,
2897
+ "mean_token_accuracy": 0.8024461850523948,
2898
+ "num_tokens": 26280960.0,
2899
+ "step": 3210
2900
+ },
2901
+ {
2902
+ "epoch": 2.954894984505911,
2903
+ "grad_norm": 1.0841213464736938,
2904
+ "learning_rate": 1.7329255861365954e-07,
2905
+ "loss": 0.1464,
2906
+ "mean_token_accuracy": 0.822847356274724,
2907
+ "num_tokens": 26362880.0,
2908
+ "step": 3220
2909
+ },
2910
+ {
2911
+ "epoch": 2.9640766670492367,
2912
+ "grad_norm": 1.0091631412506104,
2913
+ "learning_rate": 1.3931362555215769e-07,
2914
+ "loss": 0.1402,
2915
+ "mean_token_accuracy": 0.8205234851688147,
2916
+ "num_tokens": 26444800.0,
2917
+ "step": 3230
2918
+ },
2919
+ {
2920
+ "epoch": 2.973258349592563,
2921
+ "grad_norm": 1.2437331676483154,
2922
+ "learning_rate": 1.053346924906558e-07,
2923
+ "loss": 0.1487,
2924
+ "mean_token_accuracy": 0.8193126212805509,
2925
+ "num_tokens": 26526720.0,
2926
+ "step": 3240
2927
+ },
2928
+ {
2929
+ "epoch": 2.982440032135889,
2930
+ "grad_norm": 1.2033171653747559,
2931
+ "learning_rate": 7.135575942915393e-08,
2932
+ "loss": 0.1081,
2933
+ "mean_token_accuracy": 0.8560909986495971,
2934
+ "num_tokens": 26608640.0,
2935
+ "step": 3250
2936
+ },
2937
+ {
2938
+ "epoch": 2.991621714679215,
2939
+ "grad_norm": 1.167435884475708,
2940
+ "learning_rate": 3.737682636765206e-08,
2941
+ "loss": 0.1496,
2942
+ "mean_token_accuracy": 0.8093933459371329,
2943
+ "num_tokens": 26690560.0,
2944
+ "step": 3260
2945
+ },
2946
+ {
2947
+ "epoch": 3.0,
2948
+ "grad_norm": 7.105273246765137,
2949
+ "learning_rate": 3.3978933061501875e-09,
2950
+ "loss": 0.1229,
2951
+ "mean_token_accuracy": 0.8537249038480732,
2952
+ "num_tokens": 26764800.0,
2953
+ "step": 3270
2954
  }
2955
  ],
2956
  "logging_steps": 10,
 
2965
  "should_evaluate": false,
2966
  "should_log": false,
2967
  "should_save": true,
2968
+ "should_training_stop": true
2969
  },
2970
  "attributes": {}
2971
  }
2972
  },
2973
+ "total_flos": 7.07341401980928e+16,
2974
  "train_batch_size": 2,
2975
  "trial_name": null,
2976
  "trial_params": null