Azrail commited on
Commit
cf50f2e
·
verified ·
1 Parent(s): c585da5

Training in progress, step 17000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:314ace06359dc0a1588628e331e678c1756658b1e5b55ebfc40ace3ca7f19975
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebeffb2c037b50b65a6c0dee470a06e80bc04cb18fc25d36c6c23ebbfb1bfdb7
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:145b83ea857b82f412f448a97a8f9f5c98bbb7dffd2fbc6b5c490420cb05fd6d
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56670a00c0a6655472a5df0bad61f805ef42230ce33d41f768bab9a708635a97
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37104b59ca9cf24df14fa2064ba3c6de266e9640e43d473d5abb1378b6567288
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b383a5a228123d48b81ff62301f8c357c6f3a9cd7484f11e193f37bbe5162530
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1fdb7ab6d17f71d52f24a53832c179d2b096c87d3e016344e7361ae053671325
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2af187ec456db07cec83217e48a58a7d4609355155eba34a029dc1dd312e2a7
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.3514566918864301,
6
  "eval_steps": 500,
7
- "global_step": 16000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2856,11 +2856,189 @@
2856
  "eval_steps_per_second": 18.912,
2857
  "num_input_tokens_seen": 16777216000,
2858
  "step": 16000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2859
  }
2860
  ],
2861
  "logging_steps": 50,
2862
  "max_steps": 200000,
2863
- "num_input_tokens_seen": 16777216000,
2864
  "num_train_epochs": 5,
2865
  "save_steps": 1000,
2866
  "stateful_callbacks": {
@@ -2875,7 +3053,7 @@
2875
  "attributes": {}
2876
  }
2877
  },
2878
- "total_flos": 9.554747455438848e+18,
2879
  "train_batch_size": 64,
2880
  "trial_name": null,
2881
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.37342273512933194,
6
  "eval_steps": 500,
7
+ "global_step": 17000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2856
  "eval_steps_per_second": 18.912,
2857
  "num_input_tokens_seen": 16777216000,
2858
  "step": 16000
2859
+ },
2860
+ {
2861
+ "epoch": 0.35255499404857515,
2862
+ "grad_norm": 0.1446143537759781,
2863
+ "learning_rate": 0.001,
2864
+ "loss": 2.7837,
2865
+ "num_input_tokens_seen": 16829644800,
2866
+ "step": 16050
2867
+ },
2868
+ {
2869
+ "epoch": 0.3536532962107203,
2870
+ "grad_norm": 0.12466421723365784,
2871
+ "learning_rate": 0.001,
2872
+ "loss": 2.7808,
2873
+ "num_input_tokens_seen": 16882073600,
2874
+ "step": 16100
2875
+ },
2876
+ {
2877
+ "epoch": 0.35475159837286535,
2878
+ "grad_norm": 0.13154324889183044,
2879
+ "learning_rate": 0.001,
2880
+ "loss": 2.7608,
2881
+ "num_input_tokens_seen": 16934502400,
2882
+ "step": 16150
2883
+ },
2884
+ {
2885
+ "epoch": 0.3558499005350104,
2886
+ "grad_norm": 0.12929347157478333,
2887
+ "learning_rate": 0.001,
2888
+ "loss": 2.7599,
2889
+ "num_input_tokens_seen": 16986931200,
2890
+ "step": 16200
2891
+ },
2892
+ {
2893
+ "epoch": 0.35694820269715555,
2894
+ "grad_norm": 0.12805528938770294,
2895
+ "learning_rate": 0.001,
2896
+ "loss": 2.7562,
2897
+ "num_input_tokens_seen": 17039360000,
2898
+ "step": 16250
2899
+ },
2900
+ {
2901
+ "epoch": 0.3580465048593006,
2902
+ "grad_norm": 0.12885579466819763,
2903
+ "learning_rate": 0.001,
2904
+ "loss": 2.7498,
2905
+ "num_input_tokens_seen": 17091788800,
2906
+ "step": 16300
2907
+ },
2908
+ {
2909
+ "epoch": 0.35914480702144574,
2910
+ "grad_norm": 0.14422497153282166,
2911
+ "learning_rate": 0.001,
2912
+ "loss": 2.7518,
2913
+ "num_input_tokens_seen": 17144217600,
2914
+ "step": 16350
2915
+ },
2916
+ {
2917
+ "epoch": 0.3602431091835908,
2918
+ "grad_norm": 0.13284224271774292,
2919
+ "learning_rate": 0.001,
2920
+ "loss": 2.7453,
2921
+ "num_input_tokens_seen": 17196646400,
2922
+ "step": 16400
2923
+ },
2924
+ {
2925
+ "epoch": 0.3613414113457359,
2926
+ "grad_norm": 0.1408185362815857,
2927
+ "learning_rate": 0.001,
2928
+ "loss": 2.7422,
2929
+ "num_input_tokens_seen": 17249075200,
2930
+ "step": 16450
2931
+ },
2932
+ {
2933
+ "epoch": 0.362439713507881,
2934
+ "grad_norm": 0.1295713484287262,
2935
+ "learning_rate": 0.001,
2936
+ "loss": 2.7394,
2937
+ "num_input_tokens_seen": 17301504000,
2938
+ "step": 16500
2939
+ },
2940
+ {
2941
+ "epoch": 0.362439713507881,
2942
+ "eval_loss": 2.6431446075439453,
2943
+ "eval_runtime": 65.9239,
2944
+ "eval_samples_per_second": 75.845,
2945
+ "eval_steps_per_second": 18.961,
2946
+ "num_input_tokens_seen": 17301504000,
2947
+ "step": 16500
2948
+ },
2949
+ {
2950
+ "epoch": 0.3635380156700261,
2951
+ "grad_norm": 0.1245918869972229,
2952
+ "learning_rate": 0.001,
2953
+ "loss": 2.7434,
2954
+ "num_input_tokens_seen": 17353932800,
2955
+ "step": 16550
2956
+ },
2957
+ {
2958
+ "epoch": 0.3646363178321712,
2959
+ "grad_norm": 0.15865615010261536,
2960
+ "learning_rate": 0.001,
2961
+ "loss": 2.7378,
2962
+ "num_input_tokens_seen": 17406361600,
2963
+ "step": 16600
2964
+ },
2965
+ {
2966
+ "epoch": 0.3657346199943163,
2967
+ "grad_norm": 0.1391313523054123,
2968
+ "learning_rate": 0.001,
2969
+ "loss": 2.7415,
2970
+ "num_input_tokens_seen": 17458790400,
2971
+ "step": 16650
2972
+ },
2973
+ {
2974
+ "epoch": 0.3668329221564614,
2975
+ "grad_norm": 0.13604389131069183,
2976
+ "learning_rate": 0.001,
2977
+ "loss": 2.7394,
2978
+ "num_input_tokens_seen": 17511219200,
2979
+ "step": 16700
2980
+ },
2981
+ {
2982
+ "epoch": 0.3679312243186065,
2983
+ "grad_norm": 0.14926299452781677,
2984
+ "learning_rate": 0.001,
2985
+ "loss": 2.732,
2986
+ "num_input_tokens_seen": 17563648000,
2987
+ "step": 16750
2988
+ },
2989
+ {
2990
+ "epoch": 0.36902952648075155,
2991
+ "grad_norm": 0.12619628012180328,
2992
+ "learning_rate": 0.001,
2993
+ "loss": 2.7275,
2994
+ "num_input_tokens_seen": 17616076800,
2995
+ "step": 16800
2996
+ },
2997
+ {
2998
+ "epoch": 0.3701278286428967,
2999
+ "grad_norm": 0.1268402636051178,
3000
+ "learning_rate": 0.001,
3001
+ "loss": 2.7309,
3002
+ "num_input_tokens_seen": 17668505600,
3003
+ "step": 16850
3004
+ },
3005
+ {
3006
+ "epoch": 0.37122613080504174,
3007
+ "grad_norm": 0.1379624754190445,
3008
+ "learning_rate": 0.001,
3009
+ "loss": 2.7266,
3010
+ "num_input_tokens_seen": 17720934400,
3011
+ "step": 16900
3012
+ },
3013
+ {
3014
+ "epoch": 0.37232443296718687,
3015
+ "grad_norm": 0.1443478763103485,
3016
+ "learning_rate": 0.001,
3017
+ "loss": 2.7321,
3018
+ "num_input_tokens_seen": 17773363200,
3019
+ "step": 16950
3020
+ },
3021
+ {
3022
+ "epoch": 0.37342273512933194,
3023
+ "grad_norm": 0.15214091539382935,
3024
+ "learning_rate": 0.001,
3025
+ "loss": 2.7284,
3026
+ "num_input_tokens_seen": 17825792000,
3027
+ "step": 17000
3028
+ },
3029
+ {
3030
+ "epoch": 0.37342273512933194,
3031
+ "eval_loss": 2.63478946685791,
3032
+ "eval_runtime": 65.141,
3033
+ "eval_samples_per_second": 76.757,
3034
+ "eval_steps_per_second": 19.189,
3035
+ "num_input_tokens_seen": 17825792000,
3036
+ "step": 17000
3037
  }
3038
  ],
3039
  "logging_steps": 50,
3040
  "max_steps": 200000,
3041
+ "num_input_tokens_seen": 17825792000,
3042
  "num_train_epochs": 5,
3043
  "save_steps": 1000,
3044
  "stateful_callbacks": {
 
3053
  "attributes": {}
3054
  }
3055
  },
3056
+ "total_flos": 1.0151919171403776e+19,
3057
  "train_batch_size": 64,
3058
  "trial_name": null,
3059
  "trial_params": null