Azrail commited on
Commit
2ff638e
·
verified ·
1 Parent(s): a9d468f

Training in progress, step 14000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:23be2c11c244c72601ea6f47dd507781736231ff1da2289fe5f8ba433277cb99
3
  size 150625560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0579f8b01bd92a4b6d4d9542187f9f6be5d525493ee4cacf89313462b0d4fc29
3
  size 150625560
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04943bdcad0923c88796f61e80a911b94cde9c121a1bb27006e82c8a584a0c44
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1ac4e5f1a091d05231fad9fd4f9941afbf6737a4f9256414d7439dd21637791
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff84b2998c9ce4e6e3eaf03e775fc93a7c11be8195c0bb3abb7a8b9a1cec86e5
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dac96a69b6625532fa7a1849a782b63a79e8d1b28e764bc8297e354d748f16c9
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71a524f67e79e2b512d6d818f94e2b528e5b7f4447259f3966ae44cdba439db5
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:081dc59c3c452b8ce89bfce5eae0952bf765aeed7903bbba40be0fb195d20006
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 3.1378457081642197,
6
  "eval_steps": 500,
7
- "global_step": 13000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2842,11 +2842,229 @@
2842
  "eval_steps_per_second": 20.539,
2843
  "num_input_tokens_seen": 6280158129,
2844
  "step": 13000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2845
  }
2846
  ],
2847
  "logging_steps": 50,
2848
  "max_steps": 16568,
2849
- "num_input_tokens_seen": 6280158129,
2850
  "num_train_epochs": 4,
2851
  "save_steps": 1000,
2852
  "stateful_callbacks": {
@@ -2861,7 +3079,7 @@
2861
  "attributes": {}
2862
  }
2863
  },
2864
- "total_flos": 1.680003593850839e+18,
2865
  "train_batch_size": 16,
2866
  "trial_name": null,
2867
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.3792567557372846,
6
  "eval_steps": 500,
7
+ "global_step": 14000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2842
  "eval_steps_per_second": 20.539,
2843
  "num_input_tokens_seen": 6280158129,
2844
  "step": 13000
2845
+ },
2846
+ {
2847
+ "epoch": 3.149916260542873,
2848
+ "grad_norm": 0.25390625,
2849
+ "learning_rate": 1.3271465218047383e-05,
2850
+ "loss": 2.0974,
2851
+ "mean_token_accuracy": 0.5548031070828437,
2852
+ "num_input_tokens_seen": 6304365713,
2853
+ "num_tokens": 2656912031.0,
2854
+ "step": 13050
2855
+ },
2856
+ {
2857
+ "epoch": 3.1619868129215263,
2858
+ "grad_norm": 0.24609375,
2859
+ "learning_rate": 1.3082842915346311e-05,
2860
+ "loss": 2.0981,
2861
+ "mean_token_accuracy": 0.5543636172637343,
2862
+ "num_input_tokens_seen": 6328561217,
2863
+ "num_tokens": 2667181848.0,
2864
+ "step": 13100
2865
+ },
2866
+ {
2867
+ "epoch": 3.1740573653001793,
2868
+ "grad_norm": 0.236328125,
2869
+ "learning_rate": 1.2894220612645238e-05,
2870
+ "loss": 2.093,
2871
+ "mean_token_accuracy": 0.5551713344082236,
2872
+ "num_input_tokens_seen": 6352657569,
2873
+ "num_tokens": 2677374089.0,
2874
+ "step": 13150
2875
+ },
2876
+ {
2877
+ "epoch": 3.186127917678833,
2878
+ "grad_norm": 0.267578125,
2879
+ "learning_rate": 1.2705598309944169e-05,
2880
+ "loss": 2.084,
2881
+ "mean_token_accuracy": 0.5568741805478931,
2882
+ "num_input_tokens_seen": 6376750801,
2883
+ "num_tokens": 2687517529.0,
2884
+ "step": 13200
2885
+ },
2886
+ {
2887
+ "epoch": 3.198198470057486,
2888
+ "grad_norm": 0.2578125,
2889
+ "learning_rate": 1.2516976007243097e-05,
2890
+ "loss": 2.0985,
2891
+ "mean_token_accuracy": 0.5545465455949307,
2892
+ "num_input_tokens_seen": 6400738145,
2893
+ "num_tokens": 2697615714.0,
2894
+ "step": 13250
2895
+ },
2896
+ {
2897
+ "epoch": 3.2102690224361394,
2898
+ "grad_norm": 0.2451171875,
2899
+ "learning_rate": 1.2328353704542026e-05,
2900
+ "loss": 2.0969,
2901
+ "mean_token_accuracy": 0.5544571406021714,
2902
+ "num_input_tokens_seen": 6424909057,
2903
+ "num_tokens": 2707784293.0,
2904
+ "step": 13300
2905
+ },
2906
+ {
2907
+ "epoch": 3.2223395748147925,
2908
+ "grad_norm": 0.302734375,
2909
+ "learning_rate": 1.2139731401840953e-05,
2910
+ "loss": 2.0932,
2911
+ "mean_token_accuracy": 0.5548350306227803,
2912
+ "num_input_tokens_seen": 6449111825,
2913
+ "num_tokens": 2717984302.0,
2914
+ "step": 13350
2915
+ },
2916
+ {
2917
+ "epoch": 3.2344101271934456,
2918
+ "grad_norm": 0.228515625,
2919
+ "learning_rate": 1.1951109099139883e-05,
2920
+ "loss": 2.1012,
2921
+ "mean_token_accuracy": 0.5535725425183773,
2922
+ "num_input_tokens_seen": 6473257953,
2923
+ "num_tokens": 2728233467.0,
2924
+ "step": 13400
2925
+ },
2926
+ {
2927
+ "epoch": 3.246480679572099,
2928
+ "grad_norm": 0.2578125,
2929
+ "learning_rate": 1.1762486796438812e-05,
2930
+ "loss": 2.0985,
2931
+ "mean_token_accuracy": 0.5541856496781111,
2932
+ "num_input_tokens_seen": 6497464865,
2933
+ "num_tokens": 2738326366.0,
2934
+ "step": 13450
2935
+ },
2936
+ {
2937
+ "epoch": 3.258551231950752,
2938
+ "grad_norm": 0.2412109375,
2939
+ "learning_rate": 1.157386449373774e-05,
2940
+ "loss": 2.0911,
2941
+ "num_input_tokens_seen": 6521634753,
2942
+ "step": 13500
2943
+ },
2944
+ {
2945
+ "epoch": 3.258551231950752,
2946
+ "eval_loss": 1.9680596590042114,
2947
+ "eval_mean_token_accuracy": 0.5785238554199033,
2948
+ "eval_num_tokens": 2748403907.0,
2949
+ "eval_runtime": 130.2372,
2950
+ "eval_samples_per_second": 82.25,
2951
+ "eval_steps_per_second": 20.562,
2952
+ "num_input_tokens_seen": 6521634753,
2953
+ "step": 13500
2954
+ },
2955
+ {
2956
+ "epoch": 3.270621784329405,
2957
+ "grad_norm": 0.251953125,
2958
+ "learning_rate": 1.1385242191036669e-05,
2959
+ "loss": 2.0844,
2960
+ "mean_token_accuracy": 0.5562084444984794,
2961
+ "num_input_tokens_seen": 6545823777,
2962
+ "num_tokens": 2758638062.0,
2963
+ "step": 13550
2964
+ },
2965
+ {
2966
+ "epoch": 3.2826923367080587,
2967
+ "grad_norm": 0.24609375,
2968
+ "learning_rate": 1.1196619888335598e-05,
2969
+ "loss": 2.089,
2970
+ "mean_token_accuracy": 0.5565486250445246,
2971
+ "num_input_tokens_seen": 6569949777,
2972
+ "num_tokens": 2768698376.0,
2973
+ "step": 13600
2974
+ },
2975
+ {
2976
+ "epoch": 3.2947628890867118,
2977
+ "grad_norm": 0.2431640625,
2978
+ "learning_rate": 1.1007997585634526e-05,
2979
+ "loss": 2.0915,
2980
+ "mean_token_accuracy": 0.5548499751463533,
2981
+ "num_input_tokens_seen": 6593997425,
2982
+ "num_tokens": 2778806953.0,
2983
+ "step": 13650
2984
+ },
2985
+ {
2986
+ "epoch": 3.306833441465365,
2987
+ "grad_norm": 0.330078125,
2988
+ "learning_rate": 1.0819375282933455e-05,
2989
+ "loss": 2.0875,
2990
+ "mean_token_accuracy": 0.5560770154371858,
2991
+ "num_input_tokens_seen": 6618153121,
2992
+ "num_tokens": 2789046249.0,
2993
+ "step": 13700
2994
+ },
2995
+ {
2996
+ "epoch": 3.3189039938440184,
2997
+ "grad_norm": 0.26171875,
2998
+ "learning_rate": 1.0630752980232384e-05,
2999
+ "loss": 2.0974,
3000
+ "mean_token_accuracy": 0.5540758088976144,
3001
+ "num_input_tokens_seen": 6642228561,
3002
+ "num_tokens": 2799134100.0,
3003
+ "step": 13750
3004
+ },
3005
+ {
3006
+ "epoch": 3.3309745462226714,
3007
+ "grad_norm": 0.2578125,
3008
+ "learning_rate": 1.0442130677531312e-05,
3009
+ "loss": 2.0837,
3010
+ "mean_token_accuracy": 0.5564264697581529,
3011
+ "num_input_tokens_seen": 6666487089,
3012
+ "num_tokens": 2809333203.0,
3013
+ "step": 13800
3014
+ },
3015
+ {
3016
+ "epoch": 3.343045098601325,
3017
+ "grad_norm": 0.271484375,
3018
+ "learning_rate": 1.025350837483024e-05,
3019
+ "loss": 2.0804,
3020
+ "mean_token_accuracy": 0.5564664682373405,
3021
+ "num_input_tokens_seen": 6690592209,
3022
+ "num_tokens": 2819507621.0,
3023
+ "step": 13850
3024
+ },
3025
+ {
3026
+ "epoch": 3.355115650979978,
3027
+ "grad_norm": 0.2578125,
3028
+ "learning_rate": 1.006488607212917e-05,
3029
+ "loss": 2.0875,
3030
+ "mean_token_accuracy": 0.5563617146387696,
3031
+ "num_input_tokens_seen": 6714782033,
3032
+ "num_tokens": 2829715451.0,
3033
+ "step": 13900
3034
+ },
3035
+ {
3036
+ "epoch": 3.367186203358631,
3037
+ "grad_norm": 0.26171875,
3038
+ "learning_rate": 9.876263769428096e-06,
3039
+ "loss": 2.1015,
3040
+ "mean_token_accuracy": 0.5533242063969374,
3041
+ "num_input_tokens_seen": 6738954721,
3042
+ "num_tokens": 2839876349.0,
3043
+ "step": 13950
3044
+ },
3045
+ {
3046
+ "epoch": 3.3792567557372846,
3047
+ "grad_norm": 0.2578125,
3048
+ "learning_rate": 9.687641466727027e-06,
3049
+ "loss": 2.1018,
3050
+ "num_input_tokens_seen": 6763271617,
3051
+ "step": 14000
3052
+ },
3053
+ {
3054
+ "epoch": 3.3792567557372846,
3055
+ "eval_loss": 1.9681081771850586,
3056
+ "eval_mean_token_accuracy": 0.5785279828634967,
3057
+ "eval_num_tokens": 2850147053.0,
3058
+ "eval_runtime": 131.6179,
3059
+ "eval_samples_per_second": 81.387,
3060
+ "eval_steps_per_second": 20.347,
3061
+ "num_input_tokens_seen": 6763271617,
3062
+ "step": 14000
3063
  }
3064
  ],
3065
  "logging_steps": 50,
3066
  "max_steps": 16568,
3067
+ "num_input_tokens_seen": 6763271617,
3068
  "num_train_epochs": 4,
3069
  "save_steps": 1000,
3070
  "stateful_callbacks": {
 
3079
  "attributes": {}
3080
  }
3081
  },
3082
+ "total_flos": 1.809241167078482e+18,
3083
  "train_batch_size": 16,
3084
  "trial_name": null,
3085
  "trial_params": null