CocoRoF commited on
Commit
0adba3f
·
verified ·
1 Parent(s): 8255486

Training in progress, step 2500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ddca534d10503034b593de387e205daca04a072f5ccbe17faac957a202b96d5
3
  size 791869518
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93c68d4f57aecd5e349b5c7323df8de547b94fc82347524cba7280d76e0e7875
3
  size 791869518
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9cb1c32a2fe7e4ba0a80cc2f7de739a5497222f3987f01b2d711b570de3fbe5c
3
  size 2375752250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee24b7c1f0faf41adb006e45d6263282d13f5f99cdfbeb8ec28e041f2947ac7d
3
  size 2375752250
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:06fea830cf5ad73ec00d500ea6fb952740ac936f18e93fa2d32abde1ea3ead92
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69ec6e3926fa071bede113523efa3dc6e630c3c7958c54a9ca321cf4d62ed145
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be561d1df19be227394d8ea607c54262a06c9bf880af0aa5e04a52596a2a6cb0
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6127ee4f0c13500ec5038fce65af8f7beec63c137c7d4b7c157aa6303cf5879
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03f3e24417a59435f5a8450a4aeb0f09cc92734b5c3b45a0701b2c043c415c05
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da01d1c5eb2cc3a323f97c1f590d13ccfac2a4c5b1479bd378b4e643304f5a4f
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2bea02744c29f30024590ab1629a0e7b7dabbf1e8476456c2e7c5ce46dc35c28
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49a3f04d76c0d3acc7d3dd95a04215f368f35a451ae8cba8a2fdba38cda9ca0a
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:041be966454b60c86af576fc1eb7f34189114689abff8f9622b947110f7334c8
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df7d2c9825dba80cb544920f8cc0c72122f96514e6cd259052a8765b034393e2
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b85766f6596d15a810177d77dd259d9b50588cf100ec5f8ebff5fed881d57957
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a20a42d44ff48cc162224010190e898fe28598ddad8cd1896d330a3bb1d8ec3
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8be75d04b1ebe614241b88fd010a5dda1b7bf703c00c6ebe310ca07975830fe7
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18ac0dc4f09f25179860561fcea7c5c8f997aabdc46a170665f9dc5a72bc27c6
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4699833a7ab4cb692996ef7567f934c0bac79d6a067963a873f89a38e412bd48
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a16fcb5411ff961b47eff7378d85105fe9837e0492d19ea5ce3b7c4b77aa3b6
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:06d017eeef42e0127e56fc73579acf75949a31de7dbc0f95bf4428c95dd75f92
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7219278083dbc646fe72946ab4301102da8206fb4979ac90c21b38ce89792e7
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.4623509098921422,
5
  "eval_steps": 500,
6
- "global_step": 2000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2839,6 +2839,714 @@
2839
  "eval_samples_per_second": 609.667,
2840
  "eval_steps_per_second": 38.105,
2841
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2842
  }
2843
  ],
2844
  "logging_steps": 5,
@@ -2858,7 +3566,7 @@
2858
  "attributes": {}
2859
  }
2860
  },
2861
- "total_flos": 8.664715985577574e+18,
2862
  "train_batch_size": 4,
2863
  "trial_name": null,
2864
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5779386373651777,
5
  "eval_steps": 500,
6
+ "global_step": 2500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2839
  "eval_samples_per_second": 609.667,
2840
  "eval_steps_per_second": 38.105,
2841
  "step": 2000
2842
+ },
2843
+ {
2844
+ "epoch": 0.46350678716687255,
2845
+ "grad_norm": 176.5,
2846
+ "learning_rate": 5.960945529290854e-06,
2847
+ "loss": 71.4161,
2848
+ "step": 2005
2849
+ },
2850
+ {
2851
+ "epoch": 0.4646626644416029,
2852
+ "grad_norm": 179.375,
2853
+ "learning_rate": 5.948098663926003e-06,
2854
+ "loss": 70.9138,
2855
+ "step": 2010
2856
+ },
2857
+ {
2858
+ "epoch": 0.46581854171633325,
2859
+ "grad_norm": 160.375,
2860
+ "learning_rate": 5.935251798561151e-06,
2861
+ "loss": 72.0052,
2862
+ "step": 2015
2863
+ },
2864
+ {
2865
+ "epoch": 0.4669744189910636,
2866
+ "grad_norm": 164.5,
2867
+ "learning_rate": 5.922404933196301e-06,
2868
+ "loss": 70.5761,
2869
+ "step": 2020
2870
+ },
2871
+ {
2872
+ "epoch": 0.468130296265794,
2873
+ "grad_norm": 164.5,
2874
+ "learning_rate": 5.90955806783145e-06,
2875
+ "loss": 69.3542,
2876
+ "step": 2025
2877
+ },
2878
+ {
2879
+ "epoch": 0.4692861735405243,
2880
+ "grad_norm": 159.5,
2881
+ "learning_rate": 5.896711202466599e-06,
2882
+ "loss": 70.1756,
2883
+ "step": 2030
2884
+ },
2885
+ {
2886
+ "epoch": 0.4704420508152547,
2887
+ "grad_norm": 205.125,
2888
+ "learning_rate": 5.8838643371017475e-06,
2889
+ "loss": 70.4389,
2890
+ "step": 2035
2891
+ },
2892
+ {
2893
+ "epoch": 0.471597928089985,
2894
+ "grad_norm": 176.375,
2895
+ "learning_rate": 5.871017471736896e-06,
2896
+ "loss": 70.7157,
2897
+ "step": 2040
2898
+ },
2899
+ {
2900
+ "epoch": 0.4727538053647154,
2901
+ "grad_norm": 164.25,
2902
+ "learning_rate": 5.858170606372045e-06,
2903
+ "loss": 71.9851,
2904
+ "step": 2045
2905
+ },
2906
+ {
2907
+ "epoch": 0.4739096826394458,
2908
+ "grad_norm": 183.875,
2909
+ "learning_rate": 5.845323741007195e-06,
2910
+ "loss": 71.0546,
2911
+ "step": 2050
2912
+ },
2913
+ {
2914
+ "epoch": 0.4750655599141761,
2915
+ "grad_norm": 211.375,
2916
+ "learning_rate": 5.832476875642343e-06,
2917
+ "loss": 71.8851,
2918
+ "step": 2055
2919
+ },
2920
+ {
2921
+ "epoch": 0.47622143718890647,
2922
+ "grad_norm": 182.75,
2923
+ "learning_rate": 5.819630010277493e-06,
2924
+ "loss": 71.1175,
2925
+ "step": 2060
2926
+ },
2927
+ {
2928
+ "epoch": 0.47737731446363685,
2929
+ "grad_norm": 167.25,
2930
+ "learning_rate": 5.806783144912642e-06,
2931
+ "loss": 71.7044,
2932
+ "step": 2065
2933
+ },
2934
+ {
2935
+ "epoch": 0.47853319173836717,
2936
+ "grad_norm": 158.375,
2937
+ "learning_rate": 5.79393627954779e-06,
2938
+ "loss": 69.464,
2939
+ "step": 2070
2940
+ },
2941
+ {
2942
+ "epoch": 0.47968906901309755,
2943
+ "grad_norm": 169.625,
2944
+ "learning_rate": 5.78108941418294e-06,
2945
+ "loss": 71.0951,
2946
+ "step": 2075
2947
+ },
2948
+ {
2949
+ "epoch": 0.48084494628782787,
2950
+ "grad_norm": 186.75,
2951
+ "learning_rate": 5.76824254881809e-06,
2952
+ "loss": 69.9219,
2953
+ "step": 2080
2954
+ },
2955
+ {
2956
+ "epoch": 0.48200082356255824,
2957
+ "grad_norm": 168.5,
2958
+ "learning_rate": 5.755395683453238e-06,
2959
+ "loss": 71.1934,
2960
+ "step": 2085
2961
+ },
2962
+ {
2963
+ "epoch": 0.4831567008372886,
2964
+ "grad_norm": 155.125,
2965
+ "learning_rate": 5.742548818088387e-06,
2966
+ "loss": 71.4351,
2967
+ "step": 2090
2968
+ },
2969
+ {
2970
+ "epoch": 0.48431257811201894,
2971
+ "grad_norm": 154.0,
2972
+ "learning_rate": 5.729701952723536e-06,
2973
+ "loss": 70.3735,
2974
+ "step": 2095
2975
+ },
2976
+ {
2977
+ "epoch": 0.4854684553867493,
2978
+ "grad_norm": 158.0,
2979
+ "learning_rate": 5.716855087358685e-06,
2980
+ "loss": 69.8907,
2981
+ "step": 2100
2982
+ },
2983
+ {
2984
+ "epoch": 0.48662433266147964,
2985
+ "grad_norm": 160.375,
2986
+ "learning_rate": 5.704008221993834e-06,
2987
+ "loss": 71.9331,
2988
+ "step": 2105
2989
+ },
2990
+ {
2991
+ "epoch": 0.48778020993621,
2992
+ "grad_norm": 159.875,
2993
+ "learning_rate": 5.691161356628983e-06,
2994
+ "loss": 70.6931,
2995
+ "step": 2110
2996
+ },
2997
+ {
2998
+ "epoch": 0.4889360872109404,
2999
+ "grad_norm": 208.25,
3000
+ "learning_rate": 5.678314491264132e-06,
3001
+ "loss": 70.8729,
3002
+ "step": 2115
3003
+ },
3004
+ {
3005
+ "epoch": 0.4900919644856707,
3006
+ "grad_norm": 169.875,
3007
+ "learning_rate": 5.665467625899281e-06,
3008
+ "loss": 71.4016,
3009
+ "step": 2120
3010
+ },
3011
+ {
3012
+ "epoch": 0.4912478417604011,
3013
+ "grad_norm": 172.875,
3014
+ "learning_rate": 5.652620760534429e-06,
3015
+ "loss": 70.1855,
3016
+ "step": 2125
3017
+ },
3018
+ {
3019
+ "epoch": 0.49240371903513147,
3020
+ "grad_norm": 161.25,
3021
+ "learning_rate": 5.639773895169579e-06,
3022
+ "loss": 69.836,
3023
+ "step": 2130
3024
+ },
3025
+ {
3026
+ "epoch": 0.4935595963098618,
3027
+ "grad_norm": 154.625,
3028
+ "learning_rate": 5.626927029804729e-06,
3029
+ "loss": 70.073,
3030
+ "step": 2135
3031
+ },
3032
+ {
3033
+ "epoch": 0.49471547358459217,
3034
+ "grad_norm": 157.25,
3035
+ "learning_rate": 5.614080164439877e-06,
3036
+ "loss": 69.7913,
3037
+ "step": 2140
3038
+ },
3039
+ {
3040
+ "epoch": 0.4958713508593225,
3041
+ "grad_norm": 176.25,
3042
+ "learning_rate": 5.601233299075026e-06,
3043
+ "loss": 69.8387,
3044
+ "step": 2145
3045
+ },
3046
+ {
3047
+ "epoch": 0.49702722813405287,
3048
+ "grad_norm": 161.125,
3049
+ "learning_rate": 5.588386433710175e-06,
3050
+ "loss": 69.4486,
3051
+ "step": 2150
3052
+ },
3053
+ {
3054
+ "epoch": 0.49818310540878324,
3055
+ "grad_norm": 180.875,
3056
+ "learning_rate": 5.575539568345324e-06,
3057
+ "loss": 69.8849,
3058
+ "step": 2155
3059
+ },
3060
+ {
3061
+ "epoch": 0.49933898268351357,
3062
+ "grad_norm": 179.25,
3063
+ "learning_rate": 5.562692702980474e-06,
3064
+ "loss": 70.4132,
3065
+ "step": 2160
3066
+ },
3067
+ {
3068
+ "epoch": 0.5004948599582439,
3069
+ "grad_norm": 151.375,
3070
+ "learning_rate": 5.549845837615623e-06,
3071
+ "loss": 70.3948,
3072
+ "step": 2165
3073
+ },
3074
+ {
3075
+ "epoch": 0.5016507372329743,
3076
+ "grad_norm": 163.0,
3077
+ "learning_rate": 5.5369989722507715e-06,
3078
+ "loss": 70.1088,
3079
+ "step": 2170
3080
+ },
3081
+ {
3082
+ "epoch": 0.5028066145077047,
3083
+ "grad_norm": 150.0,
3084
+ "learning_rate": 5.52415210688592e-06,
3085
+ "loss": 70.178,
3086
+ "step": 2175
3087
+ },
3088
+ {
3089
+ "epoch": 0.503962491782435,
3090
+ "grad_norm": 159.625,
3091
+ "learning_rate": 5.511305241521069e-06,
3092
+ "loss": 71.454,
3093
+ "step": 2180
3094
+ },
3095
+ {
3096
+ "epoch": 0.5051183690571653,
3097
+ "grad_norm": 192.25,
3098
+ "learning_rate": 5.498458376156218e-06,
3099
+ "loss": 70.7199,
3100
+ "step": 2185
3101
+ },
3102
+ {
3103
+ "epoch": 0.5062742463318957,
3104
+ "grad_norm": 190.375,
3105
+ "learning_rate": 5.485611510791368e-06,
3106
+ "loss": 70.7635,
3107
+ "step": 2190
3108
+ },
3109
+ {
3110
+ "epoch": 0.5074301236066261,
3111
+ "grad_norm": 202.5,
3112
+ "learning_rate": 5.472764645426516e-06,
3113
+ "loss": 70.3444,
3114
+ "step": 2195
3115
+ },
3116
+ {
3117
+ "epoch": 0.5085860008813564,
3118
+ "grad_norm": 169.875,
3119
+ "learning_rate": 5.459917780061665e-06,
3120
+ "loss": 70.8077,
3121
+ "step": 2200
3122
+ },
3123
+ {
3124
+ "epoch": 0.5097418781560867,
3125
+ "grad_norm": 164.125,
3126
+ "learning_rate": 5.447070914696815e-06,
3127
+ "loss": 70.1035,
3128
+ "step": 2205
3129
+ },
3130
+ {
3131
+ "epoch": 0.5108977554308172,
3132
+ "grad_norm": 156.75,
3133
+ "learning_rate": 5.434224049331963e-06,
3134
+ "loss": 70.4719,
3135
+ "step": 2210
3136
+ },
3137
+ {
3138
+ "epoch": 0.5120536327055475,
3139
+ "grad_norm": 185.625,
3140
+ "learning_rate": 5.421377183967113e-06,
3141
+ "loss": 71.3653,
3142
+ "step": 2215
3143
+ },
3144
+ {
3145
+ "epoch": 0.5132095099802778,
3146
+ "grad_norm": 161.125,
3147
+ "learning_rate": 5.408530318602262e-06,
3148
+ "loss": 69.8378,
3149
+ "step": 2220
3150
+ },
3151
+ {
3152
+ "epoch": 0.5143653872550082,
3153
+ "grad_norm": 159.5,
3154
+ "learning_rate": 5.3956834532374105e-06,
3155
+ "loss": 70.8915,
3156
+ "step": 2225
3157
+ },
3158
+ {
3159
+ "epoch": 0.5155212645297386,
3160
+ "grad_norm": 157.375,
3161
+ "learning_rate": 5.382836587872559e-06,
3162
+ "loss": 69.4553,
3163
+ "step": 2230
3164
+ },
3165
+ {
3166
+ "epoch": 0.5166771418044689,
3167
+ "grad_norm": 157.625,
3168
+ "learning_rate": 5.369989722507709e-06,
3169
+ "loss": 69.6028,
3170
+ "step": 2235
3171
+ },
3172
+ {
3173
+ "epoch": 0.5178330190791993,
3174
+ "grad_norm": 174.0,
3175
+ "learning_rate": 5.357142857142857e-06,
3176
+ "loss": 69.8295,
3177
+ "step": 2240
3178
+ },
3179
+ {
3180
+ "epoch": 0.5189888963539296,
3181
+ "grad_norm": 172.0,
3182
+ "learning_rate": 5.344295991778007e-06,
3183
+ "loss": 70.2746,
3184
+ "step": 2245
3185
+ },
3186
+ {
3187
+ "epoch": 0.52014477362866,
3188
+ "grad_norm": 156.625,
3189
+ "learning_rate": 5.331449126413155e-06,
3190
+ "loss": 70.6405,
3191
+ "step": 2250
3192
+ },
3193
+ {
3194
+ "epoch": 0.5213006509033903,
3195
+ "grad_norm": 186.0,
3196
+ "learning_rate": 5.3186022610483044e-06,
3197
+ "loss": 69.7524,
3198
+ "step": 2255
3199
+ },
3200
+ {
3201
+ "epoch": 0.5224565281781207,
3202
+ "grad_norm": 177.375,
3203
+ "learning_rate": 5.305755395683454e-06,
3204
+ "loss": 69.9427,
3205
+ "step": 2260
3206
+ },
3207
+ {
3208
+ "epoch": 0.523612405452851,
3209
+ "grad_norm": 162.0,
3210
+ "learning_rate": 5.292908530318602e-06,
3211
+ "loss": 69.921,
3212
+ "step": 2265
3213
+ },
3214
+ {
3215
+ "epoch": 0.5247682827275814,
3216
+ "grad_norm": 149.25,
3217
+ "learning_rate": 5.280061664953752e-06,
3218
+ "loss": 69.2333,
3219
+ "step": 2270
3220
+ },
3221
+ {
3222
+ "epoch": 0.5259241600023118,
3223
+ "grad_norm": 156.125,
3224
+ "learning_rate": 5.2672147995889015e-06,
3225
+ "loss": 69.6589,
3226
+ "step": 2275
3227
+ },
3228
+ {
3229
+ "epoch": 0.5270800372770421,
3230
+ "grad_norm": 165.25,
3231
+ "learning_rate": 5.2543679342240495e-06,
3232
+ "loss": 71.545,
3233
+ "step": 2280
3234
+ },
3235
+ {
3236
+ "epoch": 0.5282359145517724,
3237
+ "grad_norm": 159.75,
3238
+ "learning_rate": 5.241521068859199e-06,
3239
+ "loss": 69.826,
3240
+ "step": 2285
3241
+ },
3242
+ {
3243
+ "epoch": 0.5293917918265029,
3244
+ "grad_norm": 166.25,
3245
+ "learning_rate": 5.228674203494348e-06,
3246
+ "loss": 69.6276,
3247
+ "step": 2290
3248
+ },
3249
+ {
3250
+ "epoch": 0.5305476691012332,
3251
+ "grad_norm": 170.125,
3252
+ "learning_rate": 5.215827338129497e-06,
3253
+ "loss": 69.4714,
3254
+ "step": 2295
3255
+ },
3256
+ {
3257
+ "epoch": 0.5317035463759635,
3258
+ "grad_norm": 153.5,
3259
+ "learning_rate": 5.202980472764646e-06,
3260
+ "loss": 70.1795,
3261
+ "step": 2300
3262
+ },
3263
+ {
3264
+ "epoch": 0.5328594236506939,
3265
+ "grad_norm": 157.5,
3266
+ "learning_rate": 5.190133607399795e-06,
3267
+ "loss": 69.2672,
3268
+ "step": 2305
3269
+ },
3270
+ {
3271
+ "epoch": 0.5340153009254243,
3272
+ "grad_norm": 161.375,
3273
+ "learning_rate": 5.1772867420349434e-06,
3274
+ "loss": 69.4437,
3275
+ "step": 2310
3276
+ },
3277
+ {
3278
+ "epoch": 0.5351711782001546,
3279
+ "grad_norm": 167.25,
3280
+ "learning_rate": 5.164439876670093e-06,
3281
+ "loss": 70.2732,
3282
+ "step": 2315
3283
+ },
3284
+ {
3285
+ "epoch": 0.5363270554748849,
3286
+ "grad_norm": 158.0,
3287
+ "learning_rate": 5.151593011305241e-06,
3288
+ "loss": 70.8601,
3289
+ "step": 2320
3290
+ },
3291
+ {
3292
+ "epoch": 0.5374829327496153,
3293
+ "grad_norm": 189.625,
3294
+ "learning_rate": 5.138746145940391e-06,
3295
+ "loss": 70.938,
3296
+ "step": 2325
3297
+ },
3298
+ {
3299
+ "epoch": 0.5386388100243457,
3300
+ "grad_norm": 174.75,
3301
+ "learning_rate": 5.1258992805755405e-06,
3302
+ "loss": 69.4389,
3303
+ "step": 2330
3304
+ },
3305
+ {
3306
+ "epoch": 0.539794687299076,
3307
+ "grad_norm": 193.875,
3308
+ "learning_rate": 5.1130524152106885e-06,
3309
+ "loss": 70.4369,
3310
+ "step": 2335
3311
+ },
3312
+ {
3313
+ "epoch": 0.5409505645738064,
3314
+ "grad_norm": 164.0,
3315
+ "learning_rate": 5.100205549845838e-06,
3316
+ "loss": 69.9866,
3317
+ "step": 2340
3318
+ },
3319
+ {
3320
+ "epoch": 0.5421064418485367,
3321
+ "grad_norm": 148.125,
3322
+ "learning_rate": 5.087358684480987e-06,
3323
+ "loss": 69.0706,
3324
+ "step": 2345
3325
+ },
3326
+ {
3327
+ "epoch": 0.5432623191232671,
3328
+ "grad_norm": 167.5,
3329
+ "learning_rate": 5.074511819116136e-06,
3330
+ "loss": 70.4756,
3331
+ "step": 2350
3332
+ },
3333
+ {
3334
+ "epoch": 0.5444181963979975,
3335
+ "grad_norm": 161.75,
3336
+ "learning_rate": 5.061664953751286e-06,
3337
+ "loss": 69.9491,
3338
+ "step": 2355
3339
+ },
3340
+ {
3341
+ "epoch": 0.5455740736727278,
3342
+ "grad_norm": 174.0,
3343
+ "learning_rate": 5.0488180883864345e-06,
3344
+ "loss": 69.262,
3345
+ "step": 2360
3346
+ },
3347
+ {
3348
+ "epoch": 0.5467299509474581,
3349
+ "grad_norm": 172.0,
3350
+ "learning_rate": 5.035971223021583e-06,
3351
+ "loss": 70.5688,
3352
+ "step": 2365
3353
+ },
3354
+ {
3355
+ "epoch": 0.5478858282221886,
3356
+ "grad_norm": 181.5,
3357
+ "learning_rate": 5.023124357656732e-06,
3358
+ "loss": 69.6968,
3359
+ "step": 2370
3360
+ },
3361
+ {
3362
+ "epoch": 0.5490417054969189,
3363
+ "grad_norm": 177.25,
3364
+ "learning_rate": 5.010277492291881e-06,
3365
+ "loss": 71.718,
3366
+ "step": 2375
3367
+ },
3368
+ {
3369
+ "epoch": 0.5501975827716492,
3370
+ "grad_norm": 166.625,
3371
+ "learning_rate": 4.99743062692703e-06,
3372
+ "loss": 70.3106,
3373
+ "step": 2380
3374
+ },
3375
+ {
3376
+ "epoch": 0.5513534600463795,
3377
+ "grad_norm": 152.25,
3378
+ "learning_rate": 4.9845837615621795e-06,
3379
+ "loss": 69.8569,
3380
+ "step": 2385
3381
+ },
3382
+ {
3383
+ "epoch": 0.55250933732111,
3384
+ "grad_norm": 170.625,
3385
+ "learning_rate": 4.971736896197328e-06,
3386
+ "loss": 69.7388,
3387
+ "step": 2390
3388
+ },
3389
+ {
3390
+ "epoch": 0.5536652145958403,
3391
+ "grad_norm": 151.5,
3392
+ "learning_rate": 4.958890030832477e-06,
3393
+ "loss": 69.3465,
3394
+ "step": 2395
3395
+ },
3396
+ {
3397
+ "epoch": 0.5548210918705706,
3398
+ "grad_norm": 162.0,
3399
+ "learning_rate": 4.946043165467626e-06,
3400
+ "loss": 69.7658,
3401
+ "step": 2400
3402
+ },
3403
+ {
3404
+ "epoch": 0.555976969145301,
3405
+ "grad_norm": 170.25,
3406
+ "learning_rate": 4.933196300102776e-06,
3407
+ "loss": 68.5618,
3408
+ "step": 2405
3409
+ },
3410
+ {
3411
+ "epoch": 0.5571328464200314,
3412
+ "grad_norm": 168.0,
3413
+ "learning_rate": 4.920349434737925e-06,
3414
+ "loss": 69.0478,
3415
+ "step": 2410
3416
+ },
3417
+ {
3418
+ "epoch": 0.5582887236947617,
3419
+ "grad_norm": 163.75,
3420
+ "learning_rate": 4.9075025693730735e-06,
3421
+ "loss": 70.553,
3422
+ "step": 2415
3423
+ },
3424
+ {
3425
+ "epoch": 0.5594446009694921,
3426
+ "grad_norm": 187.25,
3427
+ "learning_rate": 4.894655704008222e-06,
3428
+ "loss": 70.2036,
3429
+ "step": 2420
3430
+ },
3431
+ {
3432
+ "epoch": 0.5606004782442224,
3433
+ "grad_norm": 169.125,
3434
+ "learning_rate": 4.881808838643371e-06,
3435
+ "loss": 69.2802,
3436
+ "step": 2425
3437
+ },
3438
+ {
3439
+ "epoch": 0.5617563555189528,
3440
+ "grad_norm": 156.75,
3441
+ "learning_rate": 4.86896197327852e-06,
3442
+ "loss": 69.5526,
3443
+ "step": 2430
3444
+ },
3445
+ {
3446
+ "epoch": 0.5629122327936831,
3447
+ "grad_norm": 161.75,
3448
+ "learning_rate": 4.856115107913669e-06,
3449
+ "loss": 70.8698,
3450
+ "step": 2435
3451
+ },
3452
+ {
3453
+ "epoch": 0.5640681100684135,
3454
+ "grad_norm": 162.75,
3455
+ "learning_rate": 4.8432682425488185e-06,
3456
+ "loss": 68.8072,
3457
+ "step": 2440
3458
+ },
3459
+ {
3460
+ "epoch": 0.5652239873431438,
3461
+ "grad_norm": 179.25,
3462
+ "learning_rate": 4.830421377183967e-06,
3463
+ "loss": 70.1462,
3464
+ "step": 2445
3465
+ },
3466
+ {
3467
+ "epoch": 0.5663798646178742,
3468
+ "grad_norm": 162.625,
3469
+ "learning_rate": 4.817574511819116e-06,
3470
+ "loss": 70.458,
3471
+ "step": 2450
3472
+ },
3473
+ {
3474
+ "epoch": 0.5675357418926046,
3475
+ "grad_norm": 151.125,
3476
+ "learning_rate": 4.804727646454266e-06,
3477
+ "loss": 70.8089,
3478
+ "step": 2455
3479
+ },
3480
+ {
3481
+ "epoch": 0.5686916191673349,
3482
+ "grad_norm": 166.0,
3483
+ "learning_rate": 4.791880781089415e-06,
3484
+ "loss": 69.5729,
3485
+ "step": 2460
3486
+ },
3487
+ {
3488
+ "epoch": 0.5698474964420652,
3489
+ "grad_norm": 153.125,
3490
+ "learning_rate": 4.779033915724564e-06,
3491
+ "loss": 69.4479,
3492
+ "step": 2465
3493
+ },
3494
+ {
3495
+ "epoch": 0.5710033737167957,
3496
+ "grad_norm": 153.0,
3497
+ "learning_rate": 4.7661870503597125e-06,
3498
+ "loss": 67.9093,
3499
+ "step": 2470
3500
+ },
3501
+ {
3502
+ "epoch": 0.572159250991526,
3503
+ "grad_norm": 155.125,
3504
+ "learning_rate": 4.753340184994862e-06,
3505
+ "loss": 70.8675,
3506
+ "step": 2475
3507
+ },
3508
+ {
3509
+ "epoch": 0.5733151282662563,
3510
+ "grad_norm": 173.375,
3511
+ "learning_rate": 4.740493319630011e-06,
3512
+ "loss": 69.6522,
3513
+ "step": 2480
3514
+ },
3515
+ {
3516
+ "epoch": 0.5744710055409867,
3517
+ "grad_norm": 167.125,
3518
+ "learning_rate": 4.72764645426516e-06,
3519
+ "loss": 69.2384,
3520
+ "step": 2485
3521
+ },
3522
+ {
3523
+ "epoch": 0.5756268828157171,
3524
+ "grad_norm": 163.25,
3525
+ "learning_rate": 4.714799588900309e-06,
3526
+ "loss": 69.2969,
3527
+ "step": 2490
3528
+ },
3529
+ {
3530
+ "epoch": 0.5767827600904474,
3531
+ "grad_norm": 154.875,
3532
+ "learning_rate": 4.7019527235354576e-06,
3533
+ "loss": 69.5717,
3534
+ "step": 2495
3535
+ },
3536
+ {
3537
+ "epoch": 0.5779386373651777,
3538
+ "grad_norm": 162.375,
3539
+ "learning_rate": 4.689105858170606e-06,
3540
+ "loss": 69.8666,
3541
+ "step": 2500
3542
+ },
3543
+ {
3544
+ "epoch": 0.5779386373651777,
3545
+ "eval_loss": NaN,
3546
+ "eval_runtime": 381.701,
3547
+ "eval_samples_per_second": 610.776,
3548
+ "eval_steps_per_second": 38.174,
3549
+ "step": 2500
3550
  }
3551
  ],
3552
  "logging_steps": 5,
 
3566
  "attributes": {}
3567
  }
3568
  },
3569
+ "total_flos": 1.0830894981971968e+19,
3570
  "train_batch_size": 4,
3571
  "trial_name": null,
3572
  "trial_params": null