ZurabDz commited on
Commit
a5c9236
·
verified ·
1 Parent(s): 3287168

Training in progress, step 10000

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f363676186105730c1d25197e2c0305ebfbf6706de03ba2c07404ec86bf25c03
3
  size 44644496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90e0cf7ce1f314bfb364f44b1b9395ac7510ae076e5664e54d0b81cb80e0f2d4
3
  size 44644496
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:abda9e7a12534ef2affd8d0c860673e26661a5152bce292672896e64d2a0cdaf
3
  size 11230198
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf57752b7fe8b0f27d2c11bd437ea0d0546c101a5edf6c2b8ef58464ff8128e9
3
  size 11230198
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b3ee827a7a00012c0a116546df467feee35e70376d81a7a85b1a70eb90414d3
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c062f7f375beded48b5337f5a3f3a5cb38807fa3e85dbf3e294c0ab6b627bfc2
3
  size 14244
runs/Jun07_12-33-16_DESKTOP-69FPKCK/events.out.tfevents.1717788805.DESKTOP-69FPKCK CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:edcf8641734c1d64588936cae32831ed874d304e627a7c2312709108b8fd418e
3
- size 110072
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be506d81ada1b7dc41d27acbdb06762dd92f15fc140e5efbef58d1d2d1e179db
3
+ size 111338
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b35aaeefae1777c5b0cc2a6a699a6e86dbf10049e0c78d4a59c18dcf3571dfd
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c903d70ba831f7bc91d767743519849df9eeb11f7c11a55a187111672ce37e65
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.03619598315076984,
5
  "eval_steps": 2000,
6
- "global_step": 8000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2843,6 +2843,715 @@
2843
  "eval_samples_per_second": 2769.782,
2844
  "eval_steps_per_second": 10.822,
2845
  "step": 8000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2846
  }
2847
  ],
2848
  "logging_steps": 20,
@@ -2850,7 +3559,7 @@
2850
  "num_input_tokens_seen": 0,
2851
  "num_train_epochs": 3,
2852
  "save_steps": 100,
2853
- "total_flos": 2876724215808000.0,
2854
  "train_batch_size": 256,
2855
  "trial_name": null,
2856
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.045244978938462306,
5
  "eval_steps": 2000,
6
+ "global_step": 10000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2843
  "eval_samples_per_second": 2769.782,
2844
  "eval_steps_per_second": 10.822,
2845
  "step": 8000
2846
+ },
2847
+ {
2848
+ "epoch": 0.036286473108646765,
2849
+ "grad_norm": 12.62540054321289,
2850
+ "learning_rate": 0.000108804633064881,
2851
+ "loss": 8.1561,
2852
+ "step": 8020
2853
+ },
2854
+ {
2855
+ "epoch": 0.036376963066523695,
2856
+ "grad_norm": 12.97541332244873,
2857
+ "learning_rate": 0.00010907610171025247,
2858
+ "loss": 8.1708,
2859
+ "step": 8040
2860
+ },
2861
+ {
2862
+ "epoch": 0.03646745302440062,
2863
+ "grad_norm": 8.305766105651855,
2864
+ "learning_rate": 0.00010934757035562392,
2865
+ "loss": 8.1671,
2866
+ "step": 8060
2867
+ },
2868
+ {
2869
+ "epoch": 0.03655794298227754,
2870
+ "grad_norm": 14.076859474182129,
2871
+ "learning_rate": 0.00010961903900099538,
2872
+ "loss": 8.1659,
2873
+ "step": 8080
2874
+ },
2875
+ {
2876
+ "epoch": 0.03664843294015447,
2877
+ "grad_norm": 11.951278686523438,
2878
+ "learning_rate": 0.00010989050764636684,
2879
+ "loss": 8.1893,
2880
+ "step": 8100
2881
+ },
2882
+ {
2883
+ "epoch": 0.03673892289803139,
2884
+ "grad_norm": 10.796624183654785,
2885
+ "learning_rate": 0.00011016197629173831,
2886
+ "loss": 8.1942,
2887
+ "step": 8120
2888
+ },
2889
+ {
2890
+ "epoch": 0.036829412855908315,
2891
+ "grad_norm": 10.49177074432373,
2892
+ "learning_rate": 0.00011043344493710976,
2893
+ "loss": 8.1589,
2894
+ "step": 8140
2895
+ },
2896
+ {
2897
+ "epoch": 0.03691990281378524,
2898
+ "grad_norm": 12.82060432434082,
2899
+ "learning_rate": 0.00011070491358248122,
2900
+ "loss": 8.1957,
2901
+ "step": 8160
2902
+ },
2903
+ {
2904
+ "epoch": 0.03701039277166217,
2905
+ "grad_norm": 11.00941276550293,
2906
+ "learning_rate": 0.00011097638222785267,
2907
+ "loss": 8.1609,
2908
+ "step": 8180
2909
+ },
2910
+ {
2911
+ "epoch": 0.03710088272953909,
2912
+ "grad_norm": 10.24111270904541,
2913
+ "learning_rate": 0.00011124785087322413,
2914
+ "loss": 8.1769,
2915
+ "step": 8200
2916
+ },
2917
+ {
2918
+ "epoch": 0.03719137268741601,
2919
+ "grad_norm": 11.292909622192383,
2920
+ "learning_rate": 0.0001115193195185956,
2921
+ "loss": 8.1628,
2922
+ "step": 8220
2923
+ },
2924
+ {
2925
+ "epoch": 0.037281862645292936,
2926
+ "grad_norm": 9.362674713134766,
2927
+ "learning_rate": 0.00011179078816396706,
2928
+ "loss": 8.1638,
2929
+ "step": 8240
2930
+ },
2931
+ {
2932
+ "epoch": 0.037372352603169866,
2933
+ "grad_norm": 12.9249906539917,
2934
+ "learning_rate": 0.00011206225680933852,
2935
+ "loss": 8.1957,
2936
+ "step": 8260
2937
+ },
2938
+ {
2939
+ "epoch": 0.03746284256104679,
2940
+ "grad_norm": 10.386489868164062,
2941
+ "learning_rate": 0.00011233372545470999,
2942
+ "loss": 8.1525,
2943
+ "step": 8280
2944
+ },
2945
+ {
2946
+ "epoch": 0.03755333251892371,
2947
+ "grad_norm": 12.65300464630127,
2948
+ "learning_rate": 0.00011260519410008144,
2949
+ "loss": 8.1558,
2950
+ "step": 8300
2951
+ },
2952
+ {
2953
+ "epoch": 0.037643822476800634,
2954
+ "grad_norm": 11.562602996826172,
2955
+ "learning_rate": 0.0001128766627454529,
2956
+ "loss": 8.148,
2957
+ "step": 8320
2958
+ },
2959
+ {
2960
+ "epoch": 0.037734312434677564,
2961
+ "grad_norm": 14.783183097839355,
2962
+ "learning_rate": 0.00011314813139082436,
2963
+ "loss": 8.1448,
2964
+ "step": 8340
2965
+ },
2966
+ {
2967
+ "epoch": 0.03782480239255449,
2968
+ "grad_norm": 15.469168663024902,
2969
+ "learning_rate": 0.00011341960003619583,
2970
+ "loss": 8.1801,
2971
+ "step": 8360
2972
+ },
2973
+ {
2974
+ "epoch": 0.03791529235043141,
2975
+ "grad_norm": 11.361299514770508,
2976
+ "learning_rate": 0.00011369106868156726,
2977
+ "loss": 8.1549,
2978
+ "step": 8380
2979
+ },
2980
+ {
2981
+ "epoch": 0.03800578230830833,
2982
+ "grad_norm": 9.814708709716797,
2983
+ "learning_rate": 0.00011396253732693873,
2984
+ "loss": 8.1663,
2985
+ "step": 8400
2986
+ },
2987
+ {
2988
+ "epoch": 0.03809627226618526,
2989
+ "grad_norm": 10.522832870483398,
2990
+ "learning_rate": 0.00011423400597231019,
2991
+ "loss": 8.1459,
2992
+ "step": 8420
2993
+ },
2994
+ {
2995
+ "epoch": 0.038186762224062185,
2996
+ "grad_norm": 10.637961387634277,
2997
+ "learning_rate": 0.00011450547461768165,
2998
+ "loss": 8.1554,
2999
+ "step": 8440
3000
+ },
3001
+ {
3002
+ "epoch": 0.03827725218193911,
3003
+ "grad_norm": 14.578750610351562,
3004
+ "learning_rate": 0.00011477694326305312,
3005
+ "loss": 8.1758,
3006
+ "step": 8460
3007
+ },
3008
+ {
3009
+ "epoch": 0.03836774213981604,
3010
+ "grad_norm": 12.179791450500488,
3011
+ "learning_rate": 0.00011504841190842457,
3012
+ "loss": 8.1117,
3013
+ "step": 8480
3014
+ },
3015
+ {
3016
+ "epoch": 0.03845823209769296,
3017
+ "grad_norm": 11.189960479736328,
3018
+ "learning_rate": 0.00011531988055379603,
3019
+ "loss": 8.1517,
3020
+ "step": 8500
3021
+ },
3022
+ {
3023
+ "epoch": 0.03854872205556988,
3024
+ "grad_norm": 11.662614822387695,
3025
+ "learning_rate": 0.00011559134919916749,
3026
+ "loss": 8.129,
3027
+ "step": 8520
3028
+ },
3029
+ {
3030
+ "epoch": 0.038639212013446805,
3031
+ "grad_norm": 9.089029312133789,
3032
+ "learning_rate": 0.00011584924441227038,
3033
+ "loss": 8.1452,
3034
+ "step": 8540
3035
+ },
3036
+ {
3037
+ "epoch": 0.038729701971323735,
3038
+ "grad_norm": 15.1500825881958,
3039
+ "learning_rate": 0.00011612071305764184,
3040
+ "loss": 8.1623,
3041
+ "step": 8560
3042
+ },
3043
+ {
3044
+ "epoch": 0.03882019192920066,
3045
+ "grad_norm": 15.177955627441406,
3046
+ "learning_rate": 0.0001163921817030133,
3047
+ "loss": 8.1138,
3048
+ "step": 8580
3049
+ },
3050
+ {
3051
+ "epoch": 0.03891068188707758,
3052
+ "grad_norm": 9.620798110961914,
3053
+ "learning_rate": 0.00011666365034838476,
3054
+ "loss": 8.1472,
3055
+ "step": 8600
3056
+ },
3057
+ {
3058
+ "epoch": 0.0390011718449545,
3059
+ "grad_norm": 13.227412223815918,
3060
+ "learning_rate": 0.00011693511899375622,
3061
+ "loss": 8.1436,
3062
+ "step": 8620
3063
+ },
3064
+ {
3065
+ "epoch": 0.03909166180283143,
3066
+ "grad_norm": 12.561627388000488,
3067
+ "learning_rate": 0.00011720658763912768,
3068
+ "loss": 8.1478,
3069
+ "step": 8640
3070
+ },
3071
+ {
3072
+ "epoch": 0.039182151760708356,
3073
+ "grad_norm": 12.864951133728027,
3074
+ "learning_rate": 0.00011747805628449915,
3075
+ "loss": 8.1727,
3076
+ "step": 8660
3077
+ },
3078
+ {
3079
+ "epoch": 0.03927264171858528,
3080
+ "grad_norm": 12.883962631225586,
3081
+ "learning_rate": 0.00011774952492987061,
3082
+ "loss": 8.1396,
3083
+ "step": 8680
3084
+ },
3085
+ {
3086
+ "epoch": 0.0393631316764622,
3087
+ "grad_norm": 7.435621738433838,
3088
+ "learning_rate": 0.00011802099357524204,
3089
+ "loss": 8.1774,
3090
+ "step": 8700
3091
+ },
3092
+ {
3093
+ "epoch": 0.03945362163433913,
3094
+ "grad_norm": 12.7384672164917,
3095
+ "learning_rate": 0.00011829246222061351,
3096
+ "loss": 8.1297,
3097
+ "step": 8720
3098
+ },
3099
+ {
3100
+ "epoch": 0.039544111592216054,
3101
+ "grad_norm": 14.0343017578125,
3102
+ "learning_rate": 0.00011856393086598497,
3103
+ "loss": 8.1406,
3104
+ "step": 8740
3105
+ },
3106
+ {
3107
+ "epoch": 0.03963460155009298,
3108
+ "grad_norm": 15.325870513916016,
3109
+ "learning_rate": 0.00011883539951135643,
3110
+ "loss": 8.1619,
3111
+ "step": 8760
3112
+ },
3113
+ {
3114
+ "epoch": 0.039725091507969906,
3115
+ "grad_norm": 21.650548934936523,
3116
+ "learning_rate": 0.00011910686815672788,
3117
+ "loss": 8.193,
3118
+ "step": 8780
3119
+ },
3120
+ {
3121
+ "epoch": 0.03981558146584683,
3122
+ "grad_norm": 15.605712890625,
3123
+ "learning_rate": 0.00011937833680209935,
3124
+ "loss": 8.1709,
3125
+ "step": 8800
3126
+ },
3127
+ {
3128
+ "epoch": 0.03990607142372375,
3129
+ "grad_norm": 10.788895606994629,
3130
+ "learning_rate": 0.00011964980544747081,
3131
+ "loss": 8.1451,
3132
+ "step": 8820
3133
+ },
3134
+ {
3135
+ "epoch": 0.039996561381600675,
3136
+ "grad_norm": 16.377477645874023,
3137
+ "learning_rate": 0.00011992127409284227,
3138
+ "loss": 8.134,
3139
+ "step": 8840
3140
+ },
3141
+ {
3142
+ "epoch": 0.040087051339477604,
3143
+ "grad_norm": 13.106194496154785,
3144
+ "learning_rate": 0.00012019274273821374,
3145
+ "loss": 8.1352,
3146
+ "step": 8860
3147
+ },
3148
+ {
3149
+ "epoch": 0.04017754129735453,
3150
+ "grad_norm": 11.152835845947266,
3151
+ "learning_rate": 0.0001204642113835852,
3152
+ "loss": 8.1138,
3153
+ "step": 8880
3154
+ },
3155
+ {
3156
+ "epoch": 0.04026803125523145,
3157
+ "grad_norm": 9.210712432861328,
3158
+ "learning_rate": 0.00012073568002895666,
3159
+ "loss": 8.1769,
3160
+ "step": 8900
3161
+ },
3162
+ {
3163
+ "epoch": 0.04035852121310837,
3164
+ "grad_norm": 12.555234909057617,
3165
+ "learning_rate": 0.00012100714867432813,
3166
+ "loss": 8.1383,
3167
+ "step": 8920
3168
+ },
3169
+ {
3170
+ "epoch": 0.0404490111709853,
3171
+ "grad_norm": 12.013688087463379,
3172
+ "learning_rate": 0.00012127861731969958,
3173
+ "loss": 8.1564,
3174
+ "step": 8940
3175
+ },
3176
+ {
3177
+ "epoch": 0.040539501128862225,
3178
+ "grad_norm": 9.827411651611328,
3179
+ "learning_rate": 0.00012155008596507101,
3180
+ "loss": 8.1348,
3181
+ "step": 8960
3182
+ },
3183
+ {
3184
+ "epoch": 0.04062999108673915,
3185
+ "grad_norm": 11.609356880187988,
3186
+ "learning_rate": 0.00012182155461044248,
3187
+ "loss": 8.1646,
3188
+ "step": 8980
3189
+ },
3190
+ {
3191
+ "epoch": 0.04072048104461607,
3192
+ "grad_norm": 13.045088768005371,
3193
+ "learning_rate": 0.00012209302325581395,
3194
+ "loss": 8.1628,
3195
+ "step": 9000
3196
+ },
3197
+ {
3198
+ "epoch": 0.040810971002493,
3199
+ "grad_norm": 12.780691146850586,
3200
+ "learning_rate": 0.00012236449190118542,
3201
+ "loss": 8.1487,
3202
+ "step": 9020
3203
+ },
3204
+ {
3205
+ "epoch": 0.04090146096036992,
3206
+ "grad_norm": 10.65334701538086,
3207
+ "learning_rate": 0.00012263596054655685,
3208
+ "loss": 8.1275,
3209
+ "step": 9040
3210
+ },
3211
+ {
3212
+ "epoch": 0.040991950918246846,
3213
+ "grad_norm": 8.080134391784668,
3214
+ "learning_rate": 0.00012290742919192832,
3215
+ "loss": 8.1356,
3216
+ "step": 9060
3217
+ },
3218
+ {
3219
+ "epoch": 0.041082440876123776,
3220
+ "grad_norm": 12.708916664123535,
3221
+ "learning_rate": 0.00012317889783729978,
3222
+ "loss": 8.1606,
3223
+ "step": 9080
3224
+ },
3225
+ {
3226
+ "epoch": 0.0411729308340007,
3227
+ "grad_norm": 13.570298194885254,
3228
+ "learning_rate": 0.00012345036648267124,
3229
+ "loss": 8.1389,
3230
+ "step": 9100
3231
+ },
3232
+ {
3233
+ "epoch": 0.04126342079187762,
3234
+ "grad_norm": 13.237983703613281,
3235
+ "learning_rate": 0.0001237218351280427,
3236
+ "loss": 8.1243,
3237
+ "step": 9120
3238
+ },
3239
+ {
3240
+ "epoch": 0.041353910749754544,
3241
+ "grad_norm": 14.53023910522461,
3242
+ "learning_rate": 0.00012399330377341417,
3243
+ "loss": 8.1191,
3244
+ "step": 9140
3245
+ },
3246
+ {
3247
+ "epoch": 0.041444400707631474,
3248
+ "grad_norm": 11.765192031860352,
3249
+ "learning_rate": 0.00012426477241878563,
3250
+ "loss": 8.1031,
3251
+ "step": 9160
3252
+ },
3253
+ {
3254
+ "epoch": 0.041534890665508396,
3255
+ "grad_norm": 11.261069297790527,
3256
+ "learning_rate": 0.0001245362410641571,
3257
+ "loss": 8.1504,
3258
+ "step": 9180
3259
+ },
3260
+ {
3261
+ "epoch": 0.04162538062338532,
3262
+ "grad_norm": 13.039865493774414,
3263
+ "learning_rate": 0.00012480770970952856,
3264
+ "loss": 8.1186,
3265
+ "step": 9200
3266
+ },
3267
+ {
3268
+ "epoch": 0.04171587058126224,
3269
+ "grad_norm": 11.21242904663086,
3270
+ "learning_rate": 0.0001250791783549,
3271
+ "loss": 8.1244,
3272
+ "step": 9220
3273
+ },
3274
+ {
3275
+ "epoch": 0.04180636053913917,
3276
+ "grad_norm": 13.84521770477295,
3277
+ "learning_rate": 0.00012535064700027146,
3278
+ "loss": 8.1442,
3279
+ "step": 9240
3280
+ },
3281
+ {
3282
+ "epoch": 0.041896850497016094,
3283
+ "grad_norm": 14.333518981933594,
3284
+ "learning_rate": 0.00012562211564564292,
3285
+ "loss": 8.1628,
3286
+ "step": 9260
3287
+ },
3288
+ {
3289
+ "epoch": 0.04198734045489302,
3290
+ "grad_norm": 12.016851425170898,
3291
+ "learning_rate": 0.00012589358429101438,
3292
+ "loss": 8.1037,
3293
+ "step": 9280
3294
+ },
3295
+ {
3296
+ "epoch": 0.04207783041276994,
3297
+ "grad_norm": 9.183259010314941,
3298
+ "learning_rate": 0.00012616505293638585,
3299
+ "loss": 8.1429,
3300
+ "step": 9300
3301
+ },
3302
+ {
3303
+ "epoch": 0.04216832037064687,
3304
+ "grad_norm": 13.651033401489258,
3305
+ "learning_rate": 0.0001264365215817573,
3306
+ "loss": 8.1202,
3307
+ "step": 9320
3308
+ },
3309
+ {
3310
+ "epoch": 0.04225881032852379,
3311
+ "grad_norm": 11.869391441345215,
3312
+ "learning_rate": 0.00012670799022712877,
3313
+ "loss": 8.1125,
3314
+ "step": 9340
3315
+ },
3316
+ {
3317
+ "epoch": 0.042349300286400715,
3318
+ "grad_norm": 15.943286895751953,
3319
+ "learning_rate": 0.00012697945887250024,
3320
+ "loss": 8.1694,
3321
+ "step": 9360
3322
+ },
3323
+ {
3324
+ "epoch": 0.04243979024427764,
3325
+ "grad_norm": 13.450387001037598,
3326
+ "learning_rate": 0.00012725092751787167,
3327
+ "loss": 8.1379,
3328
+ "step": 9380
3329
+ },
3330
+ {
3331
+ "epoch": 0.04253028020215457,
3332
+ "grad_norm": 15.152196884155273,
3333
+ "learning_rate": 0.00012752239616324314,
3334
+ "loss": 8.1391,
3335
+ "step": 9400
3336
+ },
3337
+ {
3338
+ "epoch": 0.04262077016003149,
3339
+ "grad_norm": 15.109274864196777,
3340
+ "learning_rate": 0.0001277938648086146,
3341
+ "loss": 8.0963,
3342
+ "step": 9420
3343
+ },
3344
+ {
3345
+ "epoch": 0.04271126011790841,
3346
+ "grad_norm": 10.3173189163208,
3347
+ "learning_rate": 0.00012806533345398606,
3348
+ "loss": 8.1557,
3349
+ "step": 9440
3350
+ },
3351
+ {
3352
+ "epoch": 0.04280175007578534,
3353
+ "grad_norm": 11.38595962524414,
3354
+ "learning_rate": 0.00012833680209935753,
3355
+ "loss": 8.173,
3356
+ "step": 9460
3357
+ },
3358
+ {
3359
+ "epoch": 0.042892240033662266,
3360
+ "grad_norm": 11.458219528198242,
3361
+ "learning_rate": 0.00012859469731246043,
3362
+ "loss": 8.2542,
3363
+ "step": 9480
3364
+ },
3365
+ {
3366
+ "epoch": 0.04298272999153919,
3367
+ "grad_norm": 14.253256797790527,
3368
+ "learning_rate": 0.00012886616595783186,
3369
+ "loss": 8.1687,
3370
+ "step": 9500
3371
+ },
3372
+ {
3373
+ "epoch": 0.04307321994941611,
3374
+ "grad_norm": 14.074560165405273,
3375
+ "learning_rate": 0.00012913763460320333,
3376
+ "loss": 8.1175,
3377
+ "step": 9520
3378
+ },
3379
+ {
3380
+ "epoch": 0.04316370990729304,
3381
+ "grad_norm": 14.521282196044922,
3382
+ "learning_rate": 0.00012939552981630623,
3383
+ "loss": 8.1456,
3384
+ "step": 9540
3385
+ },
3386
+ {
3387
+ "epoch": 0.043254199865169964,
3388
+ "grad_norm": 12.537208557128906,
3389
+ "learning_rate": 0.0001296669984616777,
3390
+ "loss": 8.1432,
3391
+ "step": 9560
3392
+ },
3393
+ {
3394
+ "epoch": 0.043344689823046886,
3395
+ "grad_norm": 10.885902404785156,
3396
+ "learning_rate": 0.00012993846710704915,
3397
+ "loss": 8.1875,
3398
+ "step": 9580
3399
+ },
3400
+ {
3401
+ "epoch": 0.04343517978092381,
3402
+ "grad_norm": 10.156676292419434,
3403
+ "learning_rate": 0.0001302099357524206,
3404
+ "loss": 8.1728,
3405
+ "step": 9600
3406
+ },
3407
+ {
3408
+ "epoch": 0.04352566973880074,
3409
+ "grad_norm": 13.31322193145752,
3410
+ "learning_rate": 0.00013048140439779205,
3411
+ "loss": 8.1394,
3412
+ "step": 9620
3413
+ },
3414
+ {
3415
+ "epoch": 0.04361615969667766,
3416
+ "grad_norm": 7.779819488525391,
3417
+ "learning_rate": 0.0001307528730431635,
3418
+ "loss": 8.139,
3419
+ "step": 9640
3420
+ },
3421
+ {
3422
+ "epoch": 0.043706649654554584,
3423
+ "grad_norm": 12.208565711975098,
3424
+ "learning_rate": 0.00013102434168853495,
3425
+ "loss": 8.1346,
3426
+ "step": 9660
3427
+ },
3428
+ {
3429
+ "epoch": 0.04379713961243151,
3430
+ "grad_norm": 11.362008094787598,
3431
+ "learning_rate": 0.00013129581033390642,
3432
+ "loss": 8.1419,
3433
+ "step": 9680
3434
+ },
3435
+ {
3436
+ "epoch": 0.04388762957030844,
3437
+ "grad_norm": 11.86789321899414,
3438
+ "learning_rate": 0.00013156727897927788,
3439
+ "loss": 8.1475,
3440
+ "step": 9700
3441
+ },
3442
+ {
3443
+ "epoch": 0.04397811952818536,
3444
+ "grad_norm": 14.61185073852539,
3445
+ "learning_rate": 0.00013183874762464934,
3446
+ "loss": 8.1582,
3447
+ "step": 9720
3448
+ },
3449
+ {
3450
+ "epoch": 0.04406860948606228,
3451
+ "grad_norm": 11.60112190246582,
3452
+ "learning_rate": 0.0001321102162700208,
3453
+ "loss": 8.1073,
3454
+ "step": 9740
3455
+ },
3456
+ {
3457
+ "epoch": 0.04415909944393921,
3458
+ "grad_norm": 13.442856788635254,
3459
+ "learning_rate": 0.00013238168491539227,
3460
+ "loss": 8.1358,
3461
+ "step": 9760
3462
+ },
3463
+ {
3464
+ "epoch": 0.044249589401816135,
3465
+ "grad_norm": 11.524395942687988,
3466
+ "learning_rate": 0.00013265315356076373,
3467
+ "loss": 8.1083,
3468
+ "step": 9780
3469
+ },
3470
+ {
3471
+ "epoch": 0.04434007935969306,
3472
+ "grad_norm": 13.528814315795898,
3473
+ "learning_rate": 0.0001329246222061352,
3474
+ "loss": 8.1392,
3475
+ "step": 9800
3476
+ },
3477
+ {
3478
+ "epoch": 0.04443056931756998,
3479
+ "grad_norm": 18.11868667602539,
3480
+ "learning_rate": 0.00013319609085150666,
3481
+ "loss": 8.1784,
3482
+ "step": 9820
3483
+ },
3484
+ {
3485
+ "epoch": 0.04452105927544691,
3486
+ "grad_norm": 15.858280181884766,
3487
+ "learning_rate": 0.00013346755949687812,
3488
+ "loss": 8.1597,
3489
+ "step": 9840
3490
+ },
3491
+ {
3492
+ "epoch": 0.04461154923332383,
3493
+ "grad_norm": 14.466769218444824,
3494
+ "learning_rate": 0.00013373902814224956,
3495
+ "loss": 8.1632,
3496
+ "step": 9860
3497
+ },
3498
+ {
3499
+ "epoch": 0.044702039191200756,
3500
+ "grad_norm": 11.416616439819336,
3501
+ "learning_rate": 0.00013401049678762102,
3502
+ "loss": 8.1681,
3503
+ "step": 9880
3504
+ },
3505
+ {
3506
+ "epoch": 0.04479252914907768,
3507
+ "grad_norm": 39.87081527709961,
3508
+ "learning_rate": 0.00013428196543299249,
3509
+ "loss": 8.1384,
3510
+ "step": 9900
3511
+ },
3512
+ {
3513
+ "epoch": 0.04488301910695461,
3514
+ "grad_norm": 11.689374923706055,
3515
+ "learning_rate": 0.0001345398606460954,
3516
+ "loss": 8.5619,
3517
+ "step": 9920
3518
+ },
3519
+ {
3520
+ "epoch": 0.04497350906483153,
3521
+ "grad_norm": 10.53484058380127,
3522
+ "learning_rate": 0.00013481132929146682,
3523
+ "loss": 9.1495,
3524
+ "step": 9940
3525
+ },
3526
+ {
3527
+ "epoch": 0.045063999022708454,
3528
+ "grad_norm": 12.07006549835205,
3529
+ "learning_rate": 0.00013508279793683829,
3530
+ "loss": 9.1771,
3531
+ "step": 9960
3532
+ },
3533
+ {
3534
+ "epoch": 0.045154488980585376,
3535
+ "grad_norm": 9.795348167419434,
3536
+ "learning_rate": 0.00013535426658220975,
3537
+ "loss": 9.1545,
3538
+ "step": 9980
3539
+ },
3540
+ {
3541
+ "epoch": 0.045244978938462306,
3542
+ "grad_norm": 10.068339347839355,
3543
+ "learning_rate": 0.0001356257352275812,
3544
+ "loss": 9.1969,
3545
+ "step": 10000
3546
+ },
3547
+ {
3548
+ "epoch": 0.045244978938462306,
3549
+ "eval_accuracy": 0.022879129772772476,
3550
+ "eval_loss": 9.148832321166992,
3551
+ "eval_runtime": 212.7494,
3552
+ "eval_samples_per_second": 2857.071,
3553
+ "eval_steps_per_second": 11.163,
3554
+ "step": 10000
3555
  }
3556
  ],
3557
  "logging_steps": 20,
 
3559
  "num_input_tokens_seen": 0,
3560
  "num_train_epochs": 3,
3561
  "save_steps": 100,
3562
+ "total_flos": 3595905269760000.0,
3563
  "train_batch_size": 256,
3564
  "trial_name": null,
3565
  "trial_params": null