{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 1892, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004228329809725159, "grad_norm": 6.605544090270996, "learning_rate": 5.263157894736842e-08, "loss": 1.8237226009368896, "step": 2 }, { "epoch": 0.008456659619450317, "grad_norm": 0.9109106659889221, "learning_rate": 1.5789473684210527e-07, "loss": 2.176421880722046, "step": 4 }, { "epoch": 0.012684989429175475, "grad_norm": 3.9910192489624023, "learning_rate": 2.6315789473684213e-07, "loss": 2.1531057357788086, "step": 6 }, { "epoch": 0.016913319238900635, "grad_norm": 0.9899866580963135, "learning_rate": 3.6842105263157896e-07, "loss": 1.9564805030822754, "step": 8 }, { "epoch": 0.021141649048625793, "grad_norm": 2.9574663639068604, "learning_rate": 4.7368421052631585e-07, "loss": 2.021973133087158, "step": 10 }, { "epoch": 0.02536997885835095, "grad_norm": 2.2470693588256836, "learning_rate": 5.789473684210526e-07, "loss": 1.692598581314087, "step": 12 }, { "epoch": 0.02959830866807611, "grad_norm": 1.5818345546722412, "learning_rate": 6.842105263157896e-07, "loss": 1.6616182327270508, "step": 14 }, { "epoch": 0.03382663847780127, "grad_norm": 1.9749239683151245, "learning_rate": 7.894736842105263e-07, "loss": 1.8213186264038086, "step": 16 }, { "epoch": 0.03805496828752643, "grad_norm": 1.0426429510116577, "learning_rate": 8.947368421052632e-07, "loss": 1.8437881469726562, "step": 18 }, { "epoch": 0.042283298097251586, "grad_norm": 15.372987747192383, "learning_rate": 1.0000000000000002e-06, "loss": 2.0638184547424316, "step": 20 }, { "epoch": 0.046511627906976744, "grad_norm": 4.7466301918029785, "learning_rate": 1.1052631578947369e-06, "loss": 1.948048710823059, "step": 22 }, { "epoch": 0.0507399577167019, "grad_norm": 1.4575281143188477, "learning_rate": 1.2105263157894738e-06, "loss": 1.8744062185287476, "step": 24 }, { "epoch": 0.05496828752642706, "grad_norm": 3.336325168609619, "learning_rate": 1.3157894736842106e-06, "loss": 1.715809941291809, "step": 26 }, { "epoch": 0.05919661733615222, "grad_norm": 0.6526132822036743, "learning_rate": 1.4210526315789475e-06, "loss": 1.750809907913208, "step": 28 }, { "epoch": 0.06342494714587738, "grad_norm": 0.8683815002441406, "learning_rate": 1.5263157894736844e-06, "loss": 1.5532337427139282, "step": 30 }, { "epoch": 0.06765327695560254, "grad_norm": 4.369999885559082, "learning_rate": 1.6315789473684212e-06, "loss": 1.2949138879776, "step": 32 }, { "epoch": 0.07188160676532769, "grad_norm": 0.8993115425109863, "learning_rate": 1.736842105263158e-06, "loss": 1.3249835968017578, "step": 34 }, { "epoch": 0.07610993657505286, "grad_norm": 1.2760223150253296, "learning_rate": 1.8421052631578948e-06, "loss": 1.4548490047454834, "step": 36 }, { "epoch": 0.080338266384778, "grad_norm": 5.320272445678711, "learning_rate": 1.9473684210526315e-06, "loss": 1.3651189804077148, "step": 38 }, { "epoch": 0.08456659619450317, "grad_norm": 4.702236175537109, "learning_rate": 2.0526315789473687e-06, "loss": 1.316279649734497, "step": 40 }, { "epoch": 0.08879492600422834, "grad_norm": 5.524050712585449, "learning_rate": 2.1578947368421054e-06, "loss": 1.7164320945739746, "step": 42 }, { "epoch": 0.09302325581395349, "grad_norm": 1.7671512365341187, "learning_rate": 2.2631578947368426e-06, "loss": 1.695072889328003, "step": 44 }, { "epoch": 0.09725158562367865, "grad_norm": 0.7419072985649109, "learning_rate": 2.368421052631579e-06, "loss": 1.3463151454925537, "step": 46 }, { "epoch": 0.1014799154334038, "grad_norm": 0.44772660732269287, "learning_rate": 2.473684210526316e-06, "loss": 1.5603983402252197, "step": 48 }, { "epoch": 0.10570824524312897, "grad_norm": 0.9128488302230835, "learning_rate": 2.578947368421053e-06, "loss": 1.5553311109542847, "step": 50 }, { "epoch": 0.10993657505285412, "grad_norm": 2.591745138168335, "learning_rate": 2.68421052631579e-06, "loss": 0.8912559747695923, "step": 52 }, { "epoch": 0.11416490486257928, "grad_norm": 0.4358270466327667, "learning_rate": 2.789473684210526e-06, "loss": 1.115212321281433, "step": 54 }, { "epoch": 0.11839323467230443, "grad_norm": 0.9643327593803406, "learning_rate": 2.8947368421052634e-06, "loss": 1.4279577732086182, "step": 56 }, { "epoch": 0.1226215644820296, "grad_norm": 0.7719278931617737, "learning_rate": 3e-06, "loss": 1.4487831592559814, "step": 58 }, { "epoch": 0.12684989429175475, "grad_norm": 0.7415221929550171, "learning_rate": 3.1052631578947372e-06, "loss": 1.4636942148208618, "step": 60 }, { "epoch": 0.13107822410147993, "grad_norm": 1.0166652202606201, "learning_rate": 3.210526315789474e-06, "loss": 0.8027479648590088, "step": 62 }, { "epoch": 0.13530655391120508, "grad_norm": 0.8704442381858826, "learning_rate": 3.3157894736842107e-06, "loss": 1.042100429534912, "step": 64 }, { "epoch": 0.13953488372093023, "grad_norm": 0.5612062811851501, "learning_rate": 3.421052631578948e-06, "loss": 1.3720247745513916, "step": 66 }, { "epoch": 0.14376321353065538, "grad_norm": 0.9619214534759521, "learning_rate": 3.5263157894736846e-06, "loss": 1.4121863842010498, "step": 68 }, { "epoch": 0.14799154334038056, "grad_norm": 0.7827504277229309, "learning_rate": 3.6315789473684217e-06, "loss": 1.409840703010559, "step": 70 }, { "epoch": 0.1522198731501057, "grad_norm": 0.7777084708213806, "learning_rate": 3.736842105263158e-06, "loss": 1.3885422945022583, "step": 72 }, { "epoch": 0.15644820295983086, "grad_norm": 0.7667552828788757, "learning_rate": 3.842105263157895e-06, "loss": 0.9778481125831604, "step": 74 }, { "epoch": 0.160676532769556, "grad_norm": 0.7666372060775757, "learning_rate": 3.947368421052632e-06, "loss": 0.8655365705490112, "step": 76 }, { "epoch": 0.1649048625792812, "grad_norm": 1.127411961555481, "learning_rate": 4.052631578947368e-06, "loss": 1.3328880071640015, "step": 78 }, { "epoch": 0.16913319238900634, "grad_norm": 1.8701919317245483, "learning_rate": 4.157894736842106e-06, "loss": 1.1148747205734253, "step": 80 }, { "epoch": 0.1733615221987315, "grad_norm": 0.7047215104103088, "learning_rate": 4.2631578947368425e-06, "loss": 1.1087937355041504, "step": 82 }, { "epoch": 0.17758985200845667, "grad_norm": 1.534998893737793, "learning_rate": 4.368421052631579e-06, "loss": 0.9685766696929932, "step": 84 }, { "epoch": 0.18181818181818182, "grad_norm": 1.2067896127700806, "learning_rate": 4.473684210526316e-06, "loss": 1.7643671035766602, "step": 86 }, { "epoch": 0.18604651162790697, "grad_norm": 1.5933588743209839, "learning_rate": 4.578947368421053e-06, "loss": 1.3110942840576172, "step": 88 }, { "epoch": 0.19027484143763213, "grad_norm": 4.7901225090026855, "learning_rate": 4.68421052631579e-06, "loss": 1.0165361166000366, "step": 90 }, { "epoch": 0.1945031712473573, "grad_norm": 0.7400741577148438, "learning_rate": 4.789473684210527e-06, "loss": 1.3473522663116455, "step": 92 }, { "epoch": 0.19873150105708245, "grad_norm": 1.6431527137756348, "learning_rate": 4.894736842105264e-06, "loss": 0.9921259880065918, "step": 94 }, { "epoch": 0.2029598308668076, "grad_norm": 1.1242965459823608, "learning_rate": 5e-06, "loss": 1.3252302408218384, "step": 96 }, { "epoch": 0.20718816067653276, "grad_norm": 1.4306470155715942, "learning_rate": 4.999986246423023e-06, "loss": 1.4119060039520264, "step": 98 }, { "epoch": 0.21141649048625794, "grad_norm": 0.6519229412078857, "learning_rate": 4.999944985860234e-06, "loss": 1.3824262619018555, "step": 100 }, { "epoch": 0.2156448202959831, "grad_norm": 0.9801005721092224, "learning_rate": 4.9998762188160604e-06, "loss": 1.2583341598510742, "step": 102 }, { "epoch": 0.21987315010570824, "grad_norm": 0.6960574388504028, "learning_rate": 4.999779946131206e-06, "loss": 1.3352376222610474, "step": 104 }, { "epoch": 0.22410147991543342, "grad_norm": 0.937359631061554, "learning_rate": 4.9996561689826455e-06, "loss": 1.3265419006347656, "step": 106 }, { "epoch": 0.22832980972515857, "grad_norm": 0.973751962184906, "learning_rate": 4.999504888883601e-06, "loss": 1.0313334465026855, "step": 108 }, { "epoch": 0.23255813953488372, "grad_norm": 0.8326078653335571, "learning_rate": 4.999326107683535e-06, "loss": 1.3112177848815918, "step": 110 }, { "epoch": 0.23678646934460887, "grad_norm": 1.3524088859558105, "learning_rate": 4.999119827568119e-06, "loss": 1.379159688949585, "step": 112 }, { "epoch": 0.24101479915433405, "grad_norm": 0.8450507521629333, "learning_rate": 4.9988860510592085e-06, "loss": 1.4243919849395752, "step": 114 }, { "epoch": 0.2452431289640592, "grad_norm": 1.4355442523956299, "learning_rate": 4.998624781014819e-06, "loss": 1.3441710472106934, "step": 116 }, { "epoch": 0.24947145877378435, "grad_norm": 1.2588502168655396, "learning_rate": 4.998336020629077e-06, "loss": 1.3617056608200073, "step": 118 }, { "epoch": 0.2536997885835095, "grad_norm": 1.3343617916107178, "learning_rate": 4.998019773432198e-06, "loss": 0.8026182055473328, "step": 120 }, { "epoch": 0.25792811839323465, "grad_norm": 0.6089492440223694, "learning_rate": 4.997676043290429e-06, "loss": 1.320847988128662, "step": 122 }, { "epoch": 0.26215644820295986, "grad_norm": 0.6601752042770386, "learning_rate": 4.997304834406011e-06, "loss": 1.1560726165771484, "step": 124 }, { "epoch": 0.266384778012685, "grad_norm": 0.9852200746536255, "learning_rate": 4.9969061513171185e-06, "loss": 1.3645535707473755, "step": 126 }, { "epoch": 0.27061310782241016, "grad_norm": 1.04742431640625, "learning_rate": 4.996479998897815e-06, "loss": 1.0370509624481201, "step": 128 }, { "epoch": 0.2748414376321353, "grad_norm": 0.768661379814148, "learning_rate": 4.996026382357985e-06, "loss": 0.976492166519165, "step": 130 }, { "epoch": 0.27906976744186046, "grad_norm": 1.244371771812439, "learning_rate": 4.995545307243273e-06, "loss": 1.290363073348999, "step": 132 }, { "epoch": 0.2832980972515856, "grad_norm": 1.2053449153900146, "learning_rate": 4.995036779435014e-06, "loss": 0.8751378655433655, "step": 134 }, { "epoch": 0.28752642706131076, "grad_norm": 2.1075494289398193, "learning_rate": 4.994500805150167e-06, "loss": 1.123706579208374, "step": 136 }, { "epoch": 0.2917547568710359, "grad_norm": 2.0092146396636963, "learning_rate": 4.993937390941231e-06, "loss": 1.4683767557144165, "step": 138 }, { "epoch": 0.2959830866807611, "grad_norm": 1.5879631042480469, "learning_rate": 4.9933465436961705e-06, "loss": 0.9096964597702026, "step": 140 }, { "epoch": 0.30021141649048627, "grad_norm": 1.2290595769882202, "learning_rate": 4.992728270638333e-06, "loss": 1.5735292434692383, "step": 142 }, { "epoch": 0.3044397463002114, "grad_norm": 16.471097946166992, "learning_rate": 4.992082579326354e-06, "loss": 1.1104016304016113, "step": 144 }, { "epoch": 0.3086680761099366, "grad_norm": 3.488492250442505, "learning_rate": 4.9914094776540676e-06, "loss": 0.6830090284347534, "step": 146 }, { "epoch": 0.3128964059196617, "grad_norm": 1.9017225503921509, "learning_rate": 4.990708973850415e-06, "loss": 1.2578611373901367, "step": 148 }, { "epoch": 0.3171247357293869, "grad_norm": 0.8831533789634705, "learning_rate": 4.989981076479334e-06, "loss": 1.2861902713775635, "step": 150 }, { "epoch": 0.321353065539112, "grad_norm": 1.4398751258850098, "learning_rate": 4.989225794439665e-06, "loss": 1.2161321640014648, "step": 152 }, { "epoch": 0.32558139534883723, "grad_norm": 2.2701990604400635, "learning_rate": 4.9884431369650316e-06, "loss": 0.7495906949043274, "step": 154 }, { "epoch": 0.3298097251585624, "grad_norm": 1.7071163654327393, "learning_rate": 4.987633113623737e-06, "loss": 1.2624861001968384, "step": 156 }, { "epoch": 0.33403805496828753, "grad_norm": 0.7767173647880554, "learning_rate": 4.986795734318643e-06, "loss": 1.2871098518371582, "step": 158 }, { "epoch": 0.3382663847780127, "grad_norm": 3.3821675777435303, "learning_rate": 4.985931009287047e-06, "loss": 1.279846429824829, "step": 160 }, { "epoch": 0.34249471458773784, "grad_norm": 1.1429699659347534, "learning_rate": 4.98503894910056e-06, "loss": 1.1057888269424438, "step": 162 }, { "epoch": 0.346723044397463, "grad_norm": 1.2170449495315552, "learning_rate": 4.9841195646649764e-06, "loss": 1.0053894519805908, "step": 164 }, { "epoch": 0.35095137420718814, "grad_norm": 0.7444745302200317, "learning_rate": 4.98317286722014e-06, "loss": 0.8725204467773438, "step": 166 }, { "epoch": 0.35517970401691334, "grad_norm": 1.1420894861221313, "learning_rate": 4.982198868339808e-06, "loss": 1.1899808645248413, "step": 168 }, { "epoch": 0.3594080338266385, "grad_norm": 0.6692090034484863, "learning_rate": 4.981197579931507e-06, "loss": 1.2234405279159546, "step": 170 }, { "epoch": 0.36363636363636365, "grad_norm": 0.6509742736816406, "learning_rate": 4.980169014236391e-06, "loss": 1.050593614578247, "step": 172 }, { "epoch": 0.3678646934460888, "grad_norm": 0.659625232219696, "learning_rate": 4.979113183829088e-06, "loss": 1.3807719945907593, "step": 174 }, { "epoch": 0.37209302325581395, "grad_norm": 0.8343127369880676, "learning_rate": 4.97803010161755e-06, "loss": 1.068203330039978, "step": 176 }, { "epoch": 0.3763213530655391, "grad_norm": 1.0895072221755981, "learning_rate": 4.976919780842892e-06, "loss": 0.8895647525787354, "step": 178 }, { "epoch": 0.38054968287526425, "grad_norm": 0.4808787703514099, "learning_rate": 4.97578223507923e-06, "loss": 1.3076379299163818, "step": 180 }, { "epoch": 0.38477801268498946, "grad_norm": 1.1349347829818726, "learning_rate": 4.97461747823352e-06, "loss": 1.547876238822937, "step": 182 }, { "epoch": 0.3890063424947146, "grad_norm": 1.6290419101715088, "learning_rate": 4.973425524545382e-06, "loss": 1.3527616262435913, "step": 184 }, { "epoch": 0.39323467230443976, "grad_norm": 1.678579568862915, "learning_rate": 4.972206388586927e-06, "loss": 1.102654218673706, "step": 186 }, { "epoch": 0.3974630021141649, "grad_norm": 1.367773175239563, "learning_rate": 4.970960085262584e-06, "loss": 0.9921371340751648, "step": 188 }, { "epoch": 0.40169133192389006, "grad_norm": 1.1447407007217407, "learning_rate": 4.969686629808911e-06, "loss": 1.0145394802093506, "step": 190 }, { "epoch": 0.4059196617336152, "grad_norm": 0.8305537104606628, "learning_rate": 4.9683860377944125e-06, "loss": 1.3157576322555542, "step": 192 }, { "epoch": 0.41014799154334036, "grad_norm": 0.30826497077941895, "learning_rate": 4.967058325119348e-06, "loss": 1.023323655128479, "step": 194 }, { "epoch": 0.4143763213530655, "grad_norm": 1.3932182788848877, "learning_rate": 4.965703508015539e-06, "loss": 1.2941372394561768, "step": 196 }, { "epoch": 0.4186046511627907, "grad_norm": 0.7291075587272644, "learning_rate": 4.964321603046169e-06, "loss": 1.0717015266418457, "step": 198 }, { "epoch": 0.42283298097251587, "grad_norm": 0.5842322707176208, "learning_rate": 4.962912627105581e-06, "loss": 1.1873562335968018, "step": 200 }, { "epoch": 0.427061310782241, "grad_norm": 0.6558592319488525, "learning_rate": 4.961476597419072e-06, "loss": 0.5549638867378235, "step": 202 }, { "epoch": 0.4312896405919662, "grad_norm": 0.5496861934661865, "learning_rate": 4.960013531542681e-06, "loss": 1.251203179359436, "step": 204 }, { "epoch": 0.4355179704016913, "grad_norm": 1.2829310894012451, "learning_rate": 4.958523447362978e-06, "loss": 1.3057016134262085, "step": 206 }, { "epoch": 0.4397463002114165, "grad_norm": 1.240744709968567, "learning_rate": 4.95700636309684e-06, "loss": 0.773059606552124, "step": 208 }, { "epoch": 0.4439746300211416, "grad_norm": 0.6856869459152222, "learning_rate": 4.955462297291231e-06, "loss": 1.3060951232910156, "step": 210 }, { "epoch": 0.44820295983086683, "grad_norm": 1.0836693048477173, "learning_rate": 4.953891268822977e-06, "loss": 1.2806111574172974, "step": 212 }, { "epoch": 0.452431289640592, "grad_norm": 0.8455312252044678, "learning_rate": 4.952293296898531e-06, "loss": 1.4302403926849365, "step": 214 }, { "epoch": 0.45665961945031713, "grad_norm": 0.5475296974182129, "learning_rate": 4.9506684010537425e-06, "loss": 0.6154606938362122, "step": 216 }, { "epoch": 0.4608879492600423, "grad_norm": 2.704235315322876, "learning_rate": 4.949016601153615e-06, "loss": 1.0028847455978394, "step": 218 }, { "epoch": 0.46511627906976744, "grad_norm": 0.5048341751098633, "learning_rate": 4.947337917392068e-06, "loss": 1.2792624235153198, "step": 220 }, { "epoch": 0.4693446088794926, "grad_norm": 1.052980899810791, "learning_rate": 4.9456323702916834e-06, "loss": 1.2493329048156738, "step": 222 }, { "epoch": 0.47357293868921774, "grad_norm": 0.7700513601303101, "learning_rate": 4.94389998070346e-06, "loss": 1.3523839712142944, "step": 224 }, { "epoch": 0.47780126849894294, "grad_norm": 0.4727860689163208, "learning_rate": 4.9421407698065546e-06, "loss": 1.262749433517456, "step": 226 }, { "epoch": 0.4820295983086681, "grad_norm": 0.7402790188789368, "learning_rate": 4.940354759108031e-06, "loss": 1.2572187185287476, "step": 228 }, { "epoch": 0.48625792811839325, "grad_norm": 0.5866014361381531, "learning_rate": 4.938541970442585e-06, "loss": 0.9033302068710327, "step": 230 }, { "epoch": 0.4904862579281184, "grad_norm": 0.6419193148612976, "learning_rate": 4.9367024259722866e-06, "loss": 1.2711232900619507, "step": 232 }, { "epoch": 0.49471458773784355, "grad_norm": 1.1681197881698608, "learning_rate": 4.934836148186306e-06, "loss": 0.9501933455467224, "step": 234 }, { "epoch": 0.4989429175475687, "grad_norm": 7.673317909240723, "learning_rate": 4.93294315990064e-06, "loss": 0.8862608075141907, "step": 236 }, { "epoch": 0.5031712473572939, "grad_norm": 1.019538402557373, "learning_rate": 4.93102348425783e-06, "loss": 0.8333485722541809, "step": 238 }, { "epoch": 0.507399577167019, "grad_norm": 0.6142158508300781, "learning_rate": 4.9290771447266815e-06, "loss": 0.8960846066474915, "step": 240 }, { "epoch": 0.5116279069767442, "grad_norm": 0.829379677772522, "learning_rate": 4.927104165101979e-06, "loss": 1.3168963193893433, "step": 242 }, { "epoch": 0.5158562367864693, "grad_norm": 1.2920678853988647, "learning_rate": 4.925104569504188e-06, "loss": 1.365329623222351, "step": 244 }, { "epoch": 0.5200845665961945, "grad_norm": 1.2702393531799316, "learning_rate": 4.923078382379172e-06, "loss": 1.2854634523391724, "step": 246 }, { "epoch": 0.5243128964059197, "grad_norm": 0.618291974067688, "learning_rate": 4.921025628497879e-06, "loss": 1.2556568384170532, "step": 248 }, { "epoch": 0.5285412262156448, "grad_norm": 1.7209643125534058, "learning_rate": 4.918946332956052e-06, "loss": 1.1718345880508423, "step": 250 }, { "epoch": 0.53276955602537, "grad_norm": 2.5865588188171387, "learning_rate": 4.916840521173914e-06, "loss": 1.1015582084655762, "step": 252 }, { "epoch": 0.5369978858350951, "grad_norm": 1.6121222972869873, "learning_rate": 4.914708218895861e-06, "loss": 1.742082118988037, "step": 254 }, { "epoch": 0.5412262156448203, "grad_norm": 0.8724984526634216, "learning_rate": 4.912549452190142e-06, "loss": 1.2257004976272583, "step": 256 }, { "epoch": 0.5454545454545454, "grad_norm": 0.8018029928207397, "learning_rate": 4.9103642474485506e-06, "loss": 1.404122233390808, "step": 258 }, { "epoch": 0.5496828752642706, "grad_norm": 0.7809526920318604, "learning_rate": 4.908152631386091e-06, "loss": 1.0011447668075562, "step": 260 }, { "epoch": 0.5539112050739958, "grad_norm": 0.903286874294281, "learning_rate": 4.905914631040658e-06, "loss": 1.2129504680633545, "step": 262 }, { "epoch": 0.5581395348837209, "grad_norm": 0.8726590871810913, "learning_rate": 4.9036502737727055e-06, "loss": 1.0686239004135132, "step": 264 }, { "epoch": 0.5623678646934461, "grad_norm": 1.0852851867675781, "learning_rate": 4.901359587264911e-06, "loss": 1.510224461555481, "step": 266 }, { "epoch": 0.5665961945031712, "grad_norm": 1.2106469869613647, "learning_rate": 4.899042599521836e-06, "loss": 0.5239309072494507, "step": 268 }, { "epoch": 0.5708245243128964, "grad_norm": 1.205407738685608, "learning_rate": 4.8966993388695886e-06, "loss": 1.0271662473678589, "step": 270 }, { "epoch": 0.5750528541226215, "grad_norm": 2.216895580291748, "learning_rate": 4.894329833955471e-06, "loss": 1.2076795101165771, "step": 272 }, { "epoch": 0.5792811839323467, "grad_norm": 1.1767100095748901, "learning_rate": 4.891934113747631e-06, "loss": 0.9579524993896484, "step": 274 }, { "epoch": 0.5835095137420718, "grad_norm": 1.0378605127334595, "learning_rate": 4.8895122075347135e-06, "loss": 0.9333509206771851, "step": 276 }, { "epoch": 0.587737843551797, "grad_norm": 0.2679741382598877, "learning_rate": 4.887064144925493e-06, "loss": 0.8527027368545532, "step": 278 }, { "epoch": 0.5919661733615222, "grad_norm": 0.6713143587112427, "learning_rate": 4.8845899558485185e-06, "loss": 1.2377649545669556, "step": 280 }, { "epoch": 0.5961945031712473, "grad_norm": 0.8605502247810364, "learning_rate": 4.8820896705517465e-06, "loss": 1.4566680192947388, "step": 282 }, { "epoch": 0.6004228329809725, "grad_norm": 0.1871010959148407, "learning_rate": 4.879563319602169e-06, "loss": 0.9204542636871338, "step": 284 }, { "epoch": 0.6046511627906976, "grad_norm": 1.0409096479415894, "learning_rate": 4.87701093388544e-06, "loss": 1.2875986099243164, "step": 286 }, { "epoch": 0.6088794926004228, "grad_norm": 1.0819401741027832, "learning_rate": 4.874432544605502e-06, "loss": 0.4104747176170349, "step": 288 }, { "epoch": 0.6131078224101479, "grad_norm": 1.2147349119186401, "learning_rate": 4.871828183284199e-06, "loss": 0.9401180744171143, "step": 290 }, { "epoch": 0.6173361522198731, "grad_norm": 0.9073833227157593, "learning_rate": 4.869197881760896e-06, "loss": 0.881571888923645, "step": 292 }, { "epoch": 0.6215644820295984, "grad_norm": 1.9964375495910645, "learning_rate": 4.866541672192082e-06, "loss": 0.7248478531837463, "step": 294 }, { "epoch": 0.6257928118393234, "grad_norm": 0.8532471656799316, "learning_rate": 4.863859587050991e-06, "loss": 0.7459216117858887, "step": 296 }, { "epoch": 0.6300211416490487, "grad_norm": 1.436072587966919, "learning_rate": 4.861151659127188e-06, "loss": 1.300452709197998, "step": 298 }, { "epoch": 0.6342494714587738, "grad_norm": 0.7239471673965454, "learning_rate": 4.85841792152618e-06, "loss": 1.2527419328689575, "step": 300 }, { "epoch": 0.638477801268499, "grad_norm": 0.6468306183815002, "learning_rate": 4.85565840766901e-06, "loss": 0.6989489197731018, "step": 302 }, { "epoch": 0.642706131078224, "grad_norm": 0.8453686237335205, "learning_rate": 4.852873151291841e-06, "loss": 0.8262038230895996, "step": 304 }, { "epoch": 0.6469344608879493, "grad_norm": 1.1085318326950073, "learning_rate": 4.850062186445552e-06, "loss": 0.9046404361724854, "step": 306 }, { "epoch": 0.6511627906976745, "grad_norm": 1.146599531173706, "learning_rate": 4.847225547495318e-06, "loss": 1.2455283403396606, "step": 308 }, { "epoch": 0.6553911205073996, "grad_norm": 1.3924742937088013, "learning_rate": 4.84436326912019e-06, "loss": 1.2206207513809204, "step": 310 }, { "epoch": 0.6596194503171248, "grad_norm": 0.65780109167099, "learning_rate": 4.84147538631267e-06, "loss": 1.2247376441955566, "step": 312 }, { "epoch": 0.6638477801268499, "grad_norm": 1.4019877910614014, "learning_rate": 4.8385619343782865e-06, "loss": 1.2421458959579468, "step": 314 }, { "epoch": 0.6680761099365751, "grad_norm": 0.5540094375610352, "learning_rate": 4.835622948935159e-06, "loss": 1.0704643726348877, "step": 316 }, { "epoch": 0.6723044397463002, "grad_norm": 4.01638126373291, "learning_rate": 4.832658465913566e-06, "loss": 0.7506370544433594, "step": 318 }, { "epoch": 0.6765327695560254, "grad_norm": 0.8524858355522156, "learning_rate": 4.829668521555503e-06, "loss": 1.2541189193725586, "step": 320 }, { "epoch": 0.6807610993657506, "grad_norm": 0.789856493473053, "learning_rate": 4.826653152414242e-06, "loss": 1.31632661819458, "step": 322 }, { "epoch": 0.6849894291754757, "grad_norm": 0.6171086430549622, "learning_rate": 4.823612395353881e-06, "loss": 1.0809494256973267, "step": 324 }, { "epoch": 0.6892177589852009, "grad_norm": 0.7493656873703003, "learning_rate": 4.820546287548897e-06, "loss": 1.2742823362350464, "step": 326 }, { "epoch": 0.693446088794926, "grad_norm": 1.6876893043518066, "learning_rate": 4.81745486648369e-06, "loss": 1.1811712980270386, "step": 328 }, { "epoch": 0.6976744186046512, "grad_norm": 3.2828762531280518, "learning_rate": 4.814338169952125e-06, "loss": 0.8377833366394043, "step": 330 }, { "epoch": 0.7019027484143763, "grad_norm": 1.0052536725997925, "learning_rate": 4.811196236057068e-06, "loss": 1.3030086755752563, "step": 332 }, { "epoch": 0.7061310782241015, "grad_norm": 1.0873143672943115, "learning_rate": 4.808029103209925e-06, "loss": 1.2012561559677124, "step": 334 }, { "epoch": 0.7103594080338267, "grad_norm": 0.6253702640533447, "learning_rate": 4.804836810130165e-06, "loss": 1.2230525016784668, "step": 336 }, { "epoch": 0.7145877378435518, "grad_norm": 0.9542647004127502, "learning_rate": 4.801619395844855e-06, "loss": 1.3592028617858887, "step": 338 }, { "epoch": 0.718816067653277, "grad_norm": 2.7538552284240723, "learning_rate": 4.798376899688178e-06, "loss": 1.2697663307189941, "step": 340 }, { "epoch": 0.7230443974630021, "grad_norm": 1.300477147102356, "learning_rate": 4.79510936130095e-06, "loss": 1.1157526969909668, "step": 342 }, { "epoch": 0.7272727272727273, "grad_norm": 1.0564080476760864, "learning_rate": 4.791816820630143e-06, "loss": 0.9000387191772461, "step": 344 }, { "epoch": 0.7315010570824524, "grad_norm": 6.434342861175537, "learning_rate": 4.788499317928387e-06, "loss": 0.8705897927284241, "step": 346 }, { "epoch": 0.7357293868921776, "grad_norm": 1.8239296674728394, "learning_rate": 4.785156893753487e-06, "loss": 0.9839805364608765, "step": 348 }, { "epoch": 0.7399577167019028, "grad_norm": 1.4335103034973145, "learning_rate": 4.781789588967922e-06, "loss": 1.3093687295913696, "step": 350 }, { "epoch": 0.7441860465116279, "grad_norm": 1.6742173433303833, "learning_rate": 4.778397444738344e-06, "loss": 1.1608158349990845, "step": 352 }, { "epoch": 0.7484143763213531, "grad_norm": 7.366899013519287, "learning_rate": 4.774980502535081e-06, "loss": 0.7054665088653564, "step": 354 }, { "epoch": 0.7526427061310782, "grad_norm": 1.3835808038711548, "learning_rate": 4.771538804131623e-06, "loss": 1.0112260580062866, "step": 356 }, { "epoch": 0.7568710359408034, "grad_norm": 0.7247501015663147, "learning_rate": 4.7680723916041145e-06, "loss": 1.21829092502594, "step": 358 }, { "epoch": 0.7610993657505285, "grad_norm": 1.7645400762557983, "learning_rate": 4.764581307330844e-06, "loss": 0.8012920618057251, "step": 360 }, { "epoch": 0.7653276955602537, "grad_norm": 0.9456394910812378, "learning_rate": 4.761065593991716e-06, "loss": 1.0871394872665405, "step": 362 }, { "epoch": 0.7695560253699789, "grad_norm": 1.7007086277008057, "learning_rate": 4.757525294567743e-06, "loss": 1.0711324214935303, "step": 364 }, { "epoch": 0.773784355179704, "grad_norm": 1.7648683786392212, "learning_rate": 4.753960452340503e-06, "loss": 1.2688275575637817, "step": 366 }, { "epoch": 0.7780126849894292, "grad_norm": 2.573831796646118, "learning_rate": 4.750371110891628e-06, "loss": 1.2218682765960693, "step": 368 }, { "epoch": 0.7822410147991543, "grad_norm": 1.4216328859329224, "learning_rate": 4.746757314102258e-06, "loss": 0.882118821144104, "step": 370 }, { "epoch": 0.7864693446088795, "grad_norm": 0.9801198840141296, "learning_rate": 4.74311910615251e-06, "loss": 1.1977894306182861, "step": 372 }, { "epoch": 0.7906976744186046, "grad_norm": 0.9408032894134521, "learning_rate": 4.739456531520939e-06, "loss": 1.218635082244873, "step": 374 }, { "epoch": 0.7949260042283298, "grad_norm": 0.8495731949806213, "learning_rate": 4.735769634983991e-06, "loss": 1.3023980855941772, "step": 376 }, { "epoch": 0.7991543340380549, "grad_norm": 0.7672894597053528, "learning_rate": 4.732058461615457e-06, "loss": 0.9807602763175964, "step": 378 }, { "epoch": 0.8033826638477801, "grad_norm": 1.1932101249694824, "learning_rate": 4.728323056785922e-06, "loss": 1.3166661262512207, "step": 380 }, { "epoch": 0.8076109936575053, "grad_norm": 0.8716092109680176, "learning_rate": 4.724563466162212e-06, "loss": 1.1811023950576782, "step": 382 }, { "epoch": 0.8118393234672304, "grad_norm": 0.7233268618583679, "learning_rate": 4.7207797357068325e-06, "loss": 0.9482329487800598, "step": 384 }, { "epoch": 0.8160676532769556, "grad_norm": 1.9323195219039917, "learning_rate": 4.716971911677408e-06, "loss": 0.9550711512565613, "step": 386 }, { "epoch": 0.8202959830866807, "grad_norm": 2.0255048274993896, "learning_rate": 4.713140040626116e-06, "loss": 1.4793070554733276, "step": 388 }, { "epoch": 0.8245243128964059, "grad_norm": 3.492385149002075, "learning_rate": 4.709284169399122e-06, "loss": 1.1643321514129639, "step": 390 }, { "epoch": 0.828752642706131, "grad_norm": 1.8576074838638306, "learning_rate": 4.7054043451359995e-06, "loss": 0.9359977841377258, "step": 392 }, { "epoch": 0.8329809725158562, "grad_norm": 2.6958370208740234, "learning_rate": 4.70150061526916e-06, "loss": 1.2630811929702759, "step": 394 }, { "epoch": 0.8372093023255814, "grad_norm": 0.2705208957195282, "learning_rate": 4.6975730275232675e-06, "loss": 0.7544412612915039, "step": 396 }, { "epoch": 0.8414376321353065, "grad_norm": 0.574475109577179, "learning_rate": 4.693621629914662e-06, "loss": 0.6635357737541199, "step": 398 }, { "epoch": 0.8456659619450317, "grad_norm": 3.4476590156555176, "learning_rate": 4.689646470750765e-06, "loss": 1.1272733211517334, "step": 400 }, { "epoch": 0.8498942917547568, "grad_norm": 1.7214937210083008, "learning_rate": 4.685647598629496e-06, "loss": 0.9259978532791138, "step": 402 }, { "epoch": 0.854122621564482, "grad_norm": 0.8425617218017578, "learning_rate": 4.681625062438672e-06, "loss": 0.8047032952308655, "step": 404 }, { "epoch": 0.8583509513742071, "grad_norm": 0.5278515219688416, "learning_rate": 4.677578911355415e-06, "loss": 0.9893290996551514, "step": 406 }, { "epoch": 0.8625792811839323, "grad_norm": 1.7615666389465332, "learning_rate": 4.673509194845547e-06, "loss": 1.0258289575576782, "step": 408 }, { "epoch": 0.8668076109936576, "grad_norm": 4.036815643310547, "learning_rate": 4.669415962662987e-06, "loss": 0.8945714235305786, "step": 410 }, { "epoch": 0.8710359408033826, "grad_norm": 1.1441407203674316, "learning_rate": 4.665299264849144e-06, "loss": 1.1385798454284668, "step": 412 }, { "epoch": 0.8752642706131079, "grad_norm": 0.9861418604850769, "learning_rate": 4.661159151732302e-06, "loss": 1.000221848487854, "step": 414 }, { "epoch": 0.879492600422833, "grad_norm": 1.6568901538848877, "learning_rate": 4.656995673927008e-06, "loss": 1.1056493520736694, "step": 416 }, { "epoch": 0.8837209302325582, "grad_norm": 1.9883769750595093, "learning_rate": 4.6528088823334485e-06, "loss": 1.2290613651275635, "step": 418 }, { "epoch": 0.8879492600422833, "grad_norm": 2.026019334793091, "learning_rate": 4.648598828136836e-06, "loss": 1.2732092142105103, "step": 420 }, { "epoch": 0.8921775898520085, "grad_norm": 2.332921266555786, "learning_rate": 4.644365562806772e-06, "loss": 0.9564085006713867, "step": 422 }, { "epoch": 0.8964059196617337, "grad_norm": 1.1014127731323242, "learning_rate": 4.6401091380966276e-06, "loss": 1.2294795513153076, "step": 424 }, { "epoch": 0.9006342494714588, "grad_norm": 0.6077444553375244, "learning_rate": 4.635829606042904e-06, "loss": 1.0533849000930786, "step": 426 }, { "epoch": 0.904862579281184, "grad_norm": 1.495557427406311, "learning_rate": 4.6315270189645994e-06, "loss": 0.870442807674408, "step": 428 }, { "epoch": 0.9090909090909091, "grad_norm": 0.9370867609977722, "learning_rate": 4.627201429462571e-06, "loss": 1.0831764936447144, "step": 430 }, { "epoch": 0.9133192389006343, "grad_norm": 1.3776339292526245, "learning_rate": 4.622852890418887e-06, "loss": 1.2492940425872803, "step": 432 }, { "epoch": 0.9175475687103594, "grad_norm": 1.6412044763565063, "learning_rate": 4.618481454996184e-06, "loss": 0.5277518033981323, "step": 434 }, { "epoch": 0.9217758985200846, "grad_norm": 1.8212065696716309, "learning_rate": 4.614087176637018e-06, "loss": 0.39484813809394836, "step": 436 }, { "epoch": 0.9260042283298098, "grad_norm": 0.5595113039016724, "learning_rate": 4.6096701090632064e-06, "loss": 0.9221642017364502, "step": 438 }, { "epoch": 0.9302325581395349, "grad_norm": 0.5767474174499512, "learning_rate": 4.605230306275174e-06, "loss": 1.1318392753601074, "step": 440 }, { "epoch": 0.9344608879492601, "grad_norm": 7.54542875289917, "learning_rate": 4.600767822551295e-06, "loss": 0.7188118100166321, "step": 442 }, { "epoch": 0.9386892177589852, "grad_norm": 1.233697772026062, "learning_rate": 4.596282712447225e-06, "loss": 1.243707299232483, "step": 444 }, { "epoch": 0.9429175475687104, "grad_norm": 0.9562466144561768, "learning_rate": 4.591775030795238e-06, "loss": 1.0868984460830688, "step": 446 }, { "epoch": 0.9471458773784355, "grad_norm": 1.5018267631530762, "learning_rate": 4.587244832703551e-06, "loss": 1.1005150079727173, "step": 448 }, { "epoch": 0.9513742071881607, "grad_norm": 1.9610888957977295, "learning_rate": 4.582692173555658e-06, "loss": 0.7627214193344116, "step": 450 }, { "epoch": 0.9556025369978859, "grad_norm": 0.661365807056427, "learning_rate": 4.5781171090096456e-06, "loss": 1.0607075691223145, "step": 452 }, { "epoch": 0.959830866807611, "grad_norm": 1.50721275806427, "learning_rate": 4.573519694997514e-06, "loss": 1.3157492876052856, "step": 454 }, { "epoch": 0.9640591966173362, "grad_norm": 5.258788585662842, "learning_rate": 4.568899987724499e-06, "loss": 0.6505974531173706, "step": 456 }, { "epoch": 0.9682875264270613, "grad_norm": 2.9857466220855713, "learning_rate": 4.564258043668378e-06, "loss": 0.8859183192253113, "step": 458 }, { "epoch": 0.9725158562367865, "grad_norm": 1.149194598197937, "learning_rate": 4.559593919578779e-06, "loss": 1.232746958732605, "step": 460 }, { "epoch": 0.9767441860465116, "grad_norm": 1.1455705165863037, "learning_rate": 4.554907672476498e-06, "loss": 1.2240073680877686, "step": 462 }, { "epoch": 0.9809725158562368, "grad_norm": 2.103093385696411, "learning_rate": 4.550199359652783e-06, "loss": 0.6853596568107605, "step": 464 }, { "epoch": 0.985200845665962, "grad_norm": 2.1419098377227783, "learning_rate": 4.5454690386686525e-06, "loss": 1.2064260244369507, "step": 466 }, { "epoch": 0.9894291754756871, "grad_norm": 2.1042675971984863, "learning_rate": 4.540716767354182e-06, "loss": 0.9678149819374084, "step": 468 }, { "epoch": 0.9936575052854123, "grad_norm": 1.194627046585083, "learning_rate": 4.5359426038077955e-06, "loss": 1.2596162557601929, "step": 470 }, { "epoch": 0.9978858350951374, "grad_norm": 0.7306416034698486, "learning_rate": 4.531146606395561e-06, "loss": 1.2588738203048706, "step": 472 }, { "epoch": 1.0021141649048626, "grad_norm": 1.3689743280410767, "learning_rate": 4.5263288337504755e-06, "loss": 0.9573943614959717, "step": 474 }, { "epoch": 1.0063424947145878, "grad_norm": 0.6775557994842529, "learning_rate": 4.521489344771744e-06, "loss": 1.2035043239593506, "step": 476 }, { "epoch": 1.0105708245243128, "grad_norm": 2.428388833999634, "learning_rate": 4.516628198624062e-06, "loss": 0.43922215700149536, "step": 478 }, { "epoch": 1.014799154334038, "grad_norm": 1.7859880924224854, "learning_rate": 4.511745454736895e-06, "loss": 0.8049924969673157, "step": 480 }, { "epoch": 1.0190274841437632, "grad_norm": 0.9620187878608704, "learning_rate": 4.506841172803751e-06, "loss": 0.7076442241668701, "step": 482 }, { "epoch": 1.0232558139534884, "grad_norm": 0.7065964341163635, "learning_rate": 4.501915412781443e-06, "loss": 1.0704156160354614, "step": 484 }, { "epoch": 1.0274841437632136, "grad_norm": 1.055808663368225, "learning_rate": 4.49696823488937e-06, "loss": 1.1018344163894653, "step": 486 }, { "epoch": 1.0317124735729386, "grad_norm": 1.521101474761963, "learning_rate": 4.491999699608768e-06, "loss": 0.8694652915000916, "step": 488 }, { "epoch": 1.0359408033826638, "grad_norm": 1.305381178855896, "learning_rate": 4.487009867681976e-06, "loss": 0.8501845002174377, "step": 490 }, { "epoch": 1.040169133192389, "grad_norm": 2.744489908218384, "learning_rate": 4.4819988001116935e-06, "loss": 0.7224630117416382, "step": 492 }, { "epoch": 1.0443974630021142, "grad_norm": 1.4060019254684448, "learning_rate": 4.476966558160237e-06, "loss": 1.1804600954055786, "step": 494 }, { "epoch": 1.0486257928118394, "grad_norm": 0.8754310011863708, "learning_rate": 4.4719132033487845e-06, "loss": 0.997734010219574, "step": 496 }, { "epoch": 1.0528541226215644, "grad_norm": 0.615909993648529, "learning_rate": 4.46683879745663e-06, "loss": 0.8599073886871338, "step": 498 }, { "epoch": 1.0570824524312896, "grad_norm": 0.9889487028121948, "learning_rate": 4.461743402520423e-06, "loss": 0.8792165517807007, "step": 500 }, { "epoch": 1.0613107822410148, "grad_norm": 0.7160633206367493, "learning_rate": 4.456627080833414e-06, "loss": 1.1756080389022827, "step": 502 }, { "epoch": 1.06553911205074, "grad_norm": 0.8279690742492676, "learning_rate": 4.451489894944691e-06, "loss": 1.1408627033233643, "step": 504 }, { "epoch": 1.069767441860465, "grad_norm": 1.4028804302215576, "learning_rate": 4.446331907658416e-06, "loss": 0.8269267678260803, "step": 506 }, { "epoch": 1.0739957716701902, "grad_norm": 0.851521909236908, "learning_rate": 4.441153182033057e-06, "loss": 0.8282674551010132, "step": 508 }, { "epoch": 1.0782241014799154, "grad_norm": 4.568781852722168, "learning_rate": 4.435953781380613e-06, "loss": 0.9908189177513123, "step": 510 }, { "epoch": 1.0824524312896406, "grad_norm": 1.2083165645599365, "learning_rate": 4.430733769265846e-06, "loss": 1.0665321350097656, "step": 512 }, { "epoch": 1.0866807610993658, "grad_norm": 0.9603208303451538, "learning_rate": 4.425493209505503e-06, "loss": 1.1846468448638916, "step": 514 }, { "epoch": 1.0909090909090908, "grad_norm": 2.0771138668060303, "learning_rate": 4.420232166167531e-06, "loss": 0.920912504196167, "step": 516 }, { "epoch": 1.095137420718816, "grad_norm": 0.8328940868377686, "learning_rate": 4.414950703570299e-06, "loss": 0.9249328374862671, "step": 518 }, { "epoch": 1.0993657505285412, "grad_norm": 1.3584532737731934, "learning_rate": 4.40964888628181e-06, "loss": 0.6281458139419556, "step": 520 }, { "epoch": 1.1035940803382664, "grad_norm": 0.7690941095352173, "learning_rate": 4.404326779118909e-06, "loss": 1.4201838970184326, "step": 522 }, { "epoch": 1.1078224101479917, "grad_norm": 1.4306553602218628, "learning_rate": 4.398984447146496e-06, "loss": 0.7664209604263306, "step": 524 }, { "epoch": 1.1120507399577166, "grad_norm": 0.9326035976409912, "learning_rate": 4.393621955676723e-06, "loss": 1.3979065418243408, "step": 526 }, { "epoch": 1.1162790697674418, "grad_norm": 0.5948351621627808, "learning_rate": 4.3882393702682046e-06, "loss": 0.8897819519042969, "step": 528 }, { "epoch": 1.120507399577167, "grad_norm": 18.87567138671875, "learning_rate": 4.38283675672521e-06, "loss": 0.4860161244869232, "step": 530 }, { "epoch": 1.1247357293868923, "grad_norm": 0.7055052518844604, "learning_rate": 4.377414181096859e-06, "loss": 1.1043274402618408, "step": 532 }, { "epoch": 1.1289640591966172, "grad_norm": 0.9923433065414429, "learning_rate": 4.371971709676319e-06, "loss": 0.8963814973831177, "step": 534 }, { "epoch": 1.1331923890063424, "grad_norm": 0.6690739393234253, "learning_rate": 4.366509408999988e-06, "loss": 0.8666636347770691, "step": 536 }, { "epoch": 1.1374207188160677, "grad_norm": 1.7911560535430908, "learning_rate": 4.361027345846687e-06, "loss": 0.7381163239479065, "step": 538 }, { "epoch": 1.1416490486257929, "grad_norm": 2.775190591812134, "learning_rate": 4.355525587236841e-06, "loss": 0.803221583366394, "step": 540 }, { "epoch": 1.145877378435518, "grad_norm": 0.5926763415336609, "learning_rate": 4.350004200431658e-06, "loss": 1.1303699016571045, "step": 542 }, { "epoch": 1.150105708245243, "grad_norm": 1.6176501512527466, "learning_rate": 4.344463252932312e-06, "loss": 0.7936561107635498, "step": 544 }, { "epoch": 1.1543340380549683, "grad_norm": 0.7994486689567566, "learning_rate": 4.33890281247911e-06, "loss": 1.1952998638153076, "step": 546 }, { "epoch": 1.1585623678646935, "grad_norm": 5.03057861328125, "learning_rate": 4.333322947050673e-06, "loss": 1.3116034269332886, "step": 548 }, { "epoch": 1.1627906976744187, "grad_norm": 1.1229743957519531, "learning_rate": 4.3277237248630946e-06, "loss": 0.8429150581359863, "step": 550 }, { "epoch": 1.1670190274841437, "grad_norm": 1.2708096504211426, "learning_rate": 4.3221052143691185e-06, "loss": 1.1472980976104736, "step": 552 }, { "epoch": 1.1712473572938689, "grad_norm": 1.303392767906189, "learning_rate": 4.316467484257291e-06, "loss": 1.1732940673828125, "step": 554 }, { "epoch": 1.175475687103594, "grad_norm": 1.490607738494873, "learning_rate": 4.310810603451128e-06, "loss": 1.079361915588379, "step": 556 }, { "epoch": 1.1797040169133193, "grad_norm": 0.6570778489112854, "learning_rate": 4.30513464110827e-06, "loss": 1.1126495599746704, "step": 558 }, { "epoch": 1.1839323467230445, "grad_norm": 0.64628005027771, "learning_rate": 4.299439666619637e-06, "loss": 1.085148811340332, "step": 560 }, { "epoch": 1.1881606765327695, "grad_norm": 0.881676435470581, "learning_rate": 4.293725749608581e-06, "loss": 0.8194442987442017, "step": 562 }, { "epoch": 1.1923890063424947, "grad_norm": 1.4025084972381592, "learning_rate": 4.287992959930033e-06, "loss": 1.130499005317688, "step": 564 }, { "epoch": 1.1966173361522199, "grad_norm": 1.591407060623169, "learning_rate": 4.282241367669648e-06, "loss": 1.0246634483337402, "step": 566 }, { "epoch": 1.200845665961945, "grad_norm": 3.8888139724731445, "learning_rate": 4.276471043142954e-06, "loss": 1.2712934017181396, "step": 568 }, { "epoch": 1.20507399577167, "grad_norm": 2.388948678970337, "learning_rate": 4.270682056894487e-06, "loss": 1.2416294813156128, "step": 570 }, { "epoch": 1.2093023255813953, "grad_norm": 2.251979351043701, "learning_rate": 4.264874479696928e-06, "loss": 1.0589932203292847, "step": 572 }, { "epoch": 1.2135306553911205, "grad_norm": 0.7591463923454285, "learning_rate": 4.2590483825502425e-06, "loss": 0.8409648537635803, "step": 574 }, { "epoch": 1.2177589852008457, "grad_norm": 1.53620183467865, "learning_rate": 4.25320383668081e-06, "loss": 0.8636283278465271, "step": 576 }, { "epoch": 1.221987315010571, "grad_norm": 1.1595256328582764, "learning_rate": 4.247340913540548e-06, "loss": 0.9905154705047607, "step": 578 }, { "epoch": 1.226215644820296, "grad_norm": 1.0831875801086426, "learning_rate": 4.241459684806052e-06, "loss": 0.8118501305580139, "step": 580 }, { "epoch": 1.230443974630021, "grad_norm": 0.7525449395179749, "learning_rate": 4.235560222377703e-06, "loss": 1.1561369895935059, "step": 582 }, { "epoch": 1.2346723044397463, "grad_norm": 3.329188585281372, "learning_rate": 4.2296425983788e-06, "loss": 1.0410387516021729, "step": 584 }, { "epoch": 1.2389006342494715, "grad_norm": 1.3844683170318604, "learning_rate": 4.223706885154674e-06, "loss": 0.877763569355011, "step": 586 }, { "epoch": 1.2431289640591967, "grad_norm": 1.171336054801941, "learning_rate": 4.217753155271804e-06, "loss": 0.9664973020553589, "step": 588 }, { "epoch": 1.2473572938689217, "grad_norm": 0.8962653279304504, "learning_rate": 4.21178148151693e-06, "loss": 0.6594605445861816, "step": 590 }, { "epoch": 1.251585623678647, "grad_norm": 0.4513523578643799, "learning_rate": 4.2057919368961626e-06, "loss": 0.9009559154510498, "step": 592 }, { "epoch": 1.255813953488372, "grad_norm": 1.2183146476745605, "learning_rate": 4.199784594634091e-06, "loss": 1.0488721132278442, "step": 594 }, { "epoch": 1.2600422832980973, "grad_norm": 0.703557014465332, "learning_rate": 4.193759528172889e-06, "loss": 0.760339617729187, "step": 596 }, { "epoch": 1.2642706131078225, "grad_norm": 1.2198814153671265, "learning_rate": 4.187716811171412e-06, "loss": 1.1317111253738403, "step": 598 }, { "epoch": 1.2684989429175475, "grad_norm": 1.99573814868927, "learning_rate": 4.181656517504306e-06, "loss": 1.3582342863082886, "step": 600 }, { "epoch": 1.2727272727272727, "grad_norm": 0.549845814704895, "learning_rate": 4.175578721261093e-06, "loss": 0.9524427056312561, "step": 602 }, { "epoch": 1.276955602536998, "grad_norm": 1.245996117591858, "learning_rate": 4.169483496745277e-06, "loss": 1.1659082174301147, "step": 604 }, { "epoch": 1.2811839323467231, "grad_norm": 2.977156162261963, "learning_rate": 4.163370918473426e-06, "loss": 0.790830135345459, "step": 606 }, { "epoch": 1.285412262156448, "grad_norm": 0.9513248801231384, "learning_rate": 4.157241061174261e-06, "loss": 1.151841640472412, "step": 608 }, { "epoch": 1.2896405919661733, "grad_norm": 0.8377892374992371, "learning_rate": 4.151093999787755e-06, "loss": 0.7630675435066223, "step": 610 }, { "epoch": 1.2938689217758985, "grad_norm": 0.74871826171875, "learning_rate": 4.144929809464202e-06, "loss": 0.6663084626197815, "step": 612 }, { "epoch": 1.2980972515856237, "grad_norm": 0.7946862578392029, "learning_rate": 4.138748565563304e-06, "loss": 0.9356685876846313, "step": 614 }, { "epoch": 1.302325581395349, "grad_norm": 1.464992880821228, "learning_rate": 4.132550343653251e-06, "loss": 0.49841123819351196, "step": 616 }, { "epoch": 1.306553911205074, "grad_norm": 2.7764222621917725, "learning_rate": 4.1263352195097975e-06, "loss": 0.921845018863678, "step": 618 }, { "epoch": 1.3107822410147991, "grad_norm": 2.9197912216186523, "learning_rate": 4.120103269115332e-06, "loss": 1.1502526998519897, "step": 620 }, { "epoch": 1.3150105708245243, "grad_norm": 0.5009030103683472, "learning_rate": 4.113854568657952e-06, "loss": 1.1119526624679565, "step": 622 }, { "epoch": 1.3192389006342495, "grad_norm": 4.491978168487549, "learning_rate": 4.107589194530532e-06, "loss": 0.7493167519569397, "step": 624 }, { "epoch": 1.3234672304439745, "grad_norm": 0.510420560836792, "learning_rate": 4.101307223329786e-06, "loss": 1.1615945100784302, "step": 626 }, { "epoch": 1.3276955602536997, "grad_norm": 1.2960412502288818, "learning_rate": 4.0950087318553375e-06, "loss": 1.3132972717285156, "step": 628 }, { "epoch": 1.331923890063425, "grad_norm": 0.8799285292625427, "learning_rate": 4.088693797108774e-06, "loss": 1.0321528911590576, "step": 630 }, { "epoch": 1.3361522198731501, "grad_norm": 1.2005505561828613, "learning_rate": 4.0823624962927104e-06, "loss": 0.616770327091217, "step": 632 }, { "epoch": 1.3403805496828753, "grad_norm": 0.6413878798484802, "learning_rate": 4.076014906809842e-06, "loss": 0.747455358505249, "step": 634 }, { "epoch": 1.3446088794926006, "grad_norm": 0.6914223432540894, "learning_rate": 4.069651106262003e-06, "loss": 0.8139711022377014, "step": 636 }, { "epoch": 1.3488372093023255, "grad_norm": 3.342055082321167, "learning_rate": 4.063271172449209e-06, "loss": 1.0335206985473633, "step": 638 }, { "epoch": 1.3530655391120507, "grad_norm": 2.376635789871216, "learning_rate": 4.0568751833687155e-06, "loss": 0.7637988328933716, "step": 640 }, { "epoch": 1.357293868921776, "grad_norm": 0.9393727779388428, "learning_rate": 4.050463217214058e-06, "loss": 1.218309760093689, "step": 642 }, { "epoch": 1.361522198731501, "grad_norm": 4.736209869384766, "learning_rate": 4.0440353523741e-06, "loss": 1.1682794094085693, "step": 644 }, { "epoch": 1.3657505285412261, "grad_norm": 1.6625150442123413, "learning_rate": 4.0375916674320694e-06, "loss": 0.7112323045730591, "step": 646 }, { "epoch": 1.3699788583509513, "grad_norm": 1.635366678237915, "learning_rate": 4.0311322411646045e-06, "loss": 0.7137230634689331, "step": 648 }, { "epoch": 1.3742071881606766, "grad_norm": 1.2800323963165283, "learning_rate": 4.0246571525407875e-06, "loss": 0.7801585793495178, "step": 650 }, { "epoch": 1.3784355179704018, "grad_norm": 1.4994843006134033, "learning_rate": 4.018166480721178e-06, "loss": 0.7897611856460571, "step": 652 }, { "epoch": 1.382663847780127, "grad_norm": 0.7120780348777771, "learning_rate": 4.011660305056846e-06, "loss": 1.1767425537109375, "step": 654 }, { "epoch": 1.386892177589852, "grad_norm": 0.7388160228729248, "learning_rate": 4.005138705088401e-06, "loss": 1.0873156785964966, "step": 656 }, { "epoch": 1.3911205073995772, "grad_norm": 1.0489729642868042, "learning_rate": 3.9986017605450265e-06, "loss": 0.8503063321113586, "step": 658 }, { "epoch": 1.3953488372093024, "grad_norm": 0.8119449019432068, "learning_rate": 3.992049551343493e-06, "loss": 0.9161325097084045, "step": 660 }, { "epoch": 1.3995771670190273, "grad_norm": 0.5929046869277954, "learning_rate": 3.985482157587192e-06, "loss": 1.1369270086288452, "step": 662 }, { "epoch": 1.4038054968287526, "grad_norm": 0.3672987222671509, "learning_rate": 3.97889965956515e-06, "loss": 1.0164310932159424, "step": 664 }, { "epoch": 1.4080338266384778, "grad_norm": 1.0386170148849487, "learning_rate": 3.972302137751051e-06, "loss": 1.374223232269287, "step": 666 }, { "epoch": 1.412262156448203, "grad_norm": 1.0722689628601074, "learning_rate": 3.9656896728022476e-06, "loss": 1.12968111038208, "step": 668 }, { "epoch": 1.4164904862579282, "grad_norm": 1.196387529373169, "learning_rate": 3.959062345558782e-06, "loss": 0.771783173084259, "step": 670 }, { "epoch": 1.4207188160676534, "grad_norm": 1.3007256984710693, "learning_rate": 3.9524202370423915e-06, "loss": 1.1213726997375488, "step": 672 }, { "epoch": 1.4249471458773784, "grad_norm": 0.9761534929275513, "learning_rate": 3.945763428455523e-06, "loss": 0.6354954242706299, "step": 674 }, { "epoch": 1.4291754756871036, "grad_norm": 1.2106300592422485, "learning_rate": 3.939092001180332e-06, "loss": 0.8169525861740112, "step": 676 }, { "epoch": 1.4334038054968288, "grad_norm": 0.6862068176269531, "learning_rate": 3.932406036777701e-06, "loss": 1.3615213632583618, "step": 678 }, { "epoch": 1.437632135306554, "grad_norm": 1.1061359643936157, "learning_rate": 3.9257056169862305e-06, "loss": 1.1848570108413696, "step": 680 }, { "epoch": 1.441860465116279, "grad_norm": 1.6158320903778076, "learning_rate": 3.918990823721243e-06, "loss": 1.1745814085006714, "step": 682 }, { "epoch": 1.4460887949260042, "grad_norm": 0.6842957735061646, "learning_rate": 3.912261739073785e-06, "loss": 1.106062650680542, "step": 684 }, { "epoch": 1.4503171247357294, "grad_norm": 1.5938684940338135, "learning_rate": 3.905518445309619e-06, "loss": 1.4594074487686157, "step": 686 }, { "epoch": 1.4545454545454546, "grad_norm": 0.7108921408653259, "learning_rate": 3.8987610248682205e-06, "loss": 1.0741581916809082, "step": 688 }, { "epoch": 1.4587737843551798, "grad_norm": 7.655938148498535, "learning_rate": 3.89198956036177e-06, "loss": 0.41335436701774597, "step": 690 }, { "epoch": 1.463002114164905, "grad_norm": 0.6948283910751343, "learning_rate": 3.885204134574141e-06, "loss": 1.146783709526062, "step": 692 }, { "epoch": 1.46723044397463, "grad_norm": 0.6634160876274109, "learning_rate": 3.878404830459889e-06, "loss": 0.65525221824646, "step": 694 }, { "epoch": 1.4714587737843552, "grad_norm": 0.9858572483062744, "learning_rate": 3.87159173114324e-06, "loss": 1.1054624319076538, "step": 696 }, { "epoch": 1.4756871035940804, "grad_norm": 1.3330109119415283, "learning_rate": 3.86476491991707e-06, "loss": 1.1466596126556396, "step": 698 }, { "epoch": 1.4799154334038054, "grad_norm": 3.4319090843200684, "learning_rate": 3.857924480241888e-06, "loss": 0.9684445261955261, "step": 700 }, { "epoch": 1.4841437632135306, "grad_norm": 0.5792906880378723, "learning_rate": 3.851070495744819e-06, "loss": 1.1101263761520386, "step": 702 }, { "epoch": 1.4883720930232558, "grad_norm": 0.584158718585968, "learning_rate": 3.8442030502185745e-06, "loss": 1.0356827974319458, "step": 704 }, { "epoch": 1.492600422832981, "grad_norm": 0.7270916700363159, "learning_rate": 3.837322227620439e-06, "loss": 0.8322772979736328, "step": 706 }, { "epoch": 1.4968287526427062, "grad_norm": 0.28889569640159607, "learning_rate": 3.830428112071228e-06, "loss": 0.2769829332828522, "step": 708 }, { "epoch": 1.5010570824524314, "grad_norm": 0.7377986907958984, "learning_rate": 3.823520787854278e-06, "loss": 0.6088220477104187, "step": 710 }, { "epoch": 1.5052854122621564, "grad_norm": 2.107346296310425, "learning_rate": 3.816600339414402e-06, "loss": 0.5735040903091431, "step": 712 }, { "epoch": 1.5095137420718816, "grad_norm": 0.6663100719451904, "learning_rate": 3.8096668513568608e-06, "loss": 0.9799573421478271, "step": 714 }, { "epoch": 1.5137420718816068, "grad_norm": 0.7188597917556763, "learning_rate": 3.8027204084463334e-06, "loss": 1.1207448244094849, "step": 716 }, { "epoch": 1.5179704016913318, "grad_norm": 0.694125771522522, "learning_rate": 3.795761095605873e-06, "loss": 1.0090175867080688, "step": 718 }, { "epoch": 1.522198731501057, "grad_norm": 0.3084549903869629, "learning_rate": 3.7887889979158775e-06, "loss": 0.9819098711013794, "step": 720 }, { "epoch": 1.5264270613107822, "grad_norm": 1.8949941396713257, "learning_rate": 3.7818042006130405e-06, "loss": 0.8384270071983337, "step": 722 }, { "epoch": 1.5306553911205074, "grad_norm": 1.5150532722473145, "learning_rate": 3.774806789089316e-06, "loss": 0.9709129929542542, "step": 724 }, { "epoch": 1.5348837209302326, "grad_norm": 1.0952752828598022, "learning_rate": 3.7677968488908705e-06, "loss": 0.9372836947441101, "step": 726 }, { "epoch": 1.5391120507399578, "grad_norm": 1.564868450164795, "learning_rate": 3.76077446571704e-06, "loss": 0.6753690242767334, "step": 728 }, { "epoch": 1.543340380549683, "grad_norm": 1.170804500579834, "learning_rate": 3.75373972541928e-06, "loss": 0.8191190361976624, "step": 730 }, { "epoch": 1.547568710359408, "grad_norm": 0.679467499256134, "learning_rate": 3.746692714000117e-06, "loss": 1.086642861366272, "step": 732 }, { "epoch": 1.5517970401691332, "grad_norm": 0.2902541756629944, "learning_rate": 3.7396335176120953e-06, "loss": 0.25046733021736145, "step": 734 }, { "epoch": 1.5560253699788582, "grad_norm": 2.038381576538086, "learning_rate": 3.7325622225567294e-06, "loss": 1.009968876838684, "step": 736 }, { "epoch": 1.5602536997885834, "grad_norm": 0.2496039867401123, "learning_rate": 3.725478915283439e-06, "loss": 0.84336918592453, "step": 738 }, { "epoch": 1.5644820295983086, "grad_norm": 0.559074878692627, "learning_rate": 3.7183836823885045e-06, "loss": 1.1601533889770508, "step": 740 }, { "epoch": 1.5687103594080338, "grad_norm": 1.2242622375488281, "learning_rate": 3.7112766106139964e-06, "loss": 0.8150052428245544, "step": 742 }, { "epoch": 1.572938689217759, "grad_norm": 1.0551347732543945, "learning_rate": 3.7041577868467242e-06, "loss": 1.1540948152542114, "step": 744 }, { "epoch": 1.5771670190274842, "grad_norm": 2.7716071605682373, "learning_rate": 3.697027298117168e-06, "loss": 1.1788626909255981, "step": 746 }, { "epoch": 1.5813953488372094, "grad_norm": 1.1499396562576294, "learning_rate": 3.6898852315984156e-06, "loss": 1.057762861251831, "step": 748 }, { "epoch": 1.5856236786469344, "grad_norm": 0.3814210295677185, "learning_rate": 3.6827316746051015e-06, "loss": 0.04337337985634804, "step": 750 }, { "epoch": 1.5898520084566596, "grad_norm": 1.3480174541473389, "learning_rate": 3.675566714592333e-06, "loss": 0.9101552367210388, "step": 752 }, { "epoch": 1.5940803382663846, "grad_norm": 1.1889445781707764, "learning_rate": 3.6683904391546255e-06, "loss": 1.2129230499267578, "step": 754 }, { "epoch": 1.5983086680761098, "grad_norm": 0.5748162269592285, "learning_rate": 3.6612029360248285e-06, "loss": 1.1286925077438354, "step": 756 }, { "epoch": 1.602536997885835, "grad_norm": 0.724022626876831, "learning_rate": 3.6540042930730556e-06, "loss": 1.1947628259658813, "step": 758 }, { "epoch": 1.6067653276955602, "grad_norm": 0.677099347114563, "learning_rate": 3.6467945983056104e-06, "loss": 1.1410974264144897, "step": 760 }, { "epoch": 1.6109936575052854, "grad_norm": 0.6079980731010437, "learning_rate": 3.6395739398639057e-06, "loss": 1.1570736169815063, "step": 762 }, { "epoch": 1.6152219873150107, "grad_norm": 0.9599738121032715, "learning_rate": 3.6323424060233936e-06, "loss": 1.035282015800476, "step": 764 }, { "epoch": 1.6194503171247359, "grad_norm": 1.0322376489639282, "learning_rate": 3.6251000851924806e-06, "loss": 0.8392003774642944, "step": 766 }, { "epoch": 1.6236786469344608, "grad_norm": 0.708662211894989, "learning_rate": 3.617847065911447e-06, "loss": 1.1536966562271118, "step": 768 }, { "epoch": 1.627906976744186, "grad_norm": 1.8593244552612305, "learning_rate": 3.610583436851369e-06, "loss": 1.0729390382766724, "step": 770 }, { "epoch": 1.6321353065539113, "grad_norm": 0.5333645343780518, "learning_rate": 3.603309286813029e-06, "loss": 1.1488738059997559, "step": 772 }, { "epoch": 1.6363636363636362, "grad_norm": 1.6851012706756592, "learning_rate": 3.596024704725835e-06, "loss": 0.9281710386276245, "step": 774 }, { "epoch": 1.6405919661733614, "grad_norm": 1.7228329181671143, "learning_rate": 3.588729779646728e-06, "loss": 1.158841609954834, "step": 776 }, { "epoch": 1.6448202959830867, "grad_norm": 0.9394569396972656, "learning_rate": 3.581424600759099e-06, "loss": 0.7341264486312866, "step": 778 }, { "epoch": 1.6490486257928119, "grad_norm": 0.6430965065956116, "learning_rate": 3.5741092573716952e-06, "loss": 1.096555233001709, "step": 780 }, { "epoch": 1.653276955602537, "grad_norm": 1.5148671865463257, "learning_rate": 3.5667838389175276e-06, "loss": 0.8284240961074829, "step": 782 }, { "epoch": 1.6575052854122623, "grad_norm": 0.6028370261192322, "learning_rate": 3.55944843495278e-06, "loss": 1.1990805864334106, "step": 784 }, { "epoch": 1.6617336152219875, "grad_norm": 2.5651183128356934, "learning_rate": 3.5521031351557116e-06, "loss": 0.4815433621406555, "step": 786 }, { "epoch": 1.6659619450317125, "grad_norm": 0.8050721287727356, "learning_rate": 3.5447480293255666e-06, "loss": 1.1529608964920044, "step": 788 }, { "epoch": 1.6701902748414377, "grad_norm": 0.9118593335151672, "learning_rate": 3.5373832073814668e-06, "loss": 0.7648034691810608, "step": 790 }, { "epoch": 1.6744186046511627, "grad_norm": 0.8314517736434937, "learning_rate": 3.5300087593613186e-06, "loss": 0.5136529207229614, "step": 792 }, { "epoch": 1.6786469344608879, "grad_norm": 0.7918019890785217, "learning_rate": 3.5226247754207138e-06, "loss": 1.1230441331863403, "step": 794 }, { "epoch": 1.682875264270613, "grad_norm": 0.4042631685733795, "learning_rate": 3.5152313458318206e-06, "loss": 0.6846147775650024, "step": 796 }, { "epoch": 1.6871035940803383, "grad_norm": 1.0725696086883545, "learning_rate": 3.5078285609822875e-06, "loss": 1.2035937309265137, "step": 798 }, { "epoch": 1.6913319238900635, "grad_norm": 0.5610724687576294, "learning_rate": 3.5004165113741334e-06, "loss": 1.1461760997772217, "step": 800 }, { "epoch": 1.6955602536997887, "grad_norm": 1.0127768516540527, "learning_rate": 3.4929952876226414e-06, "loss": 0.6147741675376892, "step": 802 }, { "epoch": 1.699788583509514, "grad_norm": 0.6945735216140747, "learning_rate": 3.485564980455255e-06, "loss": 1.1363788843154907, "step": 804 }, { "epoch": 1.7040169133192389, "grad_norm": 1.351635217666626, "learning_rate": 3.478125680710463e-06, "loss": 0.8326917886734009, "step": 806 }, { "epoch": 1.708245243128964, "grad_norm": 1.1634646654129028, "learning_rate": 3.470677479336695e-06, "loss": 0.7223104238510132, "step": 808 }, { "epoch": 1.712473572938689, "grad_norm": 0.9786092042922974, "learning_rate": 3.4632204673912034e-06, "loss": 1.1191296577453613, "step": 810 }, { "epoch": 1.7167019027484143, "grad_norm": 2.46586275100708, "learning_rate": 3.4557547360389577e-06, "loss": 1.3536570072174072, "step": 812 }, { "epoch": 1.7209302325581395, "grad_norm": 0.8146648406982422, "learning_rate": 3.4482803765515206e-06, "loss": 1.100825309753418, "step": 814 }, { "epoch": 1.7251585623678647, "grad_norm": 0.8478085994720459, "learning_rate": 3.4407974803059406e-06, "loss": 1.1602932214736938, "step": 816 }, { "epoch": 1.72938689217759, "grad_norm": 0.9965582489967346, "learning_rate": 3.4333061387836307e-06, "loss": 0.9386340379714966, "step": 818 }, { "epoch": 1.733615221987315, "grad_norm": 2.556925058364868, "learning_rate": 3.4258064435692507e-06, "loss": 1.0207256078720093, "step": 820 }, { "epoch": 1.7378435517970403, "grad_norm": 1.3679172992706299, "learning_rate": 3.4182984863495876e-06, "loss": 0.6849140524864197, "step": 822 }, { "epoch": 1.7420718816067653, "grad_norm": 0.4469180405139923, "learning_rate": 3.410782358912435e-06, "loss": 0.8242835998535156, "step": 824 }, { "epoch": 1.7463002114164905, "grad_norm": 1.4416385889053345, "learning_rate": 3.403258153145471e-06, "loss": 0.9483500719070435, "step": 826 }, { "epoch": 1.7505285412262155, "grad_norm": 0.6498605608940125, "learning_rate": 3.3957259610351324e-06, "loss": 0.9845226407051086, "step": 828 }, { "epoch": 1.7547568710359407, "grad_norm": 2.385218620300293, "learning_rate": 3.388185874665495e-06, "loss": 0.8091049790382385, "step": 830 }, { "epoch": 1.758985200845666, "grad_norm": 0.9289647936820984, "learning_rate": 3.3806379862171448e-06, "loss": 1.1820333003997803, "step": 832 }, { "epoch": 1.763213530655391, "grad_norm": 1.0489366054534912, "learning_rate": 3.373082387966048e-06, "loss": 0.833751916885376, "step": 834 }, { "epoch": 1.7674418604651163, "grad_norm": 0.571071982383728, "learning_rate": 3.365519172282431e-06, "loss": 0.8406846523284912, "step": 836 }, { "epoch": 1.7716701902748415, "grad_norm": 0.5149204730987549, "learning_rate": 3.357948431629643e-06, "loss": 1.1610711812973022, "step": 838 }, { "epoch": 1.7758985200845667, "grad_norm": 0.5685960054397583, "learning_rate": 3.3503702585630305e-06, "loss": 0.8929948806762695, "step": 840 }, { "epoch": 1.7801268498942917, "grad_norm": 2.4608681201934814, "learning_rate": 3.342784745728804e-06, "loss": 0.4209887683391571, "step": 842 }, { "epoch": 1.784355179704017, "grad_norm": 0.6386370062828064, "learning_rate": 3.3351919858629045e-06, "loss": 0.7464441061019897, "step": 844 }, { "epoch": 1.7885835095137421, "grad_norm": 0.8585013747215271, "learning_rate": 3.327592071789873e-06, "loss": 0.9707925319671631, "step": 846 }, { "epoch": 1.792811839323467, "grad_norm": 1.5596438646316528, "learning_rate": 3.3199850964217116e-06, "loss": 0.7446164488792419, "step": 848 }, { "epoch": 1.7970401691331923, "grad_norm": 2.6806228160858154, "learning_rate": 3.312371152756751e-06, "loss": 0.7679558396339417, "step": 850 }, { "epoch": 1.8012684989429175, "grad_norm": 0.7470750212669373, "learning_rate": 3.304750333878511e-06, "loss": 1.0020787715911865, "step": 852 }, { "epoch": 1.8054968287526427, "grad_norm": 0.5787929892539978, "learning_rate": 3.2971227329545634e-06, "loss": 0.8919803500175476, "step": 854 }, { "epoch": 1.809725158562368, "grad_norm": 0.50643390417099, "learning_rate": 3.2894884432353957e-06, "loss": 1.0985815525054932, "step": 856 }, { "epoch": 1.8139534883720931, "grad_norm": 1.5751054286956787, "learning_rate": 3.281847558053265e-06, "loss": 0.6541829109191895, "step": 858 }, { "epoch": 1.8181818181818183, "grad_norm": 0.6766167283058167, "learning_rate": 3.274200170821064e-06, "loss": 1.0379619598388672, "step": 860 }, { "epoch": 1.8224101479915433, "grad_norm": 1.3516942262649536, "learning_rate": 3.2665463750311727e-06, "loss": 1.1044809818267822, "step": 862 }, { "epoch": 1.8266384778012685, "grad_norm": 1.573708415031433, "learning_rate": 3.2588862642543208e-06, "loss": 0.5707927942276001, "step": 864 }, { "epoch": 1.8308668076109935, "grad_norm": 0.4704338312149048, "learning_rate": 3.2512199321384393e-06, "loss": 0.7981724143028259, "step": 866 }, { "epoch": 1.8350951374207187, "grad_norm": 0.5769053101539612, "learning_rate": 3.243547472407518e-06, "loss": 1.1399530172348022, "step": 868 }, { "epoch": 1.839323467230444, "grad_norm": 0.8416613340377808, "learning_rate": 3.23586897886046e-06, "loss": 1.1359026432037354, "step": 870 }, { "epoch": 1.8435517970401691, "grad_norm": 1.3577508926391602, "learning_rate": 3.2281845453699345e-06, "loss": 0.8569067716598511, "step": 872 }, { "epoch": 1.8477801268498943, "grad_norm": 2.056459665298462, "learning_rate": 3.220494265881227e-06, "loss": 1.1351348161697388, "step": 874 }, { "epoch": 1.8520084566596196, "grad_norm": 2.2590129375457764, "learning_rate": 3.212798234411095e-06, "loss": 0.9369499087333679, "step": 876 }, { "epoch": 1.8562367864693448, "grad_norm": 1.6008362770080566, "learning_rate": 3.2050965450466136e-06, "loss": 0.2906026244163513, "step": 878 }, { "epoch": 1.8604651162790697, "grad_norm": 1.1616226434707642, "learning_rate": 3.197389291944032e-06, "loss": 0.7301267385482788, "step": 880 }, { "epoch": 1.864693446088795, "grad_norm": 0.7197896242141724, "learning_rate": 3.1896765693276135e-06, "loss": 1.1232812404632568, "step": 882 }, { "epoch": 1.86892177589852, "grad_norm": 0.6383930444717407, "learning_rate": 3.1819584714884903e-06, "loss": 0.7655252814292908, "step": 884 }, { "epoch": 1.8731501057082451, "grad_norm": 1.0251339673995972, "learning_rate": 3.1742350927835125e-06, "loss": 1.121950387954712, "step": 886 }, { "epoch": 1.8773784355179703, "grad_norm": 0.6124594807624817, "learning_rate": 3.1665065276340844e-06, "loss": 1.1401907205581665, "step": 888 }, { "epoch": 1.8816067653276956, "grad_norm": 1.397496223449707, "learning_rate": 3.158772870525022e-06, "loss": 0.6092522144317627, "step": 890 }, { "epoch": 1.8858350951374208, "grad_norm": 2.1092441082000732, "learning_rate": 3.1510342160033903e-06, "loss": 0.8399344086647034, "step": 892 }, { "epoch": 1.890063424947146, "grad_norm": 1.1821517944335938, "learning_rate": 3.1432906586773488e-06, "loss": 1.114659070968628, "step": 894 }, { "epoch": 1.8942917547568712, "grad_norm": 1.5643501281738281, "learning_rate": 3.135542293214997e-06, "loss": 1.410881519317627, "step": 896 }, { "epoch": 1.8985200845665962, "grad_norm": 0.7187080383300781, "learning_rate": 3.1277892143432165e-06, "loss": 1.065239429473877, "step": 898 }, { "epoch": 1.9027484143763214, "grad_norm": 1.0004364252090454, "learning_rate": 3.1200315168465113e-06, "loss": 0.5023792386054993, "step": 900 }, { "epoch": 1.9069767441860463, "grad_norm": 0.5592870116233826, "learning_rate": 3.1122692955658497e-06, "loss": 1.107616901397705, "step": 902 }, { "epoch": 1.9112050739957716, "grad_norm": 1.719496726989746, "learning_rate": 3.1045026453975048e-06, "loss": 0.5772966146469116, "step": 904 }, { "epoch": 1.9154334038054968, "grad_norm": 1.4579967260360718, "learning_rate": 3.096731661291896e-06, "loss": 0.8818938136100769, "step": 906 }, { "epoch": 1.919661733615222, "grad_norm": 0.8083340525627136, "learning_rate": 3.0889564382524257e-06, "loss": 1.2467647790908813, "step": 908 }, { "epoch": 1.9238900634249472, "grad_norm": 0.5722190737724304, "learning_rate": 3.08117707133432e-06, "loss": 0.812706708908081, "step": 910 }, { "epoch": 1.9281183932346724, "grad_norm": 0.6764684319496155, "learning_rate": 3.0733936556434634e-06, "loss": 1.1728202104568481, "step": 912 }, { "epoch": 1.9323467230443976, "grad_norm": 0.9146943688392639, "learning_rate": 3.0656062863352413e-06, "loss": 0.7626368999481201, "step": 914 }, { "epoch": 1.9365750528541226, "grad_norm": 0.5261140465736389, "learning_rate": 3.0578150586133704e-06, "loss": 1.1478456258773804, "step": 916 }, { "epoch": 1.9408033826638478, "grad_norm": 0.6775986552238464, "learning_rate": 3.0500200677287428e-06, "loss": 0.6973150968551636, "step": 918 }, { "epoch": 1.945031712473573, "grad_norm": 1.3343801498413086, "learning_rate": 3.042221408978251e-06, "loss": 0.9482506513595581, "step": 920 }, { "epoch": 1.949260042283298, "grad_norm": 1.5522212982177734, "learning_rate": 3.0344191777036312e-06, "loss": 0.9986613392829895, "step": 922 }, { "epoch": 1.9534883720930232, "grad_norm": 0.4814535677433014, "learning_rate": 3.026613469290298e-06, "loss": 1.0413583517074585, "step": 924 }, { "epoch": 1.9577167019027484, "grad_norm": 0.3166026473045349, "learning_rate": 3.01880437916617e-06, "loss": 0.9614431262016296, "step": 926 }, { "epoch": 1.9619450317124736, "grad_norm": 1.094480037689209, "learning_rate": 3.0109920028005135e-06, "loss": 1.2636445760726929, "step": 928 }, { "epoch": 1.9661733615221988, "grad_norm": 0.7118551135063171, "learning_rate": 3.003176435702767e-06, "loss": 0.9028820395469666, "step": 930 }, { "epoch": 1.970401691331924, "grad_norm": 0.5945522785186768, "learning_rate": 2.9953577734213775e-06, "loss": 1.2327357530593872, "step": 932 }, { "epoch": 1.9746300211416492, "grad_norm": 0.7517629265785217, "learning_rate": 2.9875361115426347e-06, "loss": 0.8936224579811096, "step": 934 }, { "epoch": 1.9788583509513742, "grad_norm": 0.9688192009925842, "learning_rate": 2.979711545689496e-06, "loss": 0.7812487483024597, "step": 936 }, { "epoch": 1.9830866807610994, "grad_norm": 8.303119659423828, "learning_rate": 2.9718841715204227e-06, "loss": 0.873395562171936, "step": 938 }, { "epoch": 1.9873150105708244, "grad_norm": 0.6607802510261536, "learning_rate": 2.9640540847282095e-06, "loss": 1.0979681015014648, "step": 940 }, { "epoch": 1.9915433403805496, "grad_norm": 0.6019576787948608, "learning_rate": 2.956221381038812e-06, "loss": 1.1199960708618164, "step": 942 }, { "epoch": 1.9957716701902748, "grad_norm": 0.7929393649101257, "learning_rate": 2.94838615621018e-06, "loss": 1.1161065101623535, "step": 944 }, { "epoch": 2.0, "grad_norm": 0.6814373135566711, "learning_rate": 2.9405485060310857e-06, "loss": 0.48783794045448303, "step": 946 }, { "epoch": 2.004228329809725, "grad_norm": 1.4736313819885254, "learning_rate": 2.9327085263199507e-06, "loss": 0.7957913279533386, "step": 948 }, { "epoch": 2.0084566596194504, "grad_norm": 0.4455970823764801, "learning_rate": 2.924866312923677e-06, "loss": 1.0547270774841309, "step": 950 }, { "epoch": 2.0126849894291756, "grad_norm": 0.773058295249939, "learning_rate": 2.9170219617164735e-06, "loss": 1.0442657470703125, "step": 952 }, { "epoch": 2.016913319238901, "grad_norm": 0.9597894549369812, "learning_rate": 2.9091755685986866e-06, "loss": 1.1685289144515991, "step": 954 }, { "epoch": 2.0211416490486256, "grad_norm": 0.6969325542449951, "learning_rate": 2.9013272294956223e-06, "loss": 1.1930384635925293, "step": 956 }, { "epoch": 2.025369978858351, "grad_norm": 0.8082700967788696, "learning_rate": 2.8934770403563815e-06, "loss": 0.776046872138977, "step": 958 }, { "epoch": 2.029598308668076, "grad_norm": 0.7422521710395813, "learning_rate": 2.8856250971526788e-06, "loss": 1.0249298810958862, "step": 960 }, { "epoch": 2.033826638477801, "grad_norm": 1.6249040365219116, "learning_rate": 2.877771495877676e-06, "loss": 0.9289775490760803, "step": 962 }, { "epoch": 2.0380549682875264, "grad_norm": 3.067833185195923, "learning_rate": 2.869916332544802e-06, "loss": 0.8100100159645081, "step": 964 }, { "epoch": 2.0422832980972516, "grad_norm": 0.724915087223053, "learning_rate": 2.8620597031865854e-06, "loss": 0.7401767373085022, "step": 966 }, { "epoch": 2.046511627906977, "grad_norm": 2.0869836807250977, "learning_rate": 2.854201703853477e-06, "loss": 0.8137513399124146, "step": 968 }, { "epoch": 2.050739957716702, "grad_norm": 0.6877044439315796, "learning_rate": 2.8463424306126743e-06, "loss": 1.10543692111969, "step": 970 }, { "epoch": 2.0549682875264272, "grad_norm": 1.3014296293258667, "learning_rate": 2.838481979546952e-06, "loss": 0.5617172122001648, "step": 972 }, { "epoch": 2.059196617336152, "grad_norm": 0.9769271016120911, "learning_rate": 2.83062044675348e-06, "loss": 1.060500144958496, "step": 974 }, { "epoch": 2.063424947145877, "grad_norm": 2.4497523307800293, "learning_rate": 2.822757928342658e-06, "loss": 1.075200080871582, "step": 976 }, { "epoch": 2.0676532769556024, "grad_norm": 0.8020917177200317, "learning_rate": 2.814894520436931e-06, "loss": 1.0989971160888672, "step": 978 }, { "epoch": 2.0718816067653276, "grad_norm": 1.6352614164352417, "learning_rate": 2.807030319169619e-06, "loss": 0.699384868144989, "step": 980 }, { "epoch": 2.076109936575053, "grad_norm": 6.557322978973389, "learning_rate": 2.7991654206837434e-06, "loss": 0.7373824119567871, "step": 982 }, { "epoch": 2.080338266384778, "grad_norm": 0.7050887942314148, "learning_rate": 2.7912999211308466e-06, "loss": 0.8136764168739319, "step": 984 }, { "epoch": 2.0845665961945032, "grad_norm": 1.3208609819412231, "learning_rate": 2.783433916669822e-06, "loss": 0.9552209973335266, "step": 986 }, { "epoch": 2.0887949260042284, "grad_norm": 0.6587861180305481, "learning_rate": 2.7755675034657336e-06, "loss": 1.0741578340530396, "step": 988 }, { "epoch": 2.0930232558139537, "grad_norm": 1.1716125011444092, "learning_rate": 2.7677007776886437e-06, "loss": 1.0747499465942383, "step": 990 }, { "epoch": 2.097251585623679, "grad_norm": 1.5702075958251953, "learning_rate": 2.759833835512435e-06, "loss": 0.670864999294281, "step": 992 }, { "epoch": 2.1014799154334036, "grad_norm": 22.38727569580078, "learning_rate": 2.7519667731136364e-06, "loss": 0.7279332280158997, "step": 994 }, { "epoch": 2.105708245243129, "grad_norm": 0.8443185091018677, "learning_rate": 2.7440996866702458e-06, "loss": 0.8103309869766235, "step": 996 }, { "epoch": 2.109936575052854, "grad_norm": 0.8229217529296875, "learning_rate": 2.7362326723605566e-06, "loss": 1.036565899848938, "step": 998 }, { "epoch": 2.1141649048625792, "grad_norm": 0.7176088094711304, "learning_rate": 2.7283658263619794e-06, "loss": 1.0687159299850464, "step": 1000 }, { "epoch": 2.1183932346723044, "grad_norm": 0.6158708333969116, "learning_rate": 2.7204992448498657e-06, "loss": 0.24933312833309174, "step": 1002 }, { "epoch": 2.1226215644820297, "grad_norm": 1.7368133068084717, "learning_rate": 2.712633023996336e-06, "loss": 0.7682783007621765, "step": 1004 }, { "epoch": 2.126849894291755, "grad_norm": 0.6421481966972351, "learning_rate": 2.7047672599691e-06, "loss": 1.0240600109100342, "step": 1006 }, { "epoch": 2.13107822410148, "grad_norm": 1.3722180128097534, "learning_rate": 2.696902048930284e-06, "loss": 0.9667700529098511, "step": 1008 }, { "epoch": 2.1353065539112053, "grad_norm": 0.6478885412216187, "learning_rate": 2.6890374870352532e-06, "loss": 0.8398556113243103, "step": 1010 }, { "epoch": 2.13953488372093, "grad_norm": 1.5403518676757812, "learning_rate": 2.6811736704314344e-06, "loss": 0.57329922914505, "step": 1012 }, { "epoch": 2.1437632135306552, "grad_norm": 0.8799501061439514, "learning_rate": 2.6733106952571467e-06, "loss": 0.6521193981170654, "step": 1014 }, { "epoch": 2.1479915433403804, "grad_norm": 0.9985294938087463, "learning_rate": 2.6654486576404197e-06, "loss": 0.8588607311248779, "step": 1016 }, { "epoch": 2.1522198731501057, "grad_norm": 0.9864040017127991, "learning_rate": 2.657587653697822e-06, "loss": 1.0104336738586426, "step": 1018 }, { "epoch": 2.156448202959831, "grad_norm": 6.648144245147705, "learning_rate": 2.6497277795332855e-06, "loss": 0.8407163619995117, "step": 1020 }, { "epoch": 2.160676532769556, "grad_norm": 0.837396502494812, "learning_rate": 2.6418691312369295e-06, "loss": 0.7050214409828186, "step": 1022 }, { "epoch": 2.1649048625792813, "grad_norm": 0.6134306192398071, "learning_rate": 2.634011804883886e-06, "loss": 1.0578330755233765, "step": 1024 }, { "epoch": 2.1691331923890065, "grad_norm": 0.7147375345230103, "learning_rate": 2.6261558965331272e-06, "loss": 1.0594534873962402, "step": 1026 }, { "epoch": 2.1733615221987317, "grad_norm": 0.6058641672134399, "learning_rate": 2.6183015022262892e-06, "loss": 1.0534790754318237, "step": 1028 }, { "epoch": 2.177589852008457, "grad_norm": 0.6654782891273499, "learning_rate": 2.610448717986496e-06, "loss": 1.067839503288269, "step": 1030 }, { "epoch": 2.1818181818181817, "grad_norm": 1.3797681331634521, "learning_rate": 2.6025976398171927e-06, "loss": 1.0668026208877563, "step": 1032 }, { "epoch": 2.186046511627907, "grad_norm": 0.6694332361221313, "learning_rate": 2.5947483637009622e-06, "loss": 1.1499404907226562, "step": 1034 }, { "epoch": 2.190274841437632, "grad_norm": 0.46979257464408875, "learning_rate": 2.586900985598358e-06, "loss": 0.8229663372039795, "step": 1036 }, { "epoch": 2.1945031712473573, "grad_norm": 2.6087801456451416, "learning_rate": 2.579055601446732e-06, "loss": 0.4731891453266144, "step": 1038 }, { "epoch": 2.1987315010570825, "grad_norm": 1.0109667778015137, "learning_rate": 2.571212307159056e-06, "loss": 1.0908327102661133, "step": 1040 }, { "epoch": 2.2029598308668077, "grad_norm": 2.8863284587860107, "learning_rate": 2.563371198622755e-06, "loss": 0.36066552996635437, "step": 1042 }, { "epoch": 2.207188160676533, "grad_norm": 1.16829514503479, "learning_rate": 2.5555323716985304e-06, "loss": 1.053403615951538, "step": 1044 }, { "epoch": 2.211416490486258, "grad_norm": 0.7351399064064026, "learning_rate": 2.54769592221919e-06, "loss": 0.6402167677879333, "step": 1046 }, { "epoch": 2.2156448202959833, "grad_norm": 3.0547754764556885, "learning_rate": 2.539861945988478e-06, "loss": 0.8964632749557495, "step": 1048 }, { "epoch": 2.219873150105708, "grad_norm": 1.3434550762176514, "learning_rate": 2.5320305387799014e-06, "loss": 0.6596440076828003, "step": 1050 }, { "epoch": 2.2241014799154333, "grad_norm": 0.6450607776641846, "learning_rate": 2.524201796335558e-06, "loss": 0.9056267142295837, "step": 1052 }, { "epoch": 2.2283298097251585, "grad_norm": 0.6330105066299438, "learning_rate": 2.5163758143649716e-06, "loss": 1.0713391304016113, "step": 1054 }, { "epoch": 2.2325581395348837, "grad_norm": 0.6766862273216248, "learning_rate": 2.5085526885439145e-06, "loss": 1.0640653371810913, "step": 1056 }, { "epoch": 2.236786469344609, "grad_norm": 0.3488926291465759, "learning_rate": 2.5007325145132427e-06, "loss": 0.8341073393821716, "step": 1058 }, { "epoch": 2.241014799154334, "grad_norm": 2.001237154006958, "learning_rate": 2.4929153878777268e-06, "loss": 0.9115666747093201, "step": 1060 }, { "epoch": 2.2452431289640593, "grad_norm": 0.7693464756011963, "learning_rate": 2.48510140420488e-06, "loss": 1.0226731300354004, "step": 1062 }, { "epoch": 2.2494714587737845, "grad_norm": 1.4121301174163818, "learning_rate": 2.477290659023791e-06, "loss": 1.0118439197540283, "step": 1064 }, { "epoch": 2.2536997885835097, "grad_norm": 2.2806310653686523, "learning_rate": 2.469483247823959e-06, "loss": 0.632957398891449, "step": 1066 }, { "epoch": 2.2579281183932345, "grad_norm": 0.8324834704399109, "learning_rate": 2.461679266054122e-06, "loss": 0.8787606954574585, "step": 1068 }, { "epoch": 2.2621564482029597, "grad_norm": 1.5810331106185913, "learning_rate": 2.453878809121093e-06, "loss": 0.8886688351631165, "step": 1070 }, { "epoch": 2.266384778012685, "grad_norm": 0.6590220332145691, "learning_rate": 2.4460819723885903e-06, "loss": 1.0459415912628174, "step": 1072 }, { "epoch": 2.27061310782241, "grad_norm": 0.26749613881111145, "learning_rate": 2.4382888511760773e-06, "loss": 0.7614855170249939, "step": 1074 }, { "epoch": 2.2748414376321353, "grad_norm": 1.3493986129760742, "learning_rate": 2.4304995407575917e-06, "loss": 0.900128185749054, "step": 1076 }, { "epoch": 2.2790697674418605, "grad_norm": 8.0263090133667, "learning_rate": 2.4227141363605804e-06, "loss": 0.22701826691627502, "step": 1078 }, { "epoch": 2.2832980972515857, "grad_norm": 0.5107969641685486, "learning_rate": 2.4149327331647432e-06, "loss": 0.16721072793006897, "step": 1080 }, { "epoch": 2.287526427061311, "grad_norm": 0.9236059188842773, "learning_rate": 2.4071554263008584e-06, "loss": 0.5462712645530701, "step": 1082 }, { "epoch": 2.291754756871036, "grad_norm": 1.4398772716522217, "learning_rate": 2.3993823108496272e-06, "loss": 0.43305540084838867, "step": 1084 }, { "epoch": 2.295983086680761, "grad_norm": 0.5344212055206299, "learning_rate": 2.391613481840509e-06, "loss": 0.25760167837142944, "step": 1086 }, { "epoch": 2.300211416490486, "grad_norm": 5.494821071624756, "learning_rate": 2.38384903425056e-06, "loss": 0.7133547067642212, "step": 1088 }, { "epoch": 2.3044397463002113, "grad_norm": 0.9530798196792603, "learning_rate": 2.376089063003272e-06, "loss": 0.9048901200294495, "step": 1090 }, { "epoch": 2.3086680761099365, "grad_norm": 0.7235156893730164, "learning_rate": 2.3683336629674096e-06, "loss": 0.6983910202980042, "step": 1092 }, { "epoch": 2.3128964059196617, "grad_norm": 0.6613774299621582, "learning_rate": 2.3605829289558545e-06, "loss": 1.0634891986846924, "step": 1094 }, { "epoch": 2.317124735729387, "grad_norm": 0.7909154891967773, "learning_rate": 2.3528369557244453e-06, "loss": 1.035917043685913, "step": 1096 }, { "epoch": 2.321353065539112, "grad_norm": 0.8521804213523865, "learning_rate": 2.3450958379708156e-06, "loss": 1.009893774986267, "step": 1098 }, { "epoch": 2.3255813953488373, "grad_norm": 2.444586753845215, "learning_rate": 2.3373596703332383e-06, "loss": 0.6026294827461243, "step": 1100 }, { "epoch": 2.3298097251585626, "grad_norm": 0.8242626786231995, "learning_rate": 2.3296285473894746e-06, "loss": 0.7475822567939758, "step": 1102 }, { "epoch": 2.3340380549682873, "grad_norm": 0.684226930141449, "learning_rate": 2.321902563655606e-06, "loss": 1.0707495212554932, "step": 1104 }, { "epoch": 2.3382663847780125, "grad_norm": 0.8783945441246033, "learning_rate": 2.314181813584887e-06, "loss": 1.013008952140808, "step": 1106 }, { "epoch": 2.3424947145877377, "grad_norm": 0.9921977519989014, "learning_rate": 2.306466391566591e-06, "loss": 0.9479020833969116, "step": 1108 }, { "epoch": 2.346723044397463, "grad_norm": 0.7830618619918823, "learning_rate": 2.2987563919248518e-06, "loss": 1.1364282369613647, "step": 1110 }, { "epoch": 2.350951374207188, "grad_norm": 0.26116877794265747, "learning_rate": 2.2910519089175103e-06, "loss": 0.6622422933578491, "step": 1112 }, { "epoch": 2.3551797040169133, "grad_norm": 4.712930202484131, "learning_rate": 2.283353036734969e-06, "loss": 0.94716477394104, "step": 1114 }, { "epoch": 2.3594080338266386, "grad_norm": 0.9706722497940063, "learning_rate": 2.2756598694990334e-06, "loss": 0.6431679725646973, "step": 1116 }, { "epoch": 2.3636363636363638, "grad_norm": 1.9938366413116455, "learning_rate": 2.267972501261762e-06, "loss": 1.308355450630188, "step": 1118 }, { "epoch": 2.367864693446089, "grad_norm": 0.7777484059333801, "learning_rate": 2.2602910260043208e-06, "loss": 1.0695171356201172, "step": 1120 }, { "epoch": 2.3720930232558137, "grad_norm": 0.7761583924293518, "learning_rate": 2.252615537635831e-06, "loss": 0.9347115755081177, "step": 1122 }, { "epoch": 2.376321353065539, "grad_norm": 0.7822389006614685, "learning_rate": 2.244946129992223e-06, "loss": 0.7232018113136292, "step": 1124 }, { "epoch": 2.380549682875264, "grad_norm": 2.1133530139923096, "learning_rate": 2.2372828968350834e-06, "loss": 1.0389723777770996, "step": 1126 }, { "epoch": 2.3847780126849893, "grad_norm": 1.3042513132095337, "learning_rate": 2.229625931850519e-06, "loss": 0.7246500849723816, "step": 1128 }, { "epoch": 2.3890063424947146, "grad_norm": 0.8496916890144348, "learning_rate": 2.221975328648002e-06, "loss": 0.8411369323730469, "step": 1130 }, { "epoch": 2.3932346723044398, "grad_norm": 1.2774096727371216, "learning_rate": 2.2143311807592292e-06, "loss": 0.7468405961990356, "step": 1132 }, { "epoch": 2.397463002114165, "grad_norm": 0.6452171206474304, "learning_rate": 2.206693581636982e-06, "loss": 1.111289620399475, "step": 1134 }, { "epoch": 2.40169133192389, "grad_norm": 5.754592418670654, "learning_rate": 2.1990626246539753e-06, "loss": 0.6915456056594849, "step": 1136 }, { "epoch": 2.4059196617336154, "grad_norm": 1.6072407960891724, "learning_rate": 2.1914384031017265e-06, "loss": 0.8382232189178467, "step": 1138 }, { "epoch": 2.41014799154334, "grad_norm": 0.4873308837413788, "learning_rate": 2.1838210101894062e-06, "loss": 1.0329222679138184, "step": 1140 }, { "epoch": 2.4143763213530653, "grad_norm": 0.7448446154594421, "learning_rate": 2.1762105390427026e-06, "loss": 1.19656503200531, "step": 1142 }, { "epoch": 2.4186046511627906, "grad_norm": 2.470224618911743, "learning_rate": 2.168607082702684e-06, "loss": 0.6114988923072815, "step": 1144 }, { "epoch": 2.4228329809725158, "grad_norm": 4.100384712219238, "learning_rate": 2.161010734124658e-06, "loss": 0.7755101323127747, "step": 1146 }, { "epoch": 2.427061310782241, "grad_norm": 0.8485273122787476, "learning_rate": 2.153421586177038e-06, "loss": 0.8298628926277161, "step": 1148 }, { "epoch": 2.431289640591966, "grad_norm": 1.0596591234207153, "learning_rate": 2.145839731640208e-06, "loss": 0.5695077180862427, "step": 1150 }, { "epoch": 2.4355179704016914, "grad_norm": 0.32878732681274414, "learning_rate": 2.138265263205384e-06, "loss": 0.6108872890472412, "step": 1152 }, { "epoch": 2.4397463002114166, "grad_norm": 0.47924017906188965, "learning_rate": 2.130698273473486e-06, "loss": 0.575315535068512, "step": 1154 }, { "epoch": 2.443974630021142, "grad_norm": 0.5258365273475647, "learning_rate": 2.1231388549540045e-06, "loss": 0.9532243609428406, "step": 1156 }, { "epoch": 2.448202959830867, "grad_norm": 4.6877546310424805, "learning_rate": 2.115587100063868e-06, "loss": 0.5808656811714172, "step": 1158 }, { "epoch": 2.452431289640592, "grad_norm": 0.8416226506233215, "learning_rate": 2.108043101126312e-06, "loss": 1.0306192636489868, "step": 1160 }, { "epoch": 2.456659619450317, "grad_norm": 3.2165985107421875, "learning_rate": 2.1005069503697566e-06, "loss": 1.0111299753189087, "step": 1162 }, { "epoch": 2.460887949260042, "grad_norm": 0.6864579916000366, "learning_rate": 2.092978739926672e-06, "loss": 0.8028541207313538, "step": 1164 }, { "epoch": 2.4651162790697674, "grad_norm": 0.9489989280700684, "learning_rate": 2.0854585618324548e-06, "loss": 1.2172460556030273, "step": 1166 }, { "epoch": 2.4693446088794926, "grad_norm": 1.215120553970337, "learning_rate": 2.0779465080243037e-06, "loss": 1.3246065378189087, "step": 1168 }, { "epoch": 2.473572938689218, "grad_norm": 0.6394163370132446, "learning_rate": 2.0704426703400944e-06, "loss": 0.7735956311225891, "step": 1170 }, { "epoch": 2.477801268498943, "grad_norm": 1.1398952007293701, "learning_rate": 2.0629471405172585e-06, "loss": 0.8254691362380981, "step": 1172 }, { "epoch": 2.482029598308668, "grad_norm": 0.5559751987457275, "learning_rate": 2.055460010191658e-06, "loss": 0.7504424452781677, "step": 1174 }, { "epoch": 2.4862579281183934, "grad_norm": 0.8105632066726685, "learning_rate": 2.0479813708964693e-06, "loss": 0.7769438028335571, "step": 1176 }, { "epoch": 2.4904862579281186, "grad_norm": 1.449171781539917, "learning_rate": 2.0405113140610634e-06, "loss": 0.8921318650245667, "step": 1178 }, { "epoch": 2.4947145877378434, "grad_norm": 1.4208768606185913, "learning_rate": 2.033049931009885e-06, "loss": 0.6842445135116577, "step": 1180 }, { "epoch": 2.4989429175475686, "grad_norm": 0.4888696074485779, "learning_rate": 2.0255973129613406e-06, "loss": 0.567357063293457, "step": 1182 }, { "epoch": 2.503171247357294, "grad_norm": 0.8814659118652344, "learning_rate": 2.0181535510266796e-06, "loss": 0.1589071899652481, "step": 1184 }, { "epoch": 2.507399577167019, "grad_norm": 1.7633031606674194, "learning_rate": 2.0107187362088816e-06, "loss": 0.9725368618965149, "step": 1186 }, { "epoch": 2.511627906976744, "grad_norm": 2.5048136711120605, "learning_rate": 2.0032929594015456e-06, "loss": 0.9178006649017334, "step": 1188 }, { "epoch": 2.5158562367864694, "grad_norm": 1.5520225763320923, "learning_rate": 1.9958763113877755e-06, "loss": 0.7678893804550171, "step": 1190 }, { "epoch": 2.5200845665961946, "grad_norm": 0.5215038061141968, "learning_rate": 1.988468882839075e-06, "loss": 1.001523733139038, "step": 1192 }, { "epoch": 2.52431289640592, "grad_norm": 0.6024693846702576, "learning_rate": 1.9810707643142325e-06, "loss": 0.6263225674629211, "step": 1194 }, { "epoch": 2.528541226215645, "grad_norm": 1.617968201637268, "learning_rate": 1.9736820462582186e-06, "loss": 1.0076720714569092, "step": 1196 }, { "epoch": 2.53276955602537, "grad_norm": 0.7982508540153503, "learning_rate": 1.9663028190010815e-06, "loss": 1.0421154499053955, "step": 1198 }, { "epoch": 2.536997885835095, "grad_norm": 1.1996971368789673, "learning_rate": 1.9589331727568384e-06, "loss": 0.7256770133972168, "step": 1200 }, { "epoch": 2.54122621564482, "grad_norm": 0.744490921497345, "learning_rate": 1.9515731976223746e-06, "loss": 1.0210518836975098, "step": 1202 }, { "epoch": 2.5454545454545454, "grad_norm": 1.66182541847229, "learning_rate": 1.9442229835763454e-06, "loss": 0.44427788257598877, "step": 1204 }, { "epoch": 2.5496828752642706, "grad_norm": 0.6226742267608643, "learning_rate": 1.936882620478069e-06, "loss": 1.068085789680481, "step": 1206 }, { "epoch": 2.553911205073996, "grad_norm": 1.4527463912963867, "learning_rate": 1.9295521980664317e-06, "loss": 1.060996174812317, "step": 1208 }, { "epoch": 2.558139534883721, "grad_norm": 0.6856507062911987, "learning_rate": 1.922231805958795e-06, "loss": 1.039587140083313, "step": 1210 }, { "epoch": 2.5623678646934462, "grad_norm": 1.3432971239089966, "learning_rate": 1.914921533649894e-06, "loss": 0.7191824316978455, "step": 1212 }, { "epoch": 2.5665961945031714, "grad_norm": 0.7632008194923401, "learning_rate": 1.9076214705107417e-06, "loss": 1.0393006801605225, "step": 1214 }, { "epoch": 2.570824524312896, "grad_norm": 1.0369495153427124, "learning_rate": 1.9003317057875443e-06, "loss": 0.6147840023040771, "step": 1216 }, { "epoch": 2.5750528541226214, "grad_norm": 1.336530089378357, "learning_rate": 1.8930523286006052e-06, "loss": 0.6377484202384949, "step": 1218 }, { "epoch": 2.5792811839323466, "grad_norm": 2.0891432762145996, "learning_rate": 1.8857834279432336e-06, "loss": 0.509937584400177, "step": 1220 }, { "epoch": 2.583509513742072, "grad_norm": 3.55784010887146, "learning_rate": 1.8785250926806613e-06, "loss": 0.5913651585578918, "step": 1222 }, { "epoch": 2.587737843551797, "grad_norm": 4.819112777709961, "learning_rate": 1.8712774115489524e-06, "loss": 0.8116767406463623, "step": 1224 }, { "epoch": 2.5919661733615222, "grad_norm": 0.43531814217567444, "learning_rate": 1.8640404731539218e-06, "loss": 0.47326603531837463, "step": 1226 }, { "epoch": 2.5961945031712474, "grad_norm": 0.8789650201797485, "learning_rate": 1.8568143659700472e-06, "loss": 0.7499734163284302, "step": 1228 }, { "epoch": 2.6004228329809727, "grad_norm": 1.4755181074142456, "learning_rate": 1.8495991783393924e-06, "loss": 0.8303921222686768, "step": 1230 }, { "epoch": 2.604651162790698, "grad_norm": 3.1309523582458496, "learning_rate": 1.8423949984705257e-06, "loss": 0.7273667454719543, "step": 1232 }, { "epoch": 2.6088794926004226, "grad_norm": 1.6632975339889526, "learning_rate": 1.8352019144374406e-06, "loss": 0.8571827411651611, "step": 1234 }, { "epoch": 2.613107822410148, "grad_norm": 0.7448071241378784, "learning_rate": 1.8280200141784771e-06, "loss": 0.8664517998695374, "step": 1236 }, { "epoch": 2.617336152219873, "grad_norm": 0.8705071210861206, "learning_rate": 1.8208493854952535e-06, "loss": 0.9958084225654602, "step": 1238 }, { "epoch": 2.6215644820295982, "grad_norm": 0.7441583275794983, "learning_rate": 1.8136901160515869e-06, "loss": 0.7479358315467834, "step": 1240 }, { "epoch": 2.6257928118393234, "grad_norm": 0.6350056529045105, "learning_rate": 1.8065422933724192e-06, "loss": 0.8547337651252747, "step": 1242 }, { "epoch": 2.6300211416490487, "grad_norm": 1.2663848400115967, "learning_rate": 1.799406004842757e-06, "loss": 1.0284228324890137, "step": 1244 }, { "epoch": 2.634249471458774, "grad_norm": 2.9096410274505615, "learning_rate": 1.7922813377065946e-06, "loss": 0.6996232867240906, "step": 1246 }, { "epoch": 2.638477801268499, "grad_norm": 2.602738857269287, "learning_rate": 1.7851683790658492e-06, "loss": 0.5642688274383545, "step": 1248 }, { "epoch": 2.6427061310782243, "grad_norm": 0.6103304028511047, "learning_rate": 1.7780672158792979e-06, "loss": 1.0508077144622803, "step": 1250 }, { "epoch": 2.646934460887949, "grad_norm": 3.92741322517395, "learning_rate": 1.7709779349615152e-06, "loss": 0.5398973822593689, "step": 1252 }, { "epoch": 2.6511627906976747, "grad_norm": 3.5654373168945312, "learning_rate": 1.763900622981805e-06, "loss": 0.7100467681884766, "step": 1254 }, { "epoch": 2.6553911205073994, "grad_norm": 0.8442944288253784, "learning_rate": 1.7568353664631528e-06, "loss": 1.0310944318771362, "step": 1256 }, { "epoch": 2.6596194503171247, "grad_norm": 0.7679892778396606, "learning_rate": 1.7497822517811576e-06, "loss": 0.3732684850692749, "step": 1258 }, { "epoch": 2.66384778012685, "grad_norm": 2.0411362648010254, "learning_rate": 1.7427413651629787e-06, "loss": 0.5446974635124207, "step": 1260 }, { "epoch": 2.668076109936575, "grad_norm": 7.280208587646484, "learning_rate": 1.735712792686285e-06, "loss": 0.7429797649383545, "step": 1262 }, { "epoch": 2.6723044397463003, "grad_norm": 1.769396185874939, "learning_rate": 1.7286966202781983e-06, "loss": 0.7472846508026123, "step": 1264 }, { "epoch": 2.6765327695560255, "grad_norm": 7.442590236663818, "learning_rate": 1.7216929337142447e-06, "loss": 0.4331527650356293, "step": 1266 }, { "epoch": 2.6807610993657507, "grad_norm": 2.3331828117370605, "learning_rate": 1.714701818617307e-06, "loss": 0.7867488861083984, "step": 1268 }, { "epoch": 2.6849894291754755, "grad_norm": 0.9535698890686035, "learning_rate": 1.7077233604565758e-06, "loss": 1.0159664154052734, "step": 1270 }, { "epoch": 2.689217758985201, "grad_norm": 1.7152396440505981, "learning_rate": 1.7007576445465054e-06, "loss": 0.8122742176055908, "step": 1272 }, { "epoch": 2.693446088794926, "grad_norm": 0.8219133019447327, "learning_rate": 1.6938047560457716e-06, "loss": 0.49331924319267273, "step": 1274 }, { "epoch": 2.697674418604651, "grad_norm": 1.0974379777908325, "learning_rate": 1.6868647799562296e-06, "loss": 0.5021317601203918, "step": 1276 }, { "epoch": 2.7019027484143763, "grad_norm": 1.0276836156845093, "learning_rate": 1.6799378011218753e-06, "loss": 1.0597912073135376, "step": 1278 }, { "epoch": 2.7061310782241015, "grad_norm": 1.305923581123352, "learning_rate": 1.6730239042278078e-06, "loss": 0.7645857334136963, "step": 1280 }, { "epoch": 2.7103594080338267, "grad_norm": 0.5968372225761414, "learning_rate": 1.666123173799195e-06, "loss": 1.1030560731887817, "step": 1282 }, { "epoch": 2.714587737843552, "grad_norm": 1.4355765581130981, "learning_rate": 1.659235694200238e-06, "loss": 0.8181160092353821, "step": 1284 }, { "epoch": 2.718816067653277, "grad_norm": 0.18032093346118927, "learning_rate": 1.6523615496331417e-06, "loss": 1.1456607580184937, "step": 1286 }, { "epoch": 2.723044397463002, "grad_norm": 1.4734760522842407, "learning_rate": 1.6455008241370874e-06, "loss": 0.5729717016220093, "step": 1288 }, { "epoch": 2.7272727272727275, "grad_norm": 0.7612594962120056, "learning_rate": 1.6386536015871976e-06, "loss": 1.0644044876098633, "step": 1290 }, { "epoch": 2.7315010570824523, "grad_norm": 0.8396774530410767, "learning_rate": 1.6318199656935195e-06, "loss": 1.0980335474014282, "step": 1292 }, { "epoch": 2.7357293868921775, "grad_norm": 0.7830753922462463, "learning_rate": 1.6250000000000007e-06, "loss": 1.054038405418396, "step": 1294 }, { "epoch": 2.7399577167019027, "grad_norm": 0.3286767899990082, "learning_rate": 1.618193787883458e-06, "loss": 0.3908301591873169, "step": 1296 }, { "epoch": 2.744186046511628, "grad_norm": 0.6592679619789124, "learning_rate": 1.611401412552569e-06, "loss": 0.5546691417694092, "step": 1298 }, { "epoch": 2.748414376321353, "grad_norm": 0.5871724486351013, "learning_rate": 1.604622957046854e-06, "loss": 0.6974177360534668, "step": 1300 }, { "epoch": 2.7526427061310783, "grad_norm": 3.476322650909424, "learning_rate": 1.5978585042356526e-06, "loss": 0.9717587828636169, "step": 1302 }, { "epoch": 2.7568710359408035, "grad_norm": 1.426347017288208, "learning_rate": 1.5911081368171174e-06, "loss": 0.696022093296051, "step": 1304 }, { "epoch": 2.7610993657505283, "grad_norm": 1.42485511302948, "learning_rate": 1.5843719373172043e-06, "loss": 0.8914967775344849, "step": 1306 }, { "epoch": 2.765327695560254, "grad_norm": 0.9887190461158752, "learning_rate": 1.5776499880886583e-06, "loss": 0.8718952536582947, "step": 1308 }, { "epoch": 2.7695560253699787, "grad_norm": 0.5860939621925354, "learning_rate": 1.5709423713100066e-06, "loss": 1.0336132049560547, "step": 1310 }, { "epoch": 2.773784355179704, "grad_norm": 0.6642679572105408, "learning_rate": 1.5642491689845623e-06, "loss": 0.9066874980926514, "step": 1312 }, { "epoch": 2.778012684989429, "grad_norm": 0.6993511319160461, "learning_rate": 1.5575704629394118e-06, "loss": 0.5353021025657654, "step": 1314 }, { "epoch": 2.7822410147991543, "grad_norm": 0.8484950065612793, "learning_rate": 1.550906334824419e-06, "loss": 0.979564905166626, "step": 1316 }, { "epoch": 2.7864693446088795, "grad_norm": 0.3303600251674652, "learning_rate": 1.5442568661112273e-06, "loss": 0.6826730966567993, "step": 1318 }, { "epoch": 2.7906976744186047, "grad_norm": 3.2293996810913086, "learning_rate": 1.5376221380922645e-06, "loss": 0.9952559471130371, "step": 1320 }, { "epoch": 2.79492600422833, "grad_norm": 2.9947149753570557, "learning_rate": 1.5310022318797468e-06, "loss": 0.5234836339950562, "step": 1322 }, { "epoch": 2.7991543340380547, "grad_norm": 0.8693253397941589, "learning_rate": 1.5243972284046843e-06, "loss": 1.0908644199371338, "step": 1324 }, { "epoch": 2.8033826638477803, "grad_norm": 1.8999295234680176, "learning_rate": 1.5178072084159006e-06, "loss": 0.30439692735671997, "step": 1326 }, { "epoch": 2.807610993657505, "grad_norm": 1.2835586071014404, "learning_rate": 1.5112322524790373e-06, "loss": 0.3868151009082794, "step": 1328 }, { "epoch": 2.8118393234672303, "grad_norm": 0.6148664355278015, "learning_rate": 1.5046724409755708e-06, "loss": 0.655669093132019, "step": 1330 }, { "epoch": 2.8160676532769555, "grad_norm": 0.9131718277931213, "learning_rate": 1.4981278541018338e-06, "loss": 1.027086615562439, "step": 1332 }, { "epoch": 2.8202959830866807, "grad_norm": 0.9928625226020813, "learning_rate": 1.4915985718680303e-06, "loss": 0.6656888723373413, "step": 1334 }, { "epoch": 2.824524312896406, "grad_norm": 0.759575366973877, "learning_rate": 1.4850846740972566e-06, "loss": 1.0963438749313354, "step": 1336 }, { "epoch": 2.828752642706131, "grad_norm": 1.3372092247009277, "learning_rate": 1.478586240424532e-06, "loss": 1.0407531261444092, "step": 1338 }, { "epoch": 2.8329809725158563, "grad_norm": 1.6881779432296753, "learning_rate": 1.4721033502958188e-06, "loss": 0.8279685974121094, "step": 1340 }, { "epoch": 2.8372093023255816, "grad_norm": 0.5533450245857239, "learning_rate": 1.4656360829670524e-06, "loss": 1.0067516565322876, "step": 1342 }, { "epoch": 2.8414376321353068, "grad_norm": 0.5380539298057556, "learning_rate": 1.4591845175031755e-06, "loss": 0.7166640162467957, "step": 1344 }, { "epoch": 2.8456659619450315, "grad_norm": 1.2064310312271118, "learning_rate": 1.4527487327771667e-06, "loss": 0.6947576403617859, "step": 1346 }, { "epoch": 2.8498942917547567, "grad_norm": 0.6393163800239563, "learning_rate": 1.44632880746908e-06, "loss": 0.9154616594314575, "step": 1348 }, { "epoch": 2.854122621564482, "grad_norm": 2.2646851539611816, "learning_rate": 1.4399248200650822e-06, "loss": 0.5946722626686096, "step": 1350 }, { "epoch": 2.858350951374207, "grad_norm": 6.771807670593262, "learning_rate": 1.4335368488564921e-06, "loss": 0.9889756441116333, "step": 1352 }, { "epoch": 2.8625792811839323, "grad_norm": 0.6024413108825684, "learning_rate": 1.4271649719388235e-06, "loss": 1.0145889520645142, "step": 1354 }, { "epoch": 2.8668076109936576, "grad_norm": 6.5942912101745605, "learning_rate": 1.420809267210832e-06, "loss": 0.4889359176158905, "step": 1356 }, { "epoch": 2.8710359408033828, "grad_norm": 1.2514537572860718, "learning_rate": 1.4144698123735614e-06, "loss": 1.060815453529358, "step": 1358 }, { "epoch": 2.875264270613108, "grad_norm": 3.7441012859344482, "learning_rate": 1.408146684929394e-06, "loss": 0.795141875743866, "step": 1360 }, { "epoch": 2.879492600422833, "grad_norm": 1.1793290376663208, "learning_rate": 1.401839962181103e-06, "loss": 0.7162335515022278, "step": 1362 }, { "epoch": 2.883720930232558, "grad_norm": 1.296712875366211, "learning_rate": 1.3955497212309082e-06, "loss": 1.0849847793579102, "step": 1364 }, { "epoch": 2.887949260042283, "grad_norm": 3.475389003753662, "learning_rate": 1.389276038979532e-06, "loss": 0.875495970249176, "step": 1366 }, { "epoch": 2.8921775898520083, "grad_norm": 0.5793375372886658, "learning_rate": 1.3830189921252605e-06, "loss": 1.020584225654602, "step": 1368 }, { "epoch": 2.8964059196617336, "grad_norm": 0.7720953226089478, "learning_rate": 1.3767786571630054e-06, "loss": 1.035544753074646, "step": 1370 }, { "epoch": 2.9006342494714588, "grad_norm": 0.6335976123809814, "learning_rate": 1.3705551103833687e-06, "loss": 1.0688656568527222, "step": 1372 }, { "epoch": 2.904862579281184, "grad_norm": 0.5152540802955627, "learning_rate": 1.364348427871709e-06, "loss": 0.8726412057876587, "step": 1374 }, { "epoch": 2.909090909090909, "grad_norm": 0.8800987601280212, "learning_rate": 1.3581586855072162e-06, "loss": 1.01813542842865, "step": 1376 }, { "epoch": 2.9133192389006344, "grad_norm": 3.5038645267486572, "learning_rate": 1.3519859589619756e-06, "loss": 0.7246266603469849, "step": 1378 }, { "epoch": 2.9175475687103596, "grad_norm": 0.6655816435813904, "learning_rate": 1.3458303237000483e-06, "loss": 0.7696600556373596, "step": 1380 }, { "epoch": 2.9217758985200843, "grad_norm": 0.9207125902175903, "learning_rate": 1.3396918549765514e-06, "loss": 1.0463422536849976, "step": 1382 }, { "epoch": 2.92600422832981, "grad_norm": 1.4580425024032593, "learning_rate": 1.3335706278367289e-06, "loss": 0.9288692474365234, "step": 1384 }, { "epoch": 2.9302325581395348, "grad_norm": 0.6088332533836365, "learning_rate": 1.3274667171150422e-06, "loss": 0.7489819526672363, "step": 1386 }, { "epoch": 2.93446088794926, "grad_norm": 1.341879963874817, "learning_rate": 1.3213801974342516e-06, "loss": 1.0183134078979492, "step": 1388 }, { "epoch": 2.938689217758985, "grad_norm": 0.258132666349411, "learning_rate": 1.3153111432045079e-06, "loss": 0.8709487318992615, "step": 1390 }, { "epoch": 2.9429175475687104, "grad_norm": 0.5611593127250671, "learning_rate": 1.309259628622435e-06, "loss": 1.007150411605835, "step": 1392 }, { "epoch": 2.9471458773784356, "grad_norm": 0.7201411724090576, "learning_rate": 1.3032257276702296e-06, "loss": 0.32561811804771423, "step": 1394 }, { "epoch": 2.951374207188161, "grad_norm": 0.5295475721359253, "learning_rate": 1.2972095141147578e-06, "loss": 0.529960572719574, "step": 1396 }, { "epoch": 2.955602536997886, "grad_norm": 2.0177695751190186, "learning_rate": 1.2912110615066447e-06, "loss": 0.9622781276702881, "step": 1398 }, { "epoch": 2.9598308668076108, "grad_norm": 0.5386593341827393, "learning_rate": 1.2852304431793838e-06, "loss": 1.2505404949188232, "step": 1400 }, { "epoch": 2.9640591966173364, "grad_norm": 4.687948226928711, "learning_rate": 1.2792677322484386e-06, "loss": 0.8016545176506042, "step": 1402 }, { "epoch": 2.968287526427061, "grad_norm": 1.594322681427002, "learning_rate": 1.2733230016103436e-06, "loss": 0.5189470052719116, "step": 1404 }, { "epoch": 2.9725158562367864, "grad_norm": 0.9102961421012878, "learning_rate": 1.26739632394182e-06, "loss": 0.9059958457946777, "step": 1406 }, { "epoch": 2.9767441860465116, "grad_norm": 0.8692654371261597, "learning_rate": 1.2614877716988845e-06, "loss": 0.8937259316444397, "step": 1408 }, { "epoch": 2.980972515856237, "grad_norm": 2.1760952472686768, "learning_rate": 1.255597417115961e-06, "loss": 0.833085834980011, "step": 1410 }, { "epoch": 2.985200845665962, "grad_norm": 1.076922059059143, "learning_rate": 1.249725332205e-06, "loss": 1.064079999923706, "step": 1412 }, { "epoch": 2.989429175475687, "grad_norm": 0.4375395178794861, "learning_rate": 1.2438715887546002e-06, "loss": 0.8243948221206665, "step": 1414 }, { "epoch": 2.9936575052854124, "grad_norm": 2.233292579650879, "learning_rate": 1.2380362583291272e-06, "loss": 0.8824648261070251, "step": 1416 }, { "epoch": 2.997885835095137, "grad_norm": 0.4582400321960449, "learning_rate": 1.2322194122678375e-06, "loss": 0.5593487620353699, "step": 1418 }, { "epoch": 3.0021141649048624, "grad_norm": 0.6782700419425964, "learning_rate": 1.226421121684014e-06, "loss": 1.03118097782135, "step": 1420 }, { "epoch": 3.0063424947145876, "grad_norm": 1.0694071054458618, "learning_rate": 1.2206414574640868e-06, "loss": 0.6397127509117126, "step": 1422 }, { "epoch": 3.010570824524313, "grad_norm": 1.2534350156784058, "learning_rate": 1.2148804902667736e-06, "loss": 1.1392219066619873, "step": 1424 }, { "epoch": 3.014799154334038, "grad_norm": 1.9186782836914062, "learning_rate": 1.2091382905222132e-06, "loss": 0.520480215549469, "step": 1426 }, { "epoch": 3.019027484143763, "grad_norm": 0.8564389944076538, "learning_rate": 1.2034149284311041e-06, "loss": 0.6777791976928711, "step": 1428 }, { "epoch": 3.0232558139534884, "grad_norm": 0.7034538388252258, "learning_rate": 1.197710473963847e-06, "loss": 0.8563777804374695, "step": 1430 }, { "epoch": 3.0274841437632136, "grad_norm": 0.7909392714500427, "learning_rate": 1.1920249968596902e-06, "loss": 1.0257045030593872, "step": 1432 }, { "epoch": 3.031712473572939, "grad_norm": 0.47468486428260803, "learning_rate": 1.1863585666258748e-06, "loss": 0.9145489931106567, "step": 1434 }, { "epoch": 3.035940803382664, "grad_norm": 1.9823009967803955, "learning_rate": 1.1807112525367876e-06, "loss": 0.6615996360778809, "step": 1436 }, { "epoch": 3.040169133192389, "grad_norm": 0.8342152833938599, "learning_rate": 1.1750831236331117e-06, "loss": 0.2739180326461792, "step": 1438 }, { "epoch": 3.044397463002114, "grad_norm": 1.0090795755386353, "learning_rate": 1.1694742487209842e-06, "loss": 1.0122308731079102, "step": 1440 }, { "epoch": 3.048625792811839, "grad_norm": 0.5868484973907471, "learning_rate": 1.1638846963711545e-06, "loss": 0.7627484798431396, "step": 1442 }, { "epoch": 3.0528541226215644, "grad_norm": 2.0811753273010254, "learning_rate": 1.1583145349181456e-06, "loss": 0.21038176119327545, "step": 1444 }, { "epoch": 3.0570824524312896, "grad_norm": 2.1059317588806152, "learning_rate": 1.152763832459419e-06, "loss": 0.6727972030639648, "step": 1446 }, { "epoch": 3.061310782241015, "grad_norm": 2.8190877437591553, "learning_rate": 1.1472326568545424e-06, "loss": 0.7937036156654358, "step": 1448 }, { "epoch": 3.06553911205074, "grad_norm": 0.6504483819007874, "learning_rate": 1.1417210757243603e-06, "loss": 0.7131494879722595, "step": 1450 }, { "epoch": 3.0697674418604652, "grad_norm": 2.133265972137451, "learning_rate": 1.136229156450165e-06, "loss": 0.7005563378334045, "step": 1452 }, { "epoch": 3.0739957716701904, "grad_norm": 0.7323704957962036, "learning_rate": 1.1307569661728775e-06, "loss": 0.9205468893051147, "step": 1454 }, { "epoch": 3.0782241014799157, "grad_norm": 0.2374901920557022, "learning_rate": 1.1253045717922215e-06, "loss": 0.3031374216079712, "step": 1456 }, { "epoch": 3.0824524312896404, "grad_norm": 1.803215742111206, "learning_rate": 1.119872039965909e-06, "loss": 0.7160661220550537, "step": 1458 }, { "epoch": 3.0866807610993656, "grad_norm": 1.3725308179855347, "learning_rate": 1.1144594371088245e-06, "loss": 1.020361065864563, "step": 1460 }, { "epoch": 3.090909090909091, "grad_norm": 0.6326039433479309, "learning_rate": 1.1090668293922122e-06, "loss": 0.971651554107666, "step": 1462 }, { "epoch": 3.095137420718816, "grad_norm": 0.6070262789726257, "learning_rate": 1.103694282742868e-06, "loss": 0.6768549680709839, "step": 1464 }, { "epoch": 3.0993657505285412, "grad_norm": 0.5303124785423279, "learning_rate": 1.098341862842333e-06, "loss": 0.7792209982872009, "step": 1466 }, { "epoch": 3.1035940803382664, "grad_norm": 0.7507800459861755, "learning_rate": 1.0930096351260913e-06, "loss": 0.9888483881950378, "step": 1468 }, { "epoch": 3.1078224101479917, "grad_norm": 0.6695652008056641, "learning_rate": 1.0876976647827677e-06, "loss": 0.9820244312286377, "step": 1470 }, { "epoch": 3.112050739957717, "grad_norm": 4.5699615478515625, "learning_rate": 1.0824060167533365e-06, "loss": 0.6230260133743286, "step": 1472 }, { "epoch": 3.116279069767442, "grad_norm": 1.4406520128250122, "learning_rate": 1.0771347557303184e-06, "loss": 1.0396496057510376, "step": 1474 }, { "epoch": 3.120507399577167, "grad_norm": 0.8460061550140381, "learning_rate": 1.0718839461569972e-06, "loss": 0.9403010606765747, "step": 1476 }, { "epoch": 3.124735729386892, "grad_norm": 2.2458934783935547, "learning_rate": 1.0666536522266314e-06, "loss": 0.4271532893180847, "step": 1478 }, { "epoch": 3.1289640591966172, "grad_norm": 0.7539458870887756, "learning_rate": 1.0614439378816634e-06, "loss": 0.9892304539680481, "step": 1480 }, { "epoch": 3.1331923890063424, "grad_norm": 0.8904014825820923, "learning_rate": 1.0562548668129449e-06, "loss": 0.9543983340263367, "step": 1482 }, { "epoch": 3.1374207188160677, "grad_norm": 0.8391467928886414, "learning_rate": 1.0510865024589558e-06, "loss": 0.33414945006370544, "step": 1484 }, { "epoch": 3.141649048625793, "grad_norm": 2.2866015434265137, "learning_rate": 1.045938908005025e-06, "loss": 1.0479934215545654, "step": 1486 }, { "epoch": 3.145877378435518, "grad_norm": 0.8269973397254944, "learning_rate": 1.0408121463825627e-06, "loss": 1.0214964151382446, "step": 1488 }, { "epoch": 3.1501057082452433, "grad_norm": 2.0901854038238525, "learning_rate": 1.0357062802682905e-06, "loss": 0.7124687433242798, "step": 1490 }, { "epoch": 3.1543340380549685, "grad_norm": 2.464489459991455, "learning_rate": 1.0306213720834738e-06, "loss": 0.7923527956008911, "step": 1492 }, { "epoch": 3.1585623678646932, "grad_norm": 0.5960375666618347, "learning_rate": 1.0255574839931555e-06, "loss": 0.5037514567375183, "step": 1494 }, { "epoch": 3.1627906976744184, "grad_norm": 0.2680164575576782, "learning_rate": 1.0205146779054037e-06, "loss": 0.8170030117034912, "step": 1496 }, { "epoch": 3.1670190274841437, "grad_norm": 0.6705971360206604, "learning_rate": 1.0154930154705493e-06, "loss": 0.9746053814888, "step": 1498 }, { "epoch": 3.171247357293869, "grad_norm": 1.046158790588379, "learning_rate": 1.0104925580804307e-06, "loss": 1.0264575481414795, "step": 1500 }, { "epoch": 3.175475687103594, "grad_norm": 2.6368725299835205, "learning_rate": 1.0055133668676505e-06, "loss": 0.46951693296432495, "step": 1502 }, { "epoch": 3.1797040169133193, "grad_norm": 0.954997181892395, "learning_rate": 1.0005555027048216e-06, "loss": 0.5769892930984497, "step": 1504 }, { "epoch": 3.1839323467230445, "grad_norm": 0.8056331276893616, "learning_rate": 9.956190262038252e-07, "loss": 0.7956379055976868, "step": 1506 }, { "epoch": 3.1881606765327697, "grad_norm": 1.2383183240890503, "learning_rate": 9.90703997715068e-07, "loss": 0.4002586901187897, "step": 1508 }, { "epoch": 3.192389006342495, "grad_norm": 3.1095306873321533, "learning_rate": 9.8581047732675e-07, "loss": 0.3678751289844513, "step": 1510 }, { "epoch": 3.1966173361522197, "grad_norm": 0.2970428764820099, "learning_rate": 9.809385248641244e-07, "loss": 0.10512058436870575, "step": 1512 }, { "epoch": 3.200845665961945, "grad_norm": 2.6694912910461426, "learning_rate": 9.760881998887647e-07, "loss": 0.7792633771896362, "step": 1514 }, { "epoch": 3.20507399577167, "grad_norm": 1.5287692546844482, "learning_rate": 9.712595616978445e-07, "loss": 1.0101102590560913, "step": 1516 }, { "epoch": 3.2093023255813953, "grad_norm": 0.7071142792701721, "learning_rate": 9.66452669323406e-07, "loss": 0.6074497699737549, "step": 1518 }, { "epoch": 3.2135306553911205, "grad_norm": 1.0035736560821533, "learning_rate": 9.616675815316373e-07, "loss": 0.8396947383880615, "step": 1520 }, { "epoch": 3.2177589852008457, "grad_norm": 0.7858723998069763, "learning_rate": 9.569043568221613e-07, "loss": 0.9395447969436646, "step": 1522 }, { "epoch": 3.221987315010571, "grad_norm": 4.942752361297607, "learning_rate": 9.52163053427313e-07, "loss": 1.0000540018081665, "step": 1524 }, { "epoch": 3.226215644820296, "grad_norm": 0.7960143685340881, "learning_rate": 9.474437293114311e-07, "loss": 0.948387086391449, "step": 1526 }, { "epoch": 3.2304439746300213, "grad_norm": 2.0574419498443604, "learning_rate": 9.427464421701493e-07, "loss": 0.2774934768676758, "step": 1528 }, { "epoch": 3.234672304439746, "grad_norm": 1.1152596473693848, "learning_rate": 9.380712494296898e-07, "loss": 0.823591411113739, "step": 1530 }, { "epoch": 3.2389006342494713, "grad_norm": 2.095369338989258, "learning_rate": 9.334182082461624e-07, "loss": 0.8626236319541931, "step": 1532 }, { "epoch": 3.2431289640591965, "grad_norm": 0.8906185626983643, "learning_rate": 9.287873755048647e-07, "loss": 0.9925634264945984, "step": 1534 }, { "epoch": 3.2473572938689217, "grad_norm": 0.876634955406189, "learning_rate": 9.241788078195874e-07, "loss": 0.8858959078788757, "step": 1536 }, { "epoch": 3.251585623678647, "grad_norm": 0.8159791231155396, "learning_rate": 9.195925615319221e-07, "loss": 0.7304887175559998, "step": 1538 }, { "epoch": 3.255813953488372, "grad_norm": 0.8356714248657227, "learning_rate": 9.150286927105726e-07, "loss": 0.6133416891098022, "step": 1540 }, { "epoch": 3.2600422832980973, "grad_norm": 1.4572813510894775, "learning_rate": 9.104872571506682e-07, "loss": 1.211620807647705, "step": 1542 }, { "epoch": 3.2642706131078225, "grad_norm": 0.5943049788475037, "learning_rate": 9.059683103730835e-07, "loss": 0.9767951369285583, "step": 1544 }, { "epoch": 3.2684989429175477, "grad_norm": 1.6723552942276, "learning_rate": 9.014719076237579e-07, "loss": 0.9184189438819885, "step": 1546 }, { "epoch": 3.2727272727272725, "grad_norm": 0.5673151016235352, "learning_rate": 8.969981038730224e-07, "loss": 0.3618415892124176, "step": 1548 }, { "epoch": 3.276955602536998, "grad_norm": 1.0060195922851562, "learning_rate": 8.925469538149245e-07, "loss": 0.9330455660820007, "step": 1550 }, { "epoch": 3.281183932346723, "grad_norm": 0.9557608366012573, "learning_rate": 8.881185118665616e-07, "loss": 1.0155820846557617, "step": 1552 }, { "epoch": 3.285412262156448, "grad_norm": 0.13276489078998566, "learning_rate": 8.837128321674174e-07, "loss": 0.1570519506931305, "step": 1554 }, { "epoch": 3.2896405919661733, "grad_norm": 0.714574933052063, "learning_rate": 8.793299685786944e-07, "loss": 0.942793607711792, "step": 1556 }, { "epoch": 3.2938689217758985, "grad_norm": 0.9168136715888977, "learning_rate": 8.749699746826612e-07, "loss": 0.5292172431945801, "step": 1558 }, { "epoch": 3.2980972515856237, "grad_norm": 1.3022035360336304, "learning_rate": 8.706329037819961e-07, "loss": 1.1990944147109985, "step": 1560 }, { "epoch": 3.302325581395349, "grad_norm": 1.6504409313201904, "learning_rate": 8.663188088991317e-07, "loss": 0.7757396697998047, "step": 1562 }, { "epoch": 3.306553911205074, "grad_norm": 1.3289718627929688, "learning_rate": 8.620277427756112e-07, "loss": 0.5169369578361511, "step": 1564 }, { "epoch": 3.3107822410147993, "grad_norm": 0.875095546245575, "learning_rate": 8.577597578714439e-07, "loss": 0.7265094518661499, "step": 1566 }, { "epoch": 3.3150105708245245, "grad_norm": 0.33962443470954895, "learning_rate": 8.53514906364458e-07, "loss": 0.12319551408290863, "step": 1568 }, { "epoch": 3.3192389006342493, "grad_norm": 1.1597821712493896, "learning_rate": 8.492932401496683e-07, "loss": 0.5623422861099243, "step": 1570 }, { "epoch": 3.3234672304439745, "grad_norm": 3.1550745964050293, "learning_rate": 8.45094810838642e-07, "loss": 0.7283601760864258, "step": 1572 }, { "epoch": 3.3276955602536997, "grad_norm": 1.145011305809021, "learning_rate": 8.40919669758864e-07, "loss": 0.26868027448654175, "step": 1574 }, { "epoch": 3.331923890063425, "grad_norm": 3.8039538860321045, "learning_rate": 8.3676786795311e-07, "loss": 0.4419690668582916, "step": 1576 }, { "epoch": 3.33615221987315, "grad_norm": 0.6252729892730713, "learning_rate": 8.326394561788257e-07, "loss": 0.5640559196472168, "step": 1578 }, { "epoch": 3.3403805496828753, "grad_norm": 0.55072021484375, "learning_rate": 8.285344849075047e-07, "loss": 0.6380379796028137, "step": 1580 }, { "epoch": 3.3446088794926006, "grad_norm": 1.0008291006088257, "learning_rate": 8.244530043240687e-07, "loss": 0.98517906665802, "step": 1582 }, { "epoch": 3.3488372093023258, "grad_norm": 1.933143138885498, "learning_rate": 8.203950643262576e-07, "loss": 0.717485785484314, "step": 1584 }, { "epoch": 3.353065539112051, "grad_norm": 0.8978578448295593, "learning_rate": 8.163607145240191e-07, "loss": 0.6533565521240234, "step": 1586 }, { "epoch": 3.3572938689217757, "grad_norm": 1.672323226928711, "learning_rate": 8.123500042389003e-07, "loss": 1.1361911296844482, "step": 1588 }, { "epoch": 3.361522198731501, "grad_norm": 0.7658936381340027, "learning_rate": 8.083629825034443e-07, "loss": 0.6171827912330627, "step": 1590 }, { "epoch": 3.365750528541226, "grad_norm": 1.5416232347488403, "learning_rate": 8.043996980605952e-07, "loss": 0.8929522633552551, "step": 1592 }, { "epoch": 3.3699788583509513, "grad_norm": 0.6201046109199524, "learning_rate": 8.004601993630979e-07, "loss": 0.4101506471633911, "step": 1594 }, { "epoch": 3.3742071881606766, "grad_norm": 0.8717901706695557, "learning_rate": 7.965445345729045e-07, "loss": 0.9818314909934998, "step": 1596 }, { "epoch": 3.3784355179704018, "grad_norm": 2.7879254817962646, "learning_rate": 7.926527515605922e-07, "loss": 0.644636332988739, "step": 1598 }, { "epoch": 3.382663847780127, "grad_norm": 1.3160312175750732, "learning_rate": 7.88784897904772e-07, "loss": 0.41142430901527405, "step": 1600 }, { "epoch": 3.386892177589852, "grad_norm": 1.4033868312835693, "learning_rate": 7.849410208915069e-07, "loss": 0.5842673778533936, "step": 1602 }, { "epoch": 3.3911205073995774, "grad_norm": 1.1946991682052612, "learning_rate": 7.811211675137392e-07, "loss": 1.0320261716842651, "step": 1604 }, { "epoch": 3.395348837209302, "grad_norm": 0.639847993850708, "learning_rate": 7.773253844707108e-07, "loss": 1.0384889841079712, "step": 1606 }, { "epoch": 3.3995771670190273, "grad_norm": 4.001772403717041, "learning_rate": 7.735537181673947e-07, "loss": 0.6584277749061584, "step": 1608 }, { "epoch": 3.4038054968287526, "grad_norm": 0.6378755569458008, "learning_rate": 7.69806214713926e-07, "loss": 1.018156886100769, "step": 1610 }, { "epoch": 3.4080338266384778, "grad_norm": 0.7776346802711487, "learning_rate": 7.660829199250404e-07, "loss": 0.8746322393417358, "step": 1612 }, { "epoch": 3.412262156448203, "grad_norm": 1.3227170705795288, "learning_rate": 7.623838793195128e-07, "loss": 0.8452064990997314, "step": 1614 }, { "epoch": 3.416490486257928, "grad_norm": 1.253333330154419, "learning_rate": 7.587091381196004e-07, "loss": 0.9873075485229492, "step": 1616 }, { "epoch": 3.4207188160676534, "grad_norm": 0.5858563184738159, "learning_rate": 7.550587412504907e-07, "loss": 0.9376651644706726, "step": 1618 }, { "epoch": 3.4249471458773786, "grad_norm": 1.6567012071609497, "learning_rate": 7.514327333397521e-07, "loss": 0.9783826470375061, "step": 1620 }, { "epoch": 3.429175475687104, "grad_norm": 7.168039321899414, "learning_rate": 7.47831158716788e-07, "loss": 0.6209827661514282, "step": 1622 }, { "epoch": 3.4334038054968286, "grad_norm": 0.9959341883659363, "learning_rate": 7.442540614122954e-07, "loss": 0.9962281584739685, "step": 1624 }, { "epoch": 3.4376321353065538, "grad_norm": 0.6434539556503296, "learning_rate": 7.407014851577257e-07, "loss": 0.7141914367675781, "step": 1626 }, { "epoch": 3.441860465116279, "grad_norm": 1.174318552017212, "learning_rate": 7.371734733847509e-07, "loss": 0.9825333952903748, "step": 1628 }, { "epoch": 3.446088794926004, "grad_norm": 3.244459867477417, "learning_rate": 7.336700692247326e-07, "loss": 0.598316490650177, "step": 1630 }, { "epoch": 3.4503171247357294, "grad_norm": 0.6823888421058655, "learning_rate": 7.301913155081937e-07, "loss": 0.9444507360458374, "step": 1632 }, { "epoch": 3.4545454545454546, "grad_norm": 3.044529676437378, "learning_rate": 7.267372547642965e-07, "loss": 0.6880492568016052, "step": 1634 }, { "epoch": 3.45877378435518, "grad_norm": 0.7098934650421143, "learning_rate": 7.23307929220321e-07, "loss": 0.8510515689849854, "step": 1636 }, { "epoch": 3.463002114164905, "grad_norm": 2.740060806274414, "learning_rate": 7.199033808011497e-07, "loss": 0.4582882225513458, "step": 1638 }, { "epoch": 3.46723044397463, "grad_norm": 0.6570109724998474, "learning_rate": 7.16523651128755e-07, "loss": 0.5597135424613953, "step": 1640 }, { "epoch": 3.471458773784355, "grad_norm": 0.6645305156707764, "learning_rate": 7.131687815216901e-07, "loss": 0.22359013557434082, "step": 1642 }, { "epoch": 3.47568710359408, "grad_norm": 0.8287932872772217, "learning_rate": 7.098388129945833e-07, "loss": 0.9671212434768677, "step": 1644 }, { "epoch": 3.4799154334038054, "grad_norm": 1.3744875192642212, "learning_rate": 7.065337862576381e-07, "loss": 0.9185785055160522, "step": 1646 }, { "epoch": 3.4841437632135306, "grad_norm": 0.5585587024688721, "learning_rate": 7.032537417161339e-07, "loss": 0.5719754695892334, "step": 1648 }, { "epoch": 3.488372093023256, "grad_norm": 0.6185411810874939, "learning_rate": 6.999987194699334e-07, "loss": 0.5411649942398071, "step": 1650 }, { "epoch": 3.492600422832981, "grad_norm": 1.760116457939148, "learning_rate": 6.967687593129909e-07, "loss": 0.6113811731338501, "step": 1652 }, { "epoch": 3.496828752642706, "grad_norm": 1.6597703695297241, "learning_rate": 6.935639007328666e-07, "loss": 0.9229161143302917, "step": 1654 }, { "epoch": 3.5010570824524314, "grad_norm": 0.8579858541488647, "learning_rate": 6.903841829102457e-07, "loss": 0.9809255003929138, "step": 1656 }, { "epoch": 3.5052854122621566, "grad_norm": 0.428204208612442, "learning_rate": 6.872296447184546e-07, "loss": 0.843367338180542, "step": 1658 }, { "epoch": 3.5095137420718814, "grad_norm": 0.5644919276237488, "learning_rate": 6.841003247229903e-07, "loss": 0.6564947962760925, "step": 1660 }, { "epoch": 3.513742071881607, "grad_norm": 4.3857879638671875, "learning_rate": 6.80996261181048e-07, "loss": 0.6874603629112244, "step": 1662 }, { "epoch": 3.517970401691332, "grad_norm": 2.124926805496216, "learning_rate": 6.779174920410505e-07, "loss": 0.9908625483512878, "step": 1664 }, { "epoch": 3.522198731501057, "grad_norm": 1.0560848712921143, "learning_rate": 6.748640549421873e-07, "loss": 1.0359817743301392, "step": 1666 }, { "epoch": 3.526427061310782, "grad_norm": 0.21484586596488953, "learning_rate": 6.71835987213955e-07, "loss": 0.2849699854850769, "step": 1668 }, { "epoch": 3.5306553911205074, "grad_norm": 0.4259510040283203, "learning_rate": 6.688333258756966e-07, "loss": 0.8330371975898743, "step": 1670 }, { "epoch": 3.5348837209302326, "grad_norm": 0.9718641042709351, "learning_rate": 6.658561076361539e-07, "loss": 0.6728772521018982, "step": 1672 }, { "epoch": 3.539112050739958, "grad_norm": 1.3043420314788818, "learning_rate": 6.629043688930161e-07, "loss": 1.06952702999115, "step": 1674 }, { "epoch": 3.543340380549683, "grad_norm": 2.601339340209961, "learning_rate": 6.599781457324759e-07, "loss": 0.7122786641120911, "step": 1676 }, { "epoch": 3.547568710359408, "grad_norm": 0.5953323841094971, "learning_rate": 6.570774739287855e-07, "loss": 0.9681164026260376, "step": 1678 }, { "epoch": 3.5517970401691334, "grad_norm": 0.6733061075210571, "learning_rate": 6.542023889438244e-07, "loss": 0.660723090171814, "step": 1680 }, { "epoch": 3.556025369978858, "grad_norm": 0.5774943828582764, "learning_rate": 6.513529259266614e-07, "loss": 0.6790302991867065, "step": 1682 }, { "epoch": 3.5602536997885834, "grad_norm": 1.2416257858276367, "learning_rate": 6.485291197131258e-07, "loss": 0.6007125377655029, "step": 1684 }, { "epoch": 3.5644820295983086, "grad_norm": 1.9640257358551025, "learning_rate": 6.45731004825384e-07, "loss": 0.29832443594932556, "step": 1686 }, { "epoch": 3.568710359408034, "grad_norm": 1.3899872303009033, "learning_rate": 6.429586154715143e-07, "loss": 0.7014768719673157, "step": 1688 }, { "epoch": 3.572938689217759, "grad_norm": 0.20190729200839996, "learning_rate": 6.402119855450905e-07, "loss": 0.33684778213500977, "step": 1690 }, { "epoch": 3.5771670190274842, "grad_norm": 1.3706597089767456, "learning_rate": 6.374911486247666e-07, "loss": 0.4806325137615204, "step": 1692 }, { "epoch": 3.5813953488372094, "grad_norm": 1.9231373071670532, "learning_rate": 6.347961379738678e-07, "loss": 0.6597048044204712, "step": 1694 }, { "epoch": 3.585623678646934, "grad_norm": 3.228268623352051, "learning_rate": 6.321269865399811e-07, "loss": 0.44895780086517334, "step": 1696 }, { "epoch": 3.58985200845666, "grad_norm": 0.6945326328277588, "learning_rate": 6.294837269545557e-07, "loss": 0.9701504111289978, "step": 1698 }, { "epoch": 3.5940803382663846, "grad_norm": 0.8373157978057861, "learning_rate": 6.268663915325021e-07, "loss": 1.074630856513977, "step": 1700 }, { "epoch": 3.59830866807611, "grad_norm": 1.8083549737930298, "learning_rate": 6.24275012271797e-07, "loss": 1.0470349788665771, "step": 1702 }, { "epoch": 3.602536997885835, "grad_norm": 3.190058469772339, "learning_rate": 6.217096208530931e-07, "loss": 0.4534735679626465, "step": 1704 }, { "epoch": 3.6067653276955602, "grad_norm": 0.6707348823547363, "learning_rate": 6.191702486393313e-07, "loss": 0.5571319460868835, "step": 1706 }, { "epoch": 3.6109936575052854, "grad_norm": 2.686514377593994, "learning_rate": 6.166569266753569e-07, "loss": 0.8430109620094299, "step": 1708 }, { "epoch": 3.6152219873150107, "grad_norm": 1.0745434761047363, "learning_rate": 6.141696856875408e-07, "loss": 0.8707183599472046, "step": 1710 }, { "epoch": 3.619450317124736, "grad_norm": 1.010704517364502, "learning_rate": 6.117085560834034e-07, "loss": 0.5877060890197754, "step": 1712 }, { "epoch": 3.6236786469344606, "grad_norm": 0.396779328584671, "learning_rate": 6.092735679512427e-07, "loss": 0.49770990014076233, "step": 1714 }, { "epoch": 3.6279069767441863, "grad_norm": 3.3722569942474365, "learning_rate": 6.068647510597671e-07, "loss": 0.7864755988121033, "step": 1716 }, { "epoch": 3.632135306553911, "grad_norm": 0.5849418044090271, "learning_rate": 6.044821348577306e-07, "loss": 0.588261604309082, "step": 1718 }, { "epoch": 3.6363636363636362, "grad_norm": 1.558585286140442, "learning_rate": 6.021257484735737e-07, "loss": 0.7706260681152344, "step": 1720 }, { "epoch": 3.6405919661733614, "grad_norm": 0.7773210406303406, "learning_rate": 5.997956207150664e-07, "loss": 0.8451033234596252, "step": 1722 }, { "epoch": 3.6448202959830867, "grad_norm": 1.4222577810287476, "learning_rate": 5.974917800689572e-07, "loss": 0.7600279450416565, "step": 1724 }, { "epoch": 3.649048625792812, "grad_norm": 0.6138176918029785, "learning_rate": 5.952142547006232e-07, "loss": 1.0202842950820923, "step": 1726 }, { "epoch": 3.653276955602537, "grad_norm": 0.6351314783096313, "learning_rate": 5.92963072453727e-07, "loss": 0.9493424296379089, "step": 1728 }, { "epoch": 3.6575052854122623, "grad_norm": 1.68190598487854, "learning_rate": 5.907382608498761e-07, "loss": 0.8003555536270142, "step": 1730 }, { "epoch": 3.6617336152219875, "grad_norm": 0.9876241683959961, "learning_rate": 5.885398470882863e-07, "loss": 0.9022297263145447, "step": 1732 }, { "epoch": 3.6659619450317127, "grad_norm": 1.069425106048584, "learning_rate": 5.863678580454489e-07, "loss": 0.9579256772994995, "step": 1734 }, { "epoch": 3.6701902748414374, "grad_norm": 0.5899412035942078, "learning_rate": 5.842223202748026e-07, "loss": 1.0141502618789673, "step": 1736 }, { "epoch": 3.6744186046511627, "grad_norm": 2.7078421115875244, "learning_rate": 5.821032600064089e-07, "loss": 0.31864723563194275, "step": 1738 }, { "epoch": 3.678646934460888, "grad_norm": 1.3227430582046509, "learning_rate": 5.800107031466306e-07, "loss": 0.52090984582901, "step": 1740 }, { "epoch": 3.682875264270613, "grad_norm": 1.3572659492492676, "learning_rate": 5.779446752778158e-07, "loss": 0.40007254481315613, "step": 1742 }, { "epoch": 3.6871035940803383, "grad_norm": 0.9358891248703003, "learning_rate": 5.759052016579858e-07, "loss": 0.9531795382499695, "step": 1744 }, { "epoch": 3.6913319238900635, "grad_norm": 0.22946986556053162, "learning_rate": 5.738923072205247e-07, "loss": 0.6118672490119934, "step": 1746 }, { "epoch": 3.6955602536997887, "grad_norm": 1.7882148027420044, "learning_rate": 5.719060165738753e-07, "loss": 0.5476849675178528, "step": 1748 }, { "epoch": 3.699788583509514, "grad_norm": 0.6446103453636169, "learning_rate": 5.699463540012398e-07, "loss": 1.0358470678329468, "step": 1750 }, { "epoch": 3.704016913319239, "grad_norm": 1.7342896461486816, "learning_rate": 5.680133434602796e-07, "loss": 0.43426331877708435, "step": 1752 }, { "epoch": 3.708245243128964, "grad_norm": 3.078566789627075, "learning_rate": 5.661070085828253e-07, "loss": 0.5601077079772949, "step": 1754 }, { "epoch": 3.712473572938689, "grad_norm": 0.696753978729248, "learning_rate": 5.642273726745867e-07, "loss": 0.8815577030181885, "step": 1756 }, { "epoch": 3.7167019027484143, "grad_norm": 3.4322452545166016, "learning_rate": 5.623744587148686e-07, "loss": 0.55597984790802, "step": 1758 }, { "epoch": 3.7209302325581395, "grad_norm": 0.21880486607551575, "learning_rate": 5.605482893562872e-07, "loss": 0.49099811911582947, "step": 1760 }, { "epoch": 3.7251585623678647, "grad_norm": 0.806815505027771, "learning_rate": 5.587488869244977e-07, "loss": 0.9834616184234619, "step": 1762 }, { "epoch": 3.72938689217759, "grad_norm": 0.905757486820221, "learning_rate": 5.569762734179175e-07, "loss": 0.5867785215377808, "step": 1764 }, { "epoch": 3.733615221987315, "grad_norm": 1.050884485244751, "learning_rate": 5.552304705074587e-07, "loss": 0.8268157243728638, "step": 1766 }, { "epoch": 3.7378435517970403, "grad_norm": 3.77276611328125, "learning_rate": 5.535114995362631e-07, "loss": 0.9136216044425964, "step": 1768 }, { "epoch": 3.7420718816067655, "grad_norm": 0.35950765013694763, "learning_rate": 5.518193815194421e-07, "loss": 0.3232070505619049, "step": 1770 }, { "epoch": 3.7463002114164903, "grad_norm": 1.1717166900634766, "learning_rate": 5.50154137143818e-07, "loss": 0.586397111415863, "step": 1772 }, { "epoch": 3.7505285412262155, "grad_norm": 1.6452980041503906, "learning_rate": 5.485157867676717e-07, "loss": 1.2943792343139648, "step": 1774 }, { "epoch": 3.7547568710359407, "grad_norm": 1.183484673500061, "learning_rate": 5.469043504204954e-07, "loss": 1.0138071775436401, "step": 1776 }, { "epoch": 3.758985200845666, "grad_norm": 0.6358800530433655, "learning_rate": 5.453198478027459e-07, "loss": 1.0095187425613403, "step": 1778 }, { "epoch": 3.763213530655391, "grad_norm": 1.2916791439056396, "learning_rate": 5.437622982856039e-07, "loss": 1.0655014514923096, "step": 1780 }, { "epoch": 3.7674418604651163, "grad_norm": 1.0883994102478027, "learning_rate": 5.422317209107381e-07, "loss": 0.856255829334259, "step": 1782 }, { "epoch": 3.7716701902748415, "grad_norm": 5.774519443511963, "learning_rate": 5.407281343900724e-07, "loss": 0.20018130540847778, "step": 1784 }, { "epoch": 3.7758985200845667, "grad_norm": 1.4228441715240479, "learning_rate": 5.392515571055551e-07, "loss": 0.7519955039024353, "step": 1786 }, { "epoch": 3.780126849894292, "grad_norm": 2.516164779663086, "learning_rate": 5.378020071089375e-07, "loss": 0.6696423292160034, "step": 1788 }, { "epoch": 3.7843551797040167, "grad_norm": 1.3914198875427246, "learning_rate": 5.363795021215504e-07, "loss": 0.354766309261322, "step": 1790 }, { "epoch": 3.7885835095137423, "grad_norm": 0.269010454416275, "learning_rate": 5.349840595340888e-07, "loss": 0.953768253326416, "step": 1792 }, { "epoch": 3.792811839323467, "grad_norm": 0.7044401168823242, "learning_rate": 5.33615696406399e-07, "loss": 0.9254974722862244, "step": 1794 }, { "epoch": 3.7970401691331923, "grad_norm": 2.0106935501098633, "learning_rate": 5.322744294672698e-07, "loss": 0.5682697296142578, "step": 1796 }, { "epoch": 3.8012684989429175, "grad_norm": 2.6919407844543457, "learning_rate": 5.309602751142287e-07, "loss": 0.9588193297386169, "step": 1798 }, { "epoch": 3.8054968287526427, "grad_norm": 1.6973198652267456, "learning_rate": 5.296732494133406e-07, "loss": 1.0144344568252563, "step": 1800 }, { "epoch": 3.809725158562368, "grad_norm": 1.7578473091125488, "learning_rate": 5.284133680990113e-07, "loss": 0.7145028114318848, "step": 1802 }, { "epoch": 3.813953488372093, "grad_norm": 0.8779058456420898, "learning_rate": 5.271806465737967e-07, "loss": 0.9277461767196655, "step": 1804 }, { "epoch": 3.8181818181818183, "grad_norm": 0.7843415141105652, "learning_rate": 5.259750999082123e-07, "loss": 1.0387165546417236, "step": 1806 }, { "epoch": 3.822410147991543, "grad_norm": 1.58511483669281, "learning_rate": 5.247967428405505e-07, "loss": 0.1425338089466095, "step": 1808 }, { "epoch": 3.8266384778012688, "grad_norm": 1.0016520023345947, "learning_rate": 5.236455897766998e-07, "loss": 0.9441636204719543, "step": 1810 }, { "epoch": 3.8308668076109935, "grad_norm": 0.822861909866333, "learning_rate": 5.22521654789969e-07, "loss": 1.029585838317871, "step": 1812 }, { "epoch": 3.8350951374207187, "grad_norm": 0.6523001194000244, "learning_rate": 5.214249516209148e-07, "loss": 0.7822322249412537, "step": 1814 }, { "epoch": 3.839323467230444, "grad_norm": 0.8254392743110657, "learning_rate": 5.203554936771742e-07, "loss": 0.6645534634590149, "step": 1816 }, { "epoch": 3.843551797040169, "grad_norm": 1.8470152616500854, "learning_rate": 5.193132940332998e-07, "loss": 0.6678524613380432, "step": 1818 }, { "epoch": 3.8477801268498943, "grad_norm": 0.7378912568092346, "learning_rate": 5.182983654306015e-07, "loss": 0.660444438457489, "step": 1820 }, { "epoch": 3.8520084566596196, "grad_norm": 0.21690633893013, "learning_rate": 5.173107202769891e-07, "loss": 0.77535080909729, "step": 1822 }, { "epoch": 3.8562367864693448, "grad_norm": 0.9497125148773193, "learning_rate": 5.163503706468209e-07, "loss": 0.6644335389137268, "step": 1824 }, { "epoch": 3.8604651162790695, "grad_norm": 2.4505763053894043, "learning_rate": 5.154173282807579e-07, "loss": 0.6357966065406799, "step": 1826 }, { "epoch": 3.864693446088795, "grad_norm": 0.5043659806251526, "learning_rate": 5.145116045856168e-07, "loss": 0.9884635210037231, "step": 1828 }, { "epoch": 3.86892177589852, "grad_norm": 1.1076487302780151, "learning_rate": 5.136332106342344e-07, "loss": 1.014207124710083, "step": 1830 }, { "epoch": 3.873150105708245, "grad_norm": 0.912322461605072, "learning_rate": 5.127821571653295e-07, "loss": 0.7557728886604309, "step": 1832 }, { "epoch": 3.8773784355179703, "grad_norm": 4.824005126953125, "learning_rate": 5.119584545833723e-07, "loss": 0.5752384066581726, "step": 1834 }, { "epoch": 3.8816067653276956, "grad_norm": 0.30750563740730286, "learning_rate": 5.111621129584585e-07, "loss": 0.6163195371627808, "step": 1836 }, { "epoch": 3.8858350951374208, "grad_norm": 0.22983339428901672, "learning_rate": 5.103931420261836e-07, "loss": 0.5606608986854553, "step": 1838 }, { "epoch": 3.890063424947146, "grad_norm": 1.0208609104156494, "learning_rate": 5.096515511875267e-07, "loss": 0.9524738788604736, "step": 1840 }, { "epoch": 3.894291754756871, "grad_norm": 0.7148008942604065, "learning_rate": 5.08937349508734e-07, "loss": 0.9512585997581482, "step": 1842 }, { "epoch": 3.898520084566596, "grad_norm": 0.737912654876709, "learning_rate": 5.082505457212071e-07, "loss": 0.6485314965248108, "step": 1844 }, { "epoch": 3.9027484143763216, "grad_norm": 8.045441627502441, "learning_rate": 5.07591148221399e-07, "loss": 0.5995697379112244, "step": 1846 }, { "epoch": 3.9069767441860463, "grad_norm": 1.6817905902862549, "learning_rate": 5.069591650707088e-07, "loss": 0.21968799829483032, "step": 1848 }, { "epoch": 3.9112050739957716, "grad_norm": 0.6323149800300598, "learning_rate": 5.063546039953841e-07, "loss": 0.9831611514091492, "step": 1850 }, { "epoch": 3.9154334038054968, "grad_norm": 0.21932940185070038, "learning_rate": 5.057774723864276e-07, "loss": 0.584568977355957, "step": 1852 }, { "epoch": 3.919661733615222, "grad_norm": 0.534582793712616, "learning_rate": 5.052277772995044e-07, "loss": 0.9615625143051147, "step": 1854 }, { "epoch": 3.923890063424947, "grad_norm": 1.219119668006897, "learning_rate": 5.04705525454858e-07, "loss": 0.5662239193916321, "step": 1856 }, { "epoch": 3.9281183932346724, "grad_norm": 1.430123209953308, "learning_rate": 5.042107232372275e-07, "loss": 0.8200953006744385, "step": 1858 }, { "epoch": 3.9323467230443976, "grad_norm": 1.7414133548736572, "learning_rate": 5.037433766957684e-07, "loss": 0.35313427448272705, "step": 1860 }, { "epoch": 3.9365750528541223, "grad_norm": 2.733624219894409, "learning_rate": 5.033034915439797e-07, "loss": 1.0163811445236206, "step": 1862 }, { "epoch": 3.940803382663848, "grad_norm": 1.1092981100082397, "learning_rate": 5.028910731596344e-07, "loss": 0.9771573543548584, "step": 1864 }, { "epoch": 3.9450317124735728, "grad_norm": 1.123976707458496, "learning_rate": 5.02506126584713e-07, "loss": 0.9256449937820435, "step": 1866 }, { "epoch": 3.949260042283298, "grad_norm": 0.1970531940460205, "learning_rate": 5.021486565253419e-07, "loss": 0.006525847129523754, "step": 1868 }, { "epoch": 3.953488372093023, "grad_norm": 1.2699693441390991, "learning_rate": 5.01818667351736e-07, "loss": 0.9931344389915466, "step": 1870 }, { "epoch": 3.9577167019027484, "grad_norm": 0.8406357765197754, "learning_rate": 5.015161630981461e-07, "loss": 0.7480917572975159, "step": 1872 }, { "epoch": 3.9619450317124736, "grad_norm": 1.2351728677749634, "learning_rate": 5.012411474628075e-07, "loss": 0.6757962703704834, "step": 1874 }, { "epoch": 3.966173361522199, "grad_norm": 0.8207041025161743, "learning_rate": 5.009936238078976e-07, "loss": 0.9615821838378906, "step": 1876 }, { "epoch": 3.970401691331924, "grad_norm": 3.9564754962921143, "learning_rate": 5.007735951594917e-07, "loss": 0.47021615505218506, "step": 1878 }, { "epoch": 3.974630021141649, "grad_norm": 0.9730548858642578, "learning_rate": 5.005810642075292e-07, "loss": 0.5955108404159546, "step": 1880 }, { "epoch": 3.9788583509513744, "grad_norm": 1.1672887802124023, "learning_rate": 5.00416033305778e-07, "loss": 0.8675932884216309, "step": 1882 }, { "epoch": 3.983086680761099, "grad_norm": 0.9139389395713806, "learning_rate": 5.002785044718068e-07, "loss": 1.0263160467147827, "step": 1884 }, { "epoch": 3.9873150105708244, "grad_norm": 0.7081848978996277, "learning_rate": 5.001684793869617e-07, "loss": 0.7986045479774475, "step": 1886 }, { "epoch": 3.9915433403805496, "grad_norm": 0.6204723119735718, "learning_rate": 5.000859593963427e-07, "loss": 0.962172269821167, "step": 1888 }, { "epoch": 3.995771670190275, "grad_norm": 0.7160171270370483, "learning_rate": 5.000309455087906e-07, "loss": 0.9778663516044617, "step": 1890 }, { "epoch": 4.0, "grad_norm": 0.8123812079429626, "learning_rate": 5.000034383968715e-07, "loss": 0.5614367723464966, "step": 1892 }, { "epoch": 4.0, "step": 1892, "total_flos": 3.554237146892075e+18, "train_loss": 0.9275046758280858, "train_runtime": 19158.8521, "train_samples_per_second": 2.963, "train_steps_per_second": 0.099 } ], "logging_steps": 2, "max_steps": 1892, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 99999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.554237146892075e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }