{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.505457282649605, "eval_steps": 200, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015054572826496049, "grad_norm": 63.155174255371094, "learning_rate": 2.0000000000000002e-07, "loss": 6.6427, "step": 1 }, { "epoch": 0.0030109145652992097, "grad_norm": 55.86313247680664, "learning_rate": 4.0000000000000003e-07, "loss": 6.6448, "step": 2 }, { "epoch": 0.004516371847948814, "grad_norm": 56.68950653076172, "learning_rate": 6.000000000000001e-07, "loss": 6.5767, "step": 3 }, { "epoch": 0.0060218291305984195, "grad_norm": 46.76124954223633, "learning_rate": 8.000000000000001e-07, "loss": 6.3008, "step": 4 }, { "epoch": 0.007527286413248024, "grad_norm": 35.34189224243164, "learning_rate": 1.0000000000000002e-06, "loss": 6.0732, "step": 5 }, { "epoch": 0.009032743695897629, "grad_norm": 29.266027450561523, "learning_rate": 1.2000000000000002e-06, "loss": 5.779, "step": 6 }, { "epoch": 0.010538200978547234, "grad_norm": 16.791263580322266, "learning_rate": 1.4000000000000001e-06, "loss": 5.5269, "step": 7 }, { "epoch": 0.012043658261196839, "grad_norm": 14.960224151611328, "learning_rate": 1.6000000000000001e-06, "loss": 5.4139, "step": 8 }, { "epoch": 0.013549115543846444, "grad_norm": 14.124578475952148, "learning_rate": 1.8e-06, "loss": 5.3145, "step": 9 }, { "epoch": 0.015054572826496047, "grad_norm": 11.18884563446045, "learning_rate": 2.0000000000000003e-06, "loss": 5.277, "step": 10 }, { "epoch": 0.016560030109145654, "grad_norm": 15.184775352478027, "learning_rate": 2.2e-06, "loss": 5.2318, "step": 11 }, { "epoch": 0.018065487391795258, "grad_norm": 11.866249084472656, "learning_rate": 2.4000000000000003e-06, "loss": 5.2019, "step": 12 }, { "epoch": 0.01957094467444486, "grad_norm": 8.771293640136719, "learning_rate": 2.6e-06, "loss": 5.1043, "step": 13 }, { "epoch": 0.021076401957094468, "grad_norm": 11.253835678100586, "learning_rate": 2.8000000000000003e-06, "loss": 5.0695, "step": 14 }, { "epoch": 0.02258185923974407, "grad_norm": 9.082535743713379, "learning_rate": 3e-06, "loss": 4.9781, "step": 15 }, { "epoch": 0.024087316522393678, "grad_norm": 8.113845825195312, "learning_rate": 3.2000000000000003e-06, "loss": 4.8502, "step": 16 }, { "epoch": 0.02559277380504328, "grad_norm": 9.939194679260254, "learning_rate": 3.4000000000000005e-06, "loss": 4.769, "step": 17 }, { "epoch": 0.027098231087692888, "grad_norm": 6.464789867401123, "learning_rate": 3.6e-06, "loss": 4.6613, "step": 18 }, { "epoch": 0.02860368837034249, "grad_norm": 7.796135425567627, "learning_rate": 3.8e-06, "loss": 4.5888, "step": 19 }, { "epoch": 0.030109145652992095, "grad_norm": 7.129495143890381, "learning_rate": 4.000000000000001e-06, "loss": 4.452, "step": 20 }, { "epoch": 0.0316146029356417, "grad_norm": 6.915582180023193, "learning_rate": 4.2000000000000004e-06, "loss": 4.3405, "step": 21 }, { "epoch": 0.03312006021829131, "grad_norm": 6.997885704040527, "learning_rate": 4.4e-06, "loss": 4.26, "step": 22 }, { "epoch": 0.03462551750094091, "grad_norm": 5.4080424308776855, "learning_rate": 4.6e-06, "loss": 4.1937, "step": 23 }, { "epoch": 0.036130974783590515, "grad_norm": 6.4277520179748535, "learning_rate": 4.800000000000001e-06, "loss": 4.0509, "step": 24 }, { "epoch": 0.03763643206624012, "grad_norm": 4.858531475067139, "learning_rate": 5e-06, "loss": 3.9245, "step": 25 }, { "epoch": 0.03914188934888972, "grad_norm": 6.183747291564941, "learning_rate": 5.2e-06, "loss": 3.872, "step": 26 }, { "epoch": 0.04064734663153933, "grad_norm": 4.410830020904541, "learning_rate": 5.4e-06, "loss": 3.8336, "step": 27 }, { "epoch": 0.042152803914188935, "grad_norm": 4.874277114868164, "learning_rate": 5.600000000000001e-06, "loss": 3.6406, "step": 28 }, { "epoch": 0.04365826119683854, "grad_norm": 3.976102113723755, "learning_rate": 5.8e-06, "loss": 3.5753, "step": 29 }, { "epoch": 0.04516371847948814, "grad_norm": 4.300251483917236, "learning_rate": 6e-06, "loss": 3.4761, "step": 30 }, { "epoch": 0.04666917576213775, "grad_norm": 3.6960384845733643, "learning_rate": 6.2e-06, "loss": 3.3445, "step": 31 }, { "epoch": 0.048174633044787356, "grad_norm": 3.8473455905914307, "learning_rate": 6.4000000000000006e-06, "loss": 3.3039, "step": 32 }, { "epoch": 0.049680090327436956, "grad_norm": 3.054046869277954, "learning_rate": 6.6e-06, "loss": 3.2148, "step": 33 }, { "epoch": 0.05118554761008656, "grad_norm": 2.6907546520233154, "learning_rate": 6.800000000000001e-06, "loss": 3.1963, "step": 34 }, { "epoch": 0.05269100489273617, "grad_norm": 3.037846803665161, "learning_rate": 7.000000000000001e-06, "loss": 3.0423, "step": 35 }, { "epoch": 0.054196462175385776, "grad_norm": 2.304053783416748, "learning_rate": 7.2e-06, "loss": 2.9456, "step": 36 }, { "epoch": 0.055701919458035376, "grad_norm": 5.218963623046875, "learning_rate": 7.4e-06, "loss": 2.8942, "step": 37 }, { "epoch": 0.05720737674068498, "grad_norm": 2.280093193054199, "learning_rate": 7.6e-06, "loss": 2.8749, "step": 38 }, { "epoch": 0.05871283402333459, "grad_norm": 2.5850820541381836, "learning_rate": 7.8e-06, "loss": 2.8534, "step": 39 }, { "epoch": 0.06021829130598419, "grad_norm": 2.293491840362549, "learning_rate": 8.000000000000001e-06, "loss": 2.7124, "step": 40 }, { "epoch": 0.061723748588633796, "grad_norm": 2.0723798274993896, "learning_rate": 8.200000000000001e-06, "loss": 2.7766, "step": 41 }, { "epoch": 0.0632292058712834, "grad_norm": 1.8761773109436035, "learning_rate": 8.400000000000001e-06, "loss": 2.6995, "step": 42 }, { "epoch": 0.064734663153933, "grad_norm": 1.6726175546646118, "learning_rate": 8.599999999999999e-06, "loss": 2.6171, "step": 43 }, { "epoch": 0.06624012043658262, "grad_norm": 1.605536937713623, "learning_rate": 8.8e-06, "loss": 2.636, "step": 44 }, { "epoch": 0.06774557771923222, "grad_norm": 1.5461937189102173, "learning_rate": 9e-06, "loss": 2.5438, "step": 45 }, { "epoch": 0.06925103500188182, "grad_norm": 1.2948230504989624, "learning_rate": 9.2e-06, "loss": 2.4146, "step": 46 }, { "epoch": 0.07075649228453143, "grad_norm": 1.2311327457427979, "learning_rate": 9.4e-06, "loss": 2.3832, "step": 47 }, { "epoch": 0.07226194956718103, "grad_norm": 1.2051506042480469, "learning_rate": 9.600000000000001e-06, "loss": 2.3325, "step": 48 }, { "epoch": 0.07376740684983063, "grad_norm": 1.087603211402893, "learning_rate": 9.800000000000001e-06, "loss": 2.2887, "step": 49 }, { "epoch": 0.07527286413248024, "grad_norm": 1.7362000942230225, "learning_rate": 1e-05, "loss": 2.3247, "step": 50 }, { "epoch": 0.07677832141512984, "grad_norm": 1.1651347875595093, "learning_rate": 1.02e-05, "loss": 2.2928, "step": 51 }, { "epoch": 0.07828377869777944, "grad_norm": 1.3218785524368286, "learning_rate": 1.04e-05, "loss": 2.2621, "step": 52 }, { "epoch": 0.07978923598042906, "grad_norm": 1.1790775060653687, "learning_rate": 1.06e-05, "loss": 2.1222, "step": 53 }, { "epoch": 0.08129469326307866, "grad_norm": 1.3794686794281006, "learning_rate": 1.08e-05, "loss": 2.1912, "step": 54 }, { "epoch": 0.08280015054572827, "grad_norm": 1.233445644378662, "learning_rate": 1.1000000000000001e-05, "loss": 2.1253, "step": 55 }, { "epoch": 0.08430560782837787, "grad_norm": 0.9659006595611572, "learning_rate": 1.1200000000000001e-05, "loss": 2.0266, "step": 56 }, { "epoch": 0.08581106511102747, "grad_norm": 0.9523867964744568, "learning_rate": 1.1400000000000001e-05, "loss": 1.9521, "step": 57 }, { "epoch": 0.08731652239367708, "grad_norm": 0.9418737888336182, "learning_rate": 1.16e-05, "loss": 1.967, "step": 58 }, { "epoch": 0.08882197967632668, "grad_norm": 0.9651983380317688, "learning_rate": 1.18e-05, "loss": 2.0307, "step": 59 }, { "epoch": 0.09032743695897628, "grad_norm": 0.9079494476318359, "learning_rate": 1.2e-05, "loss": 1.901, "step": 60 }, { "epoch": 0.0918328942416259, "grad_norm": 0.8458155393600464, "learning_rate": 1.22e-05, "loss": 1.8694, "step": 61 }, { "epoch": 0.0933383515242755, "grad_norm": 0.7945417761802673, "learning_rate": 1.24e-05, "loss": 1.7656, "step": 62 }, { "epoch": 0.0948438088069251, "grad_norm": 0.8871682286262512, "learning_rate": 1.2600000000000001e-05, "loss": 1.7878, "step": 63 }, { "epoch": 0.09634926608957471, "grad_norm": 0.8552000522613525, "learning_rate": 1.2800000000000001e-05, "loss": 1.8196, "step": 64 }, { "epoch": 0.09785472337222431, "grad_norm": 0.8254013061523438, "learning_rate": 1.3000000000000001e-05, "loss": 1.7422, "step": 65 }, { "epoch": 0.09936018065487391, "grad_norm": 0.7358948588371277, "learning_rate": 1.32e-05, "loss": 1.6966, "step": 66 }, { "epoch": 0.10086563793752353, "grad_norm": 0.7240864634513855, "learning_rate": 1.3400000000000002e-05, "loss": 1.6988, "step": 67 }, { "epoch": 0.10237109522017313, "grad_norm": 0.7599056363105774, "learning_rate": 1.3600000000000002e-05, "loss": 1.7366, "step": 68 }, { "epoch": 0.10387655250282274, "grad_norm": 0.7086566686630249, "learning_rate": 1.3800000000000002e-05, "loss": 1.6752, "step": 69 }, { "epoch": 0.10538200978547234, "grad_norm": 0.6926806569099426, "learning_rate": 1.4000000000000001e-05, "loss": 1.6806, "step": 70 }, { "epoch": 0.10688746706812194, "grad_norm": 0.7792842388153076, "learning_rate": 1.42e-05, "loss": 1.6114, "step": 71 }, { "epoch": 0.10839292435077155, "grad_norm": 0.6350343823432922, "learning_rate": 1.44e-05, "loss": 1.6184, "step": 72 }, { "epoch": 0.10989838163342115, "grad_norm": 0.7897392511367798, "learning_rate": 1.4599999999999999e-05, "loss": 1.6358, "step": 73 }, { "epoch": 0.11140383891607075, "grad_norm": 0.6388362050056458, "learning_rate": 1.48e-05, "loss": 1.6807, "step": 74 }, { "epoch": 0.11290929619872037, "grad_norm": 0.6364633440971375, "learning_rate": 1.5e-05, "loss": 1.6156, "step": 75 }, { "epoch": 0.11441475348136997, "grad_norm": 0.6043971180915833, "learning_rate": 1.52e-05, "loss": 1.5104, "step": 76 }, { "epoch": 0.11592021076401957, "grad_norm": 0.6650689244270325, "learning_rate": 1.54e-05, "loss": 1.5799, "step": 77 }, { "epoch": 0.11742566804666918, "grad_norm": 0.5849890112876892, "learning_rate": 1.56e-05, "loss": 1.4814, "step": 78 }, { "epoch": 0.11893112532931878, "grad_norm": 0.514636754989624, "learning_rate": 1.58e-05, "loss": 1.4309, "step": 79 }, { "epoch": 0.12043658261196838, "grad_norm": 0.705170750617981, "learning_rate": 1.6000000000000003e-05, "loss": 1.565, "step": 80 }, { "epoch": 0.12194203989461799, "grad_norm": 0.526858925819397, "learning_rate": 1.62e-05, "loss": 1.3995, "step": 81 }, { "epoch": 0.12344749717726759, "grad_norm": 0.5791212320327759, "learning_rate": 1.6400000000000002e-05, "loss": 1.4811, "step": 82 }, { "epoch": 0.1249529544599172, "grad_norm": 0.44450345635414124, "learning_rate": 1.66e-05, "loss": 1.3394, "step": 83 }, { "epoch": 0.1264584117425668, "grad_norm": 0.5785414576530457, "learning_rate": 1.6800000000000002e-05, "loss": 1.4328, "step": 84 }, { "epoch": 0.12796386902521642, "grad_norm": 0.5075299739837646, "learning_rate": 1.7000000000000003e-05, "loss": 1.3963, "step": 85 }, { "epoch": 0.129469326307866, "grad_norm": 0.5202359557151794, "learning_rate": 1.7199999999999998e-05, "loss": 1.4095, "step": 86 }, { "epoch": 0.13097478359051562, "grad_norm": 0.5533699989318848, "learning_rate": 1.74e-05, "loss": 1.4128, "step": 87 }, { "epoch": 0.13248024087316523, "grad_norm": 0.46378159523010254, "learning_rate": 1.76e-05, "loss": 1.3467, "step": 88 }, { "epoch": 0.13398569815581482, "grad_norm": 0.6286801695823669, "learning_rate": 1.78e-05, "loss": 1.3894, "step": 89 }, { "epoch": 0.13549115543846443, "grad_norm": 0.4489421844482422, "learning_rate": 1.8e-05, "loss": 1.2999, "step": 90 }, { "epoch": 0.13699661272111405, "grad_norm": 0.5930526256561279, "learning_rate": 1.8200000000000002e-05, "loss": 1.3682, "step": 91 }, { "epoch": 0.13850207000376363, "grad_norm": 0.4687874913215637, "learning_rate": 1.84e-05, "loss": 1.3329, "step": 92 }, { "epoch": 0.14000752728641325, "grad_norm": 0.5241579413414001, "learning_rate": 1.86e-05, "loss": 1.2559, "step": 93 }, { "epoch": 0.14151298456906286, "grad_norm": 0.563031017780304, "learning_rate": 1.88e-05, "loss": 1.288, "step": 94 }, { "epoch": 0.14301844185171245, "grad_norm": 0.4354332685470581, "learning_rate": 1.9e-05, "loss": 1.2263, "step": 95 }, { "epoch": 0.14452389913436206, "grad_norm": 0.5619744658470154, "learning_rate": 1.9200000000000003e-05, "loss": 1.3335, "step": 96 }, { "epoch": 0.14602935641701167, "grad_norm": 0.6178876161575317, "learning_rate": 1.94e-05, "loss": 1.1442, "step": 97 }, { "epoch": 0.14753481369966126, "grad_norm": 0.4359278976917267, "learning_rate": 1.9600000000000002e-05, "loss": 1.1829, "step": 98 }, { "epoch": 0.14904027098231087, "grad_norm": 0.5734010338783264, "learning_rate": 1.9800000000000004e-05, "loss": 1.2409, "step": 99 }, { "epoch": 0.1505457282649605, "grad_norm": 0.4638058543205261, "learning_rate": 2e-05, "loss": 1.2247, "step": 100 }, { "epoch": 0.15205118554761007, "grad_norm": 0.5258275270462036, "learning_rate": 2.0200000000000003e-05, "loss": 1.2802, "step": 101 }, { "epoch": 0.1535566428302597, "grad_norm": 0.48311203718185425, "learning_rate": 2.04e-05, "loss": 1.1902, "step": 102 }, { "epoch": 0.1550621001129093, "grad_norm": 0.5980188250541687, "learning_rate": 2.06e-05, "loss": 1.2217, "step": 103 }, { "epoch": 0.1565675573955589, "grad_norm": 0.5909335017204285, "learning_rate": 2.08e-05, "loss": 1.1911, "step": 104 }, { "epoch": 0.1580730146782085, "grad_norm": 0.4553234279155731, "learning_rate": 2.1e-05, "loss": 1.1141, "step": 105 }, { "epoch": 0.15957847196085811, "grad_norm": 0.5494498610496521, "learning_rate": 2.12e-05, "loss": 1.1793, "step": 106 }, { "epoch": 0.16108392924350773, "grad_norm": 0.4412781298160553, "learning_rate": 2.1400000000000002e-05, "loss": 1.0819, "step": 107 }, { "epoch": 0.16258938652615731, "grad_norm": 0.397083044052124, "learning_rate": 2.16e-05, "loss": 1.0711, "step": 108 }, { "epoch": 0.16409484380880693, "grad_norm": 0.37314021587371826, "learning_rate": 2.18e-05, "loss": 1.0495, "step": 109 }, { "epoch": 0.16560030109145654, "grad_norm": 0.3985329568386078, "learning_rate": 2.2000000000000003e-05, "loss": 1.1286, "step": 110 }, { "epoch": 0.16710575837410613, "grad_norm": 0.36453479528427124, "learning_rate": 2.22e-05, "loss": 1.0213, "step": 111 }, { "epoch": 0.16861121565675574, "grad_norm": 0.31520411372184753, "learning_rate": 2.2400000000000002e-05, "loss": 1.0479, "step": 112 }, { "epoch": 0.17011667293940536, "grad_norm": 0.35660097002983093, "learning_rate": 2.26e-05, "loss": 1.0902, "step": 113 }, { "epoch": 0.17162213022205494, "grad_norm": 0.457228422164917, "learning_rate": 2.2800000000000002e-05, "loss": 1.0843, "step": 114 }, { "epoch": 0.17312758750470456, "grad_norm": 0.270781010389328, "learning_rate": 2.3000000000000003e-05, "loss": 0.965, "step": 115 }, { "epoch": 0.17463304478735417, "grad_norm": 0.2844024896621704, "learning_rate": 2.32e-05, "loss": 0.9826, "step": 116 }, { "epoch": 0.17613850207000376, "grad_norm": 0.3486024737358093, "learning_rate": 2.3400000000000003e-05, "loss": 0.9778, "step": 117 }, { "epoch": 0.17764395935265337, "grad_norm": 0.2946693003177643, "learning_rate": 2.36e-05, "loss": 1.015, "step": 118 }, { "epoch": 0.17914941663530298, "grad_norm": 0.2843293249607086, "learning_rate": 2.38e-05, "loss": 0.9889, "step": 119 }, { "epoch": 0.18065487391795257, "grad_norm": 0.2589290142059326, "learning_rate": 2.4e-05, "loss": 0.9161, "step": 120 }, { "epoch": 0.18216033120060218, "grad_norm": 0.30723389983177185, "learning_rate": 2.4200000000000002e-05, "loss": 0.9931, "step": 121 }, { "epoch": 0.1836657884832518, "grad_norm": 0.32859113812446594, "learning_rate": 2.44e-05, "loss": 1.0171, "step": 122 }, { "epoch": 0.18517124576590138, "grad_norm": 0.3820136785507202, "learning_rate": 2.46e-05, "loss": 1.0207, "step": 123 }, { "epoch": 0.186676703048551, "grad_norm": 0.28890880942344666, "learning_rate": 2.48e-05, "loss": 0.9508, "step": 124 }, { "epoch": 0.1881821603312006, "grad_norm": 0.27351444959640503, "learning_rate": 2.5e-05, "loss": 0.9726, "step": 125 }, { "epoch": 0.1896876176138502, "grad_norm": 0.26240354776382446, "learning_rate": 2.5200000000000003e-05, "loss": 1.0269, "step": 126 }, { "epoch": 0.1911930748964998, "grad_norm": 0.22678494453430176, "learning_rate": 2.54e-05, "loss": 0.9282, "step": 127 }, { "epoch": 0.19269853217914942, "grad_norm": 0.213886559009552, "learning_rate": 2.5600000000000002e-05, "loss": 0.9375, "step": 128 }, { "epoch": 0.194203989461799, "grad_norm": 0.2372247576713562, "learning_rate": 2.58e-05, "loss": 0.9493, "step": 129 }, { "epoch": 0.19570944674444862, "grad_norm": 0.47555163502693176, "learning_rate": 2.6000000000000002e-05, "loss": 1.0098, "step": 130 }, { "epoch": 0.19721490402709824, "grad_norm": 0.2284597009420395, "learning_rate": 2.6200000000000003e-05, "loss": 0.9387, "step": 131 }, { "epoch": 0.19872036130974782, "grad_norm": 0.3116055727005005, "learning_rate": 2.64e-05, "loss": 0.94, "step": 132 }, { "epoch": 0.20022581859239744, "grad_norm": 0.2571500539779663, "learning_rate": 2.6600000000000003e-05, "loss": 0.9209, "step": 133 }, { "epoch": 0.20173127587504705, "grad_norm": 0.416579008102417, "learning_rate": 2.6800000000000004e-05, "loss": 1.0122, "step": 134 }, { "epoch": 0.20323673315769666, "grad_norm": 0.45883670449256897, "learning_rate": 2.7000000000000002e-05, "loss": 0.952, "step": 135 }, { "epoch": 0.20474219044034625, "grad_norm": 0.32968148589134216, "learning_rate": 2.7200000000000004e-05, "loss": 0.9719, "step": 136 }, { "epoch": 0.20624764772299586, "grad_norm": 0.25214099884033203, "learning_rate": 2.7400000000000002e-05, "loss": 0.9521, "step": 137 }, { "epoch": 0.20775310500564548, "grad_norm": 0.3086577355861664, "learning_rate": 2.7600000000000003e-05, "loss": 0.9515, "step": 138 }, { "epoch": 0.20925856228829506, "grad_norm": 0.30896857380867004, "learning_rate": 2.7800000000000005e-05, "loss": 1.0079, "step": 139 }, { "epoch": 0.21076401957094468, "grad_norm": 0.21344344317913055, "learning_rate": 2.8000000000000003e-05, "loss": 0.9319, "step": 140 }, { "epoch": 0.2122694768535943, "grad_norm": 0.27175232768058777, "learning_rate": 2.8199999999999998e-05, "loss": 0.97, "step": 141 }, { "epoch": 0.21377493413624388, "grad_norm": 0.2680148184299469, "learning_rate": 2.84e-05, "loss": 0.8967, "step": 142 }, { "epoch": 0.2152803914188935, "grad_norm": 0.28377294540405273, "learning_rate": 2.86e-05, "loss": 0.9444, "step": 143 }, { "epoch": 0.2167858487015431, "grad_norm": 0.25373780727386475, "learning_rate": 2.88e-05, "loss": 0.9369, "step": 144 }, { "epoch": 0.2182913059841927, "grad_norm": 0.30653658509254456, "learning_rate": 2.9e-05, "loss": 0.9024, "step": 145 }, { "epoch": 0.2197967632668423, "grad_norm": 0.3118561804294586, "learning_rate": 2.9199999999999998e-05, "loss": 0.9467, "step": 146 }, { "epoch": 0.22130222054949192, "grad_norm": 0.24516993761062622, "learning_rate": 2.94e-05, "loss": 0.9064, "step": 147 }, { "epoch": 0.2228076778321415, "grad_norm": 0.8260800838470459, "learning_rate": 2.96e-05, "loss": 0.969, "step": 148 }, { "epoch": 0.22431313511479112, "grad_norm": 0.3053131401538849, "learning_rate": 2.98e-05, "loss": 0.9254, "step": 149 }, { "epoch": 0.22581859239744073, "grad_norm": 0.44213801622390747, "learning_rate": 3e-05, "loss": 0.9011, "step": 150 }, { "epoch": 0.22732404968009032, "grad_norm": 0.2972167730331421, "learning_rate": 3.02e-05, "loss": 0.9707, "step": 151 }, { "epoch": 0.22882950696273993, "grad_norm": 0.24427281320095062, "learning_rate": 3.04e-05, "loss": 0.8905, "step": 152 }, { "epoch": 0.23033496424538955, "grad_norm": 0.34817543625831604, "learning_rate": 3.06e-05, "loss": 0.9174, "step": 153 }, { "epoch": 0.23184042152803913, "grad_norm": 0.2672892212867737, "learning_rate": 3.08e-05, "loss": 0.8742, "step": 154 }, { "epoch": 0.23334587881068874, "grad_norm": 0.29176586866378784, "learning_rate": 3.1e-05, "loss": 0.9458, "step": 155 }, { "epoch": 0.23485133609333836, "grad_norm": 0.2704721689224243, "learning_rate": 3.12e-05, "loss": 0.8928, "step": 156 }, { "epoch": 0.23635679337598794, "grad_norm": 0.22202567756175995, "learning_rate": 3.1400000000000004e-05, "loss": 0.8809, "step": 157 }, { "epoch": 0.23786225065863756, "grad_norm": 0.23295825719833374, "learning_rate": 3.16e-05, "loss": 0.87, "step": 158 }, { "epoch": 0.23936770794128717, "grad_norm": 0.23125235736370087, "learning_rate": 3.18e-05, "loss": 0.9412, "step": 159 }, { "epoch": 0.24087316522393676, "grad_norm": 0.3662780225276947, "learning_rate": 3.2000000000000005e-05, "loss": 0.9312, "step": 160 }, { "epoch": 0.24237862250658637, "grad_norm": 0.2715233266353607, "learning_rate": 3.2200000000000003e-05, "loss": 0.9416, "step": 161 }, { "epoch": 0.24388407978923599, "grad_norm": 0.23098795115947723, "learning_rate": 3.24e-05, "loss": 0.8794, "step": 162 }, { "epoch": 0.24538953707188557, "grad_norm": 0.3910009562969208, "learning_rate": 3.26e-05, "loss": 0.913, "step": 163 }, { "epoch": 0.24689499435453519, "grad_norm": 0.32783907651901245, "learning_rate": 3.2800000000000004e-05, "loss": 0.9167, "step": 164 }, { "epoch": 0.2484004516371848, "grad_norm": 0.2803574204444885, "learning_rate": 3.3e-05, "loss": 0.9173, "step": 165 }, { "epoch": 0.2499059089198344, "grad_norm": 0.30303284525871277, "learning_rate": 3.32e-05, "loss": 0.9475, "step": 166 }, { "epoch": 0.251411366202484, "grad_norm": 0.3482511043548584, "learning_rate": 3.3400000000000005e-05, "loss": 0.8604, "step": 167 }, { "epoch": 0.2529168234851336, "grad_norm": 0.2995555102825165, "learning_rate": 3.3600000000000004e-05, "loss": 0.916, "step": 168 }, { "epoch": 0.2544222807677832, "grad_norm": 0.3321720361709595, "learning_rate": 3.38e-05, "loss": 0.8501, "step": 169 }, { "epoch": 0.25592773805043284, "grad_norm": 0.2636343240737915, "learning_rate": 3.4000000000000007e-05, "loss": 0.8339, "step": 170 }, { "epoch": 0.2574331953330824, "grad_norm": 0.3316866159439087, "learning_rate": 3.4200000000000005e-05, "loss": 0.848, "step": 171 }, { "epoch": 0.258938652615732, "grad_norm": 0.4684114158153534, "learning_rate": 3.4399999999999996e-05, "loss": 0.8245, "step": 172 }, { "epoch": 0.2604441098983816, "grad_norm": 0.5100336670875549, "learning_rate": 3.46e-05, "loss": 0.8977, "step": 173 }, { "epoch": 0.26194956718103124, "grad_norm": 0.49124741554260254, "learning_rate": 3.48e-05, "loss": 0.8839, "step": 174 }, { "epoch": 0.26345502446368085, "grad_norm": 0.32672053575515747, "learning_rate": 3.5e-05, "loss": 0.8245, "step": 175 }, { "epoch": 0.26496048174633047, "grad_norm": 0.5820422172546387, "learning_rate": 3.52e-05, "loss": 0.925, "step": 176 }, { "epoch": 0.26646593902898, "grad_norm": 0.3213149607181549, "learning_rate": 3.54e-05, "loss": 0.886, "step": 177 }, { "epoch": 0.26797139631162964, "grad_norm": 0.35795313119888306, "learning_rate": 3.56e-05, "loss": 0.8472, "step": 178 }, { "epoch": 0.26947685359427925, "grad_norm": 0.2326056808233261, "learning_rate": 3.58e-05, "loss": 0.828, "step": 179 }, { "epoch": 0.27098231087692887, "grad_norm": 0.350570410490036, "learning_rate": 3.6e-05, "loss": 0.8271, "step": 180 }, { "epoch": 0.2724877681595785, "grad_norm": 0.3433835506439209, "learning_rate": 3.62e-05, "loss": 0.8983, "step": 181 }, { "epoch": 0.2739932254422281, "grad_norm": 0.31187453866004944, "learning_rate": 3.6400000000000004e-05, "loss": 0.8287, "step": 182 }, { "epoch": 0.2754986827248777, "grad_norm": 0.2332259714603424, "learning_rate": 3.66e-05, "loss": 0.7837, "step": 183 }, { "epoch": 0.27700414000752727, "grad_norm": 0.2927328646183014, "learning_rate": 3.68e-05, "loss": 0.8398, "step": 184 }, { "epoch": 0.2785095972901769, "grad_norm": 0.18824100494384766, "learning_rate": 3.7e-05, "loss": 0.8727, "step": 185 }, { "epoch": 0.2800150545728265, "grad_norm": 0.27012181282043457, "learning_rate": 3.72e-05, "loss": 0.8205, "step": 186 }, { "epoch": 0.2815205118554761, "grad_norm": 0.45509180426597595, "learning_rate": 3.74e-05, "loss": 0.8704, "step": 187 }, { "epoch": 0.2830259691381257, "grad_norm": 0.26858875155448914, "learning_rate": 3.76e-05, "loss": 0.8453, "step": 188 }, { "epoch": 0.28453142642077534, "grad_norm": 0.259222149848938, "learning_rate": 3.7800000000000004e-05, "loss": 0.8132, "step": 189 }, { "epoch": 0.2860368837034249, "grad_norm": 0.26587730646133423, "learning_rate": 3.8e-05, "loss": 0.8372, "step": 190 }, { "epoch": 0.2875423409860745, "grad_norm": 0.34051695466041565, "learning_rate": 3.82e-05, "loss": 0.8167, "step": 191 }, { "epoch": 0.2890477982687241, "grad_norm": 0.3592548668384552, "learning_rate": 3.8400000000000005e-05, "loss": 0.8675, "step": 192 }, { "epoch": 0.29055325555137373, "grad_norm": 0.25640058517456055, "learning_rate": 3.86e-05, "loss": 0.7839, "step": 193 }, { "epoch": 0.29205871283402335, "grad_norm": 0.4663311541080475, "learning_rate": 3.88e-05, "loss": 0.8315, "step": 194 }, { "epoch": 0.29356417011667296, "grad_norm": 0.31232011318206787, "learning_rate": 3.9000000000000006e-05, "loss": 0.8711, "step": 195 }, { "epoch": 0.2950696273993225, "grad_norm": 0.6571543216705322, "learning_rate": 3.9200000000000004e-05, "loss": 0.8553, "step": 196 }, { "epoch": 0.29657508468197213, "grad_norm": 0.47625118494033813, "learning_rate": 3.94e-05, "loss": 0.8603, "step": 197 }, { "epoch": 0.29808054196462175, "grad_norm": 0.5184832811355591, "learning_rate": 3.960000000000001e-05, "loss": 0.8301, "step": 198 }, { "epoch": 0.29958599924727136, "grad_norm": 0.364950567483902, "learning_rate": 3.9800000000000005e-05, "loss": 0.8341, "step": 199 }, { "epoch": 0.301091456529921, "grad_norm": 0.2601494789123535, "learning_rate": 4e-05, "loss": 0.8409, "step": 200 }, { "epoch": 0.301091456529921, "eval_loss": 0.8078026175498962, "eval_runtime": 538.8455, "eval_samples_per_second": 17.866, "eval_steps_per_second": 0.559, "step": 200 }, { "epoch": 0.3025969138125706, "grad_norm": 0.36397236585617065, "learning_rate": 4.02e-05, "loss": 0.7692, "step": 201 }, { "epoch": 0.30410237109522015, "grad_norm": 0.3101062476634979, "learning_rate": 4.0400000000000006e-05, "loss": 0.842, "step": 202 }, { "epoch": 0.30560782837786976, "grad_norm": 0.26615700125694275, "learning_rate": 4.0600000000000004e-05, "loss": 0.8562, "step": 203 }, { "epoch": 0.3071132856605194, "grad_norm": 0.2995416522026062, "learning_rate": 4.08e-05, "loss": 0.8305, "step": 204 }, { "epoch": 0.308618742943169, "grad_norm": 0.24238595366477966, "learning_rate": 4.1e-05, "loss": 0.7777, "step": 205 }, { "epoch": 0.3101242002258186, "grad_norm": 0.3775932788848877, "learning_rate": 4.12e-05, "loss": 0.8446, "step": 206 }, { "epoch": 0.3116296575084682, "grad_norm": 0.4034956991672516, "learning_rate": 4.14e-05, "loss": 0.8291, "step": 207 }, { "epoch": 0.3131351147911178, "grad_norm": 0.34179702401161194, "learning_rate": 4.16e-05, "loss": 0.872, "step": 208 }, { "epoch": 0.3146405720737674, "grad_norm": 0.283225953578949, "learning_rate": 4.18e-05, "loss": 0.8339, "step": 209 }, { "epoch": 0.316146029356417, "grad_norm": 0.22886602580547333, "learning_rate": 4.2e-05, "loss": 0.7849, "step": 210 }, { "epoch": 0.3176514866390666, "grad_norm": 0.2473565638065338, "learning_rate": 4.22e-05, "loss": 0.8763, "step": 211 }, { "epoch": 0.31915694392171623, "grad_norm": 0.289762407541275, "learning_rate": 4.24e-05, "loss": 0.8701, "step": 212 }, { "epoch": 0.32066240120436584, "grad_norm": 0.20741982758045197, "learning_rate": 4.26e-05, "loss": 0.8379, "step": 213 }, { "epoch": 0.32216785848701546, "grad_norm": 0.25838783383369446, "learning_rate": 4.2800000000000004e-05, "loss": 0.828, "step": 214 }, { "epoch": 0.323673315769665, "grad_norm": 0.2797646224498749, "learning_rate": 4.3e-05, "loss": 0.8277, "step": 215 }, { "epoch": 0.32517877305231463, "grad_norm": 0.32470548152923584, "learning_rate": 4.32e-05, "loss": 0.7577, "step": 216 }, { "epoch": 0.32668423033496424, "grad_norm": 0.3792063891887665, "learning_rate": 4.3400000000000005e-05, "loss": 0.7786, "step": 217 }, { "epoch": 0.32818968761761386, "grad_norm": 0.4814249873161316, "learning_rate": 4.36e-05, "loss": 0.841, "step": 218 }, { "epoch": 0.32969514490026347, "grad_norm": 0.30982473492622375, "learning_rate": 4.38e-05, "loss": 0.8092, "step": 219 }, { "epoch": 0.3312006021829131, "grad_norm": 0.3729328215122223, "learning_rate": 4.4000000000000006e-05, "loss": 0.7879, "step": 220 }, { "epoch": 0.33270605946556264, "grad_norm": 0.339505672454834, "learning_rate": 4.4200000000000004e-05, "loss": 0.7906, "step": 221 }, { "epoch": 0.33421151674821226, "grad_norm": 0.3058149814605713, "learning_rate": 4.44e-05, "loss": 0.8234, "step": 222 }, { "epoch": 0.33571697403086187, "grad_norm": 0.27973473072052, "learning_rate": 4.46e-05, "loss": 0.7736, "step": 223 }, { "epoch": 0.3372224313135115, "grad_norm": 0.3056331276893616, "learning_rate": 4.4800000000000005e-05, "loss": 0.7935, "step": 224 }, { "epoch": 0.3387278885961611, "grad_norm": 0.3207216262817383, "learning_rate": 4.5e-05, "loss": 0.779, "step": 225 }, { "epoch": 0.3402333458788107, "grad_norm": 0.40863484144210815, "learning_rate": 4.52e-05, "loss": 0.7843, "step": 226 }, { "epoch": 0.34173880316146027, "grad_norm": 0.3993776738643646, "learning_rate": 4.5400000000000006e-05, "loss": 0.7689, "step": 227 }, { "epoch": 0.3432442604441099, "grad_norm": 0.39806005358695984, "learning_rate": 4.5600000000000004e-05, "loss": 0.7711, "step": 228 }, { "epoch": 0.3447497177267595, "grad_norm": 0.2713351547718048, "learning_rate": 4.58e-05, "loss": 0.8022, "step": 229 }, { "epoch": 0.3462551750094091, "grad_norm": 0.25446799397468567, "learning_rate": 4.600000000000001e-05, "loss": 0.7797, "step": 230 }, { "epoch": 0.3477606322920587, "grad_norm": 0.3554419279098511, "learning_rate": 4.6200000000000005e-05, "loss": 0.7752, "step": 231 }, { "epoch": 0.34926608957470834, "grad_norm": 0.3545427620410919, "learning_rate": 4.64e-05, "loss": 0.8036, "step": 232 }, { "epoch": 0.3507715468573579, "grad_norm": 0.27903667092323303, "learning_rate": 4.660000000000001e-05, "loss": 0.7717, "step": 233 }, { "epoch": 0.3522770041400075, "grad_norm": 0.2630469501018524, "learning_rate": 4.6800000000000006e-05, "loss": 0.8714, "step": 234 }, { "epoch": 0.3537824614226571, "grad_norm": 0.37187036871910095, "learning_rate": 4.7e-05, "loss": 0.7458, "step": 235 }, { "epoch": 0.35528791870530674, "grad_norm": 0.28952011466026306, "learning_rate": 4.72e-05, "loss": 0.7642, "step": 236 }, { "epoch": 0.35679337598795635, "grad_norm": 0.1870245635509491, "learning_rate": 4.74e-05, "loss": 0.7895, "step": 237 }, { "epoch": 0.35829883327060597, "grad_norm": 0.26773858070373535, "learning_rate": 4.76e-05, "loss": 0.8115, "step": 238 }, { "epoch": 0.3598042905532556, "grad_norm": 0.20740048587322235, "learning_rate": 4.78e-05, "loss": 0.8101, "step": 239 }, { "epoch": 0.36130974783590514, "grad_norm": 0.2120191603899002, "learning_rate": 4.8e-05, "loss": 0.7782, "step": 240 }, { "epoch": 0.36281520511855475, "grad_norm": 0.29032421112060547, "learning_rate": 4.82e-05, "loss": 0.7286, "step": 241 }, { "epoch": 0.36432066240120436, "grad_norm": 0.234977126121521, "learning_rate": 4.8400000000000004e-05, "loss": 0.8059, "step": 242 }, { "epoch": 0.365826119683854, "grad_norm": 0.27322936058044434, "learning_rate": 4.86e-05, "loss": 0.7815, "step": 243 }, { "epoch": 0.3673315769665036, "grad_norm": 0.3235185742378235, "learning_rate": 4.88e-05, "loss": 0.8285, "step": 244 }, { "epoch": 0.3688370342491532, "grad_norm": 0.39334139227867126, "learning_rate": 4.9e-05, "loss": 0.7872, "step": 245 }, { "epoch": 0.37034249153180276, "grad_norm": 0.6116060614585876, "learning_rate": 4.92e-05, "loss": 0.7652, "step": 246 }, { "epoch": 0.3718479488144524, "grad_norm": 0.684800922870636, "learning_rate": 4.94e-05, "loss": 0.7509, "step": 247 }, { "epoch": 0.373353406097102, "grad_norm": 0.5566190481185913, "learning_rate": 4.96e-05, "loss": 0.7809, "step": 248 }, { "epoch": 0.3748588633797516, "grad_norm": 0.5867395401000977, "learning_rate": 4.9800000000000004e-05, "loss": 0.818, "step": 249 }, { "epoch": 0.3763643206624012, "grad_norm": 0.568831205368042, "learning_rate": 5e-05, "loss": 0.8212, "step": 250 }, { "epoch": 0.37786977794505083, "grad_norm": 0.3483644425868988, "learning_rate": 5.02e-05, "loss": 0.8141, "step": 251 }, { "epoch": 0.3793752352277004, "grad_norm": 0.5137937664985657, "learning_rate": 5.0400000000000005e-05, "loss": 0.779, "step": 252 }, { "epoch": 0.38088069251035, "grad_norm": 0.4609558582305908, "learning_rate": 5.0600000000000003e-05, "loss": 0.8327, "step": 253 }, { "epoch": 0.3823861497929996, "grad_norm": 0.34963372349739075, "learning_rate": 5.08e-05, "loss": 0.7721, "step": 254 }, { "epoch": 0.38389160707564923, "grad_norm": 0.4319046437740326, "learning_rate": 5.1000000000000006e-05, "loss": 0.7615, "step": 255 }, { "epoch": 0.38539706435829885, "grad_norm": 0.3027240037918091, "learning_rate": 5.1200000000000004e-05, "loss": 0.7181, "step": 256 }, { "epoch": 0.38690252164094846, "grad_norm": 0.3576965928077698, "learning_rate": 5.14e-05, "loss": 0.8361, "step": 257 }, { "epoch": 0.388407978923598, "grad_norm": 0.3538447618484497, "learning_rate": 5.16e-05, "loss": 0.7461, "step": 258 }, { "epoch": 0.38991343620624763, "grad_norm": 0.29571327567100525, "learning_rate": 5.1800000000000005e-05, "loss": 0.7516, "step": 259 }, { "epoch": 0.39141889348889725, "grad_norm": 0.32102593779563904, "learning_rate": 5.2000000000000004e-05, "loss": 0.736, "step": 260 }, { "epoch": 0.39292435077154686, "grad_norm": 0.33467647433280945, "learning_rate": 5.22e-05, "loss": 0.7501, "step": 261 }, { "epoch": 0.3944298080541965, "grad_norm": 0.2528577446937561, "learning_rate": 5.2400000000000007e-05, "loss": 0.7754, "step": 262 }, { "epoch": 0.3959352653368461, "grad_norm": 0.24776317179203033, "learning_rate": 5.2600000000000005e-05, "loss": 0.738, "step": 263 }, { "epoch": 0.39744072261949565, "grad_norm": 0.2503894567489624, "learning_rate": 5.28e-05, "loss": 0.7252, "step": 264 }, { "epoch": 0.39894617990214526, "grad_norm": 0.33288031816482544, "learning_rate": 5.300000000000001e-05, "loss": 0.7795, "step": 265 }, { "epoch": 0.4004516371847949, "grad_norm": 0.4533836841583252, "learning_rate": 5.3200000000000006e-05, "loss": 0.7746, "step": 266 }, { "epoch": 0.4019570944674445, "grad_norm": 0.2981036901473999, "learning_rate": 5.3400000000000004e-05, "loss": 0.8007, "step": 267 }, { "epoch": 0.4034625517500941, "grad_norm": 0.26069486141204834, "learning_rate": 5.360000000000001e-05, "loss": 0.8108, "step": 268 }, { "epoch": 0.4049680090327437, "grad_norm": 0.2519361078739166, "learning_rate": 5.380000000000001e-05, "loss": 0.7604, "step": 269 }, { "epoch": 0.40647346631539333, "grad_norm": 0.27320560812950134, "learning_rate": 5.4000000000000005e-05, "loss": 0.7739, "step": 270 }, { "epoch": 0.4079789235980429, "grad_norm": 0.263278067111969, "learning_rate": 5.420000000000001e-05, "loss": 0.7203, "step": 271 }, { "epoch": 0.4094843808806925, "grad_norm": 0.24173833429813385, "learning_rate": 5.440000000000001e-05, "loss": 0.7437, "step": 272 }, { "epoch": 0.4109898381633421, "grad_norm": 0.26429563760757446, "learning_rate": 5.4600000000000006e-05, "loss": 0.7265, "step": 273 }, { "epoch": 0.4124952954459917, "grad_norm": 0.2572035491466522, "learning_rate": 5.4800000000000004e-05, "loss": 0.7469, "step": 274 }, { "epoch": 0.41400075272864134, "grad_norm": 0.2426484078168869, "learning_rate": 5.500000000000001e-05, "loss": 0.7523, "step": 275 }, { "epoch": 0.41550621001129096, "grad_norm": 0.2684064209461212, "learning_rate": 5.520000000000001e-05, "loss": 0.7391, "step": 276 }, { "epoch": 0.4170116672939405, "grad_norm": 0.29265454411506653, "learning_rate": 5.5400000000000005e-05, "loss": 0.863, "step": 277 }, { "epoch": 0.4185171245765901, "grad_norm": 0.2641523480415344, "learning_rate": 5.560000000000001e-05, "loss": 0.8033, "step": 278 }, { "epoch": 0.42002258185923974, "grad_norm": 0.2706596851348877, "learning_rate": 5.580000000000001e-05, "loss": 0.7254, "step": 279 }, { "epoch": 0.42152803914188935, "grad_norm": 0.2349986433982849, "learning_rate": 5.6000000000000006e-05, "loss": 0.6736, "step": 280 }, { "epoch": 0.42303349642453897, "grad_norm": 0.22394873201847076, "learning_rate": 5.620000000000001e-05, "loss": 0.7135, "step": 281 }, { "epoch": 0.4245389537071886, "grad_norm": 0.20895801484584808, "learning_rate": 5.6399999999999995e-05, "loss": 0.8077, "step": 282 }, { "epoch": 0.42604441098983814, "grad_norm": 0.20906692743301392, "learning_rate": 5.66e-05, "loss": 0.7609, "step": 283 }, { "epoch": 0.42754986827248775, "grad_norm": 0.18515034019947052, "learning_rate": 5.68e-05, "loss": 0.7783, "step": 284 }, { "epoch": 0.42905532555513737, "grad_norm": 0.2389136254787445, "learning_rate": 5.6999999999999996e-05, "loss": 0.7621, "step": 285 }, { "epoch": 0.430560782837787, "grad_norm": 0.4240041673183441, "learning_rate": 5.72e-05, "loss": 0.68, "step": 286 }, { "epoch": 0.4320662401204366, "grad_norm": 0.5081321597099304, "learning_rate": 5.74e-05, "loss": 0.7607, "step": 287 }, { "epoch": 0.4335716974030862, "grad_norm": 0.46824517846107483, "learning_rate": 5.76e-05, "loss": 0.767, "step": 288 }, { "epoch": 0.43507715468573577, "grad_norm": 0.8372516632080078, "learning_rate": 5.7799999999999995e-05, "loss": 0.7539, "step": 289 }, { "epoch": 0.4365826119683854, "grad_norm": 0.7257559895515442, "learning_rate": 5.8e-05, "loss": 0.8044, "step": 290 }, { "epoch": 0.438088069251035, "grad_norm": 0.41699114441871643, "learning_rate": 5.82e-05, "loss": 0.778, "step": 291 }, { "epoch": 0.4395935265336846, "grad_norm": 0.530401885509491, "learning_rate": 5.8399999999999997e-05, "loss": 0.6967, "step": 292 }, { "epoch": 0.4410989838163342, "grad_norm": 0.5249519348144531, "learning_rate": 5.86e-05, "loss": 0.748, "step": 293 }, { "epoch": 0.44260444109898384, "grad_norm": 0.38007670640945435, "learning_rate": 5.88e-05, "loss": 0.7887, "step": 294 }, { "epoch": 0.4441098983816334, "grad_norm": 0.3110082447528839, "learning_rate": 5.9e-05, "loss": 0.7916, "step": 295 }, { "epoch": 0.445615355664283, "grad_norm": 0.3982278108596802, "learning_rate": 5.92e-05, "loss": 0.7485, "step": 296 }, { "epoch": 0.4471208129469326, "grad_norm": 0.3266528844833374, "learning_rate": 5.94e-05, "loss": 0.7193, "step": 297 }, { "epoch": 0.44862627022958224, "grad_norm": 0.2485976219177246, "learning_rate": 5.96e-05, "loss": 0.7449, "step": 298 }, { "epoch": 0.45013172751223185, "grad_norm": 0.365421324968338, "learning_rate": 5.9800000000000003e-05, "loss": 0.7265, "step": 299 }, { "epoch": 0.45163718479488146, "grad_norm": 0.43881702423095703, "learning_rate": 6e-05, "loss": 0.7302, "step": 300 }, { "epoch": 0.4531426420775311, "grad_norm": 0.37681517004966736, "learning_rate": 6.02e-05, "loss": 0.7663, "step": 301 }, { "epoch": 0.45464809936018064, "grad_norm": 0.20925012230873108, "learning_rate": 6.04e-05, "loss": 0.7744, "step": 302 }, { "epoch": 0.45615355664283025, "grad_norm": 0.2645818293094635, "learning_rate": 6.06e-05, "loss": 0.7627, "step": 303 }, { "epoch": 0.45765901392547986, "grad_norm": 0.2561851739883423, "learning_rate": 6.08e-05, "loss": 0.755, "step": 304 }, { "epoch": 0.4591644712081295, "grad_norm": 0.2750508487224579, "learning_rate": 6.1e-05, "loss": 0.7711, "step": 305 }, { "epoch": 0.4606699284907791, "grad_norm": 0.20848870277404785, "learning_rate": 6.12e-05, "loss": 0.7547, "step": 306 }, { "epoch": 0.4621753857734287, "grad_norm": 0.22842425107955933, "learning_rate": 6.14e-05, "loss": 0.7113, "step": 307 }, { "epoch": 0.46368084305607826, "grad_norm": 0.2514175772666931, "learning_rate": 6.16e-05, "loss": 0.765, "step": 308 }, { "epoch": 0.4651863003387279, "grad_norm": 0.22453932464122772, "learning_rate": 6.18e-05, "loss": 0.7444, "step": 309 }, { "epoch": 0.4666917576213775, "grad_norm": 0.21454894542694092, "learning_rate": 6.2e-05, "loss": 0.8084, "step": 310 }, { "epoch": 0.4681972149040271, "grad_norm": 0.30135148763656616, "learning_rate": 6.220000000000001e-05, "loss": 0.7517, "step": 311 }, { "epoch": 0.4697026721866767, "grad_norm": 0.35120803117752075, "learning_rate": 6.24e-05, "loss": 0.7968, "step": 312 }, { "epoch": 0.47120812946932633, "grad_norm": 0.385955810546875, "learning_rate": 6.26e-05, "loss": 0.8169, "step": 313 }, { "epoch": 0.4727135867519759, "grad_norm": 0.3152190148830414, "learning_rate": 6.280000000000001e-05, "loss": 0.7943, "step": 314 }, { "epoch": 0.4742190440346255, "grad_norm": 0.29962560534477234, "learning_rate": 6.3e-05, "loss": 0.7708, "step": 315 }, { "epoch": 0.4757245013172751, "grad_norm": 0.21756671369075775, "learning_rate": 6.32e-05, "loss": 0.6893, "step": 316 }, { "epoch": 0.47722995859992473, "grad_norm": 0.2050560563802719, "learning_rate": 6.340000000000001e-05, "loss": 0.7119, "step": 317 }, { "epoch": 0.47873541588257434, "grad_norm": 0.24789363145828247, "learning_rate": 6.36e-05, "loss": 0.6869, "step": 318 }, { "epoch": 0.48024087316522396, "grad_norm": 0.2183133214712143, "learning_rate": 6.38e-05, "loss": 0.7721, "step": 319 }, { "epoch": 0.4817463304478735, "grad_norm": 0.29270339012145996, "learning_rate": 6.400000000000001e-05, "loss": 0.8053, "step": 320 }, { "epoch": 0.48325178773052313, "grad_norm": 0.4857398569583893, "learning_rate": 6.42e-05, "loss": 0.736, "step": 321 }, { "epoch": 0.48475724501317274, "grad_norm": 0.48766207695007324, "learning_rate": 6.440000000000001e-05, "loss": 0.7135, "step": 322 }, { "epoch": 0.48626270229582236, "grad_norm": 0.33891916275024414, "learning_rate": 6.460000000000001e-05, "loss": 0.7137, "step": 323 }, { "epoch": 0.48776815957847197, "grad_norm": 1.0309803485870361, "learning_rate": 6.48e-05, "loss": 0.7534, "step": 324 }, { "epoch": 0.4892736168611216, "grad_norm": 0.429781436920166, "learning_rate": 6.500000000000001e-05, "loss": 0.7675, "step": 325 }, { "epoch": 0.49077907414377114, "grad_norm": 0.3307371139526367, "learning_rate": 6.52e-05, "loss": 0.7779, "step": 326 }, { "epoch": 0.49228453142642076, "grad_norm": 0.35158663988113403, "learning_rate": 6.54e-05, "loss": 0.7186, "step": 327 }, { "epoch": 0.49378998870907037, "grad_norm": 0.32371506094932556, "learning_rate": 6.560000000000001e-05, "loss": 0.6972, "step": 328 }, { "epoch": 0.49529544599172, "grad_norm": 0.2972734868526459, "learning_rate": 6.58e-05, "loss": 0.7563, "step": 329 }, { "epoch": 0.4968009032743696, "grad_norm": 0.3078702390193939, "learning_rate": 6.6e-05, "loss": 0.8017, "step": 330 }, { "epoch": 0.4983063605570192, "grad_norm": 0.3091720640659332, "learning_rate": 6.620000000000001e-05, "loss": 0.7364, "step": 331 }, { "epoch": 0.4998118178396688, "grad_norm": 0.2884233295917511, "learning_rate": 6.64e-05, "loss": 0.7179, "step": 332 }, { "epoch": 0.5013172751223184, "grad_norm": 0.2761831283569336, "learning_rate": 6.66e-05, "loss": 0.6589, "step": 333 }, { "epoch": 0.502822732404968, "grad_norm": 0.2868676483631134, "learning_rate": 6.680000000000001e-05, "loss": 0.7554, "step": 334 }, { "epoch": 0.5043281896876176, "grad_norm": 0.32935410737991333, "learning_rate": 6.7e-05, "loss": 0.658, "step": 335 }, { "epoch": 0.5058336469702672, "grad_norm": 0.30504581332206726, "learning_rate": 6.720000000000001e-05, "loss": 0.7177, "step": 336 }, { "epoch": 0.5073391042529168, "grad_norm": 0.27981528639793396, "learning_rate": 6.740000000000001e-05, "loss": 0.743, "step": 337 }, { "epoch": 0.5088445615355665, "grad_norm": 0.2802298069000244, "learning_rate": 6.76e-05, "loss": 0.6236, "step": 338 }, { "epoch": 0.5103500188182161, "grad_norm": 0.3625543713569641, "learning_rate": 6.780000000000001e-05, "loss": 0.7644, "step": 339 }, { "epoch": 0.5118554761008657, "grad_norm": 0.7372437119483948, "learning_rate": 6.800000000000001e-05, "loss": 0.8017, "step": 340 }, { "epoch": 0.5133609333835153, "grad_norm": 0.687118411064148, "learning_rate": 6.82e-05, "loss": 0.7368, "step": 341 }, { "epoch": 0.5148663906661648, "grad_norm": 0.4725663661956787, "learning_rate": 6.840000000000001e-05, "loss": 0.7615, "step": 342 }, { "epoch": 0.5163718479488144, "grad_norm": 0.5564609169960022, "learning_rate": 6.860000000000001e-05, "loss": 0.7505, "step": 343 }, { "epoch": 0.517877305231464, "grad_norm": 0.4471881091594696, "learning_rate": 6.879999999999999e-05, "loss": 0.6643, "step": 344 }, { "epoch": 0.5193827625141136, "grad_norm": 0.44243407249450684, "learning_rate": 6.9e-05, "loss": 0.7559, "step": 345 }, { "epoch": 0.5208882197967633, "grad_norm": 0.5498892664909363, "learning_rate": 6.92e-05, "loss": 0.7712, "step": 346 }, { "epoch": 0.5223936770794129, "grad_norm": 0.4084393084049225, "learning_rate": 6.939999999999999e-05, "loss": 0.7263, "step": 347 }, { "epoch": 0.5238991343620625, "grad_norm": 0.3166748881340027, "learning_rate": 6.96e-05, "loss": 0.7756, "step": 348 }, { "epoch": 0.5254045916447121, "grad_norm": 0.34484153985977173, "learning_rate": 6.98e-05, "loss": 0.7373, "step": 349 }, { "epoch": 0.5269100489273617, "grad_norm": 0.3640713691711426, "learning_rate": 7e-05, "loss": 0.7274, "step": 350 }, { "epoch": 0.5284155062100113, "grad_norm": 0.2946350574493408, "learning_rate": 7.02e-05, "loss": 0.7144, "step": 351 }, { "epoch": 0.5299209634926609, "grad_norm": 0.2939924895763397, "learning_rate": 7.04e-05, "loss": 0.7331, "step": 352 }, { "epoch": 0.5314264207753105, "grad_norm": 0.28686198592185974, "learning_rate": 7.06e-05, "loss": 0.716, "step": 353 }, { "epoch": 0.53293187805796, "grad_norm": 0.4396781921386719, "learning_rate": 7.08e-05, "loss": 0.7416, "step": 354 }, { "epoch": 0.5344373353406097, "grad_norm": 0.36540764570236206, "learning_rate": 7.1e-05, "loss": 0.7408, "step": 355 }, { "epoch": 0.5359427926232593, "grad_norm": 0.40335190296173096, "learning_rate": 7.12e-05, "loss": 0.7948, "step": 356 }, { "epoch": 0.5374482499059089, "grad_norm": 0.440483957529068, "learning_rate": 7.14e-05, "loss": 0.6964, "step": 357 }, { "epoch": 0.5389537071885585, "grad_norm": 0.4854414165019989, "learning_rate": 7.16e-05, "loss": 0.7186, "step": 358 }, { "epoch": 0.5404591644712081, "grad_norm": 0.46936362981796265, "learning_rate": 7.18e-05, "loss": 0.7278, "step": 359 }, { "epoch": 0.5419646217538577, "grad_norm": 0.31703782081604004, "learning_rate": 7.2e-05, "loss": 0.7084, "step": 360 }, { "epoch": 0.5434700790365073, "grad_norm": 0.3700137436389923, "learning_rate": 7.22e-05, "loss": 0.7368, "step": 361 }, { "epoch": 0.544975536319157, "grad_norm": 0.47710710763931274, "learning_rate": 7.24e-05, "loss": 0.7607, "step": 362 }, { "epoch": 0.5464809936018066, "grad_norm": 0.42255106568336487, "learning_rate": 7.26e-05, "loss": 0.7253, "step": 363 }, { "epoch": 0.5479864508844562, "grad_norm": 0.33289387822151184, "learning_rate": 7.280000000000001e-05, "loss": 0.6879, "step": 364 }, { "epoch": 0.5494919081671058, "grad_norm": 0.3040705621242523, "learning_rate": 7.3e-05, "loss": 0.7558, "step": 365 }, { "epoch": 0.5509973654497554, "grad_norm": 0.309908926486969, "learning_rate": 7.32e-05, "loss": 0.6809, "step": 366 }, { "epoch": 0.5525028227324049, "grad_norm": 0.3529672920703888, "learning_rate": 7.340000000000001e-05, "loss": 0.712, "step": 367 }, { "epoch": 0.5540082800150545, "grad_norm": 0.3098001182079315, "learning_rate": 7.36e-05, "loss": 0.7711, "step": 368 }, { "epoch": 0.5555137372977041, "grad_norm": 0.37864065170288086, "learning_rate": 7.38e-05, "loss": 0.6859, "step": 369 }, { "epoch": 0.5570191945803538, "grad_norm": 0.33707544207572937, "learning_rate": 7.4e-05, "loss": 0.7429, "step": 370 }, { "epoch": 0.5585246518630034, "grad_norm": 0.3064389228820801, "learning_rate": 7.42e-05, "loss": 0.7702, "step": 371 }, { "epoch": 0.560030109145653, "grad_norm": 0.36716747283935547, "learning_rate": 7.44e-05, "loss": 0.7183, "step": 372 }, { "epoch": 0.5615355664283026, "grad_norm": 0.2685989439487457, "learning_rate": 7.46e-05, "loss": 0.6552, "step": 373 }, { "epoch": 0.5630410237109522, "grad_norm": 0.2722727656364441, "learning_rate": 7.48e-05, "loss": 0.6681, "step": 374 }, { "epoch": 0.5645464809936018, "grad_norm": 0.2766365110874176, "learning_rate": 7.500000000000001e-05, "loss": 0.6979, "step": 375 }, { "epoch": 0.5660519382762514, "grad_norm": 0.253886878490448, "learning_rate": 7.52e-05, "loss": 0.7053, "step": 376 }, { "epoch": 0.5675573955589011, "grad_norm": 0.28142106533050537, "learning_rate": 7.54e-05, "loss": 0.7236, "step": 377 }, { "epoch": 0.5690628528415507, "grad_norm": 0.2610190212726593, "learning_rate": 7.560000000000001e-05, "loss": 0.6933, "step": 378 }, { "epoch": 0.5705683101242002, "grad_norm": 0.22964432835578918, "learning_rate": 7.58e-05, "loss": 0.6986, "step": 379 }, { "epoch": 0.5720737674068498, "grad_norm": 0.19585250318050385, "learning_rate": 7.6e-05, "loss": 0.6451, "step": 380 }, { "epoch": 0.5735792246894994, "grad_norm": 0.41539835929870605, "learning_rate": 7.620000000000001e-05, "loss": 0.7345, "step": 381 }, { "epoch": 0.575084681972149, "grad_norm": 0.26485443115234375, "learning_rate": 7.64e-05, "loss": 0.7513, "step": 382 }, { "epoch": 0.5765901392547986, "grad_norm": 0.24852228164672852, "learning_rate": 7.66e-05, "loss": 0.6987, "step": 383 }, { "epoch": 0.5780955965374482, "grad_norm": 0.28459101915359497, "learning_rate": 7.680000000000001e-05, "loss": 0.6337, "step": 384 }, { "epoch": 0.5796010538200979, "grad_norm": 0.45533862709999084, "learning_rate": 7.7e-05, "loss": 0.7427, "step": 385 }, { "epoch": 0.5811065111027475, "grad_norm": 0.5105135440826416, "learning_rate": 7.72e-05, "loss": 0.7164, "step": 386 }, { "epoch": 0.5826119683853971, "grad_norm": 0.3584785759449005, "learning_rate": 7.740000000000001e-05, "loss": 0.673, "step": 387 }, { "epoch": 0.5841174256680467, "grad_norm": 0.507972776889801, "learning_rate": 7.76e-05, "loss": 0.67, "step": 388 }, { "epoch": 0.5856228829506963, "grad_norm": 0.5392345190048218, "learning_rate": 7.780000000000001e-05, "loss": 0.722, "step": 389 }, { "epoch": 0.5871283402333459, "grad_norm": 0.542124330997467, "learning_rate": 7.800000000000001e-05, "loss": 0.7287, "step": 390 }, { "epoch": 0.5886337975159955, "grad_norm": 0.6711815595626831, "learning_rate": 7.82e-05, "loss": 0.685, "step": 391 }, { "epoch": 0.590139254798645, "grad_norm": 0.5401089787483215, "learning_rate": 7.840000000000001e-05, "loss": 0.7937, "step": 392 }, { "epoch": 0.5916447120812947, "grad_norm": 0.43701601028442383, "learning_rate": 7.860000000000001e-05, "loss": 0.6782, "step": 393 }, { "epoch": 0.5931501693639443, "grad_norm": 0.31606802344322205, "learning_rate": 7.88e-05, "loss": 0.7502, "step": 394 }, { "epoch": 0.5946556266465939, "grad_norm": 0.3622533977031708, "learning_rate": 7.900000000000001e-05, "loss": 0.676, "step": 395 }, { "epoch": 0.5961610839292435, "grad_norm": 0.5144320130348206, "learning_rate": 7.920000000000001e-05, "loss": 0.7254, "step": 396 }, { "epoch": 0.5976665412118931, "grad_norm": 0.6986407041549683, "learning_rate": 7.94e-05, "loss": 0.7307, "step": 397 }, { "epoch": 0.5991719984945427, "grad_norm": 0.5935542583465576, "learning_rate": 7.960000000000001e-05, "loss": 0.7263, "step": 398 }, { "epoch": 0.6006774557771923, "grad_norm": 0.28223922848701477, "learning_rate": 7.98e-05, "loss": 0.6487, "step": 399 }, { "epoch": 0.602182913059842, "grad_norm": 0.373828649520874, "learning_rate": 8e-05, "loss": 0.6918, "step": 400 }, { "epoch": 0.602182913059842, "eval_loss": 0.6712279915809631, "eval_runtime": 548.8157, "eval_samples_per_second": 17.541, "eval_steps_per_second": 0.548, "step": 400 }, { "epoch": 0.6036883703424916, "grad_norm": 0.5308519005775452, "learning_rate": 8.020000000000001e-05, "loss": 0.6298, "step": 401 }, { "epoch": 0.6051938276251412, "grad_norm": 0.2921159863471985, "learning_rate": 8.04e-05, "loss": 0.644, "step": 402 }, { "epoch": 0.6066992849077908, "grad_norm": 0.31240278482437134, "learning_rate": 8.060000000000001e-05, "loss": 0.5933, "step": 403 }, { "epoch": 0.6082047421904403, "grad_norm": 0.31068915128707886, "learning_rate": 8.080000000000001e-05, "loss": 0.7272, "step": 404 }, { "epoch": 0.6097101994730899, "grad_norm": 0.3131721317768097, "learning_rate": 8.1e-05, "loss": 0.7016, "step": 405 }, { "epoch": 0.6112156567557395, "grad_norm": 0.32005438208580017, "learning_rate": 8.120000000000001e-05, "loss": 0.6641, "step": 406 }, { "epoch": 0.6127211140383891, "grad_norm": 0.3316677212715149, "learning_rate": 8.14e-05, "loss": 0.739, "step": 407 }, { "epoch": 0.6142265713210388, "grad_norm": 0.36413905024528503, "learning_rate": 8.16e-05, "loss": 0.7292, "step": 408 }, { "epoch": 0.6157320286036884, "grad_norm": 0.45895034074783325, "learning_rate": 8.18e-05, "loss": 0.7103, "step": 409 }, { "epoch": 0.617237485886338, "grad_norm": 0.409968763589859, "learning_rate": 8.2e-05, "loss": 0.714, "step": 410 }, { "epoch": 0.6187429431689876, "grad_norm": 0.2708582580089569, "learning_rate": 8.22e-05, "loss": 0.7329, "step": 411 }, { "epoch": 0.6202484004516372, "grad_norm": 0.3839437961578369, "learning_rate": 8.24e-05, "loss": 0.6957, "step": 412 }, { "epoch": 0.6217538577342868, "grad_norm": 0.40152233839035034, "learning_rate": 8.26e-05, "loss": 0.6625, "step": 413 }, { "epoch": 0.6232593150169364, "grad_norm": 0.38379624485969543, "learning_rate": 8.28e-05, "loss": 0.7404, "step": 414 }, { "epoch": 0.624764772299586, "grad_norm": 0.3534543216228485, "learning_rate": 8.3e-05, "loss": 0.7043, "step": 415 }, { "epoch": 0.6262702295822355, "grad_norm": 0.3164896070957184, "learning_rate": 8.32e-05, "loss": 0.7211, "step": 416 }, { "epoch": 0.6277756868648852, "grad_norm": 0.3204507529735565, "learning_rate": 8.34e-05, "loss": 0.6243, "step": 417 }, { "epoch": 0.6292811441475348, "grad_norm": 0.3112359344959259, "learning_rate": 8.36e-05, "loss": 0.6813, "step": 418 }, { "epoch": 0.6307866014301844, "grad_norm": 0.2624550759792328, "learning_rate": 8.38e-05, "loss": 0.7211, "step": 419 }, { "epoch": 0.632292058712834, "grad_norm": 0.2881220281124115, "learning_rate": 8.4e-05, "loss": 0.5781, "step": 420 }, { "epoch": 0.6337975159954836, "grad_norm": 0.38161832094192505, "learning_rate": 8.42e-05, "loss": 0.7029, "step": 421 }, { "epoch": 0.6353029732781332, "grad_norm": 0.33642327785491943, "learning_rate": 8.44e-05, "loss": 0.75, "step": 422 }, { "epoch": 0.6368084305607828, "grad_norm": 0.30393174290657043, "learning_rate": 8.46e-05, "loss": 0.6334, "step": 423 }, { "epoch": 0.6383138878434325, "grad_norm": 0.2804274260997772, "learning_rate": 8.48e-05, "loss": 0.664, "step": 424 }, { "epoch": 0.6398193451260821, "grad_norm": 0.2536040246486664, "learning_rate": 8.5e-05, "loss": 0.6187, "step": 425 }, { "epoch": 0.6413248024087317, "grad_norm": 0.3047247529029846, "learning_rate": 8.52e-05, "loss": 0.6935, "step": 426 }, { "epoch": 0.6428302596913813, "grad_norm": 0.35099199414253235, "learning_rate": 8.54e-05, "loss": 0.7543, "step": 427 }, { "epoch": 0.6443357169740309, "grad_norm": 0.4929967224597931, "learning_rate": 8.560000000000001e-05, "loss": 0.6414, "step": 428 }, { "epoch": 0.6458411742566804, "grad_norm": 0.6982940435409546, "learning_rate": 8.58e-05, "loss": 0.7006, "step": 429 }, { "epoch": 0.64734663153933, "grad_norm": 0.6605833768844604, "learning_rate": 8.6e-05, "loss": 0.7029, "step": 430 }, { "epoch": 0.6488520888219796, "grad_norm": 0.39012813568115234, "learning_rate": 8.620000000000001e-05, "loss": 0.7202, "step": 431 }, { "epoch": 0.6503575461046293, "grad_norm": 0.4172554016113281, "learning_rate": 8.64e-05, "loss": 0.7295, "step": 432 }, { "epoch": 0.6518630033872789, "grad_norm": 0.39375823736190796, "learning_rate": 8.66e-05, "loss": 0.639, "step": 433 }, { "epoch": 0.6533684606699285, "grad_norm": 0.3437797427177429, "learning_rate": 8.680000000000001e-05, "loss": 0.705, "step": 434 }, { "epoch": 0.6548739179525781, "grad_norm": 0.287806898355484, "learning_rate": 8.7e-05, "loss": 0.6947, "step": 435 }, { "epoch": 0.6563793752352277, "grad_norm": 0.3323732018470764, "learning_rate": 8.72e-05, "loss": 0.6886, "step": 436 }, { "epoch": 0.6578848325178773, "grad_norm": 0.29263949394226074, "learning_rate": 8.740000000000001e-05, "loss": 0.7098, "step": 437 }, { "epoch": 0.6593902898005269, "grad_norm": 0.2812378406524658, "learning_rate": 8.76e-05, "loss": 0.6122, "step": 438 }, { "epoch": 0.6608957470831766, "grad_norm": 0.33268821239471436, "learning_rate": 8.78e-05, "loss": 0.6445, "step": 439 }, { "epoch": 0.6624012043658262, "grad_norm": 0.37078025937080383, "learning_rate": 8.800000000000001e-05, "loss": 0.67, "step": 440 }, { "epoch": 0.6639066616484757, "grad_norm": 0.5317936539649963, "learning_rate": 8.82e-05, "loss": 0.674, "step": 441 }, { "epoch": 0.6654121189311253, "grad_norm": 0.5790961384773254, "learning_rate": 8.840000000000001e-05, "loss": 0.6383, "step": 442 }, { "epoch": 0.6669175762137749, "grad_norm": 0.41792622208595276, "learning_rate": 8.86e-05, "loss": 0.7259, "step": 443 }, { "epoch": 0.6684230334964245, "grad_norm": 0.36798086762428284, "learning_rate": 8.88e-05, "loss": 0.6232, "step": 444 }, { "epoch": 0.6699284907790741, "grad_norm": 0.2959485352039337, "learning_rate": 8.900000000000001e-05, "loss": 0.6335, "step": 445 }, { "epoch": 0.6714339480617237, "grad_norm": 0.3528854250907898, "learning_rate": 8.92e-05, "loss": 0.6407, "step": 446 }, { "epoch": 0.6729394053443734, "grad_norm": 0.33971965312957764, "learning_rate": 8.94e-05, "loss": 0.6935, "step": 447 }, { "epoch": 0.674444862627023, "grad_norm": 0.32364147901535034, "learning_rate": 8.960000000000001e-05, "loss": 0.6397, "step": 448 }, { "epoch": 0.6759503199096726, "grad_norm": 0.28870630264282227, "learning_rate": 8.98e-05, "loss": 0.7041, "step": 449 }, { "epoch": 0.6774557771923222, "grad_norm": 0.25540757179260254, "learning_rate": 9e-05, "loss": 0.6576, "step": 450 }, { "epoch": 0.6789612344749718, "grad_norm": 0.21494922041893005, "learning_rate": 9.020000000000001e-05, "loss": 0.6925, "step": 451 }, { "epoch": 0.6804666917576214, "grad_norm": 0.2404584139585495, "learning_rate": 9.04e-05, "loss": 0.6829, "step": 452 }, { "epoch": 0.681972149040271, "grad_norm": 0.2070576548576355, "learning_rate": 9.06e-05, "loss": 0.6542, "step": 453 }, { "epoch": 0.6834776063229205, "grad_norm": 0.2087641954421997, "learning_rate": 9.080000000000001e-05, "loss": 0.6521, "step": 454 }, { "epoch": 0.6849830636055702, "grad_norm": 0.2183593064546585, "learning_rate": 9.1e-05, "loss": 0.6291, "step": 455 }, { "epoch": 0.6864885208882198, "grad_norm": 0.21753111481666565, "learning_rate": 9.120000000000001e-05, "loss": 0.6512, "step": 456 }, { "epoch": 0.6879939781708694, "grad_norm": 0.27960076928138733, "learning_rate": 9.140000000000001e-05, "loss": 0.685, "step": 457 }, { "epoch": 0.689499435453519, "grad_norm": 0.3172072172164917, "learning_rate": 9.16e-05, "loss": 0.6774, "step": 458 }, { "epoch": 0.6910048927361686, "grad_norm": 0.2956441044807434, "learning_rate": 9.180000000000001e-05, "loss": 0.6936, "step": 459 }, { "epoch": 0.6925103500188182, "grad_norm": 0.3919297158718109, "learning_rate": 9.200000000000001e-05, "loss": 0.6076, "step": 460 }, { "epoch": 0.6940158073014678, "grad_norm": 0.5185704827308655, "learning_rate": 9.22e-05, "loss": 0.7226, "step": 461 }, { "epoch": 0.6955212645841174, "grad_norm": 0.48447656631469727, "learning_rate": 9.240000000000001e-05, "loss": 0.7131, "step": 462 }, { "epoch": 0.6970267218667671, "grad_norm": 0.3868970274925232, "learning_rate": 9.260000000000001e-05, "loss": 0.656, "step": 463 }, { "epoch": 0.6985321791494167, "grad_norm": 0.3064926862716675, "learning_rate": 9.28e-05, "loss": 0.577, "step": 464 }, { "epoch": 0.7000376364320663, "grad_norm": 0.35238251090049744, "learning_rate": 9.300000000000001e-05, "loss": 0.6172, "step": 465 }, { "epoch": 0.7015430937147158, "grad_norm": 0.358140766620636, "learning_rate": 9.320000000000002e-05, "loss": 0.6912, "step": 466 }, { "epoch": 0.7030485509973654, "grad_norm": 0.3821800947189331, "learning_rate": 9.340000000000001e-05, "loss": 0.732, "step": 467 }, { "epoch": 0.704554008280015, "grad_norm": 0.3690665364265442, "learning_rate": 9.360000000000001e-05, "loss": 0.6771, "step": 468 }, { "epoch": 0.7060594655626646, "grad_norm": 0.3207572400569916, "learning_rate": 9.38e-05, "loss": 0.6865, "step": 469 }, { "epoch": 0.7075649228453142, "grad_norm": 0.40814974904060364, "learning_rate": 9.4e-05, "loss": 0.6497, "step": 470 }, { "epoch": 0.7090703801279639, "grad_norm": 0.3813323378562927, "learning_rate": 9.42e-05, "loss": 0.7112, "step": 471 }, { "epoch": 0.7105758374106135, "grad_norm": 0.3345155715942383, "learning_rate": 9.44e-05, "loss": 0.6997, "step": 472 }, { "epoch": 0.7120812946932631, "grad_norm": 0.34112271666526794, "learning_rate": 9.46e-05, "loss": 0.6437, "step": 473 }, { "epoch": 0.7135867519759127, "grad_norm": 0.4004102051258087, "learning_rate": 9.48e-05, "loss": 0.63, "step": 474 }, { "epoch": 0.7150922092585623, "grad_norm": 0.2775402069091797, "learning_rate": 9.5e-05, "loss": 0.6726, "step": 475 }, { "epoch": 0.7165976665412119, "grad_norm": 0.316678524017334, "learning_rate": 9.52e-05, "loss": 0.6748, "step": 476 }, { "epoch": 0.7181031238238615, "grad_norm": 0.34156563878059387, "learning_rate": 9.54e-05, "loss": 0.5792, "step": 477 }, { "epoch": 0.7196085811065112, "grad_norm": 0.38976773619651794, "learning_rate": 9.56e-05, "loss": 0.6171, "step": 478 }, { "epoch": 0.7211140383891607, "grad_norm": 0.6082056760787964, "learning_rate": 9.58e-05, "loss": 0.6027, "step": 479 }, { "epoch": 0.7226194956718103, "grad_norm": 0.6903731822967529, "learning_rate": 9.6e-05, "loss": 0.6588, "step": 480 }, { "epoch": 0.7241249529544599, "grad_norm": 0.4408109784126282, "learning_rate": 9.620000000000001e-05, "loss": 0.5886, "step": 481 }, { "epoch": 0.7256304102371095, "grad_norm": 0.5093309283256531, "learning_rate": 9.64e-05, "loss": 0.6865, "step": 482 }, { "epoch": 0.7271358675197591, "grad_norm": 0.5310457944869995, "learning_rate": 9.66e-05, "loss": 0.6102, "step": 483 }, { "epoch": 0.7286413248024087, "grad_norm": 0.3234555423259735, "learning_rate": 9.680000000000001e-05, "loss": 0.7005, "step": 484 }, { "epoch": 0.7301467820850583, "grad_norm": 0.4140044152736664, "learning_rate": 9.7e-05, "loss": 0.677, "step": 485 }, { "epoch": 0.731652239367708, "grad_norm": 0.4175383746623993, "learning_rate": 9.72e-05, "loss": 0.6169, "step": 486 }, { "epoch": 0.7331576966503576, "grad_norm": 0.3190940022468567, "learning_rate": 9.74e-05, "loss": 0.6046, "step": 487 }, { "epoch": 0.7346631539330072, "grad_norm": 0.32278335094451904, "learning_rate": 9.76e-05, "loss": 0.6751, "step": 488 }, { "epoch": 0.7361686112156568, "grad_norm": 0.28864163160324097, "learning_rate": 9.78e-05, "loss": 0.6962, "step": 489 }, { "epoch": 0.7376740684983064, "grad_norm": 0.2708832621574402, "learning_rate": 9.8e-05, "loss": 0.6741, "step": 490 }, { "epoch": 0.7391795257809559, "grad_norm": 0.34861132502555847, "learning_rate": 9.82e-05, "loss": 0.6719, "step": 491 }, { "epoch": 0.7406849830636055, "grad_norm": 0.29654332995414734, "learning_rate": 9.84e-05, "loss": 0.6365, "step": 492 }, { "epoch": 0.7421904403462551, "grad_norm": 0.27361512184143066, "learning_rate": 9.86e-05, "loss": 0.6557, "step": 493 }, { "epoch": 0.7436958976289048, "grad_norm": 0.276079922914505, "learning_rate": 9.88e-05, "loss": 0.6386, "step": 494 }, { "epoch": 0.7452013549115544, "grad_norm": 0.294210284948349, "learning_rate": 9.900000000000001e-05, "loss": 0.6828, "step": 495 }, { "epoch": 0.746706812194204, "grad_norm": 0.33543601632118225, "learning_rate": 9.92e-05, "loss": 0.6796, "step": 496 }, { "epoch": 0.7482122694768536, "grad_norm": 0.35436686873435974, "learning_rate": 9.94e-05, "loss": 0.6615, "step": 497 }, { "epoch": 0.7497177267595032, "grad_norm": 0.4633318781852722, "learning_rate": 9.960000000000001e-05, "loss": 0.6459, "step": 498 }, { "epoch": 0.7512231840421528, "grad_norm": 0.5765914916992188, "learning_rate": 9.98e-05, "loss": 0.614, "step": 499 }, { "epoch": 0.7527286413248024, "grad_norm": 0.5637470483779907, "learning_rate": 0.0001, "loss": 0.6213, "step": 500 }, { "epoch": 0.754234098607452, "grad_norm": 0.3848998248577118, "learning_rate": 9.999999753943383e-05, "loss": 0.6341, "step": 501 }, { "epoch": 0.7557395558901017, "grad_norm": 0.3616611361503601, "learning_rate": 9.999999015773558e-05, "loss": 0.6978, "step": 502 }, { "epoch": 0.7572450131727512, "grad_norm": 0.34052878618240356, "learning_rate": 9.999997785490607e-05, "loss": 0.6348, "step": 503 }, { "epoch": 0.7587504704554008, "grad_norm": 0.28904789686203003, "learning_rate": 9.999996063094663e-05, "loss": 0.6249, "step": 504 }, { "epoch": 0.7602559277380504, "grad_norm": 0.2707881033420563, "learning_rate": 9.999993848585916e-05, "loss": 0.6449, "step": 505 }, { "epoch": 0.7617613850207, "grad_norm": 0.2850693464279175, "learning_rate": 9.999991141964607e-05, "loss": 0.6514, "step": 506 }, { "epoch": 0.7632668423033496, "grad_norm": 0.3302256762981415, "learning_rate": 9.999987943231033e-05, "loss": 0.6341, "step": 507 }, { "epoch": 0.7647722995859992, "grad_norm": 0.37732550501823425, "learning_rate": 9.999984252385543e-05, "loss": 0.654, "step": 508 }, { "epoch": 0.7662777568686489, "grad_norm": 0.5058630108833313, "learning_rate": 9.99998006942854e-05, "loss": 0.6444, "step": 509 }, { "epoch": 0.7677832141512985, "grad_norm": 0.5311433672904968, "learning_rate": 9.999975394360483e-05, "loss": 0.5374, "step": 510 }, { "epoch": 0.7692886714339481, "grad_norm": 0.356442928314209, "learning_rate": 9.999970227181881e-05, "loss": 0.6616, "step": 511 }, { "epoch": 0.7707941287165977, "grad_norm": 0.5017839670181274, "learning_rate": 9.999964567893302e-05, "loss": 0.6675, "step": 512 }, { "epoch": 0.7722995859992473, "grad_norm": 0.4589559733867645, "learning_rate": 9.999958416495364e-05, "loss": 0.6955, "step": 513 }, { "epoch": 0.7738050432818969, "grad_norm": 0.40493592619895935, "learning_rate": 9.999951772988738e-05, "loss": 0.6974, "step": 514 }, { "epoch": 0.7753105005645465, "grad_norm": 0.4177801012992859, "learning_rate": 9.99994463737415e-05, "loss": 0.5741, "step": 515 }, { "epoch": 0.776815957847196, "grad_norm": 0.39802566170692444, "learning_rate": 9.999937009652385e-05, "loss": 0.6287, "step": 516 }, { "epoch": 0.7783214151298457, "grad_norm": 0.4049002528190613, "learning_rate": 9.999928889824273e-05, "loss": 0.7332, "step": 517 }, { "epoch": 0.7798268724124953, "grad_norm": 0.3847580850124359, "learning_rate": 9.999920277890703e-05, "loss": 0.6096, "step": 518 }, { "epoch": 0.7813323296951449, "grad_norm": 0.2968679964542389, "learning_rate": 9.999911173852618e-05, "loss": 0.673, "step": 519 }, { "epoch": 0.7828377869777945, "grad_norm": 0.29896458983421326, "learning_rate": 9.999901577711012e-05, "loss": 0.6692, "step": 520 }, { "epoch": 0.7843432442604441, "grad_norm": 0.30007416009902954, "learning_rate": 9.999891489466934e-05, "loss": 0.6492, "step": 521 }, { "epoch": 0.7858487015430937, "grad_norm": 0.28299978375434875, "learning_rate": 9.999880909121488e-05, "loss": 0.6642, "step": 522 }, { "epoch": 0.7873541588257433, "grad_norm": 0.24298696219921112, "learning_rate": 9.999869836675833e-05, "loss": 0.6734, "step": 523 }, { "epoch": 0.788859616108393, "grad_norm": 0.2391703724861145, "learning_rate": 9.999858272131177e-05, "loss": 0.6772, "step": 524 }, { "epoch": 0.7903650733910426, "grad_norm": 0.2779070734977722, "learning_rate": 9.999846215488786e-05, "loss": 0.6233, "step": 525 }, { "epoch": 0.7918705306736922, "grad_norm": 0.32368016242980957, "learning_rate": 9.999833666749979e-05, "loss": 0.6109, "step": 526 }, { "epoch": 0.7933759879563418, "grad_norm": 0.3766157329082489, "learning_rate": 9.999820625916127e-05, "loss": 0.572, "step": 527 }, { "epoch": 0.7948814452389913, "grad_norm": 0.4231093227863312, "learning_rate": 9.999807092988656e-05, "loss": 0.6395, "step": 528 }, { "epoch": 0.7963869025216409, "grad_norm": 0.41506892442703247, "learning_rate": 9.999793067969047e-05, "loss": 0.6609, "step": 529 }, { "epoch": 0.7978923598042905, "grad_norm": 0.3310212194919586, "learning_rate": 9.999778550858834e-05, "loss": 0.6506, "step": 530 }, { "epoch": 0.7993978170869401, "grad_norm": 0.2667292356491089, "learning_rate": 9.999763541659605e-05, "loss": 0.5864, "step": 531 }, { "epoch": 0.8009032743695897, "grad_norm": 0.2827354371547699, "learning_rate": 9.999748040372998e-05, "loss": 0.6359, "step": 532 }, { "epoch": 0.8024087316522394, "grad_norm": 0.3227585256099701, "learning_rate": 9.999732047000711e-05, "loss": 0.6371, "step": 533 }, { "epoch": 0.803914188934889, "grad_norm": 0.3222917914390564, "learning_rate": 9.999715561544494e-05, "loss": 0.648, "step": 534 }, { "epoch": 0.8054196462175386, "grad_norm": 0.2543361186981201, "learning_rate": 9.999698584006149e-05, "loss": 0.6035, "step": 535 }, { "epoch": 0.8069251035001882, "grad_norm": 0.24246038496494293, "learning_rate": 9.999681114387529e-05, "loss": 0.5856, "step": 536 }, { "epoch": 0.8084305607828378, "grad_norm": 0.29857516288757324, "learning_rate": 9.999663152690549e-05, "loss": 0.5868, "step": 537 }, { "epoch": 0.8099360180654874, "grad_norm": 0.29770004749298096, "learning_rate": 9.999644698917173e-05, "loss": 0.56, "step": 538 }, { "epoch": 0.811441475348137, "grad_norm": 0.27247726917266846, "learning_rate": 9.999625753069417e-05, "loss": 0.6345, "step": 539 }, { "epoch": 0.8129469326307867, "grad_norm": 0.29341915249824524, "learning_rate": 9.999606315149354e-05, "loss": 0.6649, "step": 540 }, { "epoch": 0.8144523899134362, "grad_norm": 0.3347874879837036, "learning_rate": 9.999586385159108e-05, "loss": 0.6732, "step": 541 }, { "epoch": 0.8159578471960858, "grad_norm": 0.30653226375579834, "learning_rate": 9.999565963100862e-05, "loss": 0.6369, "step": 542 }, { "epoch": 0.8174633044787354, "grad_norm": 0.2884942293167114, "learning_rate": 9.999545048976846e-05, "loss": 0.6003, "step": 543 }, { "epoch": 0.818968761761385, "grad_norm": 0.2777225077152252, "learning_rate": 9.999523642789348e-05, "loss": 0.6191, "step": 544 }, { "epoch": 0.8204742190440346, "grad_norm": 0.2798054814338684, "learning_rate": 9.999501744540712e-05, "loss": 0.5707, "step": 545 }, { "epoch": 0.8219796763266842, "grad_norm": 0.3316041827201843, "learning_rate": 9.999479354233326e-05, "loss": 0.6756, "step": 546 }, { "epoch": 0.8234851336093338, "grad_norm": 0.35636207461357117, "learning_rate": 9.999456471869645e-05, "loss": 0.6575, "step": 547 }, { "epoch": 0.8249905908919835, "grad_norm": 0.3288339376449585, "learning_rate": 9.99943309745217e-05, "loss": 0.5809, "step": 548 }, { "epoch": 0.8264960481746331, "grad_norm": 0.2819138169288635, "learning_rate": 9.999409230983455e-05, "loss": 0.6929, "step": 549 }, { "epoch": 0.8280015054572827, "grad_norm": 0.28062209486961365, "learning_rate": 9.999384872466111e-05, "loss": 0.6279, "step": 550 }, { "epoch": 0.8295069627399323, "grad_norm": 0.2533743679523468, "learning_rate": 9.999360021902802e-05, "loss": 0.5057, "step": 551 }, { "epoch": 0.8310124200225819, "grad_norm": 0.27681031823158264, "learning_rate": 9.999334679296246e-05, "loss": 0.6242, "step": 552 }, { "epoch": 0.8325178773052314, "grad_norm": 0.2651994228363037, "learning_rate": 9.999308844649214e-05, "loss": 0.5974, "step": 553 }, { "epoch": 0.834023334587881, "grad_norm": 0.2582781910896301, "learning_rate": 9.999282517964532e-05, "loss": 0.6009, "step": 554 }, { "epoch": 0.8355287918705306, "grad_norm": 0.3430691063404083, "learning_rate": 9.999255699245078e-05, "loss": 0.6152, "step": 555 }, { "epoch": 0.8370342491531803, "grad_norm": 0.510353147983551, "learning_rate": 9.999228388493786e-05, "loss": 0.6051, "step": 556 }, { "epoch": 0.8385397064358299, "grad_norm": 0.6556945443153381, "learning_rate": 9.99920058571364e-05, "loss": 0.6685, "step": 557 }, { "epoch": 0.8400451637184795, "grad_norm": 0.507698118686676, "learning_rate": 9.999172290907685e-05, "loss": 0.586, "step": 558 }, { "epoch": 0.8415506210011291, "grad_norm": 0.3213531970977783, "learning_rate": 9.999143504079011e-05, "loss": 0.6058, "step": 559 }, { "epoch": 0.8430560782837787, "grad_norm": 0.37891507148742676, "learning_rate": 9.999114225230768e-05, "loss": 0.6011, "step": 560 }, { "epoch": 0.8445615355664283, "grad_norm": 0.274384081363678, "learning_rate": 9.999084454366159e-05, "loss": 0.5419, "step": 561 }, { "epoch": 0.8460669928490779, "grad_norm": 0.31390610337257385, "learning_rate": 9.999054191488436e-05, "loss": 0.6224, "step": 562 }, { "epoch": 0.8475724501317276, "grad_norm": 0.32865554094314575, "learning_rate": 9.999023436600911e-05, "loss": 0.636, "step": 563 }, { "epoch": 0.8490779074143772, "grad_norm": 0.27636590600013733, "learning_rate": 9.998992189706949e-05, "loss": 0.721, "step": 564 }, { "epoch": 0.8505833646970267, "grad_norm": 0.3084441125392914, "learning_rate": 9.998960450809965e-05, "loss": 0.6662, "step": 565 }, { "epoch": 0.8520888219796763, "grad_norm": 0.29659223556518555, "learning_rate": 9.998928219913428e-05, "loss": 0.6071, "step": 566 }, { "epoch": 0.8535942792623259, "grad_norm": 0.2933308482170105, "learning_rate": 9.998895497020868e-05, "loss": 0.6207, "step": 567 }, { "epoch": 0.8550997365449755, "grad_norm": 0.2428172528743744, "learning_rate": 9.998862282135857e-05, "loss": 0.5517, "step": 568 }, { "epoch": 0.8566051938276251, "grad_norm": 0.23797614872455597, "learning_rate": 9.998828575262034e-05, "loss": 0.631, "step": 569 }, { "epoch": 0.8581106511102747, "grad_norm": 0.25072309374809265, "learning_rate": 9.99879437640308e-05, "loss": 0.5977, "step": 570 }, { "epoch": 0.8596161083929243, "grad_norm": 0.3036382496356964, "learning_rate": 9.998759685562737e-05, "loss": 0.5588, "step": 571 }, { "epoch": 0.861121565675574, "grad_norm": 0.8402568101882935, "learning_rate": 9.9987245027448e-05, "loss": 0.6141, "step": 572 }, { "epoch": 0.8626270229582236, "grad_norm": 0.38741546869277954, "learning_rate": 9.998688827953114e-05, "loss": 0.6211, "step": 573 }, { "epoch": 0.8641324802408732, "grad_norm": 0.40497928857803345, "learning_rate": 9.99865266119158e-05, "loss": 0.5283, "step": 574 }, { "epoch": 0.8656379375235228, "grad_norm": 0.5457287430763245, "learning_rate": 9.998616002464157e-05, "loss": 0.594, "step": 575 }, { "epoch": 0.8671433948061724, "grad_norm": 0.5043533444404602, "learning_rate": 9.99857885177485e-05, "loss": 0.6163, "step": 576 }, { "epoch": 0.868648852088822, "grad_norm": 0.33741357922554016, "learning_rate": 9.998541209127725e-05, "loss": 0.6231, "step": 577 }, { "epoch": 0.8701543093714715, "grad_norm": 0.45172446966171265, "learning_rate": 9.998503074526896e-05, "loss": 0.5847, "step": 578 }, { "epoch": 0.8716597666541211, "grad_norm": 0.48149022459983826, "learning_rate": 9.998464447976533e-05, "loss": 0.6193, "step": 579 }, { "epoch": 0.8731652239367708, "grad_norm": 0.4466894865036011, "learning_rate": 9.998425329480863e-05, "loss": 0.5733, "step": 580 }, { "epoch": 0.8746706812194204, "grad_norm": 0.46116548776626587, "learning_rate": 9.99838571904416e-05, "loss": 0.5661, "step": 581 }, { "epoch": 0.87617613850207, "grad_norm": 0.47398942708969116, "learning_rate": 9.99834561667076e-05, "loss": 0.5577, "step": 582 }, { "epoch": 0.8776815957847196, "grad_norm": 0.4290790557861328, "learning_rate": 9.998305022365047e-05, "loss": 0.5952, "step": 583 }, { "epoch": 0.8791870530673692, "grad_norm": 0.3092474937438965, "learning_rate": 9.998263936131458e-05, "loss": 0.6399, "step": 584 }, { "epoch": 0.8806925103500188, "grad_norm": 0.31108543276786804, "learning_rate": 9.998222357974488e-05, "loss": 0.6018, "step": 585 }, { "epoch": 0.8821979676326684, "grad_norm": 0.397303968667984, "learning_rate": 9.998180287898685e-05, "loss": 0.6101, "step": 586 }, { "epoch": 0.8837034249153181, "grad_norm": 0.38202545046806335, "learning_rate": 9.99813772590865e-05, "loss": 0.6103, "step": 587 }, { "epoch": 0.8852088821979677, "grad_norm": 1.4755860567092896, "learning_rate": 9.998094672009034e-05, "loss": 0.5596, "step": 588 }, { "epoch": 0.8867143394806173, "grad_norm": 0.3889983892440796, "learning_rate": 9.998051126204548e-05, "loss": 0.645, "step": 589 }, { "epoch": 0.8882197967632668, "grad_norm": 0.4962857663631439, "learning_rate": 9.998007088499952e-05, "loss": 0.6589, "step": 590 }, { "epoch": 0.8897252540459164, "grad_norm": 0.31521522998809814, "learning_rate": 9.997962558900065e-05, "loss": 0.5459, "step": 591 }, { "epoch": 0.891230711328566, "grad_norm": 0.36530882120132446, "learning_rate": 9.997917537409755e-05, "loss": 0.5571, "step": 592 }, { "epoch": 0.8927361686112156, "grad_norm": 0.32494455575942993, "learning_rate": 9.997872024033948e-05, "loss": 0.6157, "step": 593 }, { "epoch": 0.8942416258938652, "grad_norm": 0.27840861678123474, "learning_rate": 9.997826018777613e-05, "loss": 0.6692, "step": 594 }, { "epoch": 0.8957470831765149, "grad_norm": 0.31395629048347473, "learning_rate": 9.997779521645793e-05, "loss": 0.617, "step": 595 }, { "epoch": 0.8972525404591645, "grad_norm": 0.27839577198028564, "learning_rate": 9.997732532643564e-05, "loss": 0.555, "step": 596 }, { "epoch": 0.8987579977418141, "grad_norm": 0.3039555251598358, "learning_rate": 9.997685051776068e-05, "loss": 0.6011, "step": 597 }, { "epoch": 0.9002634550244637, "grad_norm": 0.2958638072013855, "learning_rate": 9.997637079048497e-05, "loss": 0.5814, "step": 598 }, { "epoch": 0.9017689123071133, "grad_norm": 0.33248159289360046, "learning_rate": 9.997588614466096e-05, "loss": 0.5753, "step": 599 }, { "epoch": 0.9032743695897629, "grad_norm": 0.3922744393348694, "learning_rate": 9.997539658034168e-05, "loss": 0.659, "step": 600 }, { "epoch": 0.9032743695897629, "eval_loss": 0.575820803642273, "eval_runtime": 546.431, "eval_samples_per_second": 17.618, "eval_steps_per_second": 0.551, "step": 600 }, { "epoch": 0.9047798268724125, "grad_norm": 0.3854648470878601, "learning_rate": 9.997490209758062e-05, "loss": 0.6047, "step": 601 }, { "epoch": 0.9062852841550622, "grad_norm": 0.3421621024608612, "learning_rate": 9.997440269643191e-05, "loss": 0.63, "step": 602 }, { "epoch": 0.9077907414377117, "grad_norm": 0.28015318512916565, "learning_rate": 9.997389837695014e-05, "loss": 0.5652, "step": 603 }, { "epoch": 0.9092961987203613, "grad_norm": 0.32436296343803406, "learning_rate": 9.997338913919045e-05, "loss": 0.5691, "step": 604 }, { "epoch": 0.9108016560030109, "grad_norm": 0.36596181988716125, "learning_rate": 9.997287498320855e-05, "loss": 0.6188, "step": 605 }, { "epoch": 0.9123071132856605, "grad_norm": 0.36790966987609863, "learning_rate": 9.997235590906067e-05, "loss": 0.5772, "step": 606 }, { "epoch": 0.9138125705683101, "grad_norm": 0.29626965522766113, "learning_rate": 9.997183191680353e-05, "loss": 0.5719, "step": 607 }, { "epoch": 0.9153180278509597, "grad_norm": 0.2163577377796173, "learning_rate": 9.997130300649448e-05, "loss": 0.6666, "step": 608 }, { "epoch": 0.9168234851336093, "grad_norm": 0.311340868473053, "learning_rate": 9.997076917819135e-05, "loss": 0.5127, "step": 609 }, { "epoch": 0.918328942416259, "grad_norm": 0.2811194360256195, "learning_rate": 9.997023043195251e-05, "loss": 0.5912, "step": 610 }, { "epoch": 0.9198343996989086, "grad_norm": 0.23890355229377747, "learning_rate": 9.99696867678369e-05, "loss": 0.5799, "step": 611 }, { "epoch": 0.9213398569815582, "grad_norm": 0.316986083984375, "learning_rate": 9.996913818590394e-05, "loss": 0.6008, "step": 612 }, { "epoch": 0.9228453142642078, "grad_norm": 0.35798943042755127, "learning_rate": 9.996858468621365e-05, "loss": 0.6217, "step": 613 }, { "epoch": 0.9243507715468574, "grad_norm": 0.2610478401184082, "learning_rate": 9.996802626882653e-05, "loss": 0.593, "step": 614 }, { "epoch": 0.9258562288295069, "grad_norm": 0.3002730906009674, "learning_rate": 9.996746293380366e-05, "loss": 0.6192, "step": 615 }, { "epoch": 0.9273616861121565, "grad_norm": 0.33754396438598633, "learning_rate": 9.996689468120665e-05, "loss": 0.5712, "step": 616 }, { "epoch": 0.9288671433948061, "grad_norm": 0.25317078828811646, "learning_rate": 9.996632151109768e-05, "loss": 0.5197, "step": 617 }, { "epoch": 0.9303726006774558, "grad_norm": 0.2719792425632477, "learning_rate": 9.996574342353936e-05, "loss": 0.5797, "step": 618 }, { "epoch": 0.9318780579601054, "grad_norm": 0.2641884684562683, "learning_rate": 9.996516041859496e-05, "loss": 0.5836, "step": 619 }, { "epoch": 0.933383515242755, "grad_norm": 0.2556197941303253, "learning_rate": 9.996457249632824e-05, "loss": 0.4964, "step": 620 }, { "epoch": 0.9348889725254046, "grad_norm": 0.2968374192714691, "learning_rate": 9.996397965680344e-05, "loss": 0.6192, "step": 621 }, { "epoch": 0.9363944298080542, "grad_norm": 0.39176058769226074, "learning_rate": 9.996338190008544e-05, "loss": 0.6072, "step": 622 }, { "epoch": 0.9378998870907038, "grad_norm": 0.42349621653556824, "learning_rate": 9.99627792262396e-05, "loss": 0.6304, "step": 623 }, { "epoch": 0.9394053443733534, "grad_norm": 0.35202571749687195, "learning_rate": 9.996217163533183e-05, "loss": 0.6077, "step": 624 }, { "epoch": 0.940910801656003, "grad_norm": 0.2999664545059204, "learning_rate": 9.996155912742855e-05, "loss": 0.5074, "step": 625 }, { "epoch": 0.9424162589386527, "grad_norm": 0.2814263105392456, "learning_rate": 9.996094170259677e-05, "loss": 0.6398, "step": 626 }, { "epoch": 0.9439217162213023, "grad_norm": 0.28173789381980896, "learning_rate": 9.996031936090401e-05, "loss": 0.6088, "step": 627 }, { "epoch": 0.9454271735039518, "grad_norm": 0.3348361551761627, "learning_rate": 9.995969210241833e-05, "loss": 0.5599, "step": 628 }, { "epoch": 0.9469326307866014, "grad_norm": 0.31884628534317017, "learning_rate": 9.995905992720829e-05, "loss": 0.5939, "step": 629 }, { "epoch": 0.948438088069251, "grad_norm": 0.34052011370658875, "learning_rate": 9.995842283534307e-05, "loss": 0.6127, "step": 630 }, { "epoch": 0.9499435453519006, "grad_norm": 0.3259519934654236, "learning_rate": 9.995778082689233e-05, "loss": 0.599, "step": 631 }, { "epoch": 0.9514490026345502, "grad_norm": 0.3147607743740082, "learning_rate": 9.995713390192624e-05, "loss": 0.5543, "step": 632 }, { "epoch": 0.9529544599171998, "grad_norm": 0.3206441402435303, "learning_rate": 9.995648206051563e-05, "loss": 0.537, "step": 633 }, { "epoch": 0.9544599171998495, "grad_norm": 0.28357985615730286, "learning_rate": 9.995582530273169e-05, "loss": 0.5671, "step": 634 }, { "epoch": 0.9559653744824991, "grad_norm": 0.2519621253013611, "learning_rate": 9.995516362864629e-05, "loss": 0.6644, "step": 635 }, { "epoch": 0.9574708317651487, "grad_norm": 0.2566375732421875, "learning_rate": 9.99544970383318e-05, "loss": 0.5931, "step": 636 }, { "epoch": 0.9589762890477983, "grad_norm": 0.24366281926631927, "learning_rate": 9.99538255318611e-05, "loss": 0.5754, "step": 637 }, { "epoch": 0.9604817463304479, "grad_norm": 0.2890430986881256, "learning_rate": 9.995314910930762e-05, "loss": 0.5968, "step": 638 }, { "epoch": 0.9619872036130975, "grad_norm": 0.24141143262386322, "learning_rate": 9.995246777074535e-05, "loss": 0.6078, "step": 639 }, { "epoch": 0.963492660895747, "grad_norm": 0.24308958649635315, "learning_rate": 9.995178151624878e-05, "loss": 0.5594, "step": 640 }, { "epoch": 0.9649981181783966, "grad_norm": 0.23390457034111023, "learning_rate": 9.995109034589296e-05, "loss": 0.5976, "step": 641 }, { "epoch": 0.9665035754610463, "grad_norm": 0.3044535517692566, "learning_rate": 9.995039425975348e-05, "loss": 0.5967, "step": 642 }, { "epoch": 0.9680090327436959, "grad_norm": 0.33916589617729187, "learning_rate": 9.99496932579065e-05, "loss": 0.579, "step": 643 }, { "epoch": 0.9695144900263455, "grad_norm": 0.3589169979095459, "learning_rate": 9.994898734042863e-05, "loss": 0.5741, "step": 644 }, { "epoch": 0.9710199473089951, "grad_norm": 0.4123527705669403, "learning_rate": 9.994827650739707e-05, "loss": 0.6205, "step": 645 }, { "epoch": 0.9725254045916447, "grad_norm": 0.3449872136116028, "learning_rate": 9.994756075888956e-05, "loss": 0.6142, "step": 646 }, { "epoch": 0.9740308618742943, "grad_norm": 0.32518914341926575, "learning_rate": 9.99468400949844e-05, "loss": 0.6543, "step": 647 }, { "epoch": 0.9755363191569439, "grad_norm": 0.35469335317611694, "learning_rate": 9.994611451576038e-05, "loss": 0.5919, "step": 648 }, { "epoch": 0.9770417764395936, "grad_norm": 0.29939255118370056, "learning_rate": 9.994538402129686e-05, "loss": 0.5871, "step": 649 }, { "epoch": 0.9785472337222432, "grad_norm": 0.3126184940338135, "learning_rate": 9.994464861167372e-05, "loss": 0.5112, "step": 650 }, { "epoch": 0.9800526910048928, "grad_norm": 0.39503297209739685, "learning_rate": 9.994390828697138e-05, "loss": 0.5214, "step": 651 }, { "epoch": 0.9815581482875423, "grad_norm": 0.4486844539642334, "learning_rate": 9.99431630472708e-05, "loss": 0.5505, "step": 652 }, { "epoch": 0.9830636055701919, "grad_norm": 0.4653003513813019, "learning_rate": 9.994241289265347e-05, "loss": 0.5412, "step": 653 }, { "epoch": 0.9845690628528415, "grad_norm": 0.30222323536872864, "learning_rate": 9.994165782320145e-05, "loss": 0.5637, "step": 654 }, { "epoch": 0.9860745201354911, "grad_norm": 0.2823014259338379, "learning_rate": 9.994089783899728e-05, "loss": 0.6389, "step": 655 }, { "epoch": 0.9875799774181407, "grad_norm": 0.27761542797088623, "learning_rate": 9.99401329401241e-05, "loss": 0.5772, "step": 656 }, { "epoch": 0.9890854347007904, "grad_norm": 0.3586849868297577, "learning_rate": 9.993936312666557e-05, "loss": 0.5976, "step": 657 }, { "epoch": 0.99059089198344, "grad_norm": 0.2824101150035858, "learning_rate": 9.993858839870581e-05, "loss": 0.5278, "step": 658 }, { "epoch": 0.9920963492660896, "grad_norm": 0.25616389513015747, "learning_rate": 9.993780875632962e-05, "loss": 0.5397, "step": 659 }, { "epoch": 0.9936018065487392, "grad_norm": 0.3072245419025421, "learning_rate": 9.993702419962222e-05, "loss": 0.593, "step": 660 }, { "epoch": 0.9951072638313888, "grad_norm": 0.3206118941307068, "learning_rate": 9.993623472866942e-05, "loss": 0.5757, "step": 661 }, { "epoch": 0.9966127211140384, "grad_norm": 0.3026239275932312, "learning_rate": 9.993544034355754e-05, "loss": 0.4766, "step": 662 }, { "epoch": 0.998118178396688, "grad_norm": 0.31667497754096985, "learning_rate": 9.993464104437346e-05, "loss": 0.4843, "step": 663 }, { "epoch": 0.9996236356793377, "grad_norm": 0.2897581458091736, "learning_rate": 9.993383683120461e-05, "loss": 0.608, "step": 664 }, { "epoch": 1.0011290929619872, "grad_norm": 0.2570367753505707, "learning_rate": 9.99330277041389e-05, "loss": 0.5358, "step": 665 }, { "epoch": 1.0026345502446368, "grad_norm": 0.25398674607276917, "learning_rate": 9.993221366326486e-05, "loss": 0.5296, "step": 666 }, { "epoch": 1.0041400075272864, "grad_norm": 0.2899864614009857, "learning_rate": 9.993139470867147e-05, "loss": 0.6011, "step": 667 }, { "epoch": 1.005645464809936, "grad_norm": 0.29653260111808777, "learning_rate": 9.993057084044832e-05, "loss": 0.5704, "step": 668 }, { "epoch": 1.0071509220925856, "grad_norm": 0.2846008539199829, "learning_rate": 9.992974205868549e-05, "loss": 0.6072, "step": 669 }, { "epoch": 1.0086563793752352, "grad_norm": 0.24604348838329315, "learning_rate": 9.992890836347361e-05, "loss": 0.5928, "step": 670 }, { "epoch": 1.0101618366578848, "grad_norm": 0.23412632942199707, "learning_rate": 9.992806975490389e-05, "loss": 0.62, "step": 671 }, { "epoch": 1.0116672939405345, "grad_norm": 0.2653500437736511, "learning_rate": 9.992722623306799e-05, "loss": 0.5978, "step": 672 }, { "epoch": 1.013172751223184, "grad_norm": 0.3266112208366394, "learning_rate": 9.992637779805817e-05, "loss": 0.5219, "step": 673 }, { "epoch": 1.0146782085058337, "grad_norm": 0.34338921308517456, "learning_rate": 9.992552444996722e-05, "loss": 0.6024, "step": 674 }, { "epoch": 1.0161836657884833, "grad_norm": 0.38998299837112427, "learning_rate": 9.992466618888847e-05, "loss": 0.5901, "step": 675 }, { "epoch": 1.017689123071133, "grad_norm": 0.3364081084728241, "learning_rate": 9.992380301491576e-05, "loss": 0.5619, "step": 676 }, { "epoch": 1.0191945803537825, "grad_norm": 0.2710467278957367, "learning_rate": 9.992293492814351e-05, "loss": 0.4962, "step": 677 }, { "epoch": 1.0207000376364321, "grad_norm": 0.40224525332450867, "learning_rate": 9.992206192866663e-05, "loss": 0.5803, "step": 678 }, { "epoch": 1.0222054949190817, "grad_norm": 0.3461301326751709, "learning_rate": 9.99211840165806e-05, "loss": 0.573, "step": 679 }, { "epoch": 1.0237109522017314, "grad_norm": 0.27956071496009827, "learning_rate": 9.992030119198141e-05, "loss": 0.5547, "step": 680 }, { "epoch": 1.025216409484381, "grad_norm": 0.41758468747138977, "learning_rate": 9.991941345496562e-05, "loss": 0.5687, "step": 681 }, { "epoch": 1.0267218667670306, "grad_norm": 0.3594836890697479, "learning_rate": 9.991852080563033e-05, "loss": 0.5585, "step": 682 }, { "epoch": 1.0282273240496802, "grad_norm": 0.2543596029281616, "learning_rate": 9.991762324407312e-05, "loss": 0.5899, "step": 683 }, { "epoch": 1.0297327813323296, "grad_norm": 0.35829585790634155, "learning_rate": 9.991672077039217e-05, "loss": 0.5147, "step": 684 }, { "epoch": 1.0312382386149792, "grad_norm": 0.3943381607532501, "learning_rate": 9.991581338468616e-05, "loss": 0.5649, "step": 685 }, { "epoch": 1.0327436958976288, "grad_norm": 0.5129222869873047, "learning_rate": 9.991490108705433e-05, "loss": 0.5219, "step": 686 }, { "epoch": 1.0342491531802784, "grad_norm": 0.4704694151878357, "learning_rate": 9.991398387759645e-05, "loss": 0.5995, "step": 687 }, { "epoch": 1.035754610462928, "grad_norm": 0.34548017382621765, "learning_rate": 9.991306175641283e-05, "loss": 0.5513, "step": 688 }, { "epoch": 1.0372600677455777, "grad_norm": 0.2937384843826294, "learning_rate": 9.991213472360429e-05, "loss": 0.6299, "step": 689 }, { "epoch": 1.0387655250282273, "grad_norm": 0.33524781465530396, "learning_rate": 9.991120277927223e-05, "loss": 0.5406, "step": 690 }, { "epoch": 1.040270982310877, "grad_norm": 0.3298502564430237, "learning_rate": 9.991026592351854e-05, "loss": 0.5472, "step": 691 }, { "epoch": 1.0417764395935265, "grad_norm": 0.3240150213241577, "learning_rate": 9.990932415644571e-05, "loss": 0.5448, "step": 692 }, { "epoch": 1.0432818968761761, "grad_norm": 0.2822765111923218, "learning_rate": 9.990837747815669e-05, "loss": 0.5611, "step": 693 }, { "epoch": 1.0447873541588257, "grad_norm": 0.28342050313949585, "learning_rate": 9.990742588875505e-05, "loss": 0.5986, "step": 694 }, { "epoch": 1.0462928114414753, "grad_norm": 0.35371944308280945, "learning_rate": 9.990646938834483e-05, "loss": 0.5492, "step": 695 }, { "epoch": 1.047798268724125, "grad_norm": 0.3047260046005249, "learning_rate": 9.990550797703062e-05, "loss": 0.5736, "step": 696 }, { "epoch": 1.0493037260067746, "grad_norm": 0.2402530163526535, "learning_rate": 9.990454165491757e-05, "loss": 0.506, "step": 697 }, { "epoch": 1.0508091832894242, "grad_norm": 0.263163685798645, "learning_rate": 9.990357042211137e-05, "loss": 0.551, "step": 698 }, { "epoch": 1.0523146405720738, "grad_norm": 0.2519015669822693, "learning_rate": 9.990259427871822e-05, "loss": 0.4759, "step": 699 }, { "epoch": 1.0538200978547234, "grad_norm": 0.22210446000099182, "learning_rate": 9.990161322484486e-05, "loss": 0.5533, "step": 700 }, { "epoch": 1.055325555137373, "grad_norm": 0.23049770295619965, "learning_rate": 9.99006272605986e-05, "loss": 0.4932, "step": 701 }, { "epoch": 1.0568310124200226, "grad_norm": 0.24564331769943237, "learning_rate": 9.989963638608722e-05, "loss": 0.5356, "step": 702 }, { "epoch": 1.0583364697026723, "grad_norm": 0.2731838822364807, "learning_rate": 9.989864060141914e-05, "loss": 0.5917, "step": 703 }, { "epoch": 1.0598419269853219, "grad_norm": 0.27545201778411865, "learning_rate": 9.989763990670322e-05, "loss": 0.613, "step": 704 }, { "epoch": 1.0613473842679715, "grad_norm": 0.2819155156612396, "learning_rate": 9.989663430204891e-05, "loss": 0.5667, "step": 705 }, { "epoch": 1.062852841550621, "grad_norm": 0.24749858677387238, "learning_rate": 9.989562378756616e-05, "loss": 0.5568, "step": 706 }, { "epoch": 1.0643582988332705, "grad_norm": 0.24747398495674133, "learning_rate": 9.989460836336549e-05, "loss": 0.5425, "step": 707 }, { "epoch": 1.0658637561159203, "grad_norm": 0.2697569727897644, "learning_rate": 9.989358802955798e-05, "loss": 0.572, "step": 708 }, { "epoch": 1.0673692133985697, "grad_norm": 0.3674769401550293, "learning_rate": 9.989256278625514e-05, "loss": 0.5889, "step": 709 }, { "epoch": 1.0688746706812193, "grad_norm": 0.36048057675361633, "learning_rate": 9.989153263356914e-05, "loss": 0.5415, "step": 710 }, { "epoch": 1.070380127963869, "grad_norm": 0.3676128685474396, "learning_rate": 9.989049757161264e-05, "loss": 0.5074, "step": 711 }, { "epoch": 1.0718855852465186, "grad_norm": 0.2987547218799591, "learning_rate": 9.98894576004988e-05, "loss": 0.5435, "step": 712 }, { "epoch": 1.0733910425291682, "grad_norm": 0.29166892170906067, "learning_rate": 9.988841272034137e-05, "loss": 0.5158, "step": 713 }, { "epoch": 1.0748964998118178, "grad_norm": 0.3056153655052185, "learning_rate": 9.988736293125462e-05, "loss": 0.5077, "step": 714 }, { "epoch": 1.0764019570944674, "grad_norm": 0.2894218862056732, "learning_rate": 9.988630823335334e-05, "loss": 0.5667, "step": 715 }, { "epoch": 1.077907414377117, "grad_norm": 0.2581893801689148, "learning_rate": 9.988524862675288e-05, "loss": 0.5855, "step": 716 }, { "epoch": 1.0794128716597666, "grad_norm": 0.23700621724128723, "learning_rate": 9.988418411156911e-05, "loss": 0.5156, "step": 717 }, { "epoch": 1.0809183289424162, "grad_norm": 0.25843408703804016, "learning_rate": 9.988311468791846e-05, "loss": 0.5802, "step": 718 }, { "epoch": 1.0824237862250659, "grad_norm": 0.3152880072593689, "learning_rate": 9.988204035591786e-05, "loss": 0.5589, "step": 719 }, { "epoch": 1.0839292435077155, "grad_norm": 0.2970457971096039, "learning_rate": 9.98809611156848e-05, "loss": 0.5405, "step": 720 }, { "epoch": 1.085434700790365, "grad_norm": 0.2758914828300476, "learning_rate": 9.987987696733731e-05, "loss": 0.5628, "step": 721 }, { "epoch": 1.0869401580730147, "grad_norm": 0.28382444381713867, "learning_rate": 9.987878791099397e-05, "loss": 0.4831, "step": 722 }, { "epoch": 1.0884456153556643, "grad_norm": 0.28824615478515625, "learning_rate": 9.987769394677384e-05, "loss": 0.564, "step": 723 }, { "epoch": 1.089951072638314, "grad_norm": 0.3021320700645447, "learning_rate": 9.987659507479657e-05, "loss": 0.5175, "step": 724 }, { "epoch": 1.0914565299209635, "grad_norm": 0.34339675307273865, "learning_rate": 9.987549129518235e-05, "loss": 0.5705, "step": 725 }, { "epoch": 1.0929619872036132, "grad_norm": 0.3676524758338928, "learning_rate": 9.987438260805186e-05, "loss": 0.5821, "step": 726 }, { "epoch": 1.0944674444862628, "grad_norm": 0.4391572177410126, "learning_rate": 9.987326901352638e-05, "loss": 0.4714, "step": 727 }, { "epoch": 1.0959729017689124, "grad_norm": 0.46437451243400574, "learning_rate": 9.987215051172763e-05, "loss": 0.5374, "step": 728 }, { "epoch": 1.097478359051562, "grad_norm": 0.46723487973213196, "learning_rate": 9.987102710277798e-05, "loss": 0.4925, "step": 729 }, { "epoch": 1.0989838163342116, "grad_norm": 0.4434554874897003, "learning_rate": 9.986989878680028e-05, "loss": 0.5815, "step": 730 }, { "epoch": 1.1004892736168612, "grad_norm": 0.3581846356391907, "learning_rate": 9.986876556391788e-05, "loss": 0.638, "step": 731 }, { "epoch": 1.1019947308995106, "grad_norm": 0.28490981459617615, "learning_rate": 9.986762743425476e-05, "loss": 0.5317, "step": 732 }, { "epoch": 1.1035001881821604, "grad_norm": 0.31563693284988403, "learning_rate": 9.986648439793536e-05, "loss": 0.5516, "step": 733 }, { "epoch": 1.1050056454648098, "grad_norm": 0.2697666883468628, "learning_rate": 9.986533645508467e-05, "loss": 0.4557, "step": 734 }, { "epoch": 1.1065111027474595, "grad_norm": 0.260818749666214, "learning_rate": 9.986418360582826e-05, "loss": 0.5412, "step": 735 }, { "epoch": 1.108016560030109, "grad_norm": 0.2482614517211914, "learning_rate": 9.986302585029216e-05, "loss": 0.4636, "step": 736 }, { "epoch": 1.1095220173127587, "grad_norm": 0.2636620104312897, "learning_rate": 9.986186318860302e-05, "loss": 0.5095, "step": 737 }, { "epoch": 1.1110274745954083, "grad_norm": 0.2765592336654663, "learning_rate": 9.986069562088795e-05, "loss": 0.5385, "step": 738 }, { "epoch": 1.112532931878058, "grad_norm": 0.23747645318508148, "learning_rate": 9.985952314727468e-05, "loss": 0.5539, "step": 739 }, { "epoch": 1.1140383891607075, "grad_norm": 0.27083268761634827, "learning_rate": 9.985834576789139e-05, "loss": 0.5117, "step": 740 }, { "epoch": 1.1155438464433571, "grad_norm": 0.2865273058414459, "learning_rate": 9.985716348286685e-05, "loss": 0.5238, "step": 741 }, { "epoch": 1.1170493037260067, "grad_norm": 0.2530610263347626, "learning_rate": 9.985597629233038e-05, "loss": 0.5201, "step": 742 }, { "epoch": 1.1185547610086564, "grad_norm": 0.2581002414226532, "learning_rate": 9.985478419641174e-05, "loss": 0.4972, "step": 743 }, { "epoch": 1.120060218291306, "grad_norm": 0.30204835534095764, "learning_rate": 9.985358719524136e-05, "loss": 0.5529, "step": 744 }, { "epoch": 1.1215656755739556, "grad_norm": 0.2305976003408432, "learning_rate": 9.985238528895012e-05, "loss": 0.5587, "step": 745 }, { "epoch": 1.1230711328566052, "grad_norm": 0.2704053819179535, "learning_rate": 9.985117847766946e-05, "loss": 0.5687, "step": 746 }, { "epoch": 1.1245765901392548, "grad_norm": 0.26578542590141296, "learning_rate": 9.984996676153134e-05, "loss": 0.5278, "step": 747 }, { "epoch": 1.1260820474219044, "grad_norm": 0.29321202635765076, "learning_rate": 9.984875014066832e-05, "loss": 0.6364, "step": 748 }, { "epoch": 1.127587504704554, "grad_norm": 0.29486754536628723, "learning_rate": 9.984752861521338e-05, "loss": 0.5238, "step": 749 }, { "epoch": 1.1290929619872037, "grad_norm": 0.34489476680755615, "learning_rate": 9.984630218530014e-05, "loss": 0.5153, "step": 750 }, { "epoch": 1.1305984192698533, "grad_norm": 0.42486169934272766, "learning_rate": 9.984507085106273e-05, "loss": 0.5791, "step": 751 }, { "epoch": 1.1321038765525029, "grad_norm": 0.48322850465774536, "learning_rate": 9.98438346126358e-05, "loss": 0.564, "step": 752 }, { "epoch": 1.1336093338351525, "grad_norm": 0.3562379479408264, "learning_rate": 9.984259347015453e-05, "loss": 0.4944, "step": 753 }, { "epoch": 1.1351147911178021, "grad_norm": 0.2635498046875, "learning_rate": 9.984134742375466e-05, "loss": 0.5293, "step": 754 }, { "epoch": 1.1366202484004517, "grad_norm": 0.3005084693431854, "learning_rate": 9.984009647357244e-05, "loss": 0.5023, "step": 755 }, { "epoch": 1.1381257056831013, "grad_norm": 0.320726603269577, "learning_rate": 9.983884061974471e-05, "loss": 0.425, "step": 756 }, { "epoch": 1.1396311629657507, "grad_norm": 0.3098583519458771, "learning_rate": 9.983757986240877e-05, "loss": 0.5431, "step": 757 }, { "epoch": 1.1411366202484006, "grad_norm": 0.27733442187309265, "learning_rate": 9.983631420170252e-05, "loss": 0.5109, "step": 758 }, { "epoch": 1.14264207753105, "grad_norm": 0.26903486251831055, "learning_rate": 9.983504363776435e-05, "loss": 0.5423, "step": 759 }, { "epoch": 1.1441475348136996, "grad_norm": 0.23091334104537964, "learning_rate": 9.98337681707332e-05, "loss": 0.5417, "step": 760 }, { "epoch": 1.1456529920963492, "grad_norm": 0.23310162127017975, "learning_rate": 9.98324878007486e-05, "loss": 0.5558, "step": 761 }, { "epoch": 1.1471584493789988, "grad_norm": 0.28925296664237976, "learning_rate": 9.983120252795053e-05, "loss": 0.5014, "step": 762 }, { "epoch": 1.1486639066616484, "grad_norm": 0.3254275321960449, "learning_rate": 9.982991235247954e-05, "loss": 0.5635, "step": 763 }, { "epoch": 1.150169363944298, "grad_norm": 0.3315838575363159, "learning_rate": 9.982861727447675e-05, "loss": 0.5074, "step": 764 }, { "epoch": 1.1516748212269476, "grad_norm": 0.30173423886299133, "learning_rate": 9.982731729408377e-05, "loss": 0.4347, "step": 765 }, { "epoch": 1.1531802785095973, "grad_norm": 0.25608593225479126, "learning_rate": 9.982601241144277e-05, "loss": 0.5035, "step": 766 }, { "epoch": 1.1546857357922469, "grad_norm": 0.3001561164855957, "learning_rate": 9.982470262669643e-05, "loss": 0.465, "step": 767 }, { "epoch": 1.1561911930748965, "grad_norm": 0.30384886264801025, "learning_rate": 9.982338793998802e-05, "loss": 0.4877, "step": 768 }, { "epoch": 1.157696650357546, "grad_norm": 0.24815376102924347, "learning_rate": 9.982206835146131e-05, "loss": 0.5079, "step": 769 }, { "epoch": 1.1592021076401957, "grad_norm": 0.22394786775112152, "learning_rate": 9.982074386126057e-05, "loss": 0.5258, "step": 770 }, { "epoch": 1.1607075649228453, "grad_norm": 0.26728004217147827, "learning_rate": 9.981941446953066e-05, "loss": 0.6173, "step": 771 }, { "epoch": 1.162213022205495, "grad_norm": 0.25506728887557983, "learning_rate": 9.981808017641699e-05, "loss": 0.591, "step": 772 }, { "epoch": 1.1637184794881446, "grad_norm": 0.21364690363407135, "learning_rate": 9.981674098206545e-05, "loss": 0.5029, "step": 773 }, { "epoch": 1.1652239367707942, "grad_norm": 0.24653097987174988, "learning_rate": 9.98153968866225e-05, "loss": 0.5555, "step": 774 }, { "epoch": 1.1667293940534438, "grad_norm": 0.2509082853794098, "learning_rate": 9.981404789023512e-05, "loss": 0.569, "step": 775 }, { "epoch": 1.1682348513360934, "grad_norm": 0.21022826433181763, "learning_rate": 9.981269399305084e-05, "loss": 0.5075, "step": 776 }, { "epoch": 1.169740308618743, "grad_norm": 0.2321189045906067, "learning_rate": 9.981133519521773e-05, "loss": 0.5238, "step": 777 }, { "epoch": 1.1712457659013926, "grad_norm": 0.2759626507759094, "learning_rate": 9.980997149688437e-05, "loss": 0.5332, "step": 778 }, { "epoch": 1.1727512231840422, "grad_norm": 0.39810091257095337, "learning_rate": 9.980860289819989e-05, "loss": 0.5039, "step": 779 }, { "epoch": 1.1742566804666918, "grad_norm": 0.536338210105896, "learning_rate": 9.980722939931397e-05, "loss": 0.5805, "step": 780 }, { "epoch": 1.1757621377493415, "grad_norm": 0.5537400245666504, "learning_rate": 9.980585100037681e-05, "loss": 0.6278, "step": 781 }, { "epoch": 1.1772675950319909, "grad_norm": 0.42508673667907715, "learning_rate": 9.980446770153917e-05, "loss": 0.5645, "step": 782 }, { "epoch": 1.1787730523146407, "grad_norm": 0.3537555932998657, "learning_rate": 9.980307950295228e-05, "loss": 0.5588, "step": 783 }, { "epoch": 1.18027850959729, "grad_norm": 0.36869490146636963, "learning_rate": 9.980168640476797e-05, "loss": 0.521, "step": 784 }, { "epoch": 1.1817839668799397, "grad_norm": 0.32589590549468994, "learning_rate": 9.980028840713861e-05, "loss": 0.5049, "step": 785 }, { "epoch": 1.1832894241625893, "grad_norm": 0.2855128347873688, "learning_rate": 9.979888551021705e-05, "loss": 0.5118, "step": 786 }, { "epoch": 1.184794881445239, "grad_norm": 0.31708016991615295, "learning_rate": 9.979747771415675e-05, "loss": 0.5762, "step": 787 }, { "epoch": 1.1863003387278885, "grad_norm": 0.26960134506225586, "learning_rate": 9.97960650191116e-05, "loss": 0.4978, "step": 788 }, { "epoch": 1.1878057960105382, "grad_norm": 0.28655895590782166, "learning_rate": 9.979464742523617e-05, "loss": 0.5125, "step": 789 }, { "epoch": 1.1893112532931878, "grad_norm": 0.26243293285369873, "learning_rate": 9.979322493268543e-05, "loss": 0.5396, "step": 790 }, { "epoch": 1.1908167105758374, "grad_norm": 0.6095902323722839, "learning_rate": 9.979179754161494e-05, "loss": 0.5436, "step": 791 }, { "epoch": 1.192322167858487, "grad_norm": 0.292708158493042, "learning_rate": 9.979036525218084e-05, "loss": 0.5391, "step": 792 }, { "epoch": 1.1938276251411366, "grad_norm": 0.30076664686203003, "learning_rate": 9.978892806453973e-05, "loss": 0.5189, "step": 793 }, { "epoch": 1.1953330824237862, "grad_norm": 0.2671612799167633, "learning_rate": 9.978748597884878e-05, "loss": 0.505, "step": 794 }, { "epoch": 1.1968385397064358, "grad_norm": 0.25366607308387756, "learning_rate": 9.97860389952657e-05, "loss": 0.5662, "step": 795 }, { "epoch": 1.1983439969890854, "grad_norm": 0.2729104161262512, "learning_rate": 9.978458711394873e-05, "loss": 0.5433, "step": 796 }, { "epoch": 1.199849454271735, "grad_norm": 0.30660614371299744, "learning_rate": 9.978313033505665e-05, "loss": 0.5168, "step": 797 }, { "epoch": 1.2013549115543847, "grad_norm": 0.31235042214393616, "learning_rate": 9.978166865874878e-05, "loss": 0.4959, "step": 798 }, { "epoch": 1.2028603688370343, "grad_norm": 0.2949833869934082, "learning_rate": 9.978020208518493e-05, "loss": 0.5632, "step": 799 }, { "epoch": 1.204365826119684, "grad_norm": 0.33027273416519165, "learning_rate": 9.977873061452552e-05, "loss": 0.4972, "step": 800 }, { "epoch": 1.204365826119684, "eval_loss": 0.5054838061332703, "eval_runtime": 552.0876, "eval_samples_per_second": 17.437, "eval_steps_per_second": 0.545, "step": 800 }, { "epoch": 1.2058712834023335, "grad_norm": 0.34392914175987244, "learning_rate": 9.977725424693145e-05, "loss": 0.5518, "step": 801 }, { "epoch": 1.2073767406849831, "grad_norm": 0.2747610807418823, "learning_rate": 9.977577298256417e-05, "loss": 0.5133, "step": 802 }, { "epoch": 1.2088821979676327, "grad_norm": 0.2982468605041504, "learning_rate": 9.977428682158569e-05, "loss": 0.5579, "step": 803 }, { "epoch": 1.2103876552502824, "grad_norm": 0.23008692264556885, "learning_rate": 9.977279576415853e-05, "loss": 0.517, "step": 804 }, { "epoch": 1.211893112532932, "grad_norm": 0.2635594606399536, "learning_rate": 9.97712998104457e-05, "loss": 0.394, "step": 805 }, { "epoch": 1.2133985698155816, "grad_norm": 0.22498904168605804, "learning_rate": 9.976979896061086e-05, "loss": 0.4926, "step": 806 }, { "epoch": 1.214904027098231, "grad_norm": 0.22572748363018036, "learning_rate": 9.976829321481812e-05, "loss": 0.5679, "step": 807 }, { "epoch": 1.2164094843808808, "grad_norm": 0.22728826105594635, "learning_rate": 9.976678257323213e-05, "loss": 0.5289, "step": 808 }, { "epoch": 1.2179149416635302, "grad_norm": 0.24827007949352264, "learning_rate": 9.976526703601811e-05, "loss": 0.5177, "step": 809 }, { "epoch": 1.2194203989461798, "grad_norm": 0.3232664465904236, "learning_rate": 9.97637466033418e-05, "loss": 0.4926, "step": 810 }, { "epoch": 1.2209258562288294, "grad_norm": 0.36909204721450806, "learning_rate": 9.976222127536944e-05, "loss": 0.5361, "step": 811 }, { "epoch": 1.222431313511479, "grad_norm": 0.3242628574371338, "learning_rate": 9.976069105226788e-05, "loss": 0.5034, "step": 812 }, { "epoch": 1.2239367707941287, "grad_norm": 0.2686716914176941, "learning_rate": 9.975915593420444e-05, "loss": 0.5463, "step": 813 }, { "epoch": 1.2254422280767783, "grad_norm": 0.31871554255485535, "learning_rate": 9.975761592134699e-05, "loss": 0.477, "step": 814 }, { "epoch": 1.2269476853594279, "grad_norm": 0.36424651741981506, "learning_rate": 9.975607101386398e-05, "loss": 0.4746, "step": 815 }, { "epoch": 1.2284531426420775, "grad_norm": 0.34471192955970764, "learning_rate": 9.975452121192431e-05, "loss": 0.4812, "step": 816 }, { "epoch": 1.2299585999247271, "grad_norm": 0.36195191740989685, "learning_rate": 9.97529665156975e-05, "loss": 0.5264, "step": 817 }, { "epoch": 1.2314640572073767, "grad_norm": 0.3377469480037689, "learning_rate": 9.975140692535354e-05, "loss": 0.4979, "step": 818 }, { "epoch": 1.2329695144900263, "grad_norm": 0.30651649832725525, "learning_rate": 9.974984244106302e-05, "loss": 0.5383, "step": 819 }, { "epoch": 1.234474971772676, "grad_norm": 0.3267380893230438, "learning_rate": 9.974827306299701e-05, "loss": 0.6158, "step": 820 }, { "epoch": 1.2359804290553256, "grad_norm": 0.2883305549621582, "learning_rate": 9.974669879132713e-05, "loss": 0.5324, "step": 821 }, { "epoch": 1.2374858863379752, "grad_norm": 0.43880847096443176, "learning_rate": 9.974511962622555e-05, "loss": 0.5674, "step": 822 }, { "epoch": 1.2389913436206248, "grad_norm": 0.3640534281730652, "learning_rate": 9.974353556786496e-05, "loss": 0.5115, "step": 823 }, { "epoch": 1.2404968009032744, "grad_norm": 0.3338591456413269, "learning_rate": 9.974194661641859e-05, "loss": 0.487, "step": 824 }, { "epoch": 1.242002258185924, "grad_norm": 0.2852989137172699, "learning_rate": 9.974035277206021e-05, "loss": 0.5001, "step": 825 }, { "epoch": 1.2435077154685736, "grad_norm": 0.2692085802555084, "learning_rate": 9.97387540349641e-05, "loss": 0.4853, "step": 826 }, { "epoch": 1.2450131727512233, "grad_norm": 0.2324955314397812, "learning_rate": 9.973715040530514e-05, "loss": 0.5841, "step": 827 }, { "epoch": 1.2465186300338729, "grad_norm": 0.2514539361000061, "learning_rate": 9.973554188325865e-05, "loss": 0.5431, "step": 828 }, { "epoch": 1.2480240873165225, "grad_norm": 0.28265196084976196, "learning_rate": 9.973392846900056e-05, "loss": 0.4562, "step": 829 }, { "epoch": 1.249529544599172, "grad_norm": 0.3667682409286499, "learning_rate": 9.973231016270731e-05, "loss": 0.5062, "step": 830 }, { "epoch": 1.2510350018818217, "grad_norm": 0.3228123188018799, "learning_rate": 9.973068696455589e-05, "loss": 0.5589, "step": 831 }, { "epoch": 1.252540459164471, "grad_norm": 0.27619943022727966, "learning_rate": 9.972905887472377e-05, "loss": 0.5611, "step": 832 }, { "epoch": 1.254045916447121, "grad_norm": 0.35707157850265503, "learning_rate": 9.972742589338905e-05, "loss": 0.4503, "step": 833 }, { "epoch": 1.2555513737297703, "grad_norm": 0.34359556436538696, "learning_rate": 9.972578802073026e-05, "loss": 0.4692, "step": 834 }, { "epoch": 1.25705683101242, "grad_norm": 0.2777124345302582, "learning_rate": 9.972414525692653e-05, "loss": 0.5314, "step": 835 }, { "epoch": 1.2585622882950696, "grad_norm": 0.29886364936828613, "learning_rate": 9.972249760215754e-05, "loss": 0.5009, "step": 836 }, { "epoch": 1.2600677455777192, "grad_norm": 0.2759563624858856, "learning_rate": 9.972084505660344e-05, "loss": 0.418, "step": 837 }, { "epoch": 1.2615732028603688, "grad_norm": 0.316806823015213, "learning_rate": 9.971918762044496e-05, "loss": 0.4806, "step": 838 }, { "epoch": 1.2630786601430184, "grad_norm": 0.29551348090171814, "learning_rate": 9.971752529386336e-05, "loss": 0.5174, "step": 839 }, { "epoch": 1.264584117425668, "grad_norm": 0.2506606876850128, "learning_rate": 9.971585807704043e-05, "loss": 0.4716, "step": 840 }, { "epoch": 1.2660895747083176, "grad_norm": 0.2507305443286896, "learning_rate": 9.971418597015848e-05, "loss": 0.4999, "step": 841 }, { "epoch": 1.2675950319909672, "grad_norm": 0.28042295575141907, "learning_rate": 9.971250897340038e-05, "loss": 0.4386, "step": 842 }, { "epoch": 1.2691004892736168, "grad_norm": 0.2842555642127991, "learning_rate": 9.971082708694953e-05, "loss": 0.4754, "step": 843 }, { "epoch": 1.2706059465562665, "grad_norm": 0.28565120697021484, "learning_rate": 9.970914031098984e-05, "loss": 0.5071, "step": 844 }, { "epoch": 1.272111403838916, "grad_norm": 0.2862533926963806, "learning_rate": 9.97074486457058e-05, "loss": 0.4664, "step": 845 }, { "epoch": 1.2736168611215657, "grad_norm": 0.2919695973396301, "learning_rate": 9.970575209128238e-05, "loss": 0.5605, "step": 846 }, { "epoch": 1.2751223184042153, "grad_norm": 0.25543758273124695, "learning_rate": 9.970405064790513e-05, "loss": 0.4902, "step": 847 }, { "epoch": 1.276627775686865, "grad_norm": 0.2789463996887207, "learning_rate": 9.970234431576011e-05, "loss": 0.552, "step": 848 }, { "epoch": 1.2781332329695145, "grad_norm": 0.3262972831726074, "learning_rate": 9.970063309503394e-05, "loss": 0.5835, "step": 849 }, { "epoch": 1.2796386902521641, "grad_norm": 0.3741668462753296, "learning_rate": 9.969891698591372e-05, "loss": 0.5211, "step": 850 }, { "epoch": 1.2811441475348138, "grad_norm": 0.28328046202659607, "learning_rate": 9.969719598858715e-05, "loss": 0.4953, "step": 851 }, { "epoch": 1.2826496048174634, "grad_norm": 0.21409907937049866, "learning_rate": 9.969547010324244e-05, "loss": 0.4792, "step": 852 }, { "epoch": 1.284155062100113, "grad_norm": 0.277927041053772, "learning_rate": 9.96937393300683e-05, "loss": 0.5041, "step": 853 }, { "epoch": 1.2856605193827626, "grad_norm": 0.30272743105888367, "learning_rate": 9.969200366925404e-05, "loss": 0.5111, "step": 854 }, { "epoch": 1.287165976665412, "grad_norm": 0.25509539246559143, "learning_rate": 9.969026312098942e-05, "loss": 0.4905, "step": 855 }, { "epoch": 1.2886714339480618, "grad_norm": 0.29279211163520813, "learning_rate": 9.968851768546486e-05, "loss": 0.5215, "step": 856 }, { "epoch": 1.2901768912307112, "grad_norm": 0.3318697512149811, "learning_rate": 9.968676736287116e-05, "loss": 0.4541, "step": 857 }, { "epoch": 1.291682348513361, "grad_norm": 0.2767161428928375, "learning_rate": 9.968501215339978e-05, "loss": 0.5885, "step": 858 }, { "epoch": 1.2931878057960104, "grad_norm": 0.30865639448165894, "learning_rate": 9.968325205724265e-05, "loss": 0.5486, "step": 859 }, { "epoch": 1.29469326307866, "grad_norm": 0.29317837953567505, "learning_rate": 9.968148707459226e-05, "loss": 0.4805, "step": 860 }, { "epoch": 1.2961987203613097, "grad_norm": 0.29015618562698364, "learning_rate": 9.967971720564162e-05, "loss": 0.4851, "step": 861 }, { "epoch": 1.2977041776439593, "grad_norm": 0.31585222482681274, "learning_rate": 9.967794245058428e-05, "loss": 0.5056, "step": 862 }, { "epoch": 1.299209634926609, "grad_norm": 0.32462796568870544, "learning_rate": 9.967616280961433e-05, "loss": 0.4916, "step": 863 }, { "epoch": 1.3007150922092585, "grad_norm": 0.34981557726860046, "learning_rate": 9.967437828292637e-05, "loss": 0.4746, "step": 864 }, { "epoch": 1.3022205494919081, "grad_norm": 0.27161893248558044, "learning_rate": 9.96725888707156e-05, "loss": 0.5113, "step": 865 }, { "epoch": 1.3037260067745577, "grad_norm": 0.22684252262115479, "learning_rate": 9.967079457317764e-05, "loss": 0.4932, "step": 866 }, { "epoch": 1.3052314640572074, "grad_norm": 0.2535623610019684, "learning_rate": 9.966899539050877e-05, "loss": 0.5235, "step": 867 }, { "epoch": 1.306736921339857, "grad_norm": 0.2550107538700104, "learning_rate": 9.96671913229057e-05, "loss": 0.4863, "step": 868 }, { "epoch": 1.3082423786225066, "grad_norm": 0.25545719265937805, "learning_rate": 9.966538237056577e-05, "loss": 0.4426, "step": 869 }, { "epoch": 1.3097478359051562, "grad_norm": 0.31868574023246765, "learning_rate": 9.966356853368677e-05, "loss": 0.5055, "step": 870 }, { "epoch": 1.3112532931878058, "grad_norm": 0.32200318574905396, "learning_rate": 9.966174981246705e-05, "loss": 0.4658, "step": 871 }, { "epoch": 1.3127587504704554, "grad_norm": 0.33224427700042725, "learning_rate": 9.965992620710552e-05, "loss": 0.503, "step": 872 }, { "epoch": 1.314264207753105, "grad_norm": 0.3260560631752014, "learning_rate": 9.965809771780162e-05, "loss": 0.5671, "step": 873 }, { "epoch": 1.3157696650357547, "grad_norm": 0.2627885639667511, "learning_rate": 9.96562643447553e-05, "loss": 0.5055, "step": 874 }, { "epoch": 1.3172751223184043, "grad_norm": 0.23896300792694092, "learning_rate": 9.965442608816703e-05, "loss": 0.4835, "step": 875 }, { "epoch": 1.3187805796010539, "grad_norm": 0.2925032079219818, "learning_rate": 9.965258294823787e-05, "loss": 0.4403, "step": 876 }, { "epoch": 1.3202860368837035, "grad_norm": 0.29810482263565063, "learning_rate": 9.965073492516937e-05, "loss": 0.4808, "step": 877 }, { "epoch": 1.321791494166353, "grad_norm": 0.32734817266464233, "learning_rate": 9.964888201916364e-05, "loss": 0.5284, "step": 878 }, { "epoch": 1.3232969514490027, "grad_norm": 0.2809845805168152, "learning_rate": 9.964702423042331e-05, "loss": 0.5027, "step": 879 }, { "epoch": 1.3248024087316521, "grad_norm": 0.23813806474208832, "learning_rate": 9.964516155915151e-05, "loss": 0.524, "step": 880 }, { "epoch": 1.326307866014302, "grad_norm": 0.3195044994354248, "learning_rate": 9.964329400555197e-05, "loss": 0.5441, "step": 881 }, { "epoch": 1.3278133232969513, "grad_norm": 0.35543137788772583, "learning_rate": 9.964142156982894e-05, "loss": 0.4789, "step": 882 }, { "epoch": 1.3293187805796012, "grad_norm": 0.3088974356651306, "learning_rate": 9.963954425218713e-05, "loss": 0.4927, "step": 883 }, { "epoch": 1.3308242378622506, "grad_norm": 0.30977723002433777, "learning_rate": 9.96376620528319e-05, "loss": 0.4809, "step": 884 }, { "epoch": 1.3323296951449002, "grad_norm": 0.28863999247550964, "learning_rate": 9.963577497196905e-05, "loss": 0.5147, "step": 885 }, { "epoch": 1.3338351524275498, "grad_norm": 0.28481176495552063, "learning_rate": 9.963388300980495e-05, "loss": 0.5521, "step": 886 }, { "epoch": 1.3353406097101994, "grad_norm": 0.3073102533817291, "learning_rate": 9.963198616654653e-05, "loss": 0.5089, "step": 887 }, { "epoch": 1.336846066992849, "grad_norm": 0.35250821709632874, "learning_rate": 9.96300844424012e-05, "loss": 0.4664, "step": 888 }, { "epoch": 1.3383515242754986, "grad_norm": 0.3215198516845703, "learning_rate": 9.962817783757693e-05, "loss": 0.4646, "step": 889 }, { "epoch": 1.3398569815581483, "grad_norm": 0.25331127643585205, "learning_rate": 9.962626635228223e-05, "loss": 0.5385, "step": 890 }, { "epoch": 1.3413624388407979, "grad_norm": 0.2247442603111267, "learning_rate": 9.962434998672614e-05, "loss": 0.4348, "step": 891 }, { "epoch": 1.3428678961234475, "grad_norm": 0.2597985863685608, "learning_rate": 9.962242874111823e-05, "loss": 0.5129, "step": 892 }, { "epoch": 1.344373353406097, "grad_norm": 0.31802982091903687, "learning_rate": 9.962050261566859e-05, "loss": 0.5583, "step": 893 }, { "epoch": 1.3458788106887467, "grad_norm": 0.4150056838989258, "learning_rate": 9.961857161058789e-05, "loss": 0.5535, "step": 894 }, { "epoch": 1.3473842679713963, "grad_norm": 0.37266969680786133, "learning_rate": 9.961663572608725e-05, "loss": 0.4958, "step": 895 }, { "epoch": 1.348889725254046, "grad_norm": 0.2825692892074585, "learning_rate": 9.961469496237844e-05, "loss": 0.5224, "step": 896 }, { "epoch": 1.3503951825366955, "grad_norm": 0.431229829788208, "learning_rate": 9.961274931967365e-05, "loss": 0.4847, "step": 897 }, { "epoch": 1.3519006398193452, "grad_norm": 0.36701059341430664, "learning_rate": 9.961079879818567e-05, "loss": 0.4387, "step": 898 }, { "epoch": 1.3534060971019948, "grad_norm": 0.2716524302959442, "learning_rate": 9.960884339812781e-05, "loss": 0.5409, "step": 899 }, { "epoch": 1.3549115543846444, "grad_norm": 0.30059143900871277, "learning_rate": 9.96068831197139e-05, "loss": 0.5046, "step": 900 }, { "epoch": 1.356417011667294, "grad_norm": 0.27900809049606323, "learning_rate": 9.96049179631583e-05, "loss": 0.3837, "step": 901 }, { "epoch": 1.3579224689499436, "grad_norm": 0.26161590218544006, "learning_rate": 9.960294792867596e-05, "loss": 0.5446, "step": 902 }, { "epoch": 1.3594279262325932, "grad_norm": 0.2705991268157959, "learning_rate": 9.96009730164823e-05, "loss": 0.4888, "step": 903 }, { "epoch": 1.3609333835152428, "grad_norm": 0.26527148485183716, "learning_rate": 9.959899322679326e-05, "loss": 0.4799, "step": 904 }, { "epoch": 1.3624388407978922, "grad_norm": 0.2376362830400467, "learning_rate": 9.959700855982538e-05, "loss": 0.5252, "step": 905 }, { "epoch": 1.363944298080542, "grad_norm": 0.2501612901687622, "learning_rate": 9.95950190157957e-05, "loss": 0.4656, "step": 906 }, { "epoch": 1.3654497553631915, "grad_norm": 0.2704228162765503, "learning_rate": 9.95930245949218e-05, "loss": 0.4442, "step": 907 }, { "epoch": 1.3669552126458413, "grad_norm": 0.3042653799057007, "learning_rate": 9.959102529742175e-05, "loss": 0.4853, "step": 908 }, { "epoch": 1.3684606699284907, "grad_norm": 0.3517780005931854, "learning_rate": 9.958902112351423e-05, "loss": 0.4551, "step": 909 }, { "epoch": 1.3699661272111403, "grad_norm": 0.321281760931015, "learning_rate": 9.95870120734184e-05, "loss": 0.5131, "step": 910 }, { "epoch": 1.37147158449379, "grad_norm": 0.2326970398426056, "learning_rate": 9.958499814735397e-05, "loss": 0.4567, "step": 911 }, { "epoch": 1.3729770417764395, "grad_norm": 0.27153491973876953, "learning_rate": 9.958297934554117e-05, "loss": 0.5685, "step": 912 }, { "epoch": 1.3744824990590891, "grad_norm": 0.2875882089138031, "learning_rate": 9.958095566820078e-05, "loss": 0.4506, "step": 913 }, { "epoch": 1.3759879563417388, "grad_norm": 0.27493521571159363, "learning_rate": 9.957892711555409e-05, "loss": 0.52, "step": 914 }, { "epoch": 1.3774934136243884, "grad_norm": 0.2827979624271393, "learning_rate": 9.957689368782297e-05, "loss": 0.464, "step": 915 }, { "epoch": 1.378998870907038, "grad_norm": 0.2816639840602875, "learning_rate": 9.957485538522978e-05, "loss": 0.4538, "step": 916 }, { "epoch": 1.3805043281896876, "grad_norm": 0.27265772223472595, "learning_rate": 9.95728122079974e-05, "loss": 0.56, "step": 917 }, { "epoch": 1.3820097854723372, "grad_norm": 0.2275013029575348, "learning_rate": 9.95707641563493e-05, "loss": 0.4656, "step": 918 }, { "epoch": 1.3835152427549868, "grad_norm": 0.23691163957118988, "learning_rate": 9.956871123050946e-05, "loss": 0.5096, "step": 919 }, { "epoch": 1.3850207000376364, "grad_norm": 0.2816791832447052, "learning_rate": 9.956665343070234e-05, "loss": 0.5201, "step": 920 }, { "epoch": 1.386526157320286, "grad_norm": 0.30931273102760315, "learning_rate": 9.956459075715305e-05, "loss": 0.5068, "step": 921 }, { "epoch": 1.3880316146029357, "grad_norm": 0.30102744698524475, "learning_rate": 9.956252321008707e-05, "loss": 0.5492, "step": 922 }, { "epoch": 1.3895370718855853, "grad_norm": 0.326251745223999, "learning_rate": 9.956045078973058e-05, "loss": 0.5041, "step": 923 }, { "epoch": 1.391042529168235, "grad_norm": 0.28816846013069153, "learning_rate": 9.955837349631016e-05, "loss": 0.5026, "step": 924 }, { "epoch": 1.3925479864508845, "grad_norm": 0.23179884254932404, "learning_rate": 9.955629133005302e-05, "loss": 0.4871, "step": 925 }, { "epoch": 1.3940534437335341, "grad_norm": 0.2574617266654968, "learning_rate": 9.955420429118688e-05, "loss": 0.4928, "step": 926 }, { "epoch": 1.3955589010161837, "grad_norm": 0.3084236681461334, "learning_rate": 9.955211237993989e-05, "loss": 0.5522, "step": 927 }, { "epoch": 1.3970643582988334, "grad_norm": 0.2953372001647949, "learning_rate": 9.955001559654091e-05, "loss": 0.4885, "step": 928 }, { "epoch": 1.398569815581483, "grad_norm": 0.21835748851299286, "learning_rate": 9.95479139412192e-05, "loss": 0.4442, "step": 929 }, { "epoch": 1.4000752728641324, "grad_norm": 0.23751597106456757, "learning_rate": 9.95458074142046e-05, "loss": 0.4509, "step": 930 }, { "epoch": 1.4015807301467822, "grad_norm": 0.30901968479156494, "learning_rate": 9.954369601572747e-05, "loss": 0.4124, "step": 931 }, { "epoch": 1.4030861874294316, "grad_norm": 0.28328272700309753, "learning_rate": 9.95415797460187e-05, "loss": 0.4749, "step": 932 }, { "epoch": 1.4045916447120814, "grad_norm": 0.23289081454277039, "learning_rate": 9.953945860530976e-05, "loss": 0.4888, "step": 933 }, { "epoch": 1.4060971019947308, "grad_norm": 0.2695089280605316, "learning_rate": 9.953733259383258e-05, "loss": 0.5027, "step": 934 }, { "epoch": 1.4076025592773804, "grad_norm": 0.2596133053302765, "learning_rate": 9.953520171181965e-05, "loss": 0.4703, "step": 935 }, { "epoch": 1.40910801656003, "grad_norm": 0.2349337488412857, "learning_rate": 9.953306595950405e-05, "loss": 0.5108, "step": 936 }, { "epoch": 1.4106134738426797, "grad_norm": 0.20314298570156097, "learning_rate": 9.95309253371193e-05, "loss": 0.4178, "step": 937 }, { "epoch": 1.4121189311253293, "grad_norm": 0.2095893770456314, "learning_rate": 9.952877984489951e-05, "loss": 0.478, "step": 938 }, { "epoch": 1.4136243884079789, "grad_norm": 0.21095207333564758, "learning_rate": 9.95266294830793e-05, "loss": 0.4549, "step": 939 }, { "epoch": 1.4151298456906285, "grad_norm": 0.21672549843788147, "learning_rate": 9.952447425189382e-05, "loss": 0.5155, "step": 940 }, { "epoch": 1.416635302973278, "grad_norm": 0.24427184462547302, "learning_rate": 9.95223141515788e-05, "loss": 0.3804, "step": 941 }, { "epoch": 1.4181407602559277, "grad_norm": 0.32649582624435425, "learning_rate": 9.952014918237043e-05, "loss": 0.467, "step": 942 }, { "epoch": 1.4196462175385773, "grad_norm": 0.34711599349975586, "learning_rate": 9.951797934450548e-05, "loss": 0.5217, "step": 943 }, { "epoch": 1.421151674821227, "grad_norm": 0.3851451277732849, "learning_rate": 9.951580463822124e-05, "loss": 0.3614, "step": 944 }, { "epoch": 1.4226571321038766, "grad_norm": 0.7124738097190857, "learning_rate": 9.951362506375555e-05, "loss": 0.4853, "step": 945 }, { "epoch": 1.4241625893865262, "grad_norm": 0.7188159227371216, "learning_rate": 9.951144062134673e-05, "loss": 0.4709, "step": 946 }, { "epoch": 1.4256680466691758, "grad_norm": 0.40072551369667053, "learning_rate": 9.950925131123369e-05, "loss": 0.5844, "step": 947 }, { "epoch": 1.4271735039518254, "grad_norm": 0.4713194966316223, "learning_rate": 9.950705713365585e-05, "loss": 0.4555, "step": 948 }, { "epoch": 1.428678961234475, "grad_norm": 0.4641496539115906, "learning_rate": 9.950485808885315e-05, "loss": 0.4905, "step": 949 }, { "epoch": 1.4301844185171246, "grad_norm": 0.3925652503967285, "learning_rate": 9.950265417706608e-05, "loss": 0.4587, "step": 950 }, { "epoch": 1.4316898757997742, "grad_norm": 0.32440194487571716, "learning_rate": 9.950044539853567e-05, "loss": 0.4699, "step": 951 }, { "epoch": 1.4331953330824239, "grad_norm": 0.3550563454627991, "learning_rate": 9.949823175350345e-05, "loss": 0.5062, "step": 952 }, { "epoch": 1.4347007903650733, "grad_norm": 0.29468682408332825, "learning_rate": 9.949601324221151e-05, "loss": 0.4678, "step": 953 }, { "epoch": 1.436206247647723, "grad_norm": 0.2624119222164154, "learning_rate": 9.949378986490245e-05, "loss": 0.4452, "step": 954 }, { "epoch": 1.4377117049303725, "grad_norm": 0.25032052397727966, "learning_rate": 9.949156162181944e-05, "loss": 0.4903, "step": 955 }, { "epoch": 1.4392171622130223, "grad_norm": 0.24277722835540771, "learning_rate": 9.948932851320614e-05, "loss": 0.5316, "step": 956 }, { "epoch": 1.4407226194956717, "grad_norm": 0.26410001516342163, "learning_rate": 9.948709053930674e-05, "loss": 0.4694, "step": 957 }, { "epoch": 1.4422280767783215, "grad_norm": 0.24850164353847504, "learning_rate": 9.948484770036605e-05, "loss": 0.4385, "step": 958 }, { "epoch": 1.443733534060971, "grad_norm": 0.2651531994342804, "learning_rate": 9.948259999662925e-05, "loss": 0.4932, "step": 959 }, { "epoch": 1.4452389913436205, "grad_norm": 0.21966899931430817, "learning_rate": 9.948034742834223e-05, "loss": 0.4223, "step": 960 }, { "epoch": 1.4467444486262702, "grad_norm": 0.21035884320735931, "learning_rate": 9.947808999575127e-05, "loss": 0.4799, "step": 961 }, { "epoch": 1.4482499059089198, "grad_norm": 0.21419383585453033, "learning_rate": 9.947582769910326e-05, "loss": 0.4475, "step": 962 }, { "epoch": 1.4497553631915694, "grad_norm": 0.22559773921966553, "learning_rate": 9.94735605386456e-05, "loss": 0.3839, "step": 963 }, { "epoch": 1.451260820474219, "grad_norm": 0.21623972058296204, "learning_rate": 9.947128851462624e-05, "loss": 0.434, "step": 964 }, { "epoch": 1.4527662777568686, "grad_norm": 0.21256133913993835, "learning_rate": 9.94690116272936e-05, "loss": 0.4479, "step": 965 }, { "epoch": 1.4542717350395182, "grad_norm": 0.2064635455608368, "learning_rate": 9.946672987689674e-05, "loss": 0.4468, "step": 966 }, { "epoch": 1.4557771923221678, "grad_norm": 0.24071155488491058, "learning_rate": 9.946444326368515e-05, "loss": 0.4353, "step": 967 }, { "epoch": 1.4572826496048175, "grad_norm": 0.2777935266494751, "learning_rate": 9.946215178790888e-05, "loss": 0.4443, "step": 968 }, { "epoch": 1.458788106887467, "grad_norm": 0.2982131242752075, "learning_rate": 9.945985544981854e-05, "loss": 0.4508, "step": 969 }, { "epoch": 1.4602935641701167, "grad_norm": 0.23222076892852783, "learning_rate": 9.945755424966527e-05, "loss": 0.5284, "step": 970 }, { "epoch": 1.4617990214527663, "grad_norm": 0.21467755734920502, "learning_rate": 9.945524818770069e-05, "loss": 0.5064, "step": 971 }, { "epoch": 1.463304478735416, "grad_norm": 0.3190455734729767, "learning_rate": 9.945293726417702e-05, "loss": 0.5336, "step": 972 }, { "epoch": 1.4648099360180655, "grad_norm": 0.32308337092399597, "learning_rate": 9.945062147934694e-05, "loss": 0.5059, "step": 973 }, { "epoch": 1.4663153933007151, "grad_norm": 0.2846772074699402, "learning_rate": 9.944830083346374e-05, "loss": 0.4464, "step": 974 }, { "epoch": 1.4678208505833648, "grad_norm": 0.2641288638114929, "learning_rate": 9.94459753267812e-05, "loss": 0.4548, "step": 975 }, { "epoch": 1.4693263078660144, "grad_norm": 0.2515201270580292, "learning_rate": 9.944364495955362e-05, "loss": 0.5019, "step": 976 }, { "epoch": 1.470831765148664, "grad_norm": 0.23379258811473846, "learning_rate": 9.944130973203584e-05, "loss": 0.4522, "step": 977 }, { "epoch": 1.4723372224313134, "grad_norm": 0.23574711382389069, "learning_rate": 9.943896964448324e-05, "loss": 0.4733, "step": 978 }, { "epoch": 1.4738426797139632, "grad_norm": 0.23959407210350037, "learning_rate": 9.943662469715174e-05, "loss": 0.5477, "step": 979 }, { "epoch": 1.4753481369966126, "grad_norm": 0.2705731689929962, "learning_rate": 9.943427489029776e-05, "loss": 0.4718, "step": 980 }, { "epoch": 1.4768535942792624, "grad_norm": 0.28158625960350037, "learning_rate": 9.943192022417829e-05, "loss": 0.4057, "step": 981 }, { "epoch": 1.4783590515619118, "grad_norm": 0.27715685963630676, "learning_rate": 9.942956069905083e-05, "loss": 0.4754, "step": 982 }, { "epoch": 1.4798645088445617, "grad_norm": 0.2988499104976654, "learning_rate": 9.942719631517341e-05, "loss": 0.4624, "step": 983 }, { "epoch": 1.481369966127211, "grad_norm": 0.24969597160816193, "learning_rate": 9.94248270728046e-05, "loss": 0.4506, "step": 984 }, { "epoch": 1.4828754234098607, "grad_norm": 0.24824057519435883, "learning_rate": 9.94224529722035e-05, "loss": 0.4831, "step": 985 }, { "epoch": 1.4843808806925103, "grad_norm": 0.27671003341674805, "learning_rate": 9.94200740136297e-05, "loss": 0.4827, "step": 986 }, { "epoch": 1.48588633797516, "grad_norm": 0.2549966871738434, "learning_rate": 9.941769019734341e-05, "loss": 0.435, "step": 987 }, { "epoch": 1.4873917952578095, "grad_norm": 7.653620719909668, "learning_rate": 9.941530152360531e-05, "loss": 0.4592, "step": 988 }, { "epoch": 1.4888972525404591, "grad_norm": 0.5677655935287476, "learning_rate": 9.941290799267661e-05, "loss": 0.4617, "step": 989 }, { "epoch": 1.4904027098231087, "grad_norm": 0.6604883670806885, "learning_rate": 9.941050960481906e-05, "loss": 0.5761, "step": 990 }, { "epoch": 1.4919081671057584, "grad_norm": 0.4322024881839752, "learning_rate": 9.940810636029496e-05, "loss": 0.4829, "step": 991 }, { "epoch": 1.493413624388408, "grad_norm": 0.44252604246139526, "learning_rate": 9.940569825936709e-05, "loss": 0.477, "step": 992 }, { "epoch": 1.4949190816710576, "grad_norm": 0.3620954155921936, "learning_rate": 9.940328530229883e-05, "loss": 0.373, "step": 993 }, { "epoch": 1.4964245389537072, "grad_norm": 0.32205504179000854, "learning_rate": 9.940086748935406e-05, "loss": 0.4826, "step": 994 }, { "epoch": 1.4979299962363568, "grad_norm": 0.3359576463699341, "learning_rate": 9.939844482079718e-05, "loss": 0.4265, "step": 995 }, { "epoch": 1.4994354535190064, "grad_norm": 0.31381651759147644, "learning_rate": 9.939601729689312e-05, "loss": 0.5075, "step": 996 }, { "epoch": 1.500940910801656, "grad_norm": 0.26423943042755127, "learning_rate": 9.939358491790735e-05, "loss": 0.4599, "step": 997 }, { "epoch": 1.5024463680843057, "grad_norm": 0.24680866301059723, "learning_rate": 9.93911476841059e-05, "loss": 0.3927, "step": 998 }, { "epoch": 1.5039518253669553, "grad_norm": 0.27464717626571655, "learning_rate": 9.938870559575526e-05, "loss": 0.4778, "step": 999 }, { "epoch": 1.5054572826496049, "grad_norm": 0.2745337188243866, "learning_rate": 9.938625865312251e-05, "loss": 0.4542, "step": 1000 }, { "epoch": 1.5054572826496049, "eval_loss": 0.448403537273407, "eval_runtime": 549.7338, "eval_samples_per_second": 17.512, "eval_steps_per_second": 0.548, "step": 1000 }, { "epoch": 1.5069627399322543, "grad_norm": 0.23503173887729645, "learning_rate": 9.938380685647525e-05, "loss": 0.4427, "step": 1001 }, { "epoch": 1.508468197214904, "grad_norm": 0.23698735237121582, "learning_rate": 9.938135020608163e-05, "loss": 0.5188, "step": 1002 }, { "epoch": 1.5099736544975535, "grad_norm": 0.25086188316345215, "learning_rate": 9.937888870221023e-05, "loss": 0.4613, "step": 1003 }, { "epoch": 1.5114791117802033, "grad_norm": 0.22732360661029816, "learning_rate": 9.937642234513032e-05, "loss": 0.4774, "step": 1004 }, { "epoch": 1.5129845690628527, "grad_norm": 0.2439388632774353, "learning_rate": 9.937395113511156e-05, "loss": 0.4065, "step": 1005 }, { "epoch": 1.5144900263455026, "grad_norm": 0.2359287291765213, "learning_rate": 9.937147507242424e-05, "loss": 0.5056, "step": 1006 }, { "epoch": 1.515995483628152, "grad_norm": 0.22149021923542023, "learning_rate": 9.936899415733911e-05, "loss": 0.3644, "step": 1007 }, { "epoch": 1.5175009409108018, "grad_norm": 0.23879432678222656, "learning_rate": 9.936650839012749e-05, "loss": 0.4915, "step": 1008 }, { "epoch": 1.5190063981934512, "grad_norm": 0.2547174394130707, "learning_rate": 9.93640177710612e-05, "loss": 0.4042, "step": 1009 }, { "epoch": 1.520511855476101, "grad_norm": 0.2876720130443573, "learning_rate": 9.936152230041264e-05, "loss": 0.4587, "step": 1010 }, { "epoch": 1.5220173127587504, "grad_norm": 0.24860022962093353, "learning_rate": 9.935902197845471e-05, "loss": 0.4418, "step": 1011 }, { "epoch": 1.5235227700414, "grad_norm": 0.23863892257213593, "learning_rate": 9.93565168054608e-05, "loss": 0.4594, "step": 1012 }, { "epoch": 1.5250282273240496, "grad_norm": 0.2264079600572586, "learning_rate": 9.935400678170492e-05, "loss": 0.4161, "step": 1013 }, { "epoch": 1.5265336846066992, "grad_norm": 0.2153846025466919, "learning_rate": 9.935149190746153e-05, "loss": 0.4294, "step": 1014 }, { "epoch": 1.5280391418893489, "grad_norm": 0.23012618720531464, "learning_rate": 9.934897218300569e-05, "loss": 0.5225, "step": 1015 }, { "epoch": 1.5295445991719985, "grad_norm": 0.2712361812591553, "learning_rate": 9.934644760861292e-05, "loss": 0.4585, "step": 1016 }, { "epoch": 1.531050056454648, "grad_norm": 0.2915598750114441, "learning_rate": 9.934391818455931e-05, "loss": 0.4633, "step": 1017 }, { "epoch": 1.5325555137372977, "grad_norm": 0.2820572257041931, "learning_rate": 9.934138391112145e-05, "loss": 0.5222, "step": 1018 }, { "epoch": 1.5340609710199473, "grad_norm": 0.24416132271289825, "learning_rate": 9.933884478857655e-05, "loss": 0.4787, "step": 1019 }, { "epoch": 1.535566428302597, "grad_norm": 0.25742241740226746, "learning_rate": 9.933630081720224e-05, "loss": 0.467, "step": 1020 }, { "epoch": 1.5370718855852465, "grad_norm": 0.3087022006511688, "learning_rate": 9.933375199727672e-05, "loss": 0.4219, "step": 1021 }, { "epoch": 1.5385773428678962, "grad_norm": 0.33223146200180054, "learning_rate": 9.933119832907873e-05, "loss": 0.459, "step": 1022 }, { "epoch": 1.5400828001505458, "grad_norm": 0.30014196038246155, "learning_rate": 9.932863981288757e-05, "loss": 0.4634, "step": 1023 }, { "epoch": 1.5415882574331954, "grad_norm": 0.3006850481033325, "learning_rate": 9.932607644898299e-05, "loss": 0.4142, "step": 1024 }, { "epoch": 1.543093714715845, "grad_norm": 0.29699158668518066, "learning_rate": 9.932350823764534e-05, "loss": 0.4749, "step": 1025 }, { "epoch": 1.5445991719984944, "grad_norm": 0.2380169779062271, "learning_rate": 9.932093517915546e-05, "loss": 0.473, "step": 1026 }, { "epoch": 1.5461046292811442, "grad_norm": 0.26097723841667175, "learning_rate": 9.931835727379474e-05, "loss": 0.4835, "step": 1027 }, { "epoch": 1.5476100865637936, "grad_norm": 0.3370758891105652, "learning_rate": 9.931577452184512e-05, "loss": 0.4438, "step": 1028 }, { "epoch": 1.5491155438464435, "grad_norm": 0.321050226688385, "learning_rate": 9.931318692358901e-05, "loss": 0.4517, "step": 1029 }, { "epoch": 1.5506210011290928, "grad_norm": 0.3013628125190735, "learning_rate": 9.93105944793094e-05, "loss": 0.4254, "step": 1030 }, { "epoch": 1.5521264584117427, "grad_norm": 0.29696887731552124, "learning_rate": 9.93079971892898e-05, "loss": 0.4483, "step": 1031 }, { "epoch": 1.553631915694392, "grad_norm": 0.2591365575790405, "learning_rate": 9.930539505381426e-05, "loss": 0.4413, "step": 1032 }, { "epoch": 1.555137372977042, "grad_norm": 0.20795929431915283, "learning_rate": 9.930278807316729e-05, "loss": 0.3941, "step": 1033 }, { "epoch": 1.5566428302596913, "grad_norm": 0.2205631285905838, "learning_rate": 9.930017624763406e-05, "loss": 0.4407, "step": 1034 }, { "epoch": 1.5581482875423411, "grad_norm": 0.2821867763996124, "learning_rate": 9.929755957750015e-05, "loss": 0.445, "step": 1035 }, { "epoch": 1.5596537448249905, "grad_norm": 0.24069561064243317, "learning_rate": 9.929493806305173e-05, "loss": 0.475, "step": 1036 }, { "epoch": 1.5611592021076401, "grad_norm": 0.22999835014343262, "learning_rate": 9.929231170457546e-05, "loss": 0.3955, "step": 1037 }, { "epoch": 1.5626646593902898, "grad_norm": 0.30125170946121216, "learning_rate": 9.928968050235861e-05, "loss": 0.5264, "step": 1038 }, { "epoch": 1.5641701166729394, "grad_norm": 0.266787588596344, "learning_rate": 9.928704445668886e-05, "loss": 0.5201, "step": 1039 }, { "epoch": 1.565675573955589, "grad_norm": 0.26023849844932556, "learning_rate": 9.928440356785453e-05, "loss": 0.4492, "step": 1040 }, { "epoch": 1.5671810312382386, "grad_norm": 0.3732949197292328, "learning_rate": 9.928175783614438e-05, "loss": 0.4552, "step": 1041 }, { "epoch": 1.5686864885208882, "grad_norm": 0.35339727997779846, "learning_rate": 9.92791072618478e-05, "loss": 0.4104, "step": 1042 }, { "epoch": 1.5701919458035378, "grad_norm": 0.3114206790924072, "learning_rate": 9.927645184525462e-05, "loss": 0.4446, "step": 1043 }, { "epoch": 1.5716974030861874, "grad_norm": 0.3709624707698822, "learning_rate": 9.92737915866552e-05, "loss": 0.473, "step": 1044 }, { "epoch": 1.573202860368837, "grad_norm": 0.28377363085746765, "learning_rate": 9.927112648634053e-05, "loss": 0.4815, "step": 1045 }, { "epoch": 1.5747083176514867, "grad_norm": 0.33494940400123596, "learning_rate": 9.926845654460202e-05, "loss": 0.4737, "step": 1046 }, { "epoch": 1.5762137749341363, "grad_norm": 0.3575019836425781, "learning_rate": 9.926578176173166e-05, "loss": 0.4364, "step": 1047 }, { "epoch": 1.577719232216786, "grad_norm": 0.3405434787273407, "learning_rate": 9.926310213802196e-05, "loss": 0.5132, "step": 1048 }, { "epoch": 1.5792246894994355, "grad_norm": 0.3129113018512726, "learning_rate": 9.926041767376594e-05, "loss": 0.4749, "step": 1049 }, { "epoch": 1.5807301467820851, "grad_norm": 0.2524198293685913, "learning_rate": 9.92577283692572e-05, "loss": 0.4668, "step": 1050 }, { "epoch": 1.5822356040647345, "grad_norm": 0.275818794965744, "learning_rate": 9.925503422478984e-05, "loss": 0.5083, "step": 1051 }, { "epoch": 1.5837410613473843, "grad_norm": 0.2816396951675415, "learning_rate": 9.925233524065847e-05, "loss": 0.5051, "step": 1052 }, { "epoch": 1.5852465186300337, "grad_norm": 0.2969379127025604, "learning_rate": 9.924963141715824e-05, "loss": 0.4371, "step": 1053 }, { "epoch": 1.5867519759126836, "grad_norm": 0.24459819495677948, "learning_rate": 9.924692275458485e-05, "loss": 0.4635, "step": 1054 }, { "epoch": 1.588257433195333, "grad_norm": 0.21884480118751526, "learning_rate": 9.92442092532345e-05, "loss": 0.4105, "step": 1055 }, { "epoch": 1.5897628904779828, "grad_norm": 0.2509744167327881, "learning_rate": 9.924149091340397e-05, "loss": 0.4514, "step": 1056 }, { "epoch": 1.5912683477606322, "grad_norm": 0.27836376428604126, "learning_rate": 9.92387677353905e-05, "loss": 0.465, "step": 1057 }, { "epoch": 1.592773805043282, "grad_norm": 0.2746243476867676, "learning_rate": 9.923603971949189e-05, "loss": 0.4575, "step": 1058 }, { "epoch": 1.5942792623259314, "grad_norm": 0.26370930671691895, "learning_rate": 9.92333068660065e-05, "loss": 0.4723, "step": 1059 }, { "epoch": 1.5957847196085813, "grad_norm": 0.2219294309616089, "learning_rate": 9.923056917523317e-05, "loss": 0.524, "step": 1060 }, { "epoch": 1.5972901768912307, "grad_norm": 0.23850548267364502, "learning_rate": 9.922782664747129e-05, "loss": 0.4744, "step": 1061 }, { "epoch": 1.5987956341738803, "grad_norm": 0.25547099113464355, "learning_rate": 9.92250792830208e-05, "loss": 0.4442, "step": 1062 }, { "epoch": 1.6003010914565299, "grad_norm": 0.25148311257362366, "learning_rate": 9.92223270821821e-05, "loss": 0.3729, "step": 1063 }, { "epoch": 1.6018065487391795, "grad_norm": 0.28833600878715515, "learning_rate": 9.921957004525622e-05, "loss": 0.4304, "step": 1064 }, { "epoch": 1.603312006021829, "grad_norm": 0.2616676092147827, "learning_rate": 9.921680817254465e-05, "loss": 0.4566, "step": 1065 }, { "epoch": 1.6048174633044787, "grad_norm": 0.2326199859380722, "learning_rate": 9.92140414643494e-05, "loss": 0.4111, "step": 1066 }, { "epoch": 1.6063229205871283, "grad_norm": 0.2307058721780777, "learning_rate": 9.921126992097306e-05, "loss": 0.4135, "step": 1067 }, { "epoch": 1.607828377869778, "grad_norm": 0.21174857020378113, "learning_rate": 9.92084935427187e-05, "loss": 0.4858, "step": 1068 }, { "epoch": 1.6093338351524276, "grad_norm": 0.21527734398841858, "learning_rate": 9.920571232988996e-05, "loss": 0.4789, "step": 1069 }, { "epoch": 1.6108392924350772, "grad_norm": 0.199984610080719, "learning_rate": 9.920292628279099e-05, "loss": 0.4435, "step": 1070 }, { "epoch": 1.6123447497177268, "grad_norm": 0.19366131722927094, "learning_rate": 9.920013540172645e-05, "loss": 0.439, "step": 1071 }, { "epoch": 1.6138502070003764, "grad_norm": 0.26058319211006165, "learning_rate": 9.919733968700157e-05, "loss": 0.4868, "step": 1072 }, { "epoch": 1.615355664283026, "grad_norm": 0.31353676319122314, "learning_rate": 9.919453913892208e-05, "loss": 0.4143, "step": 1073 }, { "epoch": 1.6168611215656754, "grad_norm": 0.345813512802124, "learning_rate": 9.91917337577942e-05, "loss": 0.5171, "step": 1074 }, { "epoch": 1.6183665788483252, "grad_norm": 0.34476518630981445, "learning_rate": 9.918892354392477e-05, "loss": 0.505, "step": 1075 }, { "epoch": 1.6198720361309746, "grad_norm": 0.2806585729122162, "learning_rate": 9.91861084976211e-05, "loss": 0.431, "step": 1076 }, { "epoch": 1.6213774934136245, "grad_norm": 0.27474331855773926, "learning_rate": 9.918328861919104e-05, "loss": 0.4703, "step": 1077 }, { "epoch": 1.6228829506962739, "grad_norm": 0.2791348695755005, "learning_rate": 9.918046390894297e-05, "loss": 0.4578, "step": 1078 }, { "epoch": 1.6243884079789237, "grad_norm": 0.25129592418670654, "learning_rate": 9.917763436718579e-05, "loss": 0.4566, "step": 1079 }, { "epoch": 1.625893865261573, "grad_norm": 0.254594624042511, "learning_rate": 9.917479999422893e-05, "loss": 0.4652, "step": 1080 }, { "epoch": 1.627399322544223, "grad_norm": 0.20884394645690918, "learning_rate": 9.917196079038237e-05, "loss": 0.4456, "step": 1081 }, { "epoch": 1.6289047798268723, "grad_norm": 0.2145880162715912, "learning_rate": 9.916911675595656e-05, "loss": 0.4179, "step": 1082 }, { "epoch": 1.6304102371095222, "grad_norm": 0.23625211417675018, "learning_rate": 9.916626789126256e-05, "loss": 0.4387, "step": 1083 }, { "epoch": 1.6319156943921715, "grad_norm": 0.22156427800655365, "learning_rate": 9.916341419661193e-05, "loss": 0.5068, "step": 1084 }, { "epoch": 1.6334211516748214, "grad_norm": 0.24632206559181213, "learning_rate": 9.91605556723167e-05, "loss": 0.4026, "step": 1085 }, { "epoch": 1.6349266089574708, "grad_norm": 0.2571054995059967, "learning_rate": 9.91576923186895e-05, "loss": 0.3399, "step": 1086 }, { "epoch": 1.6364320662401204, "grad_norm": 0.2627997100353241, "learning_rate": 9.915482413604347e-05, "loss": 0.5007, "step": 1087 }, { "epoch": 1.63793752352277, "grad_norm": 0.2298496514558792, "learning_rate": 9.915195112469226e-05, "loss": 0.4436, "step": 1088 }, { "epoch": 1.6394429808054196, "grad_norm": 0.24908208847045898, "learning_rate": 9.914907328495003e-05, "loss": 0.4358, "step": 1089 }, { "epoch": 1.6409484380880692, "grad_norm": 0.24429230391979218, "learning_rate": 9.914619061713154e-05, "loss": 0.4334, "step": 1090 }, { "epoch": 1.6424538953707188, "grad_norm": 0.2670398950576782, "learning_rate": 9.914330312155202e-05, "loss": 0.3814, "step": 1091 }, { "epoch": 1.6439593526533685, "grad_norm": 0.2707434594631195, "learning_rate": 9.914041079852724e-05, "loss": 0.4363, "step": 1092 }, { "epoch": 1.645464809936018, "grad_norm": 0.24101929366588593, "learning_rate": 9.913751364837349e-05, "loss": 0.356, "step": 1093 }, { "epoch": 1.6469702672186677, "grad_norm": 0.23632732033729553, "learning_rate": 9.91346116714076e-05, "loss": 0.4443, "step": 1094 }, { "epoch": 1.6484757245013173, "grad_norm": 0.24018539488315582, "learning_rate": 9.913170486794697e-05, "loss": 0.4631, "step": 1095 }, { "epoch": 1.649981181783967, "grad_norm": 0.22361794114112854, "learning_rate": 9.91287932383094e-05, "loss": 0.4635, "step": 1096 }, { "epoch": 1.6514866390666165, "grad_norm": 0.2329414039850235, "learning_rate": 9.912587678281338e-05, "loss": 0.4194, "step": 1097 }, { "epoch": 1.6529920963492661, "grad_norm": 0.2588191032409668, "learning_rate": 9.91229555017778e-05, "loss": 0.3684, "step": 1098 }, { "epoch": 1.6544975536319155, "grad_norm": 0.2974998950958252, "learning_rate": 9.912002939552215e-05, "loss": 0.4435, "step": 1099 }, { "epoch": 1.6560030109145654, "grad_norm": 0.30447155237197876, "learning_rate": 9.911709846436641e-05, "loss": 0.373, "step": 1100 }, { "epoch": 1.6575084681972148, "grad_norm": 0.33181530237197876, "learning_rate": 9.911416270863113e-05, "loss": 0.4895, "step": 1101 }, { "epoch": 1.6590139254798646, "grad_norm": 0.3561093211174011, "learning_rate": 9.911122212863734e-05, "loss": 0.4256, "step": 1102 }, { "epoch": 1.660519382762514, "grad_norm": 0.3877561390399933, "learning_rate": 9.91082767247066e-05, "loss": 0.4461, "step": 1103 }, { "epoch": 1.6620248400451638, "grad_norm": 0.4682588577270508, "learning_rate": 9.910532649716105e-05, "loss": 0.4333, "step": 1104 }, { "epoch": 1.6635302973278132, "grad_norm": 0.433292418718338, "learning_rate": 9.91023714463233e-05, "loss": 0.3645, "step": 1105 }, { "epoch": 1.665035754610463, "grad_norm": 0.2796177566051483, "learning_rate": 9.909941157251651e-05, "loss": 0.3834, "step": 1106 }, { "epoch": 1.6665412118931124, "grad_norm": 0.28378307819366455, "learning_rate": 9.909644687606438e-05, "loss": 0.4625, "step": 1107 }, { "epoch": 1.6680466691757623, "grad_norm": 0.3089815080165863, "learning_rate": 9.909347735729111e-05, "loss": 0.4298, "step": 1108 }, { "epoch": 1.6695521264584117, "grad_norm": 0.2811943590641022, "learning_rate": 9.909050301652145e-05, "loss": 0.454, "step": 1109 }, { "epoch": 1.6710575837410615, "grad_norm": 0.23330336809158325, "learning_rate": 9.908752385408067e-05, "loss": 0.3847, "step": 1110 }, { "epoch": 1.672563041023711, "grad_norm": 0.27226200699806213, "learning_rate": 9.908453987029459e-05, "loss": 0.4406, "step": 1111 }, { "epoch": 1.6740684983063605, "grad_norm": 0.27582305669784546, "learning_rate": 9.908155106548947e-05, "loss": 0.4982, "step": 1112 }, { "epoch": 1.6755739555890101, "grad_norm": 0.28410303592681885, "learning_rate": 9.907855743999223e-05, "loss": 0.4921, "step": 1113 }, { "epoch": 1.6770794128716597, "grad_norm": 0.35232239961624146, "learning_rate": 9.90755589941302e-05, "loss": 0.5054, "step": 1114 }, { "epoch": 1.6785848701543093, "grad_norm": 0.35414817929267883, "learning_rate": 9.907255572823133e-05, "loss": 0.3757, "step": 1115 }, { "epoch": 1.680090327436959, "grad_norm": 0.34720227122306824, "learning_rate": 9.906954764262401e-05, "loss": 0.4302, "step": 1116 }, { "epoch": 1.6815957847196086, "grad_norm": 0.26937606930732727, "learning_rate": 9.90665347376372e-05, "loss": 0.4901, "step": 1117 }, { "epoch": 1.6831012420022582, "grad_norm": 0.2735351324081421, "learning_rate": 9.906351701360044e-05, "loss": 0.4288, "step": 1118 }, { "epoch": 1.6846066992849078, "grad_norm": 0.304582417011261, "learning_rate": 9.90604944708437e-05, "loss": 0.4561, "step": 1119 }, { "epoch": 1.6861121565675574, "grad_norm": 0.33186671137809753, "learning_rate": 9.905746710969752e-05, "loss": 0.4063, "step": 1120 }, { "epoch": 1.687617613850207, "grad_norm": 0.2617332637310028, "learning_rate": 9.905443493049296e-05, "loss": 0.4155, "step": 1121 }, { "epoch": 1.6891230711328566, "grad_norm": 0.26704853773117065, "learning_rate": 9.905139793356167e-05, "loss": 0.4944, "step": 1122 }, { "epoch": 1.6906285284155063, "grad_norm": 0.2655688524246216, "learning_rate": 9.904835611923571e-05, "loss": 0.41, "step": 1123 }, { "epoch": 1.6921339856981557, "grad_norm": 0.24907462298870087, "learning_rate": 9.904530948784774e-05, "loss": 0.4584, "step": 1124 }, { "epoch": 1.6936394429808055, "grad_norm": 0.2454483062028885, "learning_rate": 9.904225803973094e-05, "loss": 0.4661, "step": 1125 }, { "epoch": 1.6951449002634549, "grad_norm": 0.22096757590770721, "learning_rate": 9.903920177521906e-05, "loss": 0.4817, "step": 1126 }, { "epoch": 1.6966503575461047, "grad_norm": 0.22758568823337555, "learning_rate": 9.903614069464625e-05, "loss": 0.4039, "step": 1127 }, { "epoch": 1.698155814828754, "grad_norm": 0.24278514087200165, "learning_rate": 9.903307479834731e-05, "loss": 0.4408, "step": 1128 }, { "epoch": 1.699661272111404, "grad_norm": 0.2509666681289673, "learning_rate": 9.903000408665752e-05, "loss": 0.5063, "step": 1129 }, { "epoch": 1.7011667293940533, "grad_norm": 0.20750530064105988, "learning_rate": 9.902692855991266e-05, "loss": 0.446, "step": 1130 }, { "epoch": 1.7026721866767032, "grad_norm": 0.22669030725955963, "learning_rate": 9.902384821844911e-05, "loss": 0.4102, "step": 1131 }, { "epoch": 1.7041776439593526, "grad_norm": 0.24241143465042114, "learning_rate": 9.90207630626037e-05, "loss": 0.4972, "step": 1132 }, { "epoch": 1.7056831012420024, "grad_norm": 0.2509285509586334, "learning_rate": 9.901767309271383e-05, "loss": 0.5196, "step": 1133 }, { "epoch": 1.7071885585246518, "grad_norm": 0.28784653544425964, "learning_rate": 9.901457830911739e-05, "loss": 0.4425, "step": 1134 }, { "epoch": 1.7086940158073016, "grad_norm": 0.23060482740402222, "learning_rate": 9.901147871215286e-05, "loss": 0.3952, "step": 1135 }, { "epoch": 1.710199473089951, "grad_norm": 0.21818314492702484, "learning_rate": 9.900837430215918e-05, "loss": 0.4538, "step": 1136 }, { "epoch": 1.7117049303726006, "grad_norm": 0.25537538528442383, "learning_rate": 9.900526507947584e-05, "loss": 0.4143, "step": 1137 }, { "epoch": 1.7132103876552502, "grad_norm": 0.23999719321727753, "learning_rate": 9.90021510444429e-05, "loss": 0.4843, "step": 1138 }, { "epoch": 1.7147158449378999, "grad_norm": 0.23636163771152496, "learning_rate": 9.899903219740087e-05, "loss": 0.3895, "step": 1139 }, { "epoch": 1.7162213022205495, "grad_norm": 0.2590572237968445, "learning_rate": 9.899590853869082e-05, "loss": 0.3938, "step": 1140 }, { "epoch": 1.717726759503199, "grad_norm": 0.28640514612197876, "learning_rate": 9.899278006865437e-05, "loss": 0.3552, "step": 1141 }, { "epoch": 1.7192322167858487, "grad_norm": 0.2770548164844513, "learning_rate": 9.898964678763362e-05, "loss": 0.4719, "step": 1142 }, { "epoch": 1.7207376740684983, "grad_norm": 0.2775212228298187, "learning_rate": 9.898650869597124e-05, "loss": 0.4027, "step": 1143 }, { "epoch": 1.722243131351148, "grad_norm": 0.2430652230978012, "learning_rate": 9.898336579401042e-05, "loss": 0.4822, "step": 1144 }, { "epoch": 1.7237485886337975, "grad_norm": 0.2564743161201477, "learning_rate": 9.898021808209483e-05, "loss": 0.3857, "step": 1145 }, { "epoch": 1.7252540459164472, "grad_norm": 0.38059455156326294, "learning_rate": 9.897706556056872e-05, "loss": 0.4457, "step": 1146 }, { "epoch": 1.7267595031990968, "grad_norm": 0.471281498670578, "learning_rate": 9.897390822977682e-05, "loss": 0.4539, "step": 1147 }, { "epoch": 1.7282649604817464, "grad_norm": 0.4874376356601715, "learning_rate": 9.897074609006444e-05, "loss": 0.498, "step": 1148 }, { "epoch": 1.7297704177643958, "grad_norm": 0.4072951376438141, "learning_rate": 9.896757914177738e-05, "loss": 0.4603, "step": 1149 }, { "epoch": 1.7312758750470456, "grad_norm": 0.26683565974235535, "learning_rate": 9.896440738526198e-05, "loss": 0.4438, "step": 1150 }, { "epoch": 1.732781332329695, "grad_norm": 0.3107357323169708, "learning_rate": 9.896123082086507e-05, "loss": 0.3893, "step": 1151 }, { "epoch": 1.7342867896123448, "grad_norm": 0.33137500286102295, "learning_rate": 9.895804944893407e-05, "loss": 0.4102, "step": 1152 }, { "epoch": 1.7357922468949942, "grad_norm": 0.26062411069869995, "learning_rate": 9.895486326981684e-05, "loss": 0.451, "step": 1153 }, { "epoch": 1.737297704177644, "grad_norm": 0.2631147801876068, "learning_rate": 9.895167228386188e-05, "loss": 0.5227, "step": 1154 }, { "epoch": 1.7388031614602935, "grad_norm": 0.27147579193115234, "learning_rate": 9.89484764914181e-05, "loss": 0.4389, "step": 1155 }, { "epoch": 1.7403086187429433, "grad_norm": 0.2434566766023636, "learning_rate": 9.894527589283501e-05, "loss": 0.3779, "step": 1156 }, { "epoch": 1.7418140760255927, "grad_norm": 0.2360423058271408, "learning_rate": 9.894207048846263e-05, "loss": 0.4115, "step": 1157 }, { "epoch": 1.7433195333082425, "grad_norm": 0.23321694135665894, "learning_rate": 9.893886027865148e-05, "loss": 0.4121, "step": 1158 }, { "epoch": 1.744824990590892, "grad_norm": 0.219273641705513, "learning_rate": 9.893564526375263e-05, "loss": 0.3758, "step": 1159 }, { "epoch": 1.7463304478735417, "grad_norm": 0.22814658284187317, "learning_rate": 9.893242544411769e-05, "loss": 0.4678, "step": 1160 }, { "epoch": 1.7478359051561911, "grad_norm": 0.2296743541955948, "learning_rate": 9.892920082009872e-05, "loss": 0.3462, "step": 1161 }, { "epoch": 1.7493413624388408, "grad_norm": 0.2289915680885315, "learning_rate": 9.892597139204842e-05, "loss": 0.4221, "step": 1162 }, { "epoch": 1.7508468197214904, "grad_norm": 0.2313208431005478, "learning_rate": 9.892273716031991e-05, "loss": 0.4335, "step": 1163 }, { "epoch": 1.75235227700414, "grad_norm": 0.2224988490343094, "learning_rate": 9.891949812526691e-05, "loss": 0.418, "step": 1164 }, { "epoch": 1.7538577342867896, "grad_norm": 0.23870526254177094, "learning_rate": 9.891625428724363e-05, "loss": 0.5147, "step": 1165 }, { "epoch": 1.7553631915694392, "grad_norm": 0.26446911692619324, "learning_rate": 9.891300564660481e-05, "loss": 0.3786, "step": 1166 }, { "epoch": 1.7568686488520888, "grad_norm": 0.2514936625957489, "learning_rate": 9.890975220370572e-05, "loss": 0.385, "step": 1167 }, { "epoch": 1.7583741061347384, "grad_norm": 0.216860830783844, "learning_rate": 9.890649395890213e-05, "loss": 0.4312, "step": 1168 }, { "epoch": 1.759879563417388, "grad_norm": 0.21065756678581238, "learning_rate": 9.890323091255039e-05, "loss": 0.4788, "step": 1169 }, { "epoch": 1.7613850207000377, "grad_norm": 0.23322618007659912, "learning_rate": 9.889996306500732e-05, "loss": 0.5087, "step": 1170 }, { "epoch": 1.7628904779826873, "grad_norm": 0.20407578349113464, "learning_rate": 9.889669041663029e-05, "loss": 0.3571, "step": 1171 }, { "epoch": 1.764395935265337, "grad_norm": 0.2036370486021042, "learning_rate": 9.889341296777719e-05, "loss": 0.354, "step": 1172 }, { "epoch": 1.7659013925479865, "grad_norm": 0.23351651430130005, "learning_rate": 9.889013071880645e-05, "loss": 0.4723, "step": 1173 }, { "epoch": 1.767406849830636, "grad_norm": 0.21483919024467468, "learning_rate": 9.888684367007698e-05, "loss": 0.4069, "step": 1174 }, { "epoch": 1.7689123071132857, "grad_norm": 0.26591092348098755, "learning_rate": 9.888355182194829e-05, "loss": 0.3727, "step": 1175 }, { "epoch": 1.7704177643959351, "grad_norm": 0.2989531457424164, "learning_rate": 9.888025517478034e-05, "loss": 0.4573, "step": 1176 }, { "epoch": 1.771923221678585, "grad_norm": 0.36838579177856445, "learning_rate": 9.887695372893367e-05, "loss": 0.4742, "step": 1177 }, { "epoch": 1.7734286789612344, "grad_norm": 0.3513825237751007, "learning_rate": 9.887364748476929e-05, "loss": 0.3719, "step": 1178 }, { "epoch": 1.7749341362438842, "grad_norm": 0.3293044865131378, "learning_rate": 9.88703364426488e-05, "loss": 0.3958, "step": 1179 }, { "epoch": 1.7764395935265336, "grad_norm": 0.3458506464958191, "learning_rate": 9.886702060293428e-05, "loss": 0.4622, "step": 1180 }, { "epoch": 1.7779450508091834, "grad_norm": 0.325515478849411, "learning_rate": 9.886369996598832e-05, "loss": 0.4655, "step": 1181 }, { "epoch": 1.7794505080918328, "grad_norm": 0.2946903109550476, "learning_rate": 9.88603745321741e-05, "loss": 0.4004, "step": 1182 }, { "epoch": 1.7809559653744826, "grad_norm": 0.3090858459472656, "learning_rate": 9.885704430185525e-05, "loss": 0.3796, "step": 1183 }, { "epoch": 1.782461422657132, "grad_norm": 0.24113187193870544, "learning_rate": 9.885370927539598e-05, "loss": 0.3906, "step": 1184 }, { "epoch": 1.7839668799397819, "grad_norm": 0.261758029460907, "learning_rate": 9.885036945316098e-05, "loss": 0.4295, "step": 1185 }, { "epoch": 1.7854723372224313, "grad_norm": 0.27938076853752136, "learning_rate": 9.884702483551553e-05, "loss": 0.4165, "step": 1186 }, { "epoch": 1.7869777945050809, "grad_norm": 0.23691590130329132, "learning_rate": 9.884367542282534e-05, "loss": 0.4285, "step": 1187 }, { "epoch": 1.7884832517877305, "grad_norm": 0.22896534204483032, "learning_rate": 9.884032121545675e-05, "loss": 0.3839, "step": 1188 }, { "epoch": 1.78998870907038, "grad_norm": 0.33887365460395813, "learning_rate": 9.883696221377653e-05, "loss": 0.382, "step": 1189 }, { "epoch": 1.7914941663530297, "grad_norm": 0.29227349162101746, "learning_rate": 9.883359841815203e-05, "loss": 0.4142, "step": 1190 }, { "epoch": 1.7929996236356793, "grad_norm": 0.287222683429718, "learning_rate": 9.883022982895113e-05, "loss": 0.44, "step": 1191 }, { "epoch": 1.794505080918329, "grad_norm": 0.3389644920825958, "learning_rate": 9.882685644654218e-05, "loss": 0.4649, "step": 1192 }, { "epoch": 1.7960105382009786, "grad_norm": 0.3566650152206421, "learning_rate": 9.88234782712941e-05, "loss": 0.4644, "step": 1193 }, { "epoch": 1.7975159954836282, "grad_norm": 0.3279627561569214, "learning_rate": 9.882009530357632e-05, "loss": 0.4143, "step": 1194 }, { "epoch": 1.7990214527662778, "grad_norm": 0.28983160853385925, "learning_rate": 9.88167075437588e-05, "loss": 0.4569, "step": 1195 }, { "epoch": 1.8005269100489274, "grad_norm": 0.287333607673645, "learning_rate": 9.881331499221202e-05, "loss": 0.4353, "step": 1196 }, { "epoch": 1.802032367331577, "grad_norm": 0.2719230651855469, "learning_rate": 9.880991764930699e-05, "loss": 0.3661, "step": 1197 }, { "epoch": 1.8035378246142266, "grad_norm": 0.3037194311618805, "learning_rate": 9.880651551541523e-05, "loss": 0.3892, "step": 1198 }, { "epoch": 1.805043281896876, "grad_norm": 0.2856713831424713, "learning_rate": 9.88031085909088e-05, "loss": 0.4643, "step": 1199 }, { "epoch": 1.8065487391795259, "grad_norm": 0.24926087260246277, "learning_rate": 9.879969687616027e-05, "loss": 0.3835, "step": 1200 }, { "epoch": 1.8065487391795259, "eval_loss": 0.4038221836090088, "eval_runtime": 560.6214, "eval_samples_per_second": 17.172, "eval_steps_per_second": 0.537, "step": 1200 }, { "epoch": 1.8080541964621752, "grad_norm": 0.26019471883773804, "learning_rate": 9.879628037154274e-05, "loss": 0.5049, "step": 1201 }, { "epoch": 1.809559653744825, "grad_norm": 0.2746449410915375, "learning_rate": 9.879285907742984e-05, "loss": 0.3606, "step": 1202 }, { "epoch": 1.8110651110274745, "grad_norm": 0.3146328032016754, "learning_rate": 9.878943299419571e-05, "loss": 0.4415, "step": 1203 }, { "epoch": 1.8125705683101243, "grad_norm": 0.21940657496452332, "learning_rate": 9.8786002122215e-05, "loss": 0.4708, "step": 1204 }, { "epoch": 1.8140760255927737, "grad_norm": 0.22409531474113464, "learning_rate": 9.878256646186298e-05, "loss": 0.4324, "step": 1205 }, { "epoch": 1.8155814828754235, "grad_norm": 0.26370349526405334, "learning_rate": 9.877912601351527e-05, "loss": 0.4378, "step": 1206 }, { "epoch": 1.817086940158073, "grad_norm": 0.32071158289909363, "learning_rate": 9.877568077754819e-05, "loss": 0.3525, "step": 1207 }, { "epoch": 1.8185923974407228, "grad_norm": 0.31463703513145447, "learning_rate": 9.877223075433844e-05, "loss": 0.4335, "step": 1208 }, { "epoch": 1.8200978547233722, "grad_norm": 0.24507051706314087, "learning_rate": 9.876877594426339e-05, "loss": 0.3842, "step": 1209 }, { "epoch": 1.821603312006022, "grad_norm": 0.27356618642807007, "learning_rate": 9.876531634770078e-05, "loss": 0.4031, "step": 1210 }, { "epoch": 1.8231087692886714, "grad_norm": 0.3212708532810211, "learning_rate": 9.876185196502899e-05, "loss": 0.4408, "step": 1211 }, { "epoch": 1.824614226571321, "grad_norm": 0.3205471634864807, "learning_rate": 9.875838279662685e-05, "loss": 0.4213, "step": 1212 }, { "epoch": 1.8261196838539706, "grad_norm": 0.24508042633533478, "learning_rate": 9.875490884287377e-05, "loss": 0.4157, "step": 1213 }, { "epoch": 1.8276251411366202, "grad_norm": 0.26109784841537476, "learning_rate": 9.875143010414965e-05, "loss": 0.4123, "step": 1214 }, { "epoch": 1.8291305984192698, "grad_norm": 0.30436643958091736, "learning_rate": 9.874794658083488e-05, "loss": 0.4112, "step": 1215 }, { "epoch": 1.8306360557019195, "grad_norm": 0.3066383898258209, "learning_rate": 9.874445827331047e-05, "loss": 0.4373, "step": 1216 }, { "epoch": 1.832141512984569, "grad_norm": 0.3142216205596924, "learning_rate": 9.874096518195788e-05, "loss": 0.4317, "step": 1217 }, { "epoch": 1.8336469702672187, "grad_norm": 0.2768400311470032, "learning_rate": 9.873746730715909e-05, "loss": 0.4253, "step": 1218 }, { "epoch": 1.8351524275498683, "grad_norm": 0.2386002540588379, "learning_rate": 9.873396464929663e-05, "loss": 0.406, "step": 1219 }, { "epoch": 1.836657884832518, "grad_norm": 0.2523922920227051, "learning_rate": 9.873045720875356e-05, "loss": 0.4504, "step": 1220 }, { "epoch": 1.8381633421151675, "grad_norm": 0.3017627000808716, "learning_rate": 9.872694498591342e-05, "loss": 0.4184, "step": 1221 }, { "epoch": 1.8396687993978171, "grad_norm": 0.32137924432754517, "learning_rate": 9.872342798116033e-05, "loss": 0.4457, "step": 1222 }, { "epoch": 1.8411742566804667, "grad_norm": 0.28638869524002075, "learning_rate": 9.87199061948789e-05, "loss": 0.3997, "step": 1223 }, { "epoch": 1.8426797139631161, "grad_norm": 0.26857733726501465, "learning_rate": 9.871637962745425e-05, "loss": 0.3866, "step": 1224 }, { "epoch": 1.844185171245766, "grad_norm": 0.26952260732650757, "learning_rate": 9.871284827927205e-05, "loss": 0.3718, "step": 1225 }, { "epoch": 1.8456906285284154, "grad_norm": 0.2365962117910385, "learning_rate": 9.870931215071849e-05, "loss": 0.4273, "step": 1226 }, { "epoch": 1.8471960858110652, "grad_norm": 0.24911633133888245, "learning_rate": 9.870577124218027e-05, "loss": 0.4534, "step": 1227 }, { "epoch": 1.8487015430937146, "grad_norm": 0.2673266530036926, "learning_rate": 9.87022255540446e-05, "loss": 0.412, "step": 1228 }, { "epoch": 1.8502070003763644, "grad_norm": 0.2779243588447571, "learning_rate": 9.869867508669927e-05, "loss": 0.3929, "step": 1229 }, { "epoch": 1.8517124576590138, "grad_norm": 0.29756593704223633, "learning_rate": 9.869511984053252e-05, "loss": 0.4085, "step": 1230 }, { "epoch": 1.8532179149416637, "grad_norm": 0.29312393069267273, "learning_rate": 9.869155981593317e-05, "loss": 0.4147, "step": 1231 }, { "epoch": 1.854723372224313, "grad_norm": 0.3219340741634369, "learning_rate": 9.868799501329051e-05, "loss": 0.4319, "step": 1232 }, { "epoch": 1.8562288295069629, "grad_norm": 0.2990373969078064, "learning_rate": 9.868442543299442e-05, "loss": 0.4566, "step": 1233 }, { "epoch": 1.8577342867896123, "grad_norm": 0.28145846724510193, "learning_rate": 9.868085107543523e-05, "loss": 0.4402, "step": 1234 }, { "epoch": 1.8592397440722621, "grad_norm": 0.26857811212539673, "learning_rate": 9.867727194100384e-05, "loss": 0.4034, "step": 1235 }, { "epoch": 1.8607452013549115, "grad_norm": 0.3374073803424835, "learning_rate": 9.867368803009166e-05, "loss": 0.4384, "step": 1236 }, { "epoch": 1.8622506586375611, "grad_norm": 0.27892202138900757, "learning_rate": 9.867009934309063e-05, "loss": 0.4094, "step": 1237 }, { "epoch": 1.8637561159202107, "grad_norm": 0.2380436211824417, "learning_rate": 9.866650588039318e-05, "loss": 0.4727, "step": 1238 }, { "epoch": 1.8652615732028603, "grad_norm": 0.2664106488227844, "learning_rate": 9.86629076423923e-05, "loss": 0.4709, "step": 1239 }, { "epoch": 1.86676703048551, "grad_norm": 0.2821607291698456, "learning_rate": 9.86593046294815e-05, "loss": 0.3967, "step": 1240 }, { "epoch": 1.8682724877681596, "grad_norm": 0.290093332529068, "learning_rate": 9.865569684205477e-05, "loss": 0.4332, "step": 1241 }, { "epoch": 1.8697779450508092, "grad_norm": 0.2512272298336029, "learning_rate": 9.865208428050668e-05, "loss": 0.4817, "step": 1242 }, { "epoch": 1.8712834023334588, "grad_norm": 0.20119507610797882, "learning_rate": 9.864846694523227e-05, "loss": 0.4534, "step": 1243 }, { "epoch": 1.8727888596161084, "grad_norm": 0.2455272078514099, "learning_rate": 9.864484483662714e-05, "loss": 0.4038, "step": 1244 }, { "epoch": 1.874294316898758, "grad_norm": 0.23807786405086517, "learning_rate": 9.864121795508742e-05, "loss": 0.41, "step": 1245 }, { "epoch": 1.8757997741814076, "grad_norm": 0.23496729135513306, "learning_rate": 9.863758630100969e-05, "loss": 0.3999, "step": 1246 }, { "epoch": 1.8773052314640573, "grad_norm": 0.2831249237060547, "learning_rate": 9.863394987479114e-05, "loss": 0.4067, "step": 1247 }, { "epoch": 1.8788106887467069, "grad_norm": 0.3024727404117584, "learning_rate": 9.863030867682944e-05, "loss": 0.3757, "step": 1248 }, { "epoch": 1.8803161460293563, "grad_norm": 0.26520034670829773, "learning_rate": 9.862666270752277e-05, "loss": 0.433, "step": 1249 }, { "epoch": 1.881821603312006, "grad_norm": 0.25817811489105225, "learning_rate": 9.862301196726987e-05, "loss": 0.3826, "step": 1250 }, { "epoch": 1.8833270605946555, "grad_norm": 0.2960113286972046, "learning_rate": 9.861935645646997e-05, "loss": 0.3645, "step": 1251 }, { "epoch": 1.8848325178773053, "grad_norm": 0.2489994913339615, "learning_rate": 9.86156961755228e-05, "loss": 0.356, "step": 1252 }, { "epoch": 1.8863379751599547, "grad_norm": 0.2572759985923767, "learning_rate": 9.86120311248287e-05, "loss": 0.3921, "step": 1253 }, { "epoch": 1.8878434324426046, "grad_norm": 0.28790122270584106, "learning_rate": 9.860836130478844e-05, "loss": 0.4655, "step": 1254 }, { "epoch": 1.889348889725254, "grad_norm": 0.300534725189209, "learning_rate": 9.860468671580336e-05, "loss": 0.4406, "step": 1255 }, { "epoch": 1.8908543470079038, "grad_norm": 0.21623390913009644, "learning_rate": 9.860100735827528e-05, "loss": 0.3776, "step": 1256 }, { "epoch": 1.8923598042905532, "grad_norm": 0.27079424262046814, "learning_rate": 9.85973232326066e-05, "loss": 0.4298, "step": 1257 }, { "epoch": 1.893865261573203, "grad_norm": 0.2737237513065338, "learning_rate": 9.859363433920021e-05, "loss": 0.3805, "step": 1258 }, { "epoch": 1.8953707188558524, "grad_norm": 0.2831946909427643, "learning_rate": 9.85899406784595e-05, "loss": 0.4091, "step": 1259 }, { "epoch": 1.8968761761385022, "grad_norm": 0.28324785828590393, "learning_rate": 9.858624225078841e-05, "loss": 0.4238, "step": 1260 }, { "epoch": 1.8983816334211516, "grad_norm": 0.26971635222435, "learning_rate": 9.85825390565914e-05, "loss": 0.3633, "step": 1261 }, { "epoch": 1.8998870907038012, "grad_norm": 0.25808942317962646, "learning_rate": 9.857883109627344e-05, "loss": 0.3932, "step": 1262 }, { "epoch": 1.9013925479864509, "grad_norm": 0.2434474527835846, "learning_rate": 9.857511837024003e-05, "loss": 0.3483, "step": 1263 }, { "epoch": 1.9028980052691005, "grad_norm": 0.26245322823524475, "learning_rate": 9.857140087889719e-05, "loss": 0.3894, "step": 1264 }, { "epoch": 1.90440346255175, "grad_norm": 0.22110742330551147, "learning_rate": 9.856767862265147e-05, "loss": 0.5214, "step": 1265 }, { "epoch": 1.9059089198343997, "grad_norm": 0.22438175976276398, "learning_rate": 9.856395160190991e-05, "loss": 0.4367, "step": 1266 }, { "epoch": 1.9074143771170493, "grad_norm": 0.2613338530063629, "learning_rate": 9.85602198170801e-05, "loss": 0.4119, "step": 1267 }, { "epoch": 1.908919834399699, "grad_norm": 0.25597718358039856, "learning_rate": 9.855648326857015e-05, "loss": 0.375, "step": 1268 }, { "epoch": 1.9104252916823485, "grad_norm": 0.24269592761993408, "learning_rate": 9.855274195678868e-05, "loss": 0.4347, "step": 1269 }, { "epoch": 1.9119307489649982, "grad_norm": 0.4979152977466583, "learning_rate": 9.854899588214481e-05, "loss": 0.4746, "step": 1270 }, { "epoch": 1.9134362062476478, "grad_norm": 0.30758070945739746, "learning_rate": 9.854524504504824e-05, "loss": 0.3603, "step": 1271 }, { "epoch": 1.9149416635302974, "grad_norm": 0.2775554358959198, "learning_rate": 9.854148944590914e-05, "loss": 0.4584, "step": 1272 }, { "epoch": 1.916447120812947, "grad_norm": 0.29928916692733765, "learning_rate": 9.853772908513822e-05, "loss": 0.3367, "step": 1273 }, { "epoch": 1.9179525780955964, "grad_norm": 0.2377856969833374, "learning_rate": 9.853396396314669e-05, "loss": 0.4125, "step": 1274 }, { "epoch": 1.9194580353782462, "grad_norm": 0.2252822071313858, "learning_rate": 9.853019408034632e-05, "loss": 0.3367, "step": 1275 }, { "epoch": 1.9209634926608956, "grad_norm": 0.270648330450058, "learning_rate": 9.85264194371494e-05, "loss": 0.4088, "step": 1276 }, { "epoch": 1.9224689499435454, "grad_norm": 0.2713404595851898, "learning_rate": 9.852264003396866e-05, "loss": 0.4976, "step": 1277 }, { "epoch": 1.9239744072261948, "grad_norm": 0.34813836216926575, "learning_rate": 9.851885587121744e-05, "loss": 0.3883, "step": 1278 }, { "epoch": 1.9254798645088447, "grad_norm": 0.34717556834220886, "learning_rate": 9.851506694930958e-05, "loss": 0.4519, "step": 1279 }, { "epoch": 1.926985321791494, "grad_norm": 0.2991420030593872, "learning_rate": 9.851127326865942e-05, "loss": 0.4364, "step": 1280 }, { "epoch": 1.928490779074144, "grad_norm": 0.2942225933074951, "learning_rate": 9.850747482968184e-05, "loss": 0.447, "step": 1281 }, { "epoch": 1.9299962363567933, "grad_norm": 0.26032811403274536, "learning_rate": 9.850367163279222e-05, "loss": 0.4447, "step": 1282 }, { "epoch": 1.9315016936394431, "grad_norm": 0.2822900712490082, "learning_rate": 9.849986367840648e-05, "loss": 0.3704, "step": 1283 }, { "epoch": 1.9330071509220925, "grad_norm": 0.21571765840053558, "learning_rate": 9.849605096694105e-05, "loss": 0.3842, "step": 1284 }, { "epoch": 1.9345126082047424, "grad_norm": 0.2432664930820465, "learning_rate": 9.849223349881289e-05, "loss": 0.3904, "step": 1285 }, { "epoch": 1.9360180654873917, "grad_norm": 0.2353736311197281, "learning_rate": 9.848841127443944e-05, "loss": 0.4146, "step": 1286 }, { "epoch": 1.9375235227700414, "grad_norm": 0.22627313435077667, "learning_rate": 9.848458429423874e-05, "loss": 0.4383, "step": 1287 }, { "epoch": 1.939028980052691, "grad_norm": 0.2449052333831787, "learning_rate": 9.848075255862927e-05, "loss": 0.4456, "step": 1288 }, { "epoch": 1.9405344373353406, "grad_norm": 0.23998692631721497, "learning_rate": 9.847691606803006e-05, "loss": 0.3573, "step": 1289 }, { "epoch": 1.9420398946179902, "grad_norm": 0.22024226188659668, "learning_rate": 9.84730748228607e-05, "loss": 0.4036, "step": 1290 }, { "epoch": 1.9435453519006398, "grad_norm": 0.2584715485572815, "learning_rate": 9.846922882354123e-05, "loss": 0.3559, "step": 1291 }, { "epoch": 1.9450508091832894, "grad_norm": 0.25236913561820984, "learning_rate": 9.846537807049223e-05, "loss": 0.3912, "step": 1292 }, { "epoch": 1.946556266465939, "grad_norm": 0.2423625886440277, "learning_rate": 9.846152256413486e-05, "loss": 0.3892, "step": 1293 }, { "epoch": 1.9480617237485887, "grad_norm": 0.26535433530807495, "learning_rate": 9.845766230489071e-05, "loss": 0.41, "step": 1294 }, { "epoch": 1.9495671810312383, "grad_norm": 0.23453322052955627, "learning_rate": 9.845379729318196e-05, "loss": 0.3865, "step": 1295 }, { "epoch": 1.9510726383138879, "grad_norm": 0.22577714920043945, "learning_rate": 9.844992752943125e-05, "loss": 0.4419, "step": 1296 }, { "epoch": 1.9525780955965375, "grad_norm": 0.24640364944934845, "learning_rate": 9.844605301406181e-05, "loss": 0.4379, "step": 1297 }, { "epoch": 1.9540835528791871, "grad_norm": 0.2855130136013031, "learning_rate": 9.844217374749732e-05, "loss": 0.4442, "step": 1298 }, { "epoch": 1.9555890101618365, "grad_norm": 0.35312917828559875, "learning_rate": 9.843828973016204e-05, "loss": 0.4497, "step": 1299 }, { "epoch": 1.9570944674444863, "grad_norm": 0.46108150482177734, "learning_rate": 9.84344009624807e-05, "loss": 0.4656, "step": 1300 }, { "epoch": 1.9585999247271357, "grad_norm": 0.488442063331604, "learning_rate": 9.843050744487857e-05, "loss": 0.3454, "step": 1301 }, { "epoch": 1.9601053820097856, "grad_norm": 0.3519701063632965, "learning_rate": 9.842660917778144e-05, "loss": 0.4393, "step": 1302 }, { "epoch": 1.961610839292435, "grad_norm": 0.3138854503631592, "learning_rate": 9.842270616161562e-05, "loss": 0.3723, "step": 1303 }, { "epoch": 1.9631162965750848, "grad_norm": 0.35462701320648193, "learning_rate": 9.841879839680794e-05, "loss": 0.4097, "step": 1304 }, { "epoch": 1.9646217538577342, "grad_norm": 0.3593135476112366, "learning_rate": 9.841488588378575e-05, "loss": 0.3962, "step": 1305 }, { "epoch": 1.966127211140384, "grad_norm": 0.26819419860839844, "learning_rate": 9.841096862297691e-05, "loss": 0.447, "step": 1306 }, { "epoch": 1.9676326684230334, "grad_norm": 0.27441540360450745, "learning_rate": 9.840704661480981e-05, "loss": 0.4881, "step": 1307 }, { "epoch": 1.9691381257056833, "grad_norm": 0.2545740008354187, "learning_rate": 9.840311985971334e-05, "loss": 0.3282, "step": 1308 }, { "epoch": 1.9706435829883326, "grad_norm": 0.24152976274490356, "learning_rate": 9.839918835811695e-05, "loss": 0.4234, "step": 1309 }, { "epoch": 1.9721490402709823, "grad_norm": 0.22960218787193298, "learning_rate": 9.839525211045058e-05, "loss": 0.4425, "step": 1310 }, { "epoch": 1.9736544975536319, "grad_norm": 0.22753572463989258, "learning_rate": 9.839131111714467e-05, "loss": 0.3772, "step": 1311 }, { "epoch": 1.9751599548362815, "grad_norm": 0.22830738127231598, "learning_rate": 9.838736537863023e-05, "loss": 0.3528, "step": 1312 }, { "epoch": 1.976665412118931, "grad_norm": 0.2126360535621643, "learning_rate": 9.838341489533873e-05, "loss": 0.3999, "step": 1313 }, { "epoch": 1.9781708694015807, "grad_norm": 0.23068754374980927, "learning_rate": 9.83794596677022e-05, "loss": 0.4523, "step": 1314 }, { "epoch": 1.9796763266842303, "grad_norm": 0.23659472167491913, "learning_rate": 9.837549969615318e-05, "loss": 0.4366, "step": 1315 }, { "epoch": 1.98118178396688, "grad_norm": 0.2161279320716858, "learning_rate": 9.837153498112475e-05, "loss": 0.4329, "step": 1316 }, { "epoch": 1.9826872412495296, "grad_norm": 0.2743729054927826, "learning_rate": 9.836756552305044e-05, "loss": 0.4306, "step": 1317 }, { "epoch": 1.9841926985321792, "grad_norm": 0.30245155096054077, "learning_rate": 9.836359132236439e-05, "loss": 0.3691, "step": 1318 }, { "epoch": 1.9856981558148288, "grad_norm": 0.24833424389362335, "learning_rate": 9.835961237950115e-05, "loss": 0.4417, "step": 1319 }, { "epoch": 1.9872036130974784, "grad_norm": 0.2580533027648926, "learning_rate": 9.835562869489592e-05, "loss": 0.3997, "step": 1320 }, { "epoch": 1.988709070380128, "grad_norm": 0.2510397434234619, "learning_rate": 9.835164026898431e-05, "loss": 0.3703, "step": 1321 }, { "epoch": 1.9902145276627776, "grad_norm": 0.2385658621788025, "learning_rate": 9.834764710220251e-05, "loss": 0.4209, "step": 1322 }, { "epoch": 1.9917199849454272, "grad_norm": 0.2592366933822632, "learning_rate": 9.834364919498719e-05, "loss": 0.3916, "step": 1323 }, { "epoch": 1.9932254422280766, "grad_norm": 0.23420611023902893, "learning_rate": 9.833964654777556e-05, "loss": 0.3043, "step": 1324 }, { "epoch": 1.9947308995107265, "grad_norm": 0.21801812946796417, "learning_rate": 9.833563916100533e-05, "loss": 0.4254, "step": 1325 }, { "epoch": 1.9962363567933759, "grad_norm": 0.249384343624115, "learning_rate": 9.833162703511475e-05, "loss": 0.395, "step": 1326 }, { "epoch": 1.9977418140760257, "grad_norm": 0.24223510921001434, "learning_rate": 9.832761017054261e-05, "loss": 0.4482, "step": 1327 }, { "epoch": 1.999247271358675, "grad_norm": 0.26331859827041626, "learning_rate": 9.832358856772817e-05, "loss": 0.4147, "step": 1328 }, { "epoch": 2.000752728641325, "grad_norm": 0.24730722606182098, "learning_rate": 9.83195622271112e-05, "loss": 0.3914, "step": 1329 }, { "epoch": 2.0022581859239743, "grad_norm": 0.2775382995605469, "learning_rate": 9.831553114913204e-05, "loss": 0.4185, "step": 1330 }, { "epoch": 2.003763643206624, "grad_norm": 0.2861573100090027, "learning_rate": 9.831149533423152e-05, "loss": 0.3402, "step": 1331 }, { "epoch": 2.0052691004892735, "grad_norm": 0.29312601685523987, "learning_rate": 9.830745478285098e-05, "loss": 0.3669, "step": 1332 }, { "epoch": 2.0067745577719234, "grad_norm": 0.26257050037384033, "learning_rate": 9.83034094954323e-05, "loss": 0.3724, "step": 1333 }, { "epoch": 2.0082800150545728, "grad_norm": 0.30341270565986633, "learning_rate": 9.829935947241786e-05, "loss": 0.4215, "step": 1334 }, { "epoch": 2.0097854723372226, "grad_norm": 0.342742383480072, "learning_rate": 9.82953047142506e-05, "loss": 0.3734, "step": 1335 }, { "epoch": 2.011290929619872, "grad_norm": 0.3800235688686371, "learning_rate": 9.829124522137386e-05, "loss": 0.4275, "step": 1336 }, { "epoch": 2.012796386902522, "grad_norm": 0.38352951407432556, "learning_rate": 9.828718099423166e-05, "loss": 0.3547, "step": 1337 }, { "epoch": 2.014301844185171, "grad_norm": 0.33433979749679565, "learning_rate": 9.828311203326843e-05, "loss": 0.3238, "step": 1338 }, { "epoch": 2.015807301467821, "grad_norm": 0.29477694630622864, "learning_rate": 9.827903833892913e-05, "loss": 0.36, "step": 1339 }, { "epoch": 2.0173127587504704, "grad_norm": 0.22716692090034485, "learning_rate": 9.827495991165928e-05, "loss": 0.3822, "step": 1340 }, { "epoch": 2.01881821603312, "grad_norm": 0.24411283433437347, "learning_rate": 9.827087675190486e-05, "loss": 0.3803, "step": 1341 }, { "epoch": 2.0203236733157697, "grad_norm": 0.29305699467658997, "learning_rate": 9.826678886011243e-05, "loss": 0.4272, "step": 1342 }, { "epoch": 2.021829130598419, "grad_norm": 0.2665279805660248, "learning_rate": 9.826269623672901e-05, "loss": 0.3691, "step": 1343 }, { "epoch": 2.023334587881069, "grad_norm": 0.26634058356285095, "learning_rate": 9.825859888220216e-05, "loss": 0.3728, "step": 1344 }, { "epoch": 2.0248400451637183, "grad_norm": 0.25811517238616943, "learning_rate": 9.825449679698002e-05, "loss": 0.3413, "step": 1345 }, { "epoch": 2.026345502446368, "grad_norm": 0.2371729612350464, "learning_rate": 9.82503899815111e-05, "loss": 0.3799, "step": 1346 }, { "epoch": 2.0278509597290175, "grad_norm": 0.23612657189369202, "learning_rate": 9.824627843624458e-05, "loss": 0.4022, "step": 1347 }, { "epoch": 2.0293564170116674, "grad_norm": 0.24737001955509186, "learning_rate": 9.824216216163006e-05, "loss": 0.4116, "step": 1348 }, { "epoch": 2.0308618742943167, "grad_norm": 0.23997357487678528, "learning_rate": 9.823804115811772e-05, "loss": 0.4199, "step": 1349 }, { "epoch": 2.0323673315769666, "grad_norm": 0.2256711721420288, "learning_rate": 9.823391542615817e-05, "loss": 0.3486, "step": 1350 }, { "epoch": 2.033872788859616, "grad_norm": 0.2029789388179779, "learning_rate": 9.822978496620266e-05, "loss": 0.298, "step": 1351 }, { "epoch": 2.035378246142266, "grad_norm": 0.2157374918460846, "learning_rate": 9.822564977870284e-05, "loss": 0.378, "step": 1352 }, { "epoch": 2.036883703424915, "grad_norm": 0.2511293888092041, "learning_rate": 9.822150986411097e-05, "loss": 0.4394, "step": 1353 }, { "epoch": 2.038389160707565, "grad_norm": 0.2859601676464081, "learning_rate": 9.821736522287974e-05, "loss": 0.3973, "step": 1354 }, { "epoch": 2.0398946179902144, "grad_norm": 0.30690401792526245, "learning_rate": 9.821321585546244e-05, "loss": 0.4393, "step": 1355 }, { "epoch": 2.0414000752728643, "grad_norm": 0.28837859630584717, "learning_rate": 9.820906176231283e-05, "loss": 0.3774, "step": 1356 }, { "epoch": 2.0429055325555137, "grad_norm": 0.2526475489139557, "learning_rate": 9.820490294388517e-05, "loss": 0.394, "step": 1357 }, { "epoch": 2.0444109898381635, "grad_norm": 0.25312474370002747, "learning_rate": 9.82007394006343e-05, "loss": 0.4, "step": 1358 }, { "epoch": 2.045916447120813, "grad_norm": 0.33636701107025146, "learning_rate": 9.819657113301551e-05, "loss": 0.3518, "step": 1359 }, { "epoch": 2.0474219044034627, "grad_norm": 0.40002796053886414, "learning_rate": 9.819239814148465e-05, "loss": 0.4415, "step": 1360 }, { "epoch": 2.048927361686112, "grad_norm": 0.38934943079948425, "learning_rate": 9.818822042649807e-05, "loss": 0.3828, "step": 1361 }, { "epoch": 2.050432818968762, "grad_norm": 0.3106479048728943, "learning_rate": 9.818403798851264e-05, "loss": 0.3859, "step": 1362 }, { "epoch": 2.0519382762514113, "grad_norm": 0.277864933013916, "learning_rate": 9.817985082798574e-05, "loss": 0.4184, "step": 1363 }, { "epoch": 2.053443733534061, "grad_norm": 0.23519428074359894, "learning_rate": 9.817565894537526e-05, "loss": 0.3745, "step": 1364 }, { "epoch": 2.0549491908167106, "grad_norm": 5.78391695022583, "learning_rate": 9.817146234113964e-05, "loss": 0.3629, "step": 1365 }, { "epoch": 2.0564546480993604, "grad_norm": 0.4223296642303467, "learning_rate": 9.816726101573782e-05, "loss": 0.3919, "step": 1366 }, { "epoch": 2.05796010538201, "grad_norm": 0.5618540048599243, "learning_rate": 9.816305496962923e-05, "loss": 0.3653, "step": 1367 }, { "epoch": 2.059465562664659, "grad_norm": 0.3213176727294922, "learning_rate": 9.815884420327383e-05, "loss": 0.3184, "step": 1368 }, { "epoch": 2.060971019947309, "grad_norm": 0.3997434973716736, "learning_rate": 9.815462871713212e-05, "loss": 0.409, "step": 1369 }, { "epoch": 2.0624764772299584, "grad_norm": 0.2984638214111328, "learning_rate": 9.81504085116651e-05, "loss": 0.3477, "step": 1370 }, { "epoch": 2.0639819345126083, "grad_norm": 0.33969804644584656, "learning_rate": 9.814618358733428e-05, "loss": 0.4032, "step": 1371 }, { "epoch": 2.0654873917952576, "grad_norm": 0.4239584803581238, "learning_rate": 9.814195394460168e-05, "loss": 0.3991, "step": 1372 }, { "epoch": 2.0669928490779075, "grad_norm": 0.34809717535972595, "learning_rate": 9.813771958392989e-05, "loss": 0.3472, "step": 1373 }, { "epoch": 2.068498306360557, "grad_norm": 0.33596640825271606, "learning_rate": 9.813348050578191e-05, "loss": 0.4399, "step": 1374 }, { "epoch": 2.0700037636432067, "grad_norm": 0.23476514220237732, "learning_rate": 9.812923671062138e-05, "loss": 0.3999, "step": 1375 }, { "epoch": 2.071509220925856, "grad_norm": 0.2919875979423523, "learning_rate": 9.812498819891235e-05, "loss": 0.3226, "step": 1376 }, { "epoch": 2.073014678208506, "grad_norm": 0.27538245916366577, "learning_rate": 9.812073497111945e-05, "loss": 0.4428, "step": 1377 }, { "epoch": 2.0745201354911553, "grad_norm": 0.2401035577058792, "learning_rate": 9.811647702770782e-05, "loss": 0.3839, "step": 1378 }, { "epoch": 2.076025592773805, "grad_norm": 0.2879429757595062, "learning_rate": 9.811221436914307e-05, "loss": 0.3456, "step": 1379 }, { "epoch": 2.0775310500564546, "grad_norm": 0.26644137501716614, "learning_rate": 9.810794699589136e-05, "loss": 0.3228, "step": 1380 }, { "epoch": 2.0790365073391044, "grad_norm": 0.2563892900943756, "learning_rate": 9.81036749084194e-05, "loss": 0.3855, "step": 1381 }, { "epoch": 2.080541964621754, "grad_norm": 0.24328358471393585, "learning_rate": 9.809939810719436e-05, "loss": 0.4021, "step": 1382 }, { "epoch": 2.0820474219044036, "grad_norm": 0.21279695630073547, "learning_rate": 9.809511659268394e-05, "loss": 0.4021, "step": 1383 }, { "epoch": 2.083552879187053, "grad_norm": 0.2087385058403015, "learning_rate": 9.809083036535635e-05, "loss": 0.3471, "step": 1384 }, { "epoch": 2.085058336469703, "grad_norm": 0.2268313616514206, "learning_rate": 9.808653942568035e-05, "loss": 0.3652, "step": 1385 }, { "epoch": 2.0865637937523522, "grad_norm": 0.21308667957782745, "learning_rate": 9.808224377412517e-05, "loss": 0.3605, "step": 1386 }, { "epoch": 2.088069251035002, "grad_norm": 0.22240956127643585, "learning_rate": 9.807794341116058e-05, "loss": 0.4397, "step": 1387 }, { "epoch": 2.0895747083176515, "grad_norm": 0.1941228061914444, "learning_rate": 9.807363833725688e-05, "loss": 0.3344, "step": 1388 }, { "epoch": 2.0910801656003013, "grad_norm": 0.2037973254919052, "learning_rate": 9.806932855288485e-05, "loss": 0.339, "step": 1389 }, { "epoch": 2.0925856228829507, "grad_norm": 0.21761946380138397, "learning_rate": 9.806501405851579e-05, "loss": 0.3174, "step": 1390 }, { "epoch": 2.0940910801656, "grad_norm": 0.20516011118888855, "learning_rate": 9.806069485462155e-05, "loss": 0.3907, "step": 1391 }, { "epoch": 2.09559653744825, "grad_norm": 0.23409345746040344, "learning_rate": 9.805637094167446e-05, "loss": 0.3673, "step": 1392 }, { "epoch": 2.0971019947308993, "grad_norm": 0.2596559524536133, "learning_rate": 9.805204232014738e-05, "loss": 0.3012, "step": 1393 }, { "epoch": 2.098607452013549, "grad_norm": 0.23841282725334167, "learning_rate": 9.804770899051367e-05, "loss": 0.3082, "step": 1394 }, { "epoch": 2.1001129092961985, "grad_norm": 0.24948985874652863, "learning_rate": 9.804337095324724e-05, "loss": 0.4154, "step": 1395 }, { "epoch": 2.1016183665788484, "grad_norm": 0.24651610851287842, "learning_rate": 9.803902820882247e-05, "loss": 0.4135, "step": 1396 }, { "epoch": 2.1031238238614978, "grad_norm": 0.24353653192520142, "learning_rate": 9.803468075771427e-05, "loss": 0.3536, "step": 1397 }, { "epoch": 2.1046292811441476, "grad_norm": 0.24038651585578918, "learning_rate": 9.803032860039811e-05, "loss": 0.3436, "step": 1398 }, { "epoch": 2.106134738426797, "grad_norm": 0.22117245197296143, "learning_rate": 9.802597173734989e-05, "loss": 0.3326, "step": 1399 }, { "epoch": 2.107640195709447, "grad_norm": 0.22303546965122223, "learning_rate": 9.80216101690461e-05, "loss": 0.3281, "step": 1400 }, { "epoch": 2.107640195709447, "eval_loss": 0.3676844537258148, "eval_runtime": 550.504, "eval_samples_per_second": 17.488, "eval_steps_per_second": 0.547, "step": 1400 }, { "epoch": 2.109145652992096, "grad_norm": 0.19052313268184662, "learning_rate": 9.80172438959637e-05, "loss": 0.355, "step": 1401 }, { "epoch": 2.110651110274746, "grad_norm": 0.20407362282276154, "learning_rate": 9.801287291858019e-05, "loss": 0.3906, "step": 1402 }, { "epoch": 2.1121565675573954, "grad_norm": 0.21192318201065063, "learning_rate": 9.800849723737355e-05, "loss": 0.3438, "step": 1403 }, { "epoch": 2.1136620248400453, "grad_norm": 0.27874839305877686, "learning_rate": 9.800411685282232e-05, "loss": 0.3814, "step": 1404 }, { "epoch": 2.1151674821226947, "grad_norm": 0.28443917632102966, "learning_rate": 9.799973176540554e-05, "loss": 0.2871, "step": 1405 }, { "epoch": 2.1166729394053445, "grad_norm": 0.2587602436542511, "learning_rate": 9.799534197560274e-05, "loss": 0.4285, "step": 1406 }, { "epoch": 2.118178396687994, "grad_norm": 0.29638993740081787, "learning_rate": 9.799094748389397e-05, "loss": 0.4093, "step": 1407 }, { "epoch": 2.1196838539706437, "grad_norm": 0.3460374176502228, "learning_rate": 9.798654829075983e-05, "loss": 0.4034, "step": 1408 }, { "epoch": 2.121189311253293, "grad_norm": 0.36437493562698364, "learning_rate": 9.798214439668139e-05, "loss": 0.4013, "step": 1409 }, { "epoch": 2.122694768535943, "grad_norm": 0.2969818413257599, "learning_rate": 9.797773580214027e-05, "loss": 0.3656, "step": 1410 }, { "epoch": 2.1242002258185924, "grad_norm": 0.26683539152145386, "learning_rate": 9.797332250761857e-05, "loss": 0.3593, "step": 1411 }, { "epoch": 2.125705683101242, "grad_norm": 0.24968495965003967, "learning_rate": 9.796890451359894e-05, "loss": 0.2502, "step": 1412 }, { "epoch": 2.1272111403838916, "grad_norm": 0.2512093782424927, "learning_rate": 9.79644818205645e-05, "loss": 0.3312, "step": 1413 }, { "epoch": 2.128716597666541, "grad_norm": 0.28659316897392273, "learning_rate": 9.796005442899894e-05, "loss": 0.3322, "step": 1414 }, { "epoch": 2.130222054949191, "grad_norm": 0.2248782515525818, "learning_rate": 9.795562233938643e-05, "loss": 0.3839, "step": 1415 }, { "epoch": 2.1317275122318406, "grad_norm": 0.258372962474823, "learning_rate": 9.795118555221161e-05, "loss": 0.3583, "step": 1416 }, { "epoch": 2.13323296951449, "grad_norm": 0.2173958271741867, "learning_rate": 9.794674406795973e-05, "loss": 0.3808, "step": 1417 }, { "epoch": 2.1347384267971394, "grad_norm": 0.23506692051887512, "learning_rate": 9.794229788711648e-05, "loss": 0.3873, "step": 1418 }, { "epoch": 2.1362438840797893, "grad_norm": 0.23185594379901886, "learning_rate": 9.793784701016812e-05, "loss": 0.3107, "step": 1419 }, { "epoch": 2.1377493413624387, "grad_norm": 0.23090076446533203, "learning_rate": 9.793339143760134e-05, "loss": 0.3609, "step": 1420 }, { "epoch": 2.1392547986450885, "grad_norm": 0.2279752790927887, "learning_rate": 9.792893116990345e-05, "loss": 0.4016, "step": 1421 }, { "epoch": 2.140760255927738, "grad_norm": 0.2251323163509369, "learning_rate": 9.792446620756216e-05, "loss": 0.319, "step": 1422 }, { "epoch": 2.1422657132103877, "grad_norm": 0.23664581775665283, "learning_rate": 9.791999655106578e-05, "loss": 0.334, "step": 1423 }, { "epoch": 2.143771170493037, "grad_norm": 0.2767602205276489, "learning_rate": 9.791552220090312e-05, "loss": 0.3857, "step": 1424 }, { "epoch": 2.145276627775687, "grad_norm": 0.2939154803752899, "learning_rate": 9.791104315756349e-05, "loss": 0.3629, "step": 1425 }, { "epoch": 2.1467820850583363, "grad_norm": 0.7558176517486572, "learning_rate": 9.790655942153669e-05, "loss": 0.3926, "step": 1426 }, { "epoch": 2.148287542340986, "grad_norm": 0.4054524600505829, "learning_rate": 9.790207099331303e-05, "loss": 0.3782, "step": 1427 }, { "epoch": 2.1497929996236356, "grad_norm": 0.3659246265888214, "learning_rate": 9.789757787338342e-05, "loss": 0.3908, "step": 1428 }, { "epoch": 2.1512984569062854, "grad_norm": 0.2985648512840271, "learning_rate": 9.789308006223918e-05, "loss": 0.3507, "step": 1429 }, { "epoch": 2.152803914188935, "grad_norm": 0.38698577880859375, "learning_rate": 9.78885775603722e-05, "loss": 0.4112, "step": 1430 }, { "epoch": 2.1543093714715846, "grad_norm": 0.3368639051914215, "learning_rate": 9.788407036827486e-05, "loss": 0.3431, "step": 1431 }, { "epoch": 2.155814828754234, "grad_norm": 0.34498167037963867, "learning_rate": 9.787955848644004e-05, "loss": 0.3664, "step": 1432 }, { "epoch": 2.157320286036884, "grad_norm": 0.26490387320518494, "learning_rate": 9.78750419153612e-05, "loss": 0.3432, "step": 1433 }, { "epoch": 2.1588257433195333, "grad_norm": 0.3038268983364105, "learning_rate": 9.787052065553221e-05, "loss": 0.3548, "step": 1434 }, { "epoch": 2.160331200602183, "grad_norm": 0.2579648792743683, "learning_rate": 9.786599470744757e-05, "loss": 0.3726, "step": 1435 }, { "epoch": 2.1618366578848325, "grad_norm": 0.2793416976928711, "learning_rate": 9.786146407160215e-05, "loss": 0.3787, "step": 1436 }, { "epoch": 2.1633421151674823, "grad_norm": 0.2985106110572815, "learning_rate": 9.78569287484915e-05, "loss": 0.3612, "step": 1437 }, { "epoch": 2.1648475724501317, "grad_norm": 0.29295432567596436, "learning_rate": 9.785238873861154e-05, "loss": 0.3844, "step": 1438 }, { "epoch": 2.1663530297327815, "grad_norm": 0.251471608877182, "learning_rate": 9.784784404245877e-05, "loss": 0.3201, "step": 1439 }, { "epoch": 2.167858487015431, "grad_norm": 0.24060453474521637, "learning_rate": 9.78432946605302e-05, "loss": 0.4221, "step": 1440 }, { "epoch": 2.1693639442980803, "grad_norm": 0.25781455636024475, "learning_rate": 9.783874059332336e-05, "loss": 0.3375, "step": 1441 }, { "epoch": 2.17086940158073, "grad_norm": 0.25627565383911133, "learning_rate": 9.783418184133623e-05, "loss": 0.3985, "step": 1442 }, { "epoch": 2.1723748588633796, "grad_norm": 0.2357557862997055, "learning_rate": 9.782961840506738e-05, "loss": 0.3764, "step": 1443 }, { "epoch": 2.1738803161460294, "grad_norm": 0.229349747300148, "learning_rate": 9.782505028501586e-05, "loss": 0.4285, "step": 1444 }, { "epoch": 2.175385773428679, "grad_norm": 0.2383635938167572, "learning_rate": 9.782047748168122e-05, "loss": 0.3051, "step": 1445 }, { "epoch": 2.1768912307113286, "grad_norm": 0.20873446762561798, "learning_rate": 9.781589999556356e-05, "loss": 0.402, "step": 1446 }, { "epoch": 2.178396687993978, "grad_norm": 0.21543651819229126, "learning_rate": 9.781131782716343e-05, "loss": 0.3772, "step": 1447 }, { "epoch": 2.179902145276628, "grad_norm": 0.2078186273574829, "learning_rate": 9.780673097698197e-05, "loss": 0.3695, "step": 1448 }, { "epoch": 2.1814076025592772, "grad_norm": 0.23341906070709229, "learning_rate": 9.780213944552075e-05, "loss": 0.3245, "step": 1449 }, { "epoch": 2.182913059841927, "grad_norm": 0.21424774825572968, "learning_rate": 9.779754323328192e-05, "loss": 0.3555, "step": 1450 }, { "epoch": 2.1844185171245765, "grad_norm": 0.2309642881155014, "learning_rate": 9.77929423407681e-05, "loss": 0.4204, "step": 1451 }, { "epoch": 2.1859239744072263, "grad_norm": 0.22782298922538757, "learning_rate": 9.778833676848245e-05, "loss": 0.3405, "step": 1452 }, { "epoch": 2.1874294316898757, "grad_norm": 0.2481640875339508, "learning_rate": 9.778372651692863e-05, "loss": 0.3945, "step": 1453 }, { "epoch": 2.1889348889725255, "grad_norm": 0.2158629447221756, "learning_rate": 9.777911158661077e-05, "loss": 0.373, "step": 1454 }, { "epoch": 2.190440346255175, "grad_norm": 0.226676806807518, "learning_rate": 9.777449197803362e-05, "loss": 0.3575, "step": 1455 }, { "epoch": 2.1919458035378248, "grad_norm": 0.23301342129707336, "learning_rate": 9.776986769170232e-05, "loss": 0.3452, "step": 1456 }, { "epoch": 2.193451260820474, "grad_norm": 0.23776309192180634, "learning_rate": 9.77652387281226e-05, "loss": 0.3196, "step": 1457 }, { "epoch": 2.194956718103124, "grad_norm": 0.2349090725183487, "learning_rate": 9.776060508780066e-05, "loss": 0.378, "step": 1458 }, { "epoch": 2.1964621753857734, "grad_norm": 0.2312983274459839, "learning_rate": 9.775596677124326e-05, "loss": 0.3772, "step": 1459 }, { "epoch": 2.197967632668423, "grad_norm": 0.2272384613752365, "learning_rate": 9.775132377895759e-05, "loss": 0.3937, "step": 1460 }, { "epoch": 2.1994730899510726, "grad_norm": 0.22820937633514404, "learning_rate": 9.774667611145143e-05, "loss": 0.3981, "step": 1461 }, { "epoch": 2.2009785472337224, "grad_norm": 0.2331072986125946, "learning_rate": 9.774202376923306e-05, "loss": 0.3338, "step": 1462 }, { "epoch": 2.202484004516372, "grad_norm": 0.2834131717681885, "learning_rate": 9.773736675281122e-05, "loss": 0.3262, "step": 1463 }, { "epoch": 2.203989461799021, "grad_norm": 0.30230000615119934, "learning_rate": 9.77327050626952e-05, "loss": 0.4345, "step": 1464 }, { "epoch": 2.205494919081671, "grad_norm": 0.25965064764022827, "learning_rate": 9.772803869939481e-05, "loss": 0.3548, "step": 1465 }, { "epoch": 2.207000376364321, "grad_norm": 0.2506908178329468, "learning_rate": 9.772336766342034e-05, "loss": 0.331, "step": 1466 }, { "epoch": 2.2085058336469703, "grad_norm": 0.2848130762577057, "learning_rate": 9.771869195528264e-05, "loss": 0.3498, "step": 1467 }, { "epoch": 2.2100112909296197, "grad_norm": 0.2972510755062103, "learning_rate": 9.7714011575493e-05, "loss": 0.3995, "step": 1468 }, { "epoch": 2.2115167482122695, "grad_norm": 0.32108089327812195, "learning_rate": 9.770932652456326e-05, "loss": 0.3593, "step": 1469 }, { "epoch": 2.213022205494919, "grad_norm": 0.3571377992630005, "learning_rate": 9.77046368030058e-05, "loss": 0.3782, "step": 1470 }, { "epoch": 2.2145276627775687, "grad_norm": 0.34221160411834717, "learning_rate": 9.769994241133345e-05, "loss": 0.4096, "step": 1471 }, { "epoch": 2.216033120060218, "grad_norm": 0.26614123582839966, "learning_rate": 9.769524335005962e-05, "loss": 0.375, "step": 1472 }, { "epoch": 2.217538577342868, "grad_norm": 0.21681220829486847, "learning_rate": 9.769053961969814e-05, "loss": 0.3463, "step": 1473 }, { "epoch": 2.2190440346255174, "grad_norm": 0.25557610392570496, "learning_rate": 9.768583122076344e-05, "loss": 0.3298, "step": 1474 }, { "epoch": 2.220549491908167, "grad_norm": 0.26666441559791565, "learning_rate": 9.768111815377042e-05, "loss": 0.3789, "step": 1475 }, { "epoch": 2.2220549491908166, "grad_norm": 0.2677510678768158, "learning_rate": 9.767640041923449e-05, "loss": 0.3947, "step": 1476 }, { "epoch": 2.2235604064734664, "grad_norm": 0.23620538413524628, "learning_rate": 9.767167801767158e-05, "loss": 0.3415, "step": 1477 }, { "epoch": 2.225065863756116, "grad_norm": 0.2128394991159439, "learning_rate": 9.766695094959812e-05, "loss": 0.3527, "step": 1478 }, { "epoch": 2.2265713210387656, "grad_norm": 0.24980637431144714, "learning_rate": 9.766221921553104e-05, "loss": 0.3748, "step": 1479 }, { "epoch": 2.228076778321415, "grad_norm": 0.2483363300561905, "learning_rate": 9.765748281598781e-05, "loss": 0.31, "step": 1480 }, { "epoch": 2.229582235604065, "grad_norm": 0.21950232982635498, "learning_rate": 9.765274175148638e-05, "loss": 0.423, "step": 1481 }, { "epoch": 2.2310876928867143, "grad_norm": 0.2250346690416336, "learning_rate": 9.764799602254527e-05, "loss": 0.4266, "step": 1482 }, { "epoch": 2.232593150169364, "grad_norm": 0.2193259745836258, "learning_rate": 9.764324562968341e-05, "loss": 0.3311, "step": 1483 }, { "epoch": 2.2340986074520135, "grad_norm": 0.20830024778842926, "learning_rate": 9.763849057342035e-05, "loss": 0.3233, "step": 1484 }, { "epoch": 2.2356040647346633, "grad_norm": 0.2110752910375595, "learning_rate": 9.763373085427603e-05, "loss": 0.3448, "step": 1485 }, { "epoch": 2.2371095220173127, "grad_norm": 0.20963908731937408, "learning_rate": 9.762896647277102e-05, "loss": 0.3887, "step": 1486 }, { "epoch": 2.2386149792999626, "grad_norm": 0.2346908003091812, "learning_rate": 9.762419742942634e-05, "loss": 0.3744, "step": 1487 }, { "epoch": 2.240120436582612, "grad_norm": 0.2883414924144745, "learning_rate": 9.76194237247635e-05, "loss": 0.3754, "step": 1488 }, { "epoch": 2.241625893865262, "grad_norm": 0.3371957540512085, "learning_rate": 9.761464535930456e-05, "loss": 0.3661, "step": 1489 }, { "epoch": 2.243131351147911, "grad_norm": 0.3623206913471222, "learning_rate": 9.760986233357208e-05, "loss": 0.3584, "step": 1490 }, { "epoch": 2.2446368084305606, "grad_norm": 0.31798481941223145, "learning_rate": 9.760507464808911e-05, "loss": 0.3627, "step": 1491 }, { "epoch": 2.2461422657132104, "grad_norm": 0.308458149433136, "learning_rate": 9.760028230337923e-05, "loss": 0.2912, "step": 1492 }, { "epoch": 2.24764772299586, "grad_norm": 0.28963056206703186, "learning_rate": 9.759548529996653e-05, "loss": 0.388, "step": 1493 }, { "epoch": 2.2491531802785096, "grad_norm": 0.2442082315683365, "learning_rate": 9.75906836383756e-05, "loss": 0.3551, "step": 1494 }, { "epoch": 2.250658637561159, "grad_norm": 0.24694465100765228, "learning_rate": 9.758587731913156e-05, "loss": 0.3435, "step": 1495 }, { "epoch": 2.252164094843809, "grad_norm": 0.2633443772792816, "learning_rate": 9.758106634275998e-05, "loss": 0.3526, "step": 1496 }, { "epoch": 2.2536695521264583, "grad_norm": 0.21845099329948425, "learning_rate": 9.7576250709787e-05, "loss": 0.3694, "step": 1497 }, { "epoch": 2.255175009409108, "grad_norm": 0.23216935992240906, "learning_rate": 9.757143042073927e-05, "loss": 0.4077, "step": 1498 }, { "epoch": 2.2566804666917575, "grad_norm": 0.25341588258743286, "learning_rate": 9.756660547614391e-05, "loss": 0.3482, "step": 1499 }, { "epoch": 2.2581859239744073, "grad_norm": 0.24076008796691895, "learning_rate": 9.756177587652856e-05, "loss": 0.3816, "step": 1500 }, { "epoch": 2.2596913812570567, "grad_norm": 0.20349743962287903, "learning_rate": 9.75569416224214e-05, "loss": 0.3897, "step": 1501 }, { "epoch": 2.2611968385397065, "grad_norm": 0.2606901526451111, "learning_rate": 9.755210271435111e-05, "loss": 0.3617, "step": 1502 }, { "epoch": 2.262702295822356, "grad_norm": 0.23971697688102722, "learning_rate": 9.754725915284681e-05, "loss": 0.3287, "step": 1503 }, { "epoch": 2.2642077531050058, "grad_norm": 0.20085211098194122, "learning_rate": 9.754241093843822e-05, "loss": 0.3125, "step": 1504 }, { "epoch": 2.265713210387655, "grad_norm": 0.22267986834049225, "learning_rate": 9.753755807165555e-05, "loss": 0.3511, "step": 1505 }, { "epoch": 2.267218667670305, "grad_norm": 0.22273030877113342, "learning_rate": 9.753270055302947e-05, "loss": 0.312, "step": 1506 }, { "epoch": 2.2687241249529544, "grad_norm": 0.30009251832962036, "learning_rate": 9.752783838309123e-05, "loss": 0.3954, "step": 1507 }, { "epoch": 2.2702295822356042, "grad_norm": 0.23699623346328735, "learning_rate": 9.752297156237248e-05, "loss": 0.3342, "step": 1508 }, { "epoch": 2.2717350395182536, "grad_norm": 0.24379651248455048, "learning_rate": 9.751810009140554e-05, "loss": 0.273, "step": 1509 }, { "epoch": 2.2732404968009035, "grad_norm": 0.21114589273929596, "learning_rate": 9.751322397072307e-05, "loss": 0.3277, "step": 1510 }, { "epoch": 2.274745954083553, "grad_norm": 0.20395508408546448, "learning_rate": 9.750834320085835e-05, "loss": 0.3864, "step": 1511 }, { "epoch": 2.2762514113662027, "grad_norm": 0.24262535572052002, "learning_rate": 9.750345778234512e-05, "loss": 0.3637, "step": 1512 }, { "epoch": 2.277756868648852, "grad_norm": 0.214229017496109, "learning_rate": 9.749856771571766e-05, "loss": 0.3126, "step": 1513 }, { "epoch": 2.2792623259315015, "grad_norm": 0.22230106592178345, "learning_rate": 9.749367300151073e-05, "loss": 0.3157, "step": 1514 }, { "epoch": 2.2807677832141513, "grad_norm": 0.3077496886253357, "learning_rate": 9.748877364025961e-05, "loss": 0.3606, "step": 1515 }, { "epoch": 2.282273240496801, "grad_norm": 0.3183213174343109, "learning_rate": 9.748386963250009e-05, "loss": 0.3204, "step": 1516 }, { "epoch": 2.2837786977794505, "grad_norm": 0.29388150572776794, "learning_rate": 9.747896097876845e-05, "loss": 0.3815, "step": 1517 }, { "epoch": 2.2852841550621, "grad_norm": 0.36572080850601196, "learning_rate": 9.747404767960151e-05, "loss": 0.3247, "step": 1518 }, { "epoch": 2.2867896123447498, "grad_norm": 0.4260615408420563, "learning_rate": 9.746912973553657e-05, "loss": 0.3339, "step": 1519 }, { "epoch": 2.288295069627399, "grad_norm": 0.37554171681404114, "learning_rate": 9.746420714711145e-05, "loss": 0.376, "step": 1520 }, { "epoch": 2.289800526910049, "grad_norm": 0.3795201778411865, "learning_rate": 9.745927991486448e-05, "loss": 0.3979, "step": 1521 }, { "epoch": 2.2913059841926984, "grad_norm": 0.2915172278881073, "learning_rate": 9.745434803933451e-05, "loss": 0.3518, "step": 1522 }, { "epoch": 2.292811441475348, "grad_norm": 0.25990575551986694, "learning_rate": 9.744941152106085e-05, "loss": 0.32, "step": 1523 }, { "epoch": 2.2943168987579976, "grad_norm": 0.28339728713035583, "learning_rate": 9.744447036058338e-05, "loss": 0.3714, "step": 1524 }, { "epoch": 2.2958223560406474, "grad_norm": 0.3006259799003601, "learning_rate": 9.743952455844245e-05, "loss": 0.373, "step": 1525 }, { "epoch": 2.297327813323297, "grad_norm": 0.3081100583076477, "learning_rate": 9.743457411517892e-05, "loss": 0.3625, "step": 1526 }, { "epoch": 2.2988332706059467, "grad_norm": 0.2510354518890381, "learning_rate": 9.742961903133415e-05, "loss": 0.3442, "step": 1527 }, { "epoch": 2.300338727888596, "grad_norm": 0.22396309673786163, "learning_rate": 9.742465930745003e-05, "loss": 0.3442, "step": 1528 }, { "epoch": 2.301844185171246, "grad_norm": 0.2609856128692627, "learning_rate": 9.741969494406898e-05, "loss": 0.3501, "step": 1529 }, { "epoch": 2.3033496424538953, "grad_norm": 0.3193005323410034, "learning_rate": 9.741472594173384e-05, "loss": 0.3721, "step": 1530 }, { "epoch": 2.304855099736545, "grad_norm": 0.2394881248474121, "learning_rate": 9.740975230098805e-05, "loss": 0.3021, "step": 1531 }, { "epoch": 2.3063605570191945, "grad_norm": 0.23576922714710236, "learning_rate": 9.740477402237552e-05, "loss": 0.3506, "step": 1532 }, { "epoch": 2.3078660143018443, "grad_norm": 0.27353599667549133, "learning_rate": 9.739979110644066e-05, "loss": 0.3671, "step": 1533 }, { "epoch": 2.3093714715844937, "grad_norm": 0.3063275218009949, "learning_rate": 9.739480355372838e-05, "loss": 0.3383, "step": 1534 }, { "epoch": 2.3108769288671436, "grad_norm": 0.32081568241119385, "learning_rate": 9.738981136478411e-05, "loss": 0.375, "step": 1535 }, { "epoch": 2.312382386149793, "grad_norm": 0.33564525842666626, "learning_rate": 9.738481454015382e-05, "loss": 0.3631, "step": 1536 }, { "epoch": 2.3138878434324424, "grad_norm": 0.31125393509864807, "learning_rate": 9.737981308038394e-05, "loss": 0.3141, "step": 1537 }, { "epoch": 2.315393300715092, "grad_norm": 0.2724047005176544, "learning_rate": 9.737480698602142e-05, "loss": 0.4002, "step": 1538 }, { "epoch": 2.316898757997742, "grad_norm": 0.29797589778900146, "learning_rate": 9.73697962576137e-05, "loss": 0.3823, "step": 1539 }, { "epoch": 2.3184042152803914, "grad_norm": 0.2505238950252533, "learning_rate": 9.736478089570876e-05, "loss": 0.3987, "step": 1540 }, { "epoch": 2.319909672563041, "grad_norm": 0.27900490164756775, "learning_rate": 9.735976090085509e-05, "loss": 0.36, "step": 1541 }, { "epoch": 2.3214151298456907, "grad_norm": 0.26154470443725586, "learning_rate": 9.735473627360166e-05, "loss": 0.3681, "step": 1542 }, { "epoch": 2.32292058712834, "grad_norm": 0.2651683986186981, "learning_rate": 9.734970701449795e-05, "loss": 0.3669, "step": 1543 }, { "epoch": 2.32442604441099, "grad_norm": 0.2107134461402893, "learning_rate": 9.734467312409395e-05, "loss": 0.3787, "step": 1544 }, { "epoch": 2.3259315016936393, "grad_norm": 0.2377001941204071, "learning_rate": 9.733963460294015e-05, "loss": 0.4075, "step": 1545 }, { "epoch": 2.327436958976289, "grad_norm": 0.2626839280128479, "learning_rate": 9.733459145158758e-05, "loss": 0.3369, "step": 1546 }, { "epoch": 2.3289424162589385, "grad_norm": 0.2786749005317688, "learning_rate": 9.732954367058772e-05, "loss": 0.4022, "step": 1547 }, { "epoch": 2.3304478735415883, "grad_norm": 0.29839226603507996, "learning_rate": 9.732449126049262e-05, "loss": 0.3236, "step": 1548 }, { "epoch": 2.3319533308242377, "grad_norm": 0.27734115719795227, "learning_rate": 9.73194342218548e-05, "loss": 0.3414, "step": 1549 }, { "epoch": 2.3334587881068876, "grad_norm": 0.25651174783706665, "learning_rate": 9.731437255522727e-05, "loss": 0.3169, "step": 1550 }, { "epoch": 2.334964245389537, "grad_norm": 0.2453424632549286, "learning_rate": 9.730930626116356e-05, "loss": 0.3696, "step": 1551 }, { "epoch": 2.336469702672187, "grad_norm": 0.20060652494430542, "learning_rate": 9.730423534021775e-05, "loss": 0.3147, "step": 1552 }, { "epoch": 2.337975159954836, "grad_norm": 0.2266579121351242, "learning_rate": 9.729915979294434e-05, "loss": 0.3363, "step": 1553 }, { "epoch": 2.339480617237486, "grad_norm": 0.2413148581981659, "learning_rate": 9.729407961989845e-05, "loss": 0.3743, "step": 1554 }, { "epoch": 2.3409860745201354, "grad_norm": 0.26424044370651245, "learning_rate": 9.728899482163557e-05, "loss": 0.3171, "step": 1555 }, { "epoch": 2.3424915318027852, "grad_norm": 0.26158589124679565, "learning_rate": 9.728390539871181e-05, "loss": 0.3646, "step": 1556 }, { "epoch": 2.3439969890854346, "grad_norm": 0.2631857991218567, "learning_rate": 9.727881135168374e-05, "loss": 0.3708, "step": 1557 }, { "epoch": 2.3455024463680845, "grad_norm": 0.31587904691696167, "learning_rate": 9.72737126811084e-05, "loss": 0.3531, "step": 1558 }, { "epoch": 2.347007903650734, "grad_norm": 0.30778768658638, "learning_rate": 9.726860938754342e-05, "loss": 0.3952, "step": 1559 }, { "epoch": 2.3485133609333837, "grad_norm": 0.30633044242858887, "learning_rate": 9.726350147154684e-05, "loss": 0.4352, "step": 1560 }, { "epoch": 2.350018818216033, "grad_norm": 0.26238739490509033, "learning_rate": 9.725838893367731e-05, "loss": 0.3307, "step": 1561 }, { "epoch": 2.351524275498683, "grad_norm": 0.22489997744560242, "learning_rate": 9.725327177449389e-05, "loss": 0.3494, "step": 1562 }, { "epoch": 2.3530297327813323, "grad_norm": 0.2688733637332916, "learning_rate": 9.724814999455619e-05, "loss": 0.3007, "step": 1563 }, { "epoch": 2.3545351900639817, "grad_norm": 0.3055160343647003, "learning_rate": 9.724302359442434e-05, "loss": 0.3101, "step": 1564 }, { "epoch": 2.3560406473466315, "grad_norm": 0.27860167622566223, "learning_rate": 9.723789257465892e-05, "loss": 0.3925, "step": 1565 }, { "epoch": 2.3575461046292814, "grad_norm": 0.2726726531982422, "learning_rate": 9.72327569358211e-05, "loss": 0.3655, "step": 1566 }, { "epoch": 2.3590515619119308, "grad_norm": 0.21360479295253754, "learning_rate": 9.722761667847246e-05, "loss": 0.3877, "step": 1567 }, { "epoch": 2.36055701919458, "grad_norm": 0.2213324010372162, "learning_rate": 9.722247180317515e-05, "loss": 0.3646, "step": 1568 }, { "epoch": 2.36206247647723, "grad_norm": 0.2131071537733078, "learning_rate": 9.72173223104918e-05, "loss": 0.2951, "step": 1569 }, { "epoch": 2.3635679337598794, "grad_norm": 0.26314011216163635, "learning_rate": 9.721216820098556e-05, "loss": 0.3352, "step": 1570 }, { "epoch": 2.3650733910425292, "grad_norm": 0.26799097657203674, "learning_rate": 9.720700947522007e-05, "loss": 0.3319, "step": 1571 }, { "epoch": 2.3665788483251786, "grad_norm": 0.24170845746994019, "learning_rate": 9.720184613375947e-05, "loss": 0.3138, "step": 1572 }, { "epoch": 2.3680843056078285, "grad_norm": 0.234614297747612, "learning_rate": 9.719667817716844e-05, "loss": 0.3792, "step": 1573 }, { "epoch": 2.369589762890478, "grad_norm": 0.2206929326057434, "learning_rate": 9.719150560601212e-05, "loss": 0.3409, "step": 1574 }, { "epoch": 2.3710952201731277, "grad_norm": 0.27901530265808105, "learning_rate": 9.71863284208562e-05, "loss": 0.3464, "step": 1575 }, { "epoch": 2.372600677455777, "grad_norm": 0.2963927984237671, "learning_rate": 9.718114662226681e-05, "loss": 0.3438, "step": 1576 }, { "epoch": 2.374106134738427, "grad_norm": 0.26642200350761414, "learning_rate": 9.717596021081065e-05, "loss": 0.31, "step": 1577 }, { "epoch": 2.3756115920210763, "grad_norm": 0.2782747447490692, "learning_rate": 9.717076918705489e-05, "loss": 0.3897, "step": 1578 }, { "epoch": 2.377117049303726, "grad_norm": 0.3126664459705353, "learning_rate": 9.716557355156721e-05, "loss": 0.3621, "step": 1579 }, { "epoch": 2.3786225065863755, "grad_norm": 0.24073415994644165, "learning_rate": 9.716037330491581e-05, "loss": 0.2985, "step": 1580 }, { "epoch": 2.3801279638690254, "grad_norm": 0.202269047498703, "learning_rate": 9.715516844766936e-05, "loss": 0.3448, "step": 1581 }, { "epoch": 2.3816334211516748, "grad_norm": 0.2640990912914276, "learning_rate": 9.714995898039709e-05, "loss": 0.3654, "step": 1582 }, { "epoch": 2.3831388784343246, "grad_norm": 0.2342236340045929, "learning_rate": 9.714474490366866e-05, "loss": 0.3788, "step": 1583 }, { "epoch": 2.384644335716974, "grad_norm": 0.23425282537937164, "learning_rate": 9.71395262180543e-05, "loss": 0.3941, "step": 1584 }, { "epoch": 2.386149792999624, "grad_norm": 0.212358295917511, "learning_rate": 9.71343029241247e-05, "loss": 0.4016, "step": 1585 }, { "epoch": 2.387655250282273, "grad_norm": 0.2384428083896637, "learning_rate": 9.712907502245107e-05, "loss": 0.3489, "step": 1586 }, { "epoch": 2.3891607075649226, "grad_norm": 0.24950096011161804, "learning_rate": 9.712384251360517e-05, "loss": 0.3624, "step": 1587 }, { "epoch": 2.3906661648475724, "grad_norm": 0.2718904912471771, "learning_rate": 9.711860539815916e-05, "loss": 0.3513, "step": 1588 }, { "epoch": 2.3921716221302223, "grad_norm": 0.34140217304229736, "learning_rate": 9.711336367668579e-05, "loss": 0.3637, "step": 1589 }, { "epoch": 2.3936770794128717, "grad_norm": 0.3058236539363861, "learning_rate": 9.71081173497583e-05, "loss": 0.3326, "step": 1590 }, { "epoch": 2.395182536695521, "grad_norm": 0.27137917280197144, "learning_rate": 9.710286641795037e-05, "loss": 0.4042, "step": 1591 }, { "epoch": 2.396687993978171, "grad_norm": 0.3120802938938141, "learning_rate": 9.709761088183631e-05, "loss": 0.3509, "step": 1592 }, { "epoch": 2.3981934512608203, "grad_norm": 0.29302284121513367, "learning_rate": 9.709235074199079e-05, "loss": 0.3815, "step": 1593 }, { "epoch": 2.39969890854347, "grad_norm": 0.2456607222557068, "learning_rate": 9.708708599898909e-05, "loss": 0.295, "step": 1594 }, { "epoch": 2.4012043658261195, "grad_norm": 0.25681447982788086, "learning_rate": 9.708181665340693e-05, "loss": 0.4024, "step": 1595 }, { "epoch": 2.4027098231087693, "grad_norm": 0.24998372793197632, "learning_rate": 9.707654270582057e-05, "loss": 0.3571, "step": 1596 }, { "epoch": 2.4042152803914187, "grad_norm": 0.2343527376651764, "learning_rate": 9.707126415680679e-05, "loss": 0.3304, "step": 1597 }, { "epoch": 2.4057207376740686, "grad_norm": 0.23473796248435974, "learning_rate": 9.706598100694279e-05, "loss": 0.2756, "step": 1598 }, { "epoch": 2.407226194956718, "grad_norm": 0.24210317432880402, "learning_rate": 9.706069325680634e-05, "loss": 0.3487, "step": 1599 }, { "epoch": 2.408731652239368, "grad_norm": 0.2262105792760849, "learning_rate": 9.705540090697575e-05, "loss": 0.401, "step": 1600 }, { "epoch": 2.408731652239368, "eval_loss": 0.34067481756210327, "eval_runtime": 551.0967, "eval_samples_per_second": 17.469, "eval_steps_per_second": 0.546, "step": 1600 }, { "epoch": 2.410237109522017, "grad_norm": 0.2605046331882477, "learning_rate": 9.705010395802971e-05, "loss": 0.3357, "step": 1601 }, { "epoch": 2.411742566804667, "grad_norm": 0.2828574478626251, "learning_rate": 9.704480241054755e-05, "loss": 0.3918, "step": 1602 }, { "epoch": 2.4132480240873164, "grad_norm": 0.24501658976078033, "learning_rate": 9.7039496265109e-05, "loss": 0.3595, "step": 1603 }, { "epoch": 2.4147534813699663, "grad_norm": 0.26198866963386536, "learning_rate": 9.703418552229434e-05, "loss": 0.3111, "step": 1604 }, { "epoch": 2.4162589386526157, "grad_norm": 0.2498687356710434, "learning_rate": 9.702887018268435e-05, "loss": 0.3225, "step": 1605 }, { "epoch": 2.4177643959352655, "grad_norm": 0.22599071264266968, "learning_rate": 9.70235502468603e-05, "loss": 0.3224, "step": 1606 }, { "epoch": 2.419269853217915, "grad_norm": 0.22304268181324005, "learning_rate": 9.7018225715404e-05, "loss": 0.3173, "step": 1607 }, { "epoch": 2.4207753105005647, "grad_norm": 0.20867738127708435, "learning_rate": 9.701289658889769e-05, "loss": 0.341, "step": 1608 }, { "epoch": 2.422280767783214, "grad_norm": 0.26461851596832275, "learning_rate": 9.700756286792419e-05, "loss": 0.4173, "step": 1609 }, { "epoch": 2.423786225065864, "grad_norm": 0.23812054097652435, "learning_rate": 9.700222455306675e-05, "loss": 0.3927, "step": 1610 }, { "epoch": 2.4252916823485133, "grad_norm": 0.2298751026391983, "learning_rate": 9.69968816449092e-05, "loss": 0.2972, "step": 1611 }, { "epoch": 2.426797139631163, "grad_norm": 0.22779878973960876, "learning_rate": 9.69915341440358e-05, "loss": 0.3244, "step": 1612 }, { "epoch": 2.4283025969138126, "grad_norm": 0.24282342195510864, "learning_rate": 9.698618205103138e-05, "loss": 0.3521, "step": 1613 }, { "epoch": 2.429808054196462, "grad_norm": 0.2702562212944031, "learning_rate": 9.69808253664812e-05, "loss": 0.3044, "step": 1614 }, { "epoch": 2.431313511479112, "grad_norm": 0.3046414852142334, "learning_rate": 9.697546409097107e-05, "loss": 0.3751, "step": 1615 }, { "epoch": 2.4328189687617616, "grad_norm": 0.3397657573223114, "learning_rate": 9.69700982250873e-05, "loss": 0.3318, "step": 1616 }, { "epoch": 2.434324426044411, "grad_norm": 0.29471099376678467, "learning_rate": 9.696472776941671e-05, "loss": 0.2892, "step": 1617 }, { "epoch": 2.4358298833270604, "grad_norm": 0.27224570512771606, "learning_rate": 9.695935272454656e-05, "loss": 0.2814, "step": 1618 }, { "epoch": 2.4373353406097102, "grad_norm": 0.22532328963279724, "learning_rate": 9.69539730910647e-05, "loss": 0.2999, "step": 1619 }, { "epoch": 2.4388407978923596, "grad_norm": 0.2904978394508362, "learning_rate": 9.694858886955939e-05, "loss": 0.3197, "step": 1620 }, { "epoch": 2.4403462551750095, "grad_norm": 0.3558069169521332, "learning_rate": 9.694320006061949e-05, "loss": 0.3041, "step": 1621 }, { "epoch": 2.441851712457659, "grad_norm": 0.28344812989234924, "learning_rate": 9.693780666483429e-05, "loss": 0.3771, "step": 1622 }, { "epoch": 2.4433571697403087, "grad_norm": 0.24762804806232452, "learning_rate": 9.693240868279362e-05, "loss": 0.3145, "step": 1623 }, { "epoch": 2.444862627022958, "grad_norm": 0.25388577580451965, "learning_rate": 9.692700611508775e-05, "loss": 0.3983, "step": 1624 }, { "epoch": 2.446368084305608, "grad_norm": 0.2315082997083664, "learning_rate": 9.692159896230756e-05, "loss": 0.3432, "step": 1625 }, { "epoch": 2.4478735415882573, "grad_norm": 0.28740131855010986, "learning_rate": 9.691618722504431e-05, "loss": 0.362, "step": 1626 }, { "epoch": 2.449378998870907, "grad_norm": 0.28828659653663635, "learning_rate": 9.691077090388987e-05, "loss": 0.3362, "step": 1627 }, { "epoch": 2.4508844561535565, "grad_norm": 0.26627928018569946, "learning_rate": 9.690534999943652e-05, "loss": 0.3157, "step": 1628 }, { "epoch": 2.4523899134362064, "grad_norm": 0.305012047290802, "learning_rate": 9.68999245122771e-05, "loss": 0.3098, "step": 1629 }, { "epoch": 2.4538953707188558, "grad_norm": 0.2478407323360443, "learning_rate": 9.689449444300493e-05, "loss": 0.3318, "step": 1630 }, { "epoch": 2.4554008280015056, "grad_norm": 0.21625551581382751, "learning_rate": 9.688905979221384e-05, "loss": 0.366, "step": 1631 }, { "epoch": 2.456906285284155, "grad_norm": 0.24449501931667328, "learning_rate": 9.688362056049813e-05, "loss": 0.2788, "step": 1632 }, { "epoch": 2.458411742566805, "grad_norm": 0.22342254221439362, "learning_rate": 9.687817674845267e-05, "loss": 0.2527, "step": 1633 }, { "epoch": 2.4599171998494542, "grad_norm": 0.2207355946302414, "learning_rate": 9.687272835667275e-05, "loss": 0.3309, "step": 1634 }, { "epoch": 2.461422657132104, "grad_norm": 0.20463770627975464, "learning_rate": 9.686727538575422e-05, "loss": 0.3144, "step": 1635 }, { "epoch": 2.4629281144147535, "grad_norm": 0.21145153045654297, "learning_rate": 9.686181783629342e-05, "loss": 0.3466, "step": 1636 }, { "epoch": 2.464433571697403, "grad_norm": 0.21689888834953308, "learning_rate": 9.685635570888712e-05, "loss": 0.3276, "step": 1637 }, { "epoch": 2.4659390289800527, "grad_norm": 0.21637648344039917, "learning_rate": 9.68508890041327e-05, "loss": 0.3799, "step": 1638 }, { "epoch": 2.4674444862627025, "grad_norm": 0.2333110123872757, "learning_rate": 9.6845417722628e-05, "loss": 0.3239, "step": 1639 }, { "epoch": 2.468949943545352, "grad_norm": 0.2117045521736145, "learning_rate": 9.683994186497132e-05, "loss": 0.3133, "step": 1640 }, { "epoch": 2.4704554008280013, "grad_norm": 0.2068711221218109, "learning_rate": 9.683446143176151e-05, "loss": 0.2925, "step": 1641 }, { "epoch": 2.471960858110651, "grad_norm": 0.19457492232322693, "learning_rate": 9.682897642359789e-05, "loss": 0.2468, "step": 1642 }, { "epoch": 2.4734663153933005, "grad_norm": 0.2375800609588623, "learning_rate": 9.682348684108028e-05, "loss": 0.3931, "step": 1643 }, { "epoch": 2.4749717726759504, "grad_norm": 0.27704349160194397, "learning_rate": 9.681799268480905e-05, "loss": 0.3679, "step": 1644 }, { "epoch": 2.4764772299585998, "grad_norm": 0.261982262134552, "learning_rate": 9.6812493955385e-05, "loss": 0.3503, "step": 1645 }, { "epoch": 2.4779826872412496, "grad_norm": 0.2678753733634949, "learning_rate": 9.680699065340949e-05, "loss": 0.3593, "step": 1646 }, { "epoch": 2.479488144523899, "grad_norm": 0.29698193073272705, "learning_rate": 9.680148277948433e-05, "loss": 0.2862, "step": 1647 }, { "epoch": 2.480993601806549, "grad_norm": 0.2550787031650543, "learning_rate": 9.679597033421186e-05, "loss": 0.3638, "step": 1648 }, { "epoch": 2.482499059089198, "grad_norm": 0.2201438695192337, "learning_rate": 9.679045331819491e-05, "loss": 0.3354, "step": 1649 }, { "epoch": 2.484004516371848, "grad_norm": 0.3426288664340973, "learning_rate": 9.678493173203682e-05, "loss": 0.2936, "step": 1650 }, { "epoch": 2.4855099736544974, "grad_norm": 0.31695547699928284, "learning_rate": 9.677940557634142e-05, "loss": 0.3232, "step": 1651 }, { "epoch": 2.4870154309371473, "grad_norm": 0.3046022355556488, "learning_rate": 9.677387485171305e-05, "loss": 0.308, "step": 1652 }, { "epoch": 2.4885208882197967, "grad_norm": 0.31809765100479126, "learning_rate": 9.676833955875651e-05, "loss": 0.2774, "step": 1653 }, { "epoch": 2.4900263455024465, "grad_norm": 0.28002461791038513, "learning_rate": 9.676279969807717e-05, "loss": 0.324, "step": 1654 }, { "epoch": 2.491531802785096, "grad_norm": 0.23536716401576996, "learning_rate": 9.675725527028083e-05, "loss": 0.4039, "step": 1655 }, { "epoch": 2.4930372600677457, "grad_norm": 0.23583164811134338, "learning_rate": 9.675170627597386e-05, "loss": 0.3147, "step": 1656 }, { "epoch": 2.494542717350395, "grad_norm": 0.2388509064912796, "learning_rate": 9.674615271576305e-05, "loss": 0.3084, "step": 1657 }, { "epoch": 2.496048174633045, "grad_norm": 0.2533866763114929, "learning_rate": 9.674059459025575e-05, "loss": 0.3721, "step": 1658 }, { "epoch": 2.4975536319156943, "grad_norm": 0.24317815899848938, "learning_rate": 9.673503190005977e-05, "loss": 0.2861, "step": 1659 }, { "epoch": 2.499059089198344, "grad_norm": 0.2065383940935135, "learning_rate": 9.672946464578345e-05, "loss": 0.294, "step": 1660 }, { "epoch": 2.5005645464809936, "grad_norm": 0.23928245902061462, "learning_rate": 9.672389282803563e-05, "loss": 0.3106, "step": 1661 }, { "epoch": 2.5020700037636434, "grad_norm": 0.2855374813079834, "learning_rate": 9.67183164474256e-05, "loss": 0.3953, "step": 1662 }, { "epoch": 2.503575461046293, "grad_norm": 0.30243396759033203, "learning_rate": 9.671273550456322e-05, "loss": 0.3299, "step": 1663 }, { "epoch": 2.505080918328942, "grad_norm": 0.21601983904838562, "learning_rate": 9.670715000005878e-05, "loss": 0.3083, "step": 1664 }, { "epoch": 2.506586375611592, "grad_norm": 0.3087557256221771, "learning_rate": 9.670155993452314e-05, "loss": 0.427, "step": 1665 }, { "epoch": 2.508091832894242, "grad_norm": 0.3858621418476105, "learning_rate": 9.669596530856761e-05, "loss": 0.4173, "step": 1666 }, { "epoch": 2.5095972901768913, "grad_norm": 0.2938467860221863, "learning_rate": 9.6690366122804e-05, "loss": 0.3036, "step": 1667 }, { "epoch": 2.5111027474595407, "grad_norm": 0.33520248532295227, "learning_rate": 9.668476237784462e-05, "loss": 0.3155, "step": 1668 }, { "epoch": 2.5126082047421905, "grad_norm": 0.27852553129196167, "learning_rate": 9.66791540743023e-05, "loss": 0.3252, "step": 1669 }, { "epoch": 2.51411366202484, "grad_norm": 0.23968590795993805, "learning_rate": 9.667354121279035e-05, "loss": 0.2981, "step": 1670 }, { "epoch": 2.5156191193074897, "grad_norm": 0.23353546857833862, "learning_rate": 9.66679237939226e-05, "loss": 0.2931, "step": 1671 }, { "epoch": 2.517124576590139, "grad_norm": 0.2784257233142853, "learning_rate": 9.666230181831333e-05, "loss": 0.3753, "step": 1672 }, { "epoch": 2.518630033872789, "grad_norm": 0.26167863607406616, "learning_rate": 9.66566752865774e-05, "loss": 0.2735, "step": 1673 }, { "epoch": 2.5201354911554383, "grad_norm": 0.2635033428668976, "learning_rate": 9.665104419933009e-05, "loss": 0.3522, "step": 1674 }, { "epoch": 2.521640948438088, "grad_norm": 0.2717929184436798, "learning_rate": 9.66454085571872e-05, "loss": 0.3598, "step": 1675 }, { "epoch": 2.5231464057207376, "grad_norm": 0.27072662115097046, "learning_rate": 9.663976836076502e-05, "loss": 0.3668, "step": 1676 }, { "epoch": 2.5246518630033874, "grad_norm": 0.26915666460990906, "learning_rate": 9.66341236106804e-05, "loss": 0.3968, "step": 1677 }, { "epoch": 2.526157320286037, "grad_norm": 0.3024865984916687, "learning_rate": 9.66284743075506e-05, "loss": 0.368, "step": 1678 }, { "epoch": 2.5276627775686866, "grad_norm": 0.29090529680252075, "learning_rate": 9.662282045199345e-05, "loss": 0.2762, "step": 1679 }, { "epoch": 2.529168234851336, "grad_norm": 0.3499895930290222, "learning_rate": 9.661716204462726e-05, "loss": 0.3005, "step": 1680 }, { "epoch": 2.530673692133986, "grad_norm": 0.37457191944122314, "learning_rate": 9.661149908607077e-05, "loss": 0.2376, "step": 1681 }, { "epoch": 2.5321791494166352, "grad_norm": 0.34734392166137695, "learning_rate": 9.660583157694329e-05, "loss": 0.3155, "step": 1682 }, { "epoch": 2.5336846066992846, "grad_norm": 0.2639312446117401, "learning_rate": 9.660015951786465e-05, "loss": 0.3431, "step": 1683 }, { "epoch": 2.5351900639819345, "grad_norm": 0.285207062959671, "learning_rate": 9.65944829094551e-05, "loss": 0.362, "step": 1684 }, { "epoch": 2.5366955212645843, "grad_norm": 0.3118298649787903, "learning_rate": 9.658880175233544e-05, "loss": 0.3603, "step": 1685 }, { "epoch": 2.5382009785472337, "grad_norm": 0.23655925691127777, "learning_rate": 9.658311604712693e-05, "loss": 0.33, "step": 1686 }, { "epoch": 2.539706435829883, "grad_norm": 0.2715807259082794, "learning_rate": 9.657742579445138e-05, "loss": 0.3227, "step": 1687 }, { "epoch": 2.541211893112533, "grad_norm": 0.30384713411331177, "learning_rate": 9.657173099493107e-05, "loss": 0.408, "step": 1688 }, { "epoch": 2.5427173503951828, "grad_norm": 0.2332019954919815, "learning_rate": 9.656603164918873e-05, "loss": 0.3527, "step": 1689 }, { "epoch": 2.544222807677832, "grad_norm": 0.21020738780498505, "learning_rate": 9.656032775784769e-05, "loss": 0.4078, "step": 1690 }, { "epoch": 2.5457282649604815, "grad_norm": 0.29488256573677063, "learning_rate": 9.655461932153168e-05, "loss": 0.3125, "step": 1691 }, { "epoch": 2.5472337222431314, "grad_norm": 0.2905116379261017, "learning_rate": 9.654890634086497e-05, "loss": 0.3127, "step": 1692 }, { "epoch": 2.548739179525781, "grad_norm": 0.22115111351013184, "learning_rate": 9.654318881647235e-05, "loss": 0.3666, "step": 1693 }, { "epoch": 2.5502446368084306, "grad_norm": 0.2510709762573242, "learning_rate": 9.653746674897904e-05, "loss": 0.3296, "step": 1694 }, { "epoch": 2.55175009409108, "grad_norm": 0.2652914524078369, "learning_rate": 9.653174013901083e-05, "loss": 0.2976, "step": 1695 }, { "epoch": 2.55325555137373, "grad_norm": 0.2593052387237549, "learning_rate": 9.652600898719395e-05, "loss": 0.2703, "step": 1696 }, { "epoch": 2.5547610086563792, "grad_norm": 0.22498691082000732, "learning_rate": 9.652027329415517e-05, "loss": 0.317, "step": 1697 }, { "epoch": 2.556266465939029, "grad_norm": 0.2644052803516388, "learning_rate": 9.651453306052173e-05, "loss": 0.2822, "step": 1698 }, { "epoch": 2.5577719232216785, "grad_norm": 0.27412617206573486, "learning_rate": 9.650878828692137e-05, "loss": 0.3716, "step": 1699 }, { "epoch": 2.5592773805043283, "grad_norm": 0.22539226710796356, "learning_rate": 9.650303897398232e-05, "loss": 0.2777, "step": 1700 }, { "epoch": 2.5607828377869777, "grad_norm": 0.28394579887390137, "learning_rate": 9.649728512233333e-05, "loss": 0.3804, "step": 1701 }, { "epoch": 2.5622882950696275, "grad_norm": 0.2763226330280304, "learning_rate": 9.649152673260363e-05, "loss": 0.3434, "step": 1702 }, { "epoch": 2.563793752352277, "grad_norm": 2.1125943660736084, "learning_rate": 9.648576380542294e-05, "loss": 0.3266, "step": 1703 }, { "epoch": 2.5652992096349267, "grad_norm": 0.33309030532836914, "learning_rate": 9.647999634142151e-05, "loss": 0.313, "step": 1704 }, { "epoch": 2.566804666917576, "grad_norm": 0.3971288204193115, "learning_rate": 9.647422434123004e-05, "loss": 0.3008, "step": 1705 }, { "epoch": 2.568310124200226, "grad_norm": 0.2754718065261841, "learning_rate": 9.646844780547975e-05, "loss": 0.3539, "step": 1706 }, { "epoch": 2.5698155814828754, "grad_norm": 0.3145469129085541, "learning_rate": 9.646266673480235e-05, "loss": 0.3228, "step": 1707 }, { "epoch": 2.571321038765525, "grad_norm": 0.26361748576164246, "learning_rate": 9.645688112983006e-05, "loss": 0.235, "step": 1708 }, { "epoch": 2.5728264960481746, "grad_norm": 0.2693319320678711, "learning_rate": 9.645109099119556e-05, "loss": 0.347, "step": 1709 }, { "epoch": 2.574331953330824, "grad_norm": 0.2639162540435791, "learning_rate": 9.64452963195321e-05, "loss": 0.288, "step": 1710 }, { "epoch": 2.575837410613474, "grad_norm": 0.22633807361125946, "learning_rate": 9.643949711547333e-05, "loss": 0.3292, "step": 1711 }, { "epoch": 2.5773428678961237, "grad_norm": 0.27151182293891907, "learning_rate": 9.643369337965346e-05, "loss": 0.3262, "step": 1712 }, { "epoch": 2.578848325178773, "grad_norm": 0.25463607907295227, "learning_rate": 9.642788511270718e-05, "loss": 0.3487, "step": 1713 }, { "epoch": 2.5803537824614224, "grad_norm": 0.24089990556240082, "learning_rate": 9.642207231526968e-05, "loss": 0.2251, "step": 1714 }, { "epoch": 2.5818592397440723, "grad_norm": 0.2618332505226135, "learning_rate": 9.641625498797661e-05, "loss": 0.3292, "step": 1715 }, { "epoch": 2.583364697026722, "grad_norm": 0.23794429004192352, "learning_rate": 9.641043313146417e-05, "loss": 0.3703, "step": 1716 }, { "epoch": 2.5848701543093715, "grad_norm": 0.2832286059856415, "learning_rate": 9.640460674636902e-05, "loss": 0.43, "step": 1717 }, { "epoch": 2.586375611592021, "grad_norm": 0.3038676381111145, "learning_rate": 9.639877583332832e-05, "loss": 0.3443, "step": 1718 }, { "epoch": 2.5878810688746707, "grad_norm": 0.26356831192970276, "learning_rate": 9.639294039297976e-05, "loss": 0.3624, "step": 1719 }, { "epoch": 2.58938652615732, "grad_norm": 0.2645999491214752, "learning_rate": 9.638710042596146e-05, "loss": 0.3872, "step": 1720 }, { "epoch": 2.59089198343997, "grad_norm": 0.21897435188293457, "learning_rate": 9.638125593291208e-05, "loss": 0.3363, "step": 1721 }, { "epoch": 2.5923974407226194, "grad_norm": 0.2533116936683655, "learning_rate": 9.637540691447077e-05, "loss": 0.3103, "step": 1722 }, { "epoch": 2.593902898005269, "grad_norm": 0.3229147493839264, "learning_rate": 9.636955337127716e-05, "loss": 0.2827, "step": 1723 }, { "epoch": 2.5954083552879186, "grad_norm": 0.2415551245212555, "learning_rate": 9.636369530397141e-05, "loss": 0.2948, "step": 1724 }, { "epoch": 2.5969138125705684, "grad_norm": 0.24350422620773315, "learning_rate": 9.635783271319409e-05, "loss": 0.2911, "step": 1725 }, { "epoch": 2.598419269853218, "grad_norm": 0.2379985749721527, "learning_rate": 9.635196559958641e-05, "loss": 0.3029, "step": 1726 }, { "epoch": 2.5999247271358676, "grad_norm": 0.19600358605384827, "learning_rate": 9.634609396378992e-05, "loss": 0.2806, "step": 1727 }, { "epoch": 2.601430184418517, "grad_norm": 0.20162837207317352, "learning_rate": 9.634021780644676e-05, "loss": 0.3126, "step": 1728 }, { "epoch": 2.602935641701167, "grad_norm": 0.2284020334482193, "learning_rate": 9.633433712819955e-05, "loss": 0.3329, "step": 1729 }, { "epoch": 2.6044410989838163, "grad_norm": 0.23770831525325775, "learning_rate": 9.632845192969136e-05, "loss": 0.3329, "step": 1730 }, { "epoch": 2.605946556266466, "grad_norm": 0.26591432094573975, "learning_rate": 9.632256221156581e-05, "loss": 0.3392, "step": 1731 }, { "epoch": 2.6074520135491155, "grad_norm": 0.33824002742767334, "learning_rate": 9.631666797446696e-05, "loss": 0.3313, "step": 1732 }, { "epoch": 2.608957470831765, "grad_norm": 0.3185422122478485, "learning_rate": 9.631076921903945e-05, "loss": 0.2638, "step": 1733 }, { "epoch": 2.6104629281144147, "grad_norm": 0.2628394663333893, "learning_rate": 9.630486594592833e-05, "loss": 0.2961, "step": 1734 }, { "epoch": 2.6119683853970646, "grad_norm": 0.24738237261772156, "learning_rate": 9.629895815577916e-05, "loss": 0.3154, "step": 1735 }, { "epoch": 2.613473842679714, "grad_norm": 0.22129525244235992, "learning_rate": 9.629304584923802e-05, "loss": 0.3661, "step": 1736 }, { "epoch": 2.6149792999623633, "grad_norm": 0.20888201892375946, "learning_rate": 9.628712902695146e-05, "loss": 0.3163, "step": 1737 }, { "epoch": 2.616484757245013, "grad_norm": 0.22559459507465363, "learning_rate": 9.628120768956655e-05, "loss": 0.2984, "step": 1738 }, { "epoch": 2.617990214527663, "grad_norm": 0.21491681039333344, "learning_rate": 9.627528183773083e-05, "loss": 0.3066, "step": 1739 }, { "epoch": 2.6194956718103124, "grad_norm": 0.24843786656856537, "learning_rate": 9.626935147209235e-05, "loss": 0.2516, "step": 1740 }, { "epoch": 2.621001129092962, "grad_norm": 0.24127717316150665, "learning_rate": 9.626341659329963e-05, "loss": 0.332, "step": 1741 }, { "epoch": 2.6225065863756116, "grad_norm": 0.28660520911216736, "learning_rate": 9.625747720200173e-05, "loss": 0.3231, "step": 1742 }, { "epoch": 2.6240120436582615, "grad_norm": 0.25691109895706177, "learning_rate": 9.625153329884815e-05, "loss": 0.349, "step": 1743 }, { "epoch": 2.625517500940911, "grad_norm": 0.23975402116775513, "learning_rate": 9.624558488448889e-05, "loss": 0.3136, "step": 1744 }, { "epoch": 2.6270229582235602, "grad_norm": 0.3023480772972107, "learning_rate": 9.62396319595745e-05, "loss": 0.359, "step": 1745 }, { "epoch": 2.62852841550621, "grad_norm": 0.24491187930107117, "learning_rate": 9.623367452475594e-05, "loss": 0.2831, "step": 1746 }, { "epoch": 2.6300338727888595, "grad_norm": 0.24479512870311737, "learning_rate": 9.622771258068475e-05, "loss": 0.2938, "step": 1747 }, { "epoch": 2.6315393300715093, "grad_norm": 0.38338503241539, "learning_rate": 9.622174612801288e-05, "loss": 0.3378, "step": 1748 }, { "epoch": 2.6330447873541587, "grad_norm": 0.2978341579437256, "learning_rate": 9.621577516739284e-05, "loss": 0.3115, "step": 1749 }, { "epoch": 2.6345502446368085, "grad_norm": 0.3475058078765869, "learning_rate": 9.620979969947759e-05, "loss": 0.3692, "step": 1750 }, { "epoch": 2.636055701919458, "grad_norm": 0.2984987795352936, "learning_rate": 9.620381972492059e-05, "loss": 0.4367, "step": 1751 }, { "epoch": 2.6375611592021078, "grad_norm": 0.28113770484924316, "learning_rate": 9.619783524437583e-05, "loss": 0.2956, "step": 1752 }, { "epoch": 2.639066616484757, "grad_norm": 0.25922632217407227, "learning_rate": 9.619184625849775e-05, "loss": 0.2811, "step": 1753 }, { "epoch": 2.640572073767407, "grad_norm": 0.2636757791042328, "learning_rate": 9.618585276794129e-05, "loss": 0.3698, "step": 1754 }, { "epoch": 2.6420775310500564, "grad_norm": 0.25993070006370544, "learning_rate": 9.61798547733619e-05, "loss": 0.3878, "step": 1755 }, { "epoch": 2.643582988332706, "grad_norm": 0.22198276221752167, "learning_rate": 9.61738522754155e-05, "loss": 0.2734, "step": 1756 }, { "epoch": 2.6450884456153556, "grad_norm": 0.24420863389968872, "learning_rate": 9.61678452747585e-05, "loss": 0.3421, "step": 1757 }, { "epoch": 2.6465939028980054, "grad_norm": 0.2563653886318207, "learning_rate": 9.616183377204787e-05, "loss": 0.2896, "step": 1758 }, { "epoch": 2.648099360180655, "grad_norm": 0.23641377687454224, "learning_rate": 9.615581776794096e-05, "loss": 0.3323, "step": 1759 }, { "epoch": 2.6496048174633042, "grad_norm": 0.268172025680542, "learning_rate": 9.61497972630957e-05, "loss": 0.3076, "step": 1760 }, { "epoch": 2.651110274745954, "grad_norm": 0.24027957022190094, "learning_rate": 9.614377225817049e-05, "loss": 0.3301, "step": 1761 }, { "epoch": 2.652615732028604, "grad_norm": 0.25027555227279663, "learning_rate": 9.613774275382419e-05, "loss": 0.3101, "step": 1762 }, { "epoch": 2.6541211893112533, "grad_norm": 0.21793028712272644, "learning_rate": 9.61317087507162e-05, "loss": 0.3127, "step": 1763 }, { "epoch": 2.6556266465939027, "grad_norm": 0.20674856007099152, "learning_rate": 9.612567024950637e-05, "loss": 0.3025, "step": 1764 }, { "epoch": 2.6571321038765525, "grad_norm": 0.19988349080085754, "learning_rate": 9.611962725085509e-05, "loss": 0.2138, "step": 1765 }, { "epoch": 2.6586375611592024, "grad_norm": 0.21004337072372437, "learning_rate": 9.611357975542319e-05, "loss": 0.3447, "step": 1766 }, { "epoch": 2.6601430184418517, "grad_norm": 0.20406118035316467, "learning_rate": 9.610752776387203e-05, "loss": 0.2748, "step": 1767 }, { "epoch": 2.661648475724501, "grad_norm": 0.22126753628253937, "learning_rate": 9.610147127686342e-05, "loss": 0.358, "step": 1768 }, { "epoch": 2.663153933007151, "grad_norm": 0.2184433788061142, "learning_rate": 9.609541029505972e-05, "loss": 0.305, "step": 1769 }, { "epoch": 2.6646593902898004, "grad_norm": 0.220318004488945, "learning_rate": 9.608934481912374e-05, "loss": 0.2971, "step": 1770 }, { "epoch": 2.66616484757245, "grad_norm": 0.19747303426265717, "learning_rate": 9.608327484971876e-05, "loss": 0.2701, "step": 1771 }, { "epoch": 2.6676703048550996, "grad_norm": 0.22135108709335327, "learning_rate": 9.607720038750864e-05, "loss": 0.3158, "step": 1772 }, { "epoch": 2.6691757621377494, "grad_norm": 0.20375578105449677, "learning_rate": 9.607112143315763e-05, "loss": 0.2849, "step": 1773 }, { "epoch": 2.670681219420399, "grad_norm": 8.267990112304688, "learning_rate": 9.606503798733054e-05, "loss": 0.4304, "step": 1774 }, { "epoch": 2.6721866767030487, "grad_norm": 0.36352863907814026, "learning_rate": 9.605895005069262e-05, "loss": 0.3646, "step": 1775 }, { "epoch": 2.673692133985698, "grad_norm": 0.46604177355766296, "learning_rate": 9.605285762390967e-05, "loss": 0.2969, "step": 1776 }, { "epoch": 2.675197591268348, "grad_norm": 0.42937231063842773, "learning_rate": 9.604676070764791e-05, "loss": 0.3591, "step": 1777 }, { "epoch": 2.6767030485509973, "grad_norm": 0.4269549250602722, "learning_rate": 9.604065930257413e-05, "loss": 0.3268, "step": 1778 }, { "epoch": 2.678208505833647, "grad_norm": 0.31019049882888794, "learning_rate": 9.603455340935557e-05, "loss": 0.3003, "step": 1779 }, { "epoch": 2.6797139631162965, "grad_norm": 0.2868860363960266, "learning_rate": 9.602844302865991e-05, "loss": 0.3533, "step": 1780 }, { "epoch": 2.6812194203989463, "grad_norm": 0.26066768169403076, "learning_rate": 9.602232816115542e-05, "loss": 0.3325, "step": 1781 }, { "epoch": 2.6827248776815957, "grad_norm": 0.24034957587718964, "learning_rate": 9.60162088075108e-05, "loss": 0.2737, "step": 1782 }, { "epoch": 2.684230334964245, "grad_norm": 0.23517920076847076, "learning_rate": 9.601008496839525e-05, "loss": 0.3312, "step": 1783 }, { "epoch": 2.685735792246895, "grad_norm": 0.24239815771579742, "learning_rate": 9.600395664447846e-05, "loss": 0.3694, "step": 1784 }, { "epoch": 2.687241249529545, "grad_norm": 0.24726858735084534, "learning_rate": 9.599782383643062e-05, "loss": 0.38, "step": 1785 }, { "epoch": 2.688746706812194, "grad_norm": 0.22088292241096497, "learning_rate": 9.59916865449224e-05, "loss": 0.3801, "step": 1786 }, { "epoch": 2.6902521640948436, "grad_norm": 0.2132876068353653, "learning_rate": 9.5985544770625e-05, "loss": 0.2913, "step": 1787 }, { "epoch": 2.6917576213774934, "grad_norm": 0.2131241410970688, "learning_rate": 9.597939851421002e-05, "loss": 0.324, "step": 1788 }, { "epoch": 2.6932630786601433, "grad_norm": 0.2166922390460968, "learning_rate": 9.597324777634962e-05, "loss": 0.3182, "step": 1789 }, { "epoch": 2.6947685359427926, "grad_norm": 0.22590839862823486, "learning_rate": 9.596709255771647e-05, "loss": 0.3035, "step": 1790 }, { "epoch": 2.696273993225442, "grad_norm": 0.2944963276386261, "learning_rate": 9.596093285898366e-05, "loss": 0.2922, "step": 1791 }, { "epoch": 2.697779450508092, "grad_norm": 0.2601071298122406, "learning_rate": 9.595476868082481e-05, "loss": 0.3128, "step": 1792 }, { "epoch": 2.6992849077907417, "grad_norm": 0.24151138961315155, "learning_rate": 9.594860002391404e-05, "loss": 0.2741, "step": 1793 }, { "epoch": 2.700790365073391, "grad_norm": 0.31307458877563477, "learning_rate": 9.594242688892593e-05, "loss": 0.3547, "step": 1794 }, { "epoch": 2.7022958223560405, "grad_norm": 0.32010120153427124, "learning_rate": 9.593624927653557e-05, "loss": 0.4014, "step": 1795 }, { "epoch": 2.7038012796386903, "grad_norm": 0.27369916439056396, "learning_rate": 9.593006718741855e-05, "loss": 0.3528, "step": 1796 }, { "epoch": 2.7053067369213397, "grad_norm": 0.26442939043045044, "learning_rate": 9.592388062225091e-05, "loss": 0.3005, "step": 1797 }, { "epoch": 2.7068121942039896, "grad_norm": 0.22972232103347778, "learning_rate": 9.591768958170921e-05, "loss": 0.3329, "step": 1798 }, { "epoch": 2.708317651486639, "grad_norm": 0.2496950626373291, "learning_rate": 9.591149406647051e-05, "loss": 0.2818, "step": 1799 }, { "epoch": 2.709823108769289, "grad_norm": 0.2558193802833557, "learning_rate": 9.590529407721231e-05, "loss": 0.3131, "step": 1800 }, { "epoch": 2.709823108769289, "eval_loss": 0.3177741765975952, "eval_runtime": 542.6152, "eval_samples_per_second": 17.742, "eval_steps_per_second": 0.555, "step": 1800 }, { "epoch": 2.711328566051938, "grad_norm": 0.2658045291900635, "learning_rate": 9.589908961461267e-05, "loss": 0.4136, "step": 1801 }, { "epoch": 2.712834023334588, "grad_norm": 0.30200836062431335, "learning_rate": 9.589288067935007e-05, "loss": 0.2967, "step": 1802 }, { "epoch": 2.7143394806172374, "grad_norm": 0.2485121488571167, "learning_rate": 9.588666727210352e-05, "loss": 0.2559, "step": 1803 }, { "epoch": 2.7158449378998872, "grad_norm": 0.3105930685997009, "learning_rate": 9.58804493935525e-05, "loss": 0.3303, "step": 1804 }, { "epoch": 2.7173503951825366, "grad_norm": 0.2812676727771759, "learning_rate": 9.5874227044377e-05, "loss": 0.2725, "step": 1805 }, { "epoch": 2.7188558524651865, "grad_norm": 0.2403954416513443, "learning_rate": 9.58680002252575e-05, "loss": 0.3594, "step": 1806 }, { "epoch": 2.720361309747836, "grad_norm": 0.25064054131507874, "learning_rate": 9.586176893687494e-05, "loss": 0.3528, "step": 1807 }, { "epoch": 2.7218667670304857, "grad_norm": 0.24559059739112854, "learning_rate": 9.585553317991075e-05, "loss": 0.2791, "step": 1808 }, { "epoch": 2.723372224313135, "grad_norm": 0.23300178349018097, "learning_rate": 9.584929295504688e-05, "loss": 0.3276, "step": 1809 }, { "epoch": 2.7248776815957845, "grad_norm": 0.22857141494750977, "learning_rate": 9.584304826296575e-05, "loss": 0.3312, "step": 1810 }, { "epoch": 2.7263831388784343, "grad_norm": 0.2357628047466278, "learning_rate": 9.583679910435026e-05, "loss": 0.3077, "step": 1811 }, { "epoch": 2.727888596161084, "grad_norm": 0.21037554740905762, "learning_rate": 9.583054547988383e-05, "loss": 0.3069, "step": 1812 }, { "epoch": 2.7293940534437335, "grad_norm": 0.2044837921857834, "learning_rate": 9.582428739025033e-05, "loss": 0.2761, "step": 1813 }, { "epoch": 2.730899510726383, "grad_norm": 0.21871916949748993, "learning_rate": 9.581802483613414e-05, "loss": 0.2687, "step": 1814 }, { "epoch": 2.7324049680090328, "grad_norm": 0.21430335938930511, "learning_rate": 9.581175781822012e-05, "loss": 0.34, "step": 1815 }, { "epoch": 2.7339104252916826, "grad_norm": 0.24252304434776306, "learning_rate": 9.580548633719363e-05, "loss": 0.3069, "step": 1816 }, { "epoch": 2.735415882574332, "grad_norm": 0.21466441452503204, "learning_rate": 9.57992103937405e-05, "loss": 0.316, "step": 1817 }, { "epoch": 2.7369213398569814, "grad_norm": 0.21388466656208038, "learning_rate": 9.579292998854706e-05, "loss": 0.2522, "step": 1818 }, { "epoch": 2.738426797139631, "grad_norm": 0.25632449984550476, "learning_rate": 9.578664512230014e-05, "loss": 0.3134, "step": 1819 }, { "epoch": 2.7399322544222806, "grad_norm": 0.18966947495937347, "learning_rate": 9.5780355795687e-05, "loss": 0.3575, "step": 1820 }, { "epoch": 2.7414377117049304, "grad_norm": 0.21534857153892517, "learning_rate": 9.57740620093955e-05, "loss": 0.3701, "step": 1821 }, { "epoch": 2.74294316898758, "grad_norm": 0.19469863176345825, "learning_rate": 9.576776376411386e-05, "loss": 0.3142, "step": 1822 }, { "epoch": 2.7444486262702297, "grad_norm": 0.2568362355232239, "learning_rate": 9.576146106053088e-05, "loss": 0.3519, "step": 1823 }, { "epoch": 2.745954083552879, "grad_norm": 0.2152559459209442, "learning_rate": 9.575515389933579e-05, "loss": 0.3257, "step": 1824 }, { "epoch": 2.747459540835529, "grad_norm": 0.21199235320091248, "learning_rate": 9.574884228121836e-05, "loss": 0.3, "step": 1825 }, { "epoch": 2.7489649981181783, "grad_norm": 0.26545777916908264, "learning_rate": 9.574252620686879e-05, "loss": 0.3202, "step": 1826 }, { "epoch": 2.750470455400828, "grad_norm": 0.2390890121459961, "learning_rate": 9.57362056769778e-05, "loss": 0.3111, "step": 1827 }, { "epoch": 2.7519759126834775, "grad_norm": 0.24091432988643646, "learning_rate": 9.572988069223662e-05, "loss": 0.3234, "step": 1828 }, { "epoch": 2.7534813699661274, "grad_norm": 0.24146698415279388, "learning_rate": 9.572355125333691e-05, "loss": 0.3392, "step": 1829 }, { "epoch": 2.7549868272487767, "grad_norm": 0.32408255338668823, "learning_rate": 9.571721736097089e-05, "loss": 0.3197, "step": 1830 }, { "epoch": 2.7564922845314266, "grad_norm": 0.3675064444541931, "learning_rate": 9.571087901583117e-05, "loss": 0.3615, "step": 1831 }, { "epoch": 2.757997741814076, "grad_norm": 0.3669273555278778, "learning_rate": 9.570453621861093e-05, "loss": 0.2771, "step": 1832 }, { "epoch": 2.7595031990967254, "grad_norm": 0.3282705247402191, "learning_rate": 9.569818897000382e-05, "loss": 0.3106, "step": 1833 }, { "epoch": 2.761008656379375, "grad_norm": 0.27306005358695984, "learning_rate": 9.569183727070396e-05, "loss": 0.3191, "step": 1834 }, { "epoch": 2.762514113662025, "grad_norm": 0.29012972116470337, "learning_rate": 9.568548112140593e-05, "loss": 0.3179, "step": 1835 }, { "epoch": 2.7640195709446744, "grad_norm": 0.26072853803634644, "learning_rate": 9.567912052280486e-05, "loss": 0.3117, "step": 1836 }, { "epoch": 2.765525028227324, "grad_norm": 0.24972747266292572, "learning_rate": 9.567275547559635e-05, "loss": 0.3069, "step": 1837 }, { "epoch": 2.7670304855099737, "grad_norm": 0.2602355480194092, "learning_rate": 9.566638598047642e-05, "loss": 0.265, "step": 1838 }, { "epoch": 2.7685359427926235, "grad_norm": 0.24481818079948425, "learning_rate": 9.566001203814169e-05, "loss": 0.2862, "step": 1839 }, { "epoch": 2.770041400075273, "grad_norm": 0.20777802169322968, "learning_rate": 9.565363364928918e-05, "loss": 0.272, "step": 1840 }, { "epoch": 2.7715468573579223, "grad_norm": 0.23450233042240143, "learning_rate": 9.564725081461639e-05, "loss": 0.2632, "step": 1841 }, { "epoch": 2.773052314640572, "grad_norm": 0.23405419290065765, "learning_rate": 9.564086353482137e-05, "loss": 0.3246, "step": 1842 }, { "epoch": 2.774557771923222, "grad_norm": 0.2314044088125229, "learning_rate": 9.563447181060262e-05, "loss": 0.344, "step": 1843 }, { "epoch": 2.7760632292058713, "grad_norm": 0.2218218743801117, "learning_rate": 9.562807564265913e-05, "loss": 0.3359, "step": 1844 }, { "epoch": 2.7775686864885207, "grad_norm": 0.23392537236213684, "learning_rate": 9.562167503169036e-05, "loss": 0.3467, "step": 1845 }, { "epoch": 2.7790741437711706, "grad_norm": 0.2812747061252594, "learning_rate": 9.56152699783963e-05, "loss": 0.326, "step": 1846 }, { "epoch": 2.78057960105382, "grad_norm": 0.257294625043869, "learning_rate": 9.560886048347736e-05, "loss": 0.3001, "step": 1847 }, { "epoch": 2.78208505833647, "grad_norm": 0.23447862267494202, "learning_rate": 9.560244654763449e-05, "loss": 0.3063, "step": 1848 }, { "epoch": 2.783590515619119, "grad_norm": 0.27092599868774414, "learning_rate": 9.559602817156913e-05, "loss": 0.3205, "step": 1849 }, { "epoch": 2.785095972901769, "grad_norm": 0.21374084055423737, "learning_rate": 9.558960535598316e-05, "loss": 0.2992, "step": 1850 }, { "epoch": 2.7866014301844184, "grad_norm": 0.22544163465499878, "learning_rate": 9.558317810157897e-05, "loss": 0.3006, "step": 1851 }, { "epoch": 2.7881068874670683, "grad_norm": 0.2013038694858551, "learning_rate": 9.557674640905943e-05, "loss": 0.3521, "step": 1852 }, { "epoch": 2.7896123447497176, "grad_norm": 0.21984528005123138, "learning_rate": 9.557031027912792e-05, "loss": 0.3075, "step": 1853 }, { "epoch": 2.7911178020323675, "grad_norm": 0.23673377931118011, "learning_rate": 9.556386971248827e-05, "loss": 0.3355, "step": 1854 }, { "epoch": 2.792623259315017, "grad_norm": 0.2508200407028198, "learning_rate": 9.555742470984481e-05, "loss": 0.2883, "step": 1855 }, { "epoch": 2.7941287165976667, "grad_norm": 0.21471986174583435, "learning_rate": 9.555097527190237e-05, "loss": 0.3464, "step": 1856 }, { "epoch": 2.795634173880316, "grad_norm": 0.22459696233272552, "learning_rate": 9.554452139936623e-05, "loss": 0.3158, "step": 1857 }, { "epoch": 2.797139631162966, "grad_norm": 0.2307433933019638, "learning_rate": 9.553806309294221e-05, "loss": 0.3243, "step": 1858 }, { "epoch": 2.7986450884456153, "grad_norm": 0.23523744940757751, "learning_rate": 9.553160035333655e-05, "loss": 0.2982, "step": 1859 }, { "epoch": 2.8001505457282647, "grad_norm": 0.24816091358661652, "learning_rate": 9.552513318125601e-05, "loss": 0.296, "step": 1860 }, { "epoch": 2.8016560030109146, "grad_norm": 0.2617699205875397, "learning_rate": 9.551866157740782e-05, "loss": 0.2737, "step": 1861 }, { "epoch": 2.8031614602935644, "grad_norm": 0.23617953062057495, "learning_rate": 9.551218554249973e-05, "loss": 0.3336, "step": 1862 }, { "epoch": 2.804666917576214, "grad_norm": 0.2215937077999115, "learning_rate": 9.550570507723995e-05, "loss": 0.362, "step": 1863 }, { "epoch": 2.806172374858863, "grad_norm": 0.24171151220798492, "learning_rate": 9.549922018233714e-05, "loss": 0.2811, "step": 1864 }, { "epoch": 2.807677832141513, "grad_norm": 0.2237575799226761, "learning_rate": 9.549273085850051e-05, "loss": 0.3428, "step": 1865 }, { "epoch": 2.809183289424163, "grad_norm": 0.28737789392471313, "learning_rate": 9.548623710643972e-05, "loss": 0.3343, "step": 1866 }, { "epoch": 2.8106887467068122, "grad_norm": 0.3086473345756531, "learning_rate": 9.54797389268649e-05, "loss": 0.2833, "step": 1867 }, { "epoch": 2.8121942039894616, "grad_norm": 0.3330196440219879, "learning_rate": 9.54732363204867e-05, "loss": 0.3522, "step": 1868 }, { "epoch": 2.8136996612721115, "grad_norm": 0.2767601013183594, "learning_rate": 9.546672928801622e-05, "loss": 0.3294, "step": 1869 }, { "epoch": 2.815205118554761, "grad_norm": 0.21234530210494995, "learning_rate": 9.546021783016508e-05, "loss": 0.3008, "step": 1870 }, { "epoch": 2.8167105758374107, "grad_norm": 0.24821801483631134, "learning_rate": 9.545370194764534e-05, "loss": 0.2255, "step": 1871 }, { "epoch": 2.81821603312006, "grad_norm": 0.28939610719680786, "learning_rate": 9.544718164116956e-05, "loss": 0.3495, "step": 1872 }, { "epoch": 2.81972149040271, "grad_norm": 0.20509499311447144, "learning_rate": 9.544065691145082e-05, "loss": 0.3244, "step": 1873 }, { "epoch": 2.8212269476853593, "grad_norm": 0.24400928616523743, "learning_rate": 9.543412775920264e-05, "loss": 0.3326, "step": 1874 }, { "epoch": 2.822732404968009, "grad_norm": 0.2412003129720688, "learning_rate": 9.542759418513906e-05, "loss": 0.2524, "step": 1875 }, { "epoch": 2.8242378622506585, "grad_norm": 0.24332232773303986, "learning_rate": 9.542105618997453e-05, "loss": 0.3622, "step": 1876 }, { "epoch": 2.8257433195333084, "grad_norm": 0.2073681652545929, "learning_rate": 9.541451377442409e-05, "loss": 0.223, "step": 1877 }, { "epoch": 2.8272487768159578, "grad_norm": 0.19303859770298004, "learning_rate": 9.540796693920318e-05, "loss": 0.298, "step": 1878 }, { "epoch": 2.8287542340986076, "grad_norm": 0.2680846154689789, "learning_rate": 9.540141568502774e-05, "loss": 0.3957, "step": 1879 }, { "epoch": 2.830259691381257, "grad_norm": 0.28274330496788025, "learning_rate": 9.539486001261425e-05, "loss": 0.2605, "step": 1880 }, { "epoch": 2.831765148663907, "grad_norm": 0.28674212098121643, "learning_rate": 9.53882999226796e-05, "loss": 0.2577, "step": 1881 }, { "epoch": 2.833270605946556, "grad_norm": 0.3031870722770691, "learning_rate": 9.538173541594118e-05, "loss": 0.2953, "step": 1882 }, { "epoch": 2.8347760632292056, "grad_norm": 0.2575107216835022, "learning_rate": 9.53751664931169e-05, "loss": 0.3318, "step": 1883 }, { "epoch": 2.8362815205118554, "grad_norm": 0.2306624948978424, "learning_rate": 9.53685931549251e-05, "loss": 0.3042, "step": 1884 }, { "epoch": 2.8377869777945053, "grad_norm": 0.22793924808502197, "learning_rate": 9.536201540208466e-05, "loss": 0.3274, "step": 1885 }, { "epoch": 2.8392924350771547, "grad_norm": 0.2841707170009613, "learning_rate": 9.535543323531489e-05, "loss": 0.3077, "step": 1886 }, { "epoch": 2.840797892359804, "grad_norm": 0.30491960048675537, "learning_rate": 9.534884665533563e-05, "loss": 0.2762, "step": 1887 }, { "epoch": 2.842303349642454, "grad_norm": 0.2995319366455078, "learning_rate": 9.534225566286715e-05, "loss": 0.3468, "step": 1888 }, { "epoch": 2.8438088069251037, "grad_norm": 0.2596789002418518, "learning_rate": 9.533566025863023e-05, "loss": 0.2689, "step": 1889 }, { "epoch": 2.845314264207753, "grad_norm": 0.28057965636253357, "learning_rate": 9.532906044334616e-05, "loss": 0.3357, "step": 1890 }, { "epoch": 2.8468197214904025, "grad_norm": 0.31410154700279236, "learning_rate": 9.532245621773668e-05, "loss": 0.3708, "step": 1891 }, { "epoch": 2.8483251787730524, "grad_norm": 0.2571392357349396, "learning_rate": 9.531584758252401e-05, "loss": 0.2558, "step": 1892 }, { "epoch": 2.849830636055702, "grad_norm": 0.2864225506782532, "learning_rate": 9.530923453843086e-05, "loss": 0.277, "step": 1893 }, { "epoch": 2.8513360933383516, "grad_norm": 0.2835250496864319, "learning_rate": 9.530261708618043e-05, "loss": 0.3348, "step": 1894 }, { "epoch": 2.852841550621001, "grad_norm": 0.20823001861572266, "learning_rate": 9.529599522649639e-05, "loss": 0.2399, "step": 1895 }, { "epoch": 2.854347007903651, "grad_norm": 0.22276276350021362, "learning_rate": 9.528936896010288e-05, "loss": 0.2607, "step": 1896 }, { "epoch": 2.8558524651863, "grad_norm": 0.21125803887844086, "learning_rate": 9.528273828772458e-05, "loss": 0.2651, "step": 1897 }, { "epoch": 2.85735792246895, "grad_norm": 0.2514656186103821, "learning_rate": 9.527610321008657e-05, "loss": 0.3648, "step": 1898 }, { "epoch": 2.8588633797515994, "grad_norm": 0.23441962897777557, "learning_rate": 9.526946372791448e-05, "loss": 0.3664, "step": 1899 }, { "epoch": 2.8603688370342493, "grad_norm": 0.2558024227619171, "learning_rate": 9.526281984193436e-05, "loss": 0.2951, "step": 1900 }, { "epoch": 2.8618742943168987, "grad_norm": 0.25763463973999023, "learning_rate": 9.52561715528728e-05, "loss": 0.3138, "step": 1901 }, { "epoch": 2.8633797515995485, "grad_norm": 0.23278772830963135, "learning_rate": 9.524951886145686e-05, "loss": 0.2734, "step": 1902 }, { "epoch": 2.864885208882198, "grad_norm": 0.24436548352241516, "learning_rate": 9.524286176841404e-05, "loss": 0.3495, "step": 1903 }, { "epoch": 2.8663906661648477, "grad_norm": 0.2675202786922455, "learning_rate": 9.523620027447235e-05, "loss": 0.2769, "step": 1904 }, { "epoch": 2.867896123447497, "grad_norm": 0.24652834236621857, "learning_rate": 9.522953438036032e-05, "loss": 0.3338, "step": 1905 }, { "epoch": 2.8694015807301465, "grad_norm": 0.252328485250473, "learning_rate": 9.522286408680687e-05, "loss": 0.3765, "step": 1906 }, { "epoch": 2.8709070380127963, "grad_norm": 0.24060387909412384, "learning_rate": 9.521618939454149e-05, "loss": 0.2824, "step": 1907 }, { "epoch": 2.872412495295446, "grad_norm": 0.2091691792011261, "learning_rate": 9.520951030429409e-05, "loss": 0.3348, "step": 1908 }, { "epoch": 2.8739179525780956, "grad_norm": 0.19348196685314178, "learning_rate": 9.520282681679513e-05, "loss": 0.2823, "step": 1909 }, { "epoch": 2.875423409860745, "grad_norm": 0.23590444028377533, "learning_rate": 9.519613893277544e-05, "loss": 0.3278, "step": 1910 }, { "epoch": 2.876928867143395, "grad_norm": 0.24083060026168823, "learning_rate": 9.518944665296643e-05, "loss": 0.2916, "step": 1911 }, { "epoch": 2.8784343244260446, "grad_norm": 0.2201680690050125, "learning_rate": 9.518274997809998e-05, "loss": 0.2849, "step": 1912 }, { "epoch": 2.879939781708694, "grad_norm": 0.22204239666461945, "learning_rate": 9.517604890890837e-05, "loss": 0.2953, "step": 1913 }, { "epoch": 2.8814452389913434, "grad_norm": 0.22368693351745605, "learning_rate": 9.51693434461245e-05, "loss": 0.249, "step": 1914 }, { "epoch": 2.8829506962739933, "grad_norm": 0.28119462728500366, "learning_rate": 9.516263359048162e-05, "loss": 0.2673, "step": 1915 }, { "epoch": 2.884456153556643, "grad_norm": 0.3423693776130676, "learning_rate": 9.515591934271347e-05, "loss": 0.3189, "step": 1916 }, { "epoch": 2.8859616108392925, "grad_norm": 0.29981279373168945, "learning_rate": 9.51492007035544e-05, "loss": 0.2854, "step": 1917 }, { "epoch": 2.887467068121942, "grad_norm": 0.24637547135353088, "learning_rate": 9.514247767373907e-05, "loss": 0.3382, "step": 1918 }, { "epoch": 2.8889725254045917, "grad_norm": 0.2599222660064697, "learning_rate": 9.513575025400275e-05, "loss": 0.3134, "step": 1919 }, { "epoch": 2.890477982687241, "grad_norm": 0.29327911138534546, "learning_rate": 9.512901844508113e-05, "loss": 0.275, "step": 1920 }, { "epoch": 2.891983439969891, "grad_norm": 0.2410399615764618, "learning_rate": 9.512228224771038e-05, "loss": 0.2873, "step": 1921 }, { "epoch": 2.8934888972525403, "grad_norm": 0.24565373361110687, "learning_rate": 9.511554166262717e-05, "loss": 0.353, "step": 1922 }, { "epoch": 2.89499435453519, "grad_norm": 0.2705269753932953, "learning_rate": 9.510879669056863e-05, "loss": 0.3261, "step": 1923 }, { "epoch": 2.8964998118178396, "grad_norm": 0.347341388463974, "learning_rate": 9.510204733227239e-05, "loss": 0.2717, "step": 1924 }, { "epoch": 2.8980052691004894, "grad_norm": 0.24267084896564484, "learning_rate": 9.509529358847655e-05, "loss": 0.2725, "step": 1925 }, { "epoch": 2.899510726383139, "grad_norm": 0.27747422456741333, "learning_rate": 9.50885354599197e-05, "loss": 0.3259, "step": 1926 }, { "epoch": 2.9010161836657886, "grad_norm": 0.26718202233314514, "learning_rate": 9.508177294734086e-05, "loss": 0.3232, "step": 1927 }, { "epoch": 2.902521640948438, "grad_norm": 0.2070687860250473, "learning_rate": 9.50750060514796e-05, "loss": 0.2687, "step": 1928 }, { "epoch": 2.904027098231088, "grad_norm": 0.21407555043697357, "learning_rate": 9.506823477307593e-05, "loss": 0.2675, "step": 1929 }, { "epoch": 2.9055325555137372, "grad_norm": 0.23044246435165405, "learning_rate": 9.506145911287034e-05, "loss": 0.3819, "step": 1930 }, { "epoch": 2.907038012796387, "grad_norm": 0.21104183793067932, "learning_rate": 9.505467907160383e-05, "loss": 0.2965, "step": 1931 }, { "epoch": 2.9085434700790365, "grad_norm": 0.2526529133319855, "learning_rate": 9.504789465001783e-05, "loss": 0.2678, "step": 1932 }, { "epoch": 2.910048927361686, "grad_norm": 0.22812482714653015, "learning_rate": 9.504110584885429e-05, "loss": 0.3282, "step": 1933 }, { "epoch": 2.9115543846443357, "grad_norm": 0.2928198277950287, "learning_rate": 9.50343126688556e-05, "loss": 0.2648, "step": 1934 }, { "epoch": 2.9130598419269855, "grad_norm": 0.26750698685646057, "learning_rate": 9.502751511076468e-05, "loss": 0.3004, "step": 1935 }, { "epoch": 2.914565299209635, "grad_norm": 0.25916051864624023, "learning_rate": 9.502071317532488e-05, "loss": 0.2646, "step": 1936 }, { "epoch": 2.9160707564922843, "grad_norm": 0.28682559728622437, "learning_rate": 9.501390686328005e-05, "loss": 0.3309, "step": 1937 }, { "epoch": 2.917576213774934, "grad_norm": 0.29032179713249207, "learning_rate": 9.500709617537453e-05, "loss": 0.3595, "step": 1938 }, { "epoch": 2.919081671057584, "grad_norm": 0.26819688081741333, "learning_rate": 9.500028111235313e-05, "loss": 0.2549, "step": 1939 }, { "epoch": 2.9205871283402334, "grad_norm": 0.24570034444332123, "learning_rate": 9.499346167496111e-05, "loss": 0.2999, "step": 1940 }, { "epoch": 2.9220925856228828, "grad_norm": 0.2816712558269501, "learning_rate": 9.498663786394427e-05, "loss": 0.274, "step": 1941 }, { "epoch": 2.9235980429055326, "grad_norm": 3.28001070022583, "learning_rate": 9.497980968004884e-05, "loss": 0.3013, "step": 1942 }, { "epoch": 2.9251035001881824, "grad_norm": 0.3349991738796234, "learning_rate": 9.497297712402152e-05, "loss": 0.2728, "step": 1943 }, { "epoch": 2.926608957470832, "grad_norm": 0.2997612953186035, "learning_rate": 9.496614019660951e-05, "loss": 0.2961, "step": 1944 }, { "epoch": 2.928114414753481, "grad_norm": 0.29519739747047424, "learning_rate": 9.49592988985605e-05, "loss": 0.269, "step": 1945 }, { "epoch": 2.929619872036131, "grad_norm": 0.2940424084663391, "learning_rate": 9.495245323062265e-05, "loss": 0.2969, "step": 1946 }, { "epoch": 2.9311253293187804, "grad_norm": 0.3340001702308655, "learning_rate": 9.494560319354457e-05, "loss": 0.2682, "step": 1947 }, { "epoch": 2.9326307866014303, "grad_norm": 0.23749876022338867, "learning_rate": 9.49387487880754e-05, "loss": 0.3486, "step": 1948 }, { "epoch": 2.9341362438840797, "grad_norm": 0.2711319923400879, "learning_rate": 9.49318900149647e-05, "loss": 0.289, "step": 1949 }, { "epoch": 2.9356417011667295, "grad_norm": 0.24524246156215668, "learning_rate": 9.492502687496253e-05, "loss": 0.3269, "step": 1950 }, { "epoch": 2.937147158449379, "grad_norm": 0.22200171649456024, "learning_rate": 9.491815936881947e-05, "loss": 0.1853, "step": 1951 }, { "epoch": 2.9386526157320287, "grad_norm": 0.22970010340213776, "learning_rate": 9.49112874972865e-05, "loss": 0.2979, "step": 1952 }, { "epoch": 2.940158073014678, "grad_norm": 0.2351755052804947, "learning_rate": 9.490441126111515e-05, "loss": 0.2723, "step": 1953 }, { "epoch": 2.941663530297328, "grad_norm": 0.20889292657375336, "learning_rate": 9.489753066105738e-05, "loss": 0.3229, "step": 1954 }, { "epoch": 2.9431689875799774, "grad_norm": 0.24267928302288055, "learning_rate": 9.489064569786563e-05, "loss": 0.2489, "step": 1955 }, { "epoch": 2.9446744448626267, "grad_norm": 0.2534712255001068, "learning_rate": 9.488375637229285e-05, "loss": 0.269, "step": 1956 }, { "epoch": 2.9461799021452766, "grad_norm": 0.21708379685878754, "learning_rate": 9.487686268509242e-05, "loss": 0.3058, "step": 1957 }, { "epoch": 2.9476853594279264, "grad_norm": 0.23425090312957764, "learning_rate": 9.486996463701827e-05, "loss": 0.2529, "step": 1958 }, { "epoch": 2.949190816710576, "grad_norm": 0.22916091978549957, "learning_rate": 9.48630622288247e-05, "loss": 0.2917, "step": 1959 }, { "epoch": 2.950696273993225, "grad_norm": 0.21957950294017792, "learning_rate": 9.48561554612666e-05, "loss": 0.3584, "step": 1960 }, { "epoch": 2.952201731275875, "grad_norm": 0.2176668494939804, "learning_rate": 9.484924433509926e-05, "loss": 0.2761, "step": 1961 }, { "epoch": 2.953707188558525, "grad_norm": 0.22736169397830963, "learning_rate": 9.484232885107846e-05, "loss": 0.2489, "step": 1962 }, { "epoch": 2.9552126458411743, "grad_norm": 0.2446775734424591, "learning_rate": 9.483540900996049e-05, "loss": 0.3341, "step": 1963 }, { "epoch": 2.9567181031238237, "grad_norm": 0.3196597099304199, "learning_rate": 9.482848481250208e-05, "loss": 0.3454, "step": 1964 }, { "epoch": 2.9582235604064735, "grad_norm": 0.34248051047325134, "learning_rate": 9.482155625946044e-05, "loss": 0.292, "step": 1965 }, { "epoch": 2.9597290176891233, "grad_norm": 0.27206316590309143, "learning_rate": 9.481462335159329e-05, "loss": 0.2678, "step": 1966 }, { "epoch": 2.9612344749717727, "grad_norm": 0.2570916712284088, "learning_rate": 9.48076860896588e-05, "loss": 0.2318, "step": 1967 }, { "epoch": 2.962739932254422, "grad_norm": 0.2572825849056244, "learning_rate": 9.480074447441559e-05, "loss": 0.3171, "step": 1968 }, { "epoch": 2.964245389537072, "grad_norm": 0.27136391401290894, "learning_rate": 9.479379850662281e-05, "loss": 0.2838, "step": 1969 }, { "epoch": 2.9657508468197213, "grad_norm": 0.3134746551513672, "learning_rate": 9.478684818704006e-05, "loss": 0.2685, "step": 1970 }, { "epoch": 2.967256304102371, "grad_norm": 0.2242620289325714, "learning_rate": 9.477989351642741e-05, "loss": 0.2646, "step": 1971 }, { "epoch": 2.9687617613850206, "grad_norm": 0.25236791372299194, "learning_rate": 9.477293449554542e-05, "loss": 0.3373, "step": 1972 }, { "epoch": 2.9702672186676704, "grad_norm": 0.2282586544752121, "learning_rate": 9.47659711251551e-05, "loss": 0.295, "step": 1973 }, { "epoch": 2.97177267595032, "grad_norm": 0.35191360116004944, "learning_rate": 9.475900340601796e-05, "loss": 0.3537, "step": 1974 }, { "epoch": 2.9732781332329696, "grad_norm": 0.3736025393009186, "learning_rate": 9.4752031338896e-05, "loss": 0.3327, "step": 1975 }, { "epoch": 2.974783590515619, "grad_norm": 0.2638415992259979, "learning_rate": 9.474505492455163e-05, "loss": 0.253, "step": 1976 }, { "epoch": 2.976289047798269, "grad_norm": 0.2769251763820648, "learning_rate": 9.473807416374784e-05, "loss": 0.293, "step": 1977 }, { "epoch": 2.9777945050809183, "grad_norm": 0.2765183746814728, "learning_rate": 9.473108905724798e-05, "loss": 0.3749, "step": 1978 }, { "epoch": 2.979299962363568, "grad_norm": 0.2542422413825989, "learning_rate": 9.472409960581598e-05, "loss": 0.2282, "step": 1979 }, { "epoch": 2.9808054196462175, "grad_norm": 0.27029719948768616, "learning_rate": 9.471710581021616e-05, "loss": 0.3059, "step": 1980 }, { "epoch": 2.9823108769288673, "grad_norm": 0.3011026382446289, "learning_rate": 9.471010767121337e-05, "loss": 0.3757, "step": 1981 }, { "epoch": 2.9838163342115167, "grad_norm": 0.25609859824180603, "learning_rate": 9.47031051895729e-05, "loss": 0.2231, "step": 1982 }, { "epoch": 2.985321791494166, "grad_norm": 0.23495256900787354, "learning_rate": 9.469609836606055e-05, "loss": 0.3364, "step": 1983 }, { "epoch": 2.986827248776816, "grad_norm": 0.28152403235435486, "learning_rate": 9.468908720144255e-05, "loss": 0.289, "step": 1984 }, { "epoch": 2.9883327060594658, "grad_norm": 0.26677843928337097, "learning_rate": 9.468207169648565e-05, "loss": 0.2541, "step": 1985 }, { "epoch": 2.989838163342115, "grad_norm": 0.2580556571483612, "learning_rate": 9.467505185195705e-05, "loss": 0.2548, "step": 1986 }, { "epoch": 2.9913436206247646, "grad_norm": 0.21023263037204742, "learning_rate": 9.466802766862444e-05, "loss": 0.3412, "step": 1987 }, { "epoch": 2.9928490779074144, "grad_norm": 0.24182023108005524, "learning_rate": 9.466099914725594e-05, "loss": 0.3345, "step": 1988 }, { "epoch": 2.9943545351900642, "grad_norm": 0.2088068574666977, "learning_rate": 9.465396628862022e-05, "loss": 0.2995, "step": 1989 }, { "epoch": 2.9958599924727136, "grad_norm": 0.2179078757762909, "learning_rate": 9.464692909348637e-05, "loss": 0.2731, "step": 1990 }, { "epoch": 2.997365449755363, "grad_norm": 0.23260453343391418, "learning_rate": 9.463988756262397e-05, "loss": 0.2838, "step": 1991 }, { "epoch": 2.998870907038013, "grad_norm": 0.2530764043331146, "learning_rate": 9.463284169680305e-05, "loss": 0.343, "step": 1992 }, { "epoch": 3.0003763643206622, "grad_norm": 0.24543823301792145, "learning_rate": 9.462579149679414e-05, "loss": 0.3444, "step": 1993 }, { "epoch": 3.001881821603312, "grad_norm": 0.231407031416893, "learning_rate": 9.461873696336825e-05, "loss": 0.3459, "step": 1994 }, { "epoch": 3.0033872788859615, "grad_norm": 0.2584514915943146, "learning_rate": 9.461167809729687e-05, "loss": 0.2593, "step": 1995 }, { "epoch": 3.0048927361686113, "grad_norm": 0.20332786440849304, "learning_rate": 9.460461489935191e-05, "loss": 0.2518, "step": 1996 }, { "epoch": 3.0063981934512607, "grad_norm": 0.21230551600456238, "learning_rate": 9.459754737030582e-05, "loss": 0.3285, "step": 1997 }, { "epoch": 3.0079036507339105, "grad_norm": 0.2104586511850357, "learning_rate": 9.459047551093148e-05, "loss": 0.2891, "step": 1998 }, { "epoch": 3.00940910801656, "grad_norm": 0.24298396706581116, "learning_rate": 9.458339932200228e-05, "loss": 0.3171, "step": 1999 }, { "epoch": 3.0109145652992098, "grad_norm": 0.21872566640377045, "learning_rate": 9.4576318804292e-05, "loss": 0.2822, "step": 2000 }, { "epoch": 3.0109145652992098, "eval_loss": 0.29471513628959656, "eval_runtime": 546.2066, "eval_samples_per_second": 17.625, "eval_steps_per_second": 0.551, "step": 2000 }, { "epoch": 3.0015054572826494, "grad_norm": 0.2338768094778061, "learning_rate": 9.456923395857503e-05, "loss": 0.3389, "step": 2001 }, { "epoch": 3.0030109145652992, "grad_norm": 0.20870551466941833, "learning_rate": 9.456214478562612e-05, "loss": 0.2929, "step": 2002 }, { "epoch": 3.0045163718479486, "grad_norm": 0.26429077982902527, "learning_rate": 9.455505128622053e-05, "loss": 0.2071, "step": 2003 }, { "epoch": 3.0060218291305985, "grad_norm": 0.3100963234901428, "learning_rate": 9.454795346113402e-05, "loss": 0.283, "step": 2004 }, { "epoch": 3.007527286413248, "grad_norm": 0.33747994899749756, "learning_rate": 9.454085131114277e-05, "loss": 0.3085, "step": 2005 }, { "epoch": 3.0090327436958977, "grad_norm": 0.28824710845947266, "learning_rate": 9.453374483702346e-05, "loss": 0.3283, "step": 2006 }, { "epoch": 3.010538200978547, "grad_norm": 0.3769630193710327, "learning_rate": 9.452663403955325e-05, "loss": 0.2827, "step": 2007 }, { "epoch": 3.012043658261197, "grad_norm": 0.3129812777042389, "learning_rate": 9.451951891950979e-05, "loss": 0.2356, "step": 2008 }, { "epoch": 3.0135491155438463, "grad_norm": 0.2818363904953003, "learning_rate": 9.451239947767115e-05, "loss": 0.2321, "step": 2009 }, { "epoch": 3.015054572826496, "grad_norm": 0.26137682795524597, "learning_rate": 9.45052757148159e-05, "loss": 0.2874, "step": 2010 }, { "epoch": 3.0165600301091455, "grad_norm": 0.2854709029197693, "learning_rate": 9.44981476317231e-05, "loss": 0.3188, "step": 2011 }, { "epoch": 3.0180654873917954, "grad_norm": 0.22286219894886017, "learning_rate": 9.449101522917225e-05, "loss": 0.2993, "step": 2012 }, { "epoch": 3.0195709446744448, "grad_norm": 0.2325684130191803, "learning_rate": 9.448387850794336e-05, "loss": 0.2984, "step": 2013 }, { "epoch": 3.0210764019570946, "grad_norm": 0.23606903851032257, "learning_rate": 9.447673746881687e-05, "loss": 0.2753, "step": 2014 }, { "epoch": 3.022581859239744, "grad_norm": 0.22394119203090668, "learning_rate": 9.446959211257374e-05, "loss": 0.2562, "step": 2015 }, { "epoch": 3.024087316522394, "grad_norm": 0.21195128560066223, "learning_rate": 9.446244243999533e-05, "loss": 0.298, "step": 2016 }, { "epoch": 3.025592773805043, "grad_norm": 0.2254779040813446, "learning_rate": 9.445528845186357e-05, "loss": 0.2878, "step": 2017 }, { "epoch": 3.027098231087693, "grad_norm": 0.20118072628974915, "learning_rate": 9.444813014896077e-05, "loss": 0.2756, "step": 2018 }, { "epoch": 3.0286036883703424, "grad_norm": 0.20092236995697021, "learning_rate": 9.444096753206977e-05, "loss": 0.2671, "step": 2019 }, { "epoch": 3.0301091456529923, "grad_norm": 0.2172233909368515, "learning_rate": 9.443380060197387e-05, "loss": 0.2936, "step": 2020 }, { "epoch": 3.0316146029356417, "grad_norm": 0.19889965653419495, "learning_rate": 9.442662935945681e-05, "loss": 0.2882, "step": 2021 }, { "epoch": 3.0331200602182915, "grad_norm": 0.22388795018196106, "learning_rate": 9.441945380530284e-05, "loss": 0.2415, "step": 2022 }, { "epoch": 3.034625517500941, "grad_norm": 0.21196013689041138, "learning_rate": 9.441227394029668e-05, "loss": 0.2348, "step": 2023 }, { "epoch": 3.0361309747835907, "grad_norm": 0.2363329529762268, "learning_rate": 9.44050897652235e-05, "loss": 0.2727, "step": 2024 }, { "epoch": 3.03763643206624, "grad_norm": 0.2674897611141205, "learning_rate": 9.439790128086894e-05, "loss": 0.2288, "step": 2025 }, { "epoch": 3.0391418893488895, "grad_norm": 0.25517335534095764, "learning_rate": 9.439070848801912e-05, "loss": 0.3356, "step": 2026 }, { "epoch": 3.0406473466315393, "grad_norm": 0.3073602616786957, "learning_rate": 9.438351138746065e-05, "loss": 0.3012, "step": 2027 }, { "epoch": 3.0421528039141887, "grad_norm": 0.3077828884124756, "learning_rate": 9.437630997998059e-05, "loss": 0.2209, "step": 2028 }, { "epoch": 3.0436582611968386, "grad_norm": 0.28028249740600586, "learning_rate": 9.436910426636647e-05, "loss": 0.3445, "step": 2029 }, { "epoch": 3.045163718479488, "grad_norm": 0.2614697515964508, "learning_rate": 9.436189424740631e-05, "loss": 0.2798, "step": 2030 }, { "epoch": 3.046669175762138, "grad_norm": 0.19922059774398804, "learning_rate": 9.435467992388855e-05, "loss": 0.2896, "step": 2031 }, { "epoch": 3.048174633044787, "grad_norm": 0.2993840277194977, "learning_rate": 9.434746129660219e-05, "loss": 0.3135, "step": 2032 }, { "epoch": 3.049680090327437, "grad_norm": 0.2563455104827881, "learning_rate": 9.43402383663366e-05, "loss": 0.2886, "step": 2033 }, { "epoch": 3.0511855476100864, "grad_norm": 0.22472555935382843, "learning_rate": 9.43330111338817e-05, "loss": 0.2882, "step": 2034 }, { "epoch": 3.0526910048927363, "grad_norm": 0.23899298906326294, "learning_rate": 9.432577960002783e-05, "loss": 0.314, "step": 2035 }, { "epoch": 3.0541964621753857, "grad_norm": 0.24786598980426788, "learning_rate": 9.431854376556585e-05, "loss": 0.2771, "step": 2036 }, { "epoch": 3.0557019194580355, "grad_norm": 0.20342090725898743, "learning_rate": 9.431130363128702e-05, "loss": 0.2877, "step": 2037 }, { "epoch": 3.057207376740685, "grad_norm": 0.2187003493309021, "learning_rate": 9.430405919798311e-05, "loss": 0.2952, "step": 2038 }, { "epoch": 3.0587128340233347, "grad_norm": 0.20183509588241577, "learning_rate": 9.42968104664464e-05, "loss": 0.2336, "step": 2039 }, { "epoch": 3.060218291305984, "grad_norm": 0.23175501823425293, "learning_rate": 9.428955743746959e-05, "loss": 0.2714, "step": 2040 }, { "epoch": 3.061723748588634, "grad_norm": 0.23342719674110413, "learning_rate": 9.428230011184583e-05, "loss": 0.2413, "step": 2041 }, { "epoch": 3.0632292058712833, "grad_norm": 0.21883393824100494, "learning_rate": 9.427503849036881e-05, "loss": 0.306, "step": 2042 }, { "epoch": 3.064734663153933, "grad_norm": 0.2682899236679077, "learning_rate": 9.426777257383261e-05, "loss": 0.292, "step": 2043 }, { "epoch": 3.0662401204365826, "grad_norm": 0.24779494106769562, "learning_rate": 9.426050236303185e-05, "loss": 0.2592, "step": 2044 }, { "epoch": 3.0677455777192324, "grad_norm": 0.22206467390060425, "learning_rate": 9.425322785876158e-05, "loss": 0.1959, "step": 2045 }, { "epoch": 3.069251035001882, "grad_norm": 0.286878377199173, "learning_rate": 9.424594906181732e-05, "loss": 0.2697, "step": 2046 }, { "epoch": 3.0707564922845316, "grad_norm": 127.21088409423828, "learning_rate": 9.423866597299508e-05, "loss": 2.2073, "step": 2047 }, { "epoch": 3.072261949567181, "grad_norm": 0.474869966506958, "learning_rate": 9.423137859309132e-05, "loss": 0.3211, "step": 2048 }, { "epoch": 3.0737674068498304, "grad_norm": 0.45546114444732666, "learning_rate": 9.422408692290298e-05, "loss": 0.2782, "step": 2049 }, { "epoch": 3.0752728641324802, "grad_norm": 0.4875631034374237, "learning_rate": 9.421679096322747e-05, "loss": 0.2771, "step": 2050 }, { "epoch": 3.0767783214151296, "grad_norm": 0.46188420057296753, "learning_rate": 9.420949071486268e-05, "loss": 0.3036, "step": 2051 }, { "epoch": 3.0782837786977795, "grad_norm": 0.3415272533893585, "learning_rate": 9.420218617860692e-05, "loss": 0.2944, "step": 2052 }, { "epoch": 3.079789235980429, "grad_norm": 0.3419550359249115, "learning_rate": 9.419487735525901e-05, "loss": 0.2496, "step": 2053 }, { "epoch": 3.0812946932630787, "grad_norm": 0.43848034739494324, "learning_rate": 9.418756424561826e-05, "loss": 0.3089, "step": 2054 }, { "epoch": 3.082800150545728, "grad_norm": 0.32184189558029175, "learning_rate": 9.418024685048437e-05, "loss": 0.3442, "step": 2055 }, { "epoch": 3.084305607828378, "grad_norm": 0.2985520660877228, "learning_rate": 9.417292517065762e-05, "loss": 0.2413, "step": 2056 }, { "epoch": 3.0858110651110273, "grad_norm": 0.34436291456222534, "learning_rate": 9.416559920693866e-05, "loss": 0.3125, "step": 2057 }, { "epoch": 3.087316522393677, "grad_norm": 0.28087618947029114, "learning_rate": 9.415826896012865e-05, "loss": 0.306, "step": 2058 }, { "epoch": 3.0888219796763265, "grad_norm": 0.2941948473453522, "learning_rate": 9.415093443102924e-05, "loss": 0.2834, "step": 2059 }, { "epoch": 3.0903274369589764, "grad_norm": 0.23948541283607483, "learning_rate": 9.414359562044249e-05, "loss": 0.3324, "step": 2060 }, { "epoch": 3.0918328942416258, "grad_norm": 0.324191153049469, "learning_rate": 9.413625252917098e-05, "loss": 0.2783, "step": 2061 }, { "epoch": 3.0933383515242756, "grad_norm": 0.26295170187950134, "learning_rate": 9.412890515801772e-05, "loss": 0.3315, "step": 2062 }, { "epoch": 3.094843808806925, "grad_norm": 0.2504276633262634, "learning_rate": 9.412155350778622e-05, "loss": 0.3021, "step": 2063 }, { "epoch": 3.096349266089575, "grad_norm": 0.2864067256450653, "learning_rate": 9.411419757928047e-05, "loss": 0.315, "step": 2064 }, { "epoch": 3.0978547233722242, "grad_norm": 0.24398043751716614, "learning_rate": 9.410683737330486e-05, "loss": 0.3238, "step": 2065 }, { "epoch": 3.099360180654874, "grad_norm": 0.23197214305400848, "learning_rate": 9.409947289066431e-05, "loss": 0.3234, "step": 2066 }, { "epoch": 3.1008656379375235, "grad_norm": 0.24552564322948456, "learning_rate": 9.40921041321642e-05, "loss": 0.2874, "step": 2067 }, { "epoch": 3.1023710952201733, "grad_norm": 0.24605529010295868, "learning_rate": 9.408473109861035e-05, "loss": 0.2316, "step": 2068 }, { "epoch": 3.1038765525028227, "grad_norm": 0.2081204205751419, "learning_rate": 9.407735379080908e-05, "loss": 0.2579, "step": 2069 }, { "epoch": 3.1053820097854725, "grad_norm": 0.2510157823562622, "learning_rate": 9.406997220956713e-05, "loss": 0.2889, "step": 2070 }, { "epoch": 3.106887467068122, "grad_norm": 0.21894951164722443, "learning_rate": 9.406258635569179e-05, "loss": 0.2888, "step": 2071 }, { "epoch": 3.1083929243507717, "grad_norm": 0.23345515131950378, "learning_rate": 9.405519622999072e-05, "loss": 0.3495, "step": 2072 }, { "epoch": 3.109898381633421, "grad_norm": 0.2625279426574707, "learning_rate": 9.40478018332721e-05, "loss": 0.2927, "step": 2073 }, { "epoch": 3.111403838916071, "grad_norm": 0.1947033852338791, "learning_rate": 9.404040316634459e-05, "loss": 0.2808, "step": 2074 }, { "epoch": 3.1129092961987204, "grad_norm": 0.21634048223495483, "learning_rate": 9.403300023001728e-05, "loss": 0.2252, "step": 2075 }, { "epoch": 3.1144147534813698, "grad_norm": 0.2129380702972412, "learning_rate": 9.402559302509975e-05, "loss": 0.2933, "step": 2076 }, { "epoch": 3.1159202107640196, "grad_norm": 0.24320583045482635, "learning_rate": 9.401818155240205e-05, "loss": 0.3135, "step": 2077 }, { "epoch": 3.117425668046669, "grad_norm": 0.25105607509613037, "learning_rate": 9.401076581273468e-05, "loss": 0.2969, "step": 2078 }, { "epoch": 3.118931125329319, "grad_norm": 0.23987670242786407, "learning_rate": 9.400334580690862e-05, "loss": 0.2911, "step": 2079 }, { "epoch": 3.120436582611968, "grad_norm": 0.2231990247964859, "learning_rate": 9.399592153573528e-05, "loss": 0.3441, "step": 2080 }, { "epoch": 3.121942039894618, "grad_norm": 0.1885639876127243, "learning_rate": 9.39884930000266e-05, "loss": 0.2722, "step": 2081 }, { "epoch": 3.1234474971772674, "grad_norm": 0.20046372711658478, "learning_rate": 9.398106020059494e-05, "loss": 0.3027, "step": 2082 }, { "epoch": 3.1249529544599173, "grad_norm": 0.20489946007728577, "learning_rate": 9.397362313825315e-05, "loss": 0.3116, "step": 2083 }, { "epoch": 3.1264584117425667, "grad_norm": 0.19920475780963898, "learning_rate": 9.396618181381452e-05, "loss": 0.251, "step": 2084 }, { "epoch": 3.1279638690252165, "grad_norm": 0.1890491247177124, "learning_rate": 9.395873622809284e-05, "loss": 0.281, "step": 2085 }, { "epoch": 3.129469326307866, "grad_norm": 0.2113160640001297, "learning_rate": 9.395128638190233e-05, "loss": 0.2953, "step": 2086 }, { "epoch": 3.1309747835905157, "grad_norm": 0.2024983912706375, "learning_rate": 9.394383227605771e-05, "loss": 0.3112, "step": 2087 }, { "epoch": 3.132480240873165, "grad_norm": 0.22835834324359894, "learning_rate": 9.393637391137416e-05, "loss": 0.2689, "step": 2088 }, { "epoch": 3.133985698155815, "grad_norm": 0.2290593683719635, "learning_rate": 9.392891128866727e-05, "loss": 0.2468, "step": 2089 }, { "epoch": 3.1354911554384644, "grad_norm": 0.22271980345249176, "learning_rate": 9.392144440875319e-05, "loss": 0.2775, "step": 2090 }, { "epoch": 3.136996612721114, "grad_norm": 0.21081236004829407, "learning_rate": 9.391397327244847e-05, "loss": 0.2308, "step": 2091 }, { "epoch": 3.1385020700037636, "grad_norm": 0.2061239331960678, "learning_rate": 9.390649788057012e-05, "loss": 0.286, "step": 2092 }, { "epoch": 3.1400075272864134, "grad_norm": 0.19059549272060394, "learning_rate": 9.389901823393567e-05, "loss": 0.2218, "step": 2093 }, { "epoch": 3.141512984569063, "grad_norm": 0.17264050245285034, "learning_rate": 9.389153433336306e-05, "loss": 0.2398, "step": 2094 }, { "epoch": 3.1430184418517126, "grad_norm": 0.2155326008796692, "learning_rate": 9.388404617967075e-05, "loss": 0.2324, "step": 2095 }, { "epoch": 3.144523899134362, "grad_norm": 0.22854778170585632, "learning_rate": 9.387655377367758e-05, "loss": 0.2887, "step": 2096 }, { "epoch": 3.146029356417012, "grad_norm": 0.2519890367984772, "learning_rate": 9.386905711620298e-05, "loss": 0.2703, "step": 2097 }, { "epoch": 3.1475348136996613, "grad_norm": 0.26717743277549744, "learning_rate": 9.386155620806671e-05, "loss": 0.3119, "step": 2098 }, { "epoch": 3.1490402709823107, "grad_norm": 0.24558500945568085, "learning_rate": 9.385405105008907e-05, "loss": 0.299, "step": 2099 }, { "epoch": 3.1505457282649605, "grad_norm": 0.2226259857416153, "learning_rate": 9.384654164309083e-05, "loss": 0.2775, "step": 2100 }, { "epoch": 3.15205118554761, "grad_norm": 0.2497721165418625, "learning_rate": 9.38390279878932e-05, "loss": 0.3133, "step": 2101 }, { "epoch": 3.1535566428302597, "grad_norm": 0.23553670942783356, "learning_rate": 9.383151008531786e-05, "loss": 0.2344, "step": 2102 }, { "epoch": 3.155062100112909, "grad_norm": 0.22791370749473572, "learning_rate": 9.382398793618697e-05, "loss": 0.2652, "step": 2103 }, { "epoch": 3.156567557395559, "grad_norm": 0.2467755824327469, "learning_rate": 9.381646154132312e-05, "loss": 0.2755, "step": 2104 }, { "epoch": 3.1580730146782083, "grad_norm": 0.23020058870315552, "learning_rate": 9.38089309015494e-05, "loss": 0.2685, "step": 2105 }, { "epoch": 3.159578471960858, "grad_norm": 0.2927198112010956, "learning_rate": 9.380139601768935e-05, "loss": 0.3114, "step": 2106 }, { "epoch": 3.1610839292435076, "grad_norm": 0.32407069206237793, "learning_rate": 9.379385689056697e-05, "loss": 0.2329, "step": 2107 }, { "epoch": 3.1625893865261574, "grad_norm": 0.33144843578338623, "learning_rate": 9.378631352100672e-05, "loss": 0.2279, "step": 2108 }, { "epoch": 3.164094843808807, "grad_norm": 0.28598740696907043, "learning_rate": 9.377876590983353e-05, "loss": 0.3032, "step": 2109 }, { "epoch": 3.1656003010914566, "grad_norm": 0.2647744417190552, "learning_rate": 9.377121405787282e-05, "loss": 0.292, "step": 2110 }, { "epoch": 3.167105758374106, "grad_norm": 0.28939589858055115, "learning_rate": 9.376365796595042e-05, "loss": 0.2658, "step": 2111 }, { "epoch": 3.168611215656756, "grad_norm": 0.2714223563671112, "learning_rate": 9.375609763489269e-05, "loss": 0.3084, "step": 2112 }, { "epoch": 3.1701166729394052, "grad_norm": 0.2763046622276306, "learning_rate": 9.374853306552636e-05, "loss": 0.2452, "step": 2113 }, { "epoch": 3.171622130222055, "grad_norm": 0.23302146792411804, "learning_rate": 9.374096425867872e-05, "loss": 0.3041, "step": 2114 }, { "epoch": 3.1731275875047045, "grad_norm": 0.22006480395793915, "learning_rate": 9.373339121517747e-05, "loss": 0.3, "step": 2115 }, { "epoch": 3.1746330447873543, "grad_norm": 0.23028703033924103, "learning_rate": 9.372581393585081e-05, "loss": 0.2705, "step": 2116 }, { "epoch": 3.1761385020700037, "grad_norm": 0.2490130364894867, "learning_rate": 9.371823242152734e-05, "loss": 0.3027, "step": 2117 }, { "epoch": 3.1776439593526535, "grad_norm": 0.22314222157001495, "learning_rate": 9.371064667303619e-05, "loss": 0.2703, "step": 2118 }, { "epoch": 3.179149416635303, "grad_norm": 0.21281477808952332, "learning_rate": 9.370305669120693e-05, "loss": 0.2638, "step": 2119 }, { "epoch": 3.1806548739179528, "grad_norm": 0.2109440565109253, "learning_rate": 9.369546247686956e-05, "loss": 0.2643, "step": 2120 }, { "epoch": 3.182160331200602, "grad_norm": 0.20070607960224152, "learning_rate": 9.368786403085462e-05, "loss": 0.2546, "step": 2121 }, { "epoch": 3.183665788483252, "grad_norm": 0.2207813411951065, "learning_rate": 9.368026135399301e-05, "loss": 0.3242, "step": 2122 }, { "epoch": 3.1851712457659014, "grad_norm": 0.21428456902503967, "learning_rate": 9.367265444711619e-05, "loss": 0.2998, "step": 2123 }, { "epoch": 3.186676703048551, "grad_norm": 0.21756713092327118, "learning_rate": 9.366504331105601e-05, "loss": 0.2399, "step": 2124 }, { "epoch": 3.1881821603312006, "grad_norm": 0.21334369480609894, "learning_rate": 9.365742794664484e-05, "loss": 0.2748, "step": 2125 }, { "epoch": 3.18968761761385, "grad_norm": 0.2210208773612976, "learning_rate": 9.364980835471546e-05, "loss": 0.2572, "step": 2126 }, { "epoch": 3.1911930748965, "grad_norm": 0.20711103081703186, "learning_rate": 9.364218453610116e-05, "loss": 0.3077, "step": 2127 }, { "epoch": 3.1926985321791492, "grad_norm": 0.2025875300168991, "learning_rate": 9.363455649163564e-05, "loss": 0.3139, "step": 2128 }, { "epoch": 3.194203989461799, "grad_norm": 0.23864904046058655, "learning_rate": 9.362692422215312e-05, "loss": 0.2701, "step": 2129 }, { "epoch": 3.1957094467444485, "grad_norm": 0.23441794514656067, "learning_rate": 9.361928772848824e-05, "loss": 0.2517, "step": 2130 }, { "epoch": 3.1972149040270983, "grad_norm": 0.21461611986160278, "learning_rate": 9.361164701147612e-05, "loss": 0.2424, "step": 2131 }, { "epoch": 3.1987203613097477, "grad_norm": 0.19142460823059082, "learning_rate": 9.360400207195232e-05, "loss": 0.2643, "step": 2132 }, { "epoch": 3.2002258185923975, "grad_norm": 0.21436452865600586, "learning_rate": 9.35963529107529e-05, "loss": 0.2108, "step": 2133 }, { "epoch": 3.201731275875047, "grad_norm": 0.2164587676525116, "learning_rate": 9.358869952871436e-05, "loss": 0.247, "step": 2134 }, { "epoch": 3.2032367331576967, "grad_norm": 0.24673165380954742, "learning_rate": 9.358104192667365e-05, "loss": 0.2465, "step": 2135 }, { "epoch": 3.204742190440346, "grad_norm": 0.25427010655403137, "learning_rate": 9.357338010546821e-05, "loss": 0.2655, "step": 2136 }, { "epoch": 3.206247647722996, "grad_norm": 0.21653510630130768, "learning_rate": 9.35657140659359e-05, "loss": 0.2276, "step": 2137 }, { "epoch": 3.2077531050056454, "grad_norm": 0.18221063911914825, "learning_rate": 9.35580438089151e-05, "loss": 0.2857, "step": 2138 }, { "epoch": 3.209258562288295, "grad_norm": 0.22444050014019012, "learning_rate": 9.355036933524458e-05, "loss": 0.222, "step": 2139 }, { "epoch": 3.2107640195709446, "grad_norm": 0.25067338347435, "learning_rate": 9.354269064576366e-05, "loss": 0.2442, "step": 2140 }, { "epoch": 3.2122694768535944, "grad_norm": 0.25561362504959106, "learning_rate": 9.3535007741312e-05, "loss": 0.2811, "step": 2141 }, { "epoch": 3.213774934136244, "grad_norm": 0.2569461464881897, "learning_rate": 9.352732062272988e-05, "loss": 0.2992, "step": 2142 }, { "epoch": 3.2152803914188937, "grad_norm": 0.31719544529914856, "learning_rate": 9.351962929085786e-05, "loss": 0.2655, "step": 2143 }, { "epoch": 3.216785848701543, "grad_norm": 0.33576270937919617, "learning_rate": 9.35119337465371e-05, "loss": 0.3004, "step": 2144 }, { "epoch": 3.218291305984193, "grad_norm": 0.30985596776008606, "learning_rate": 9.350423399060917e-05, "loss": 0.2838, "step": 2145 }, { "epoch": 3.2197967632668423, "grad_norm": 0.2421116679906845, "learning_rate": 9.349653002391611e-05, "loss": 0.312, "step": 2146 }, { "epoch": 3.221302220549492, "grad_norm": 0.2167498618364334, "learning_rate": 9.34888218473004e-05, "loss": 0.2143, "step": 2147 }, { "epoch": 3.2228076778321415, "grad_norm": 0.25504907965660095, "learning_rate": 9.3481109461605e-05, "loss": 0.2998, "step": 2148 }, { "epoch": 3.224313135114791, "grad_norm": 0.23439273238182068, "learning_rate": 9.347339286767333e-05, "loss": 0.255, "step": 2149 }, { "epoch": 3.2258185923974407, "grad_norm": 0.2539646327495575, "learning_rate": 9.346567206634927e-05, "loss": 0.2563, "step": 2150 }, { "epoch": 3.22732404968009, "grad_norm": 0.21435190737247467, "learning_rate": 9.345794705847713e-05, "loss": 0.3054, "step": 2151 }, { "epoch": 3.22882950696274, "grad_norm": 0.2132420539855957, "learning_rate": 9.345021784490173e-05, "loss": 0.3111, "step": 2152 }, { "epoch": 3.2303349642453894, "grad_norm": 0.23544912040233612, "learning_rate": 9.344248442646829e-05, "loss": 0.3529, "step": 2153 }, { "epoch": 3.231840421528039, "grad_norm": 0.22490811347961426, "learning_rate": 9.343474680402255e-05, "loss": 0.3278, "step": 2154 }, { "epoch": 3.2333458788106886, "grad_norm": 0.22272677719593048, "learning_rate": 9.342700497841072e-05, "loss": 0.34, "step": 2155 }, { "epoch": 3.2348513360933384, "grad_norm": 0.21967609226703644, "learning_rate": 9.341925895047937e-05, "loss": 0.2444, "step": 2156 }, { "epoch": 3.236356793375988, "grad_norm": 0.20032066106796265, "learning_rate": 9.341150872107564e-05, "loss": 0.2307, "step": 2157 }, { "epoch": 3.2378622506586376, "grad_norm": 0.2055717408657074, "learning_rate": 9.340375429104706e-05, "loss": 0.2451, "step": 2158 }, { "epoch": 3.239367707941287, "grad_norm": 0.22072777152061462, "learning_rate": 9.339599566124166e-05, "loss": 0.2484, "step": 2159 }, { "epoch": 3.240873165223937, "grad_norm": 0.21022537350654602, "learning_rate": 9.338823283250788e-05, "loss": 0.2834, "step": 2160 }, { "epoch": 3.2423786225065863, "grad_norm": 0.1837294101715088, "learning_rate": 9.338046580569469e-05, "loss": 0.2258, "step": 2161 }, { "epoch": 3.243884079789236, "grad_norm": 0.20679596066474915, "learning_rate": 9.337269458165147e-05, "loss": 0.2439, "step": 2162 }, { "epoch": 3.2453895370718855, "grad_norm": 0.20045006275177002, "learning_rate": 9.336491916122806e-05, "loss": 0.2339, "step": 2163 }, { "epoch": 3.2468949943545353, "grad_norm": 0.19325141608715057, "learning_rate": 9.335713954527476e-05, "loss": 0.2834, "step": 2164 }, { "epoch": 3.2484004516371847, "grad_norm": 0.21076908707618713, "learning_rate": 9.334935573464236e-05, "loss": 0.2396, "step": 2165 }, { "epoch": 3.2499059089198346, "grad_norm": 0.21042612195014954, "learning_rate": 9.334156773018207e-05, "loss": 0.2514, "step": 2166 }, { "epoch": 3.251411366202484, "grad_norm": 0.2146611213684082, "learning_rate": 9.333377553274558e-05, "loss": 0.2748, "step": 2167 }, { "epoch": 3.252916823485134, "grad_norm": 0.2432810217142105, "learning_rate": 9.332597914318502e-05, "loss": 0.2758, "step": 2168 }, { "epoch": 3.254422280767783, "grad_norm": 0.26850271224975586, "learning_rate": 9.331817856235302e-05, "loss": 0.2276, "step": 2169 }, { "epoch": 3.255927738050433, "grad_norm": 0.28294724225997925, "learning_rate": 9.331037379110262e-05, "loss": 0.2754, "step": 2170 }, { "epoch": 3.2574331953330824, "grad_norm": 0.2673597037792206, "learning_rate": 9.330256483028733e-05, "loss": 0.3332, "step": 2171 }, { "epoch": 3.258938652615732, "grad_norm": 0.2230258285999298, "learning_rate": 9.329475168076114e-05, "loss": 0.258, "step": 2172 }, { "epoch": 3.2604441098983816, "grad_norm": 0.2468830645084381, "learning_rate": 9.328693434337849e-05, "loss": 0.2467, "step": 2173 }, { "epoch": 3.2619495671810315, "grad_norm": 0.2860652208328247, "learning_rate": 9.327911281899424e-05, "loss": 0.2178, "step": 2174 }, { "epoch": 3.263455024463681, "grad_norm": 0.2378043383359909, "learning_rate": 9.327128710846379e-05, "loss": 0.2243, "step": 2175 }, { "epoch": 3.2649604817463302, "grad_norm": 0.22885645925998688, "learning_rate": 9.326345721264293e-05, "loss": 0.2215, "step": 2176 }, { "epoch": 3.26646593902898, "grad_norm": 0.25959286093711853, "learning_rate": 9.32556231323879e-05, "loss": 0.2252, "step": 2177 }, { "epoch": 3.2679713963116295, "grad_norm": 0.22024399042129517, "learning_rate": 9.324778486855543e-05, "loss": 0.2662, "step": 2178 }, { "epoch": 3.2694768535942793, "grad_norm": 0.24454426765441895, "learning_rate": 9.323994242200273e-05, "loss": 0.3453, "step": 2179 }, { "epoch": 3.2709823108769287, "grad_norm": 0.2637886106967926, "learning_rate": 9.323209579358741e-05, "loss": 0.3057, "step": 2180 }, { "epoch": 3.2724877681595785, "grad_norm": 0.2350526601076126, "learning_rate": 9.322424498416757e-05, "loss": 0.2357, "step": 2181 }, { "epoch": 3.273993225442228, "grad_norm": 0.24418985843658447, "learning_rate": 9.321638999460178e-05, "loss": 0.2635, "step": 2182 }, { "epoch": 3.2754986827248778, "grad_norm": 0.31764572858810425, "learning_rate": 9.320853082574904e-05, "loss": 0.2511, "step": 2183 }, { "epoch": 3.277004140007527, "grad_norm": 0.28076693415641785, "learning_rate": 9.32006674784688e-05, "loss": 0.3084, "step": 2184 }, { "epoch": 3.278509597290177, "grad_norm": 0.18459546566009521, "learning_rate": 9.319279995362102e-05, "loss": 0.2157, "step": 2185 }, { "epoch": 3.2800150545728264, "grad_norm": 0.25364580750465393, "learning_rate": 9.318492825206604e-05, "loss": 0.3038, "step": 2186 }, { "epoch": 3.281520511855476, "grad_norm": 0.30582234263420105, "learning_rate": 9.317705237466472e-05, "loss": 0.2324, "step": 2187 }, { "epoch": 3.2830259691381256, "grad_norm": 0.27055060863494873, "learning_rate": 9.316917232227837e-05, "loss": 0.2488, "step": 2188 }, { "epoch": 3.2845314264207754, "grad_norm": 0.24911868572235107, "learning_rate": 9.316128809576869e-05, "loss": 0.2397, "step": 2189 }, { "epoch": 3.286036883703425, "grad_norm": 0.2800605893135071, "learning_rate": 9.315339969599793e-05, "loss": 0.2804, "step": 2190 }, { "epoch": 3.2875423409860747, "grad_norm": 0.2676287591457367, "learning_rate": 9.314550712382875e-05, "loss": 0.2655, "step": 2191 }, { "epoch": 3.289047798268724, "grad_norm": 0.2608090341091156, "learning_rate": 9.313761038012425e-05, "loss": 0.2265, "step": 2192 }, { "epoch": 3.290553255551374, "grad_norm": 0.23268365859985352, "learning_rate": 9.312970946574803e-05, "loss": 0.2447, "step": 2193 }, { "epoch": 3.2920587128340233, "grad_norm": 0.21427491307258606, "learning_rate": 9.31218043815641e-05, "loss": 0.2495, "step": 2194 }, { "epoch": 3.293564170116673, "grad_norm": 0.21554403007030487, "learning_rate": 9.311389512843696e-05, "loss": 0.2694, "step": 2195 }, { "epoch": 3.2950696273993225, "grad_norm": 0.19407711923122406, "learning_rate": 9.310598170723156e-05, "loss": 0.2709, "step": 2196 }, { "epoch": 3.2965750846819724, "grad_norm": 0.2527214288711548, "learning_rate": 9.30980641188133e-05, "loss": 0.3004, "step": 2197 }, { "epoch": 3.2980805419646217, "grad_norm": 0.265705406665802, "learning_rate": 9.3090142364048e-05, "loss": 0.2273, "step": 2198 }, { "epoch": 3.299585999247271, "grad_norm": 0.22899092733860016, "learning_rate": 9.308221644380202e-05, "loss": 0.2567, "step": 2199 }, { "epoch": 3.301091456529921, "grad_norm": 0.2285255789756775, "learning_rate": 9.30742863589421e-05, "loss": 0.2275, "step": 2200 }, { "epoch": 3.301091456529921, "eval_loss": 0.2774868309497833, "eval_runtime": 537.0773, "eval_samples_per_second": 17.925, "eval_steps_per_second": 0.56, "step": 2200 }, { "epoch": 3.302596913812571, "grad_norm": 0.23664170503616333, "learning_rate": 9.306635211033547e-05, "loss": 0.2462, "step": 2201 }, { "epoch": 3.30410237109522, "grad_norm": 0.2159542441368103, "learning_rate": 9.30584136988498e-05, "loss": 0.2953, "step": 2202 }, { "epoch": 3.3056078283778696, "grad_norm": 0.8537228107452393, "learning_rate": 9.305047112535322e-05, "loss": 0.2315, "step": 2203 }, { "epoch": 3.3071132856605194, "grad_norm": 0.19070936739444733, "learning_rate": 9.304252439071434e-05, "loss": 0.2367, "step": 2204 }, { "epoch": 3.308618742943169, "grad_norm": 0.17589005827903748, "learning_rate": 9.303457349580219e-05, "loss": 0.225, "step": 2205 }, { "epoch": 3.3101242002258187, "grad_norm": 0.22391721606254578, "learning_rate": 9.302661844148625e-05, "loss": 0.3617, "step": 2206 }, { "epoch": 3.311629657508468, "grad_norm": 1.3838443756103516, "learning_rate": 9.30186592286365e-05, "loss": 0.2597, "step": 2207 }, { "epoch": 3.313135114791118, "grad_norm": 0.24690832197666168, "learning_rate": 9.301069585812334e-05, "loss": 0.2923, "step": 2208 }, { "epoch": 3.3146405720737673, "grad_norm": 0.3301064372062683, "learning_rate": 9.300272833081763e-05, "loss": 0.2523, "step": 2209 }, { "epoch": 3.316146029356417, "grad_norm": 0.3520905673503876, "learning_rate": 9.299475664759069e-05, "loss": 0.221, "step": 2210 }, { "epoch": 3.3176514866390665, "grad_norm": 0.32862260937690735, "learning_rate": 9.298678080931427e-05, "loss": 0.2611, "step": 2211 }, { "epoch": 3.3191569439217163, "grad_norm": 0.2830688953399658, "learning_rate": 9.297880081686064e-05, "loss": 0.2794, "step": 2212 }, { "epoch": 3.3206624012043657, "grad_norm": 0.2369527667760849, "learning_rate": 9.29708166711024e-05, "loss": 0.2631, "step": 2213 }, { "epoch": 3.3221678584870156, "grad_norm": 0.22086350619792938, "learning_rate": 9.29628283729128e-05, "loss": 0.2499, "step": 2214 }, { "epoch": 3.323673315769665, "grad_norm": 0.2736642360687256, "learning_rate": 9.295483592316534e-05, "loss": 0.2832, "step": 2215 }, { "epoch": 3.325178773052315, "grad_norm": 0.29032406210899353, "learning_rate": 9.294683932273408e-05, "loss": 0.3126, "step": 2216 }, { "epoch": 3.326684230334964, "grad_norm": 0.26113560795783997, "learning_rate": 9.293883857249352e-05, "loss": 0.2384, "step": 2217 }, { "epoch": 3.328189687617614, "grad_norm": 0.2688741683959961, "learning_rate": 9.293083367331863e-05, "loss": 0.2796, "step": 2218 }, { "epoch": 3.3296951449002634, "grad_norm": 0.25695210695266724, "learning_rate": 9.292282462608479e-05, "loss": 0.286, "step": 2219 }, { "epoch": 3.3312006021829133, "grad_norm": 0.25207972526550293, "learning_rate": 9.291481143166785e-05, "loss": 0.232, "step": 2220 }, { "epoch": 3.3327060594655626, "grad_norm": 0.26540249586105347, "learning_rate": 9.290679409094417e-05, "loss": 0.3015, "step": 2221 }, { "epoch": 3.334211516748212, "grad_norm": 0.2557046711444855, "learning_rate": 9.289877260479046e-05, "loss": 0.2647, "step": 2222 }, { "epoch": 3.335716974030862, "grad_norm": 0.23135070502758026, "learning_rate": 9.289074697408396e-05, "loss": 0.2725, "step": 2223 }, { "epoch": 3.3372224313135117, "grad_norm": 0.2885662019252777, "learning_rate": 9.288271719970235e-05, "loss": 0.2474, "step": 2224 }, { "epoch": 3.338727888596161, "grad_norm": 0.35446441173553467, "learning_rate": 9.287468328252372e-05, "loss": 0.3591, "step": 2225 }, { "epoch": 3.3402333458788105, "grad_norm": 0.3576541841030121, "learning_rate": 9.286664522342668e-05, "loss": 0.2308, "step": 2226 }, { "epoch": 3.3417388031614603, "grad_norm": 0.34442999958992004, "learning_rate": 9.285860302329026e-05, "loss": 0.2407, "step": 2227 }, { "epoch": 3.3432442604441097, "grad_norm": 0.2799660265445709, "learning_rate": 9.285055668299391e-05, "loss": 0.2048, "step": 2228 }, { "epoch": 3.3447497177267596, "grad_norm": 0.2925272583961487, "learning_rate": 9.28425062034176e-05, "loss": 0.3006, "step": 2229 }, { "epoch": 3.346255175009409, "grad_norm": 0.27914899587631226, "learning_rate": 9.283445158544172e-05, "loss": 0.2218, "step": 2230 }, { "epoch": 3.347760632292059, "grad_norm": 0.28307899832725525, "learning_rate": 9.28263928299471e-05, "loss": 0.2745, "step": 2231 }, { "epoch": 3.349266089574708, "grad_norm": 0.2181001901626587, "learning_rate": 9.2818329937815e-05, "loss": 0.2423, "step": 2232 }, { "epoch": 3.350771546857358, "grad_norm": 0.2792724668979645, "learning_rate": 9.281026290992724e-05, "loss": 0.2691, "step": 2233 }, { "epoch": 3.3522770041400074, "grad_norm": 0.25680118799209595, "learning_rate": 9.280219174716594e-05, "loss": 0.2415, "step": 2234 }, { "epoch": 3.3537824614226572, "grad_norm": 0.25923022627830505, "learning_rate": 9.27941164504138e-05, "loss": 0.3673, "step": 2235 }, { "epoch": 3.3552879187053066, "grad_norm": 0.24250160157680511, "learning_rate": 9.27860370205539e-05, "loss": 0.2703, "step": 2236 }, { "epoch": 3.3567933759879565, "grad_norm": 0.2258729338645935, "learning_rate": 9.27779534584698e-05, "loss": 0.2827, "step": 2237 }, { "epoch": 3.358298833270606, "grad_norm": 0.2316340059041977, "learning_rate": 9.276986576504552e-05, "loss": 0.2925, "step": 2238 }, { "epoch": 3.3598042905532557, "grad_norm": 0.2223215401172638, "learning_rate": 9.27617739411655e-05, "loss": 0.288, "step": 2239 }, { "epoch": 3.361309747835905, "grad_norm": 0.24250739812850952, "learning_rate": 9.275367798771466e-05, "loss": 0.2404, "step": 2240 }, { "epoch": 3.362815205118555, "grad_norm": 0.2499026656150818, "learning_rate": 9.274557790557835e-05, "loss": 0.2814, "step": 2241 }, { "epoch": 3.3643206624012043, "grad_norm": 0.20979046821594238, "learning_rate": 9.27374736956424e-05, "loss": 0.211, "step": 2242 }, { "epoch": 3.365826119683854, "grad_norm": 0.2147216498851776, "learning_rate": 9.272936535879305e-05, "loss": 0.286, "step": 2243 }, { "epoch": 3.3673315769665035, "grad_norm": 0.22326993942260742, "learning_rate": 9.272125289591703e-05, "loss": 0.223, "step": 2244 }, { "epoch": 3.3688370342491534, "grad_norm": 0.23991763591766357, "learning_rate": 9.271313630790152e-05, "loss": 0.3065, "step": 2245 }, { "epoch": 3.3703424915318028, "grad_norm": 0.22484175860881805, "learning_rate": 9.270501559563412e-05, "loss": 0.2162, "step": 2246 }, { "epoch": 3.3718479488144526, "grad_norm": 0.19983436167240143, "learning_rate": 9.26968907600029e-05, "loss": 0.2975, "step": 2247 }, { "epoch": 3.373353406097102, "grad_norm": 0.19484102725982666, "learning_rate": 9.268876180189639e-05, "loss": 0.2363, "step": 2248 }, { "epoch": 3.3748588633797514, "grad_norm": 0.20803654193878174, "learning_rate": 9.268062872220353e-05, "loss": 0.2666, "step": 2249 }, { "epoch": 3.376364320662401, "grad_norm": 0.20304614305496216, "learning_rate": 9.267249152181379e-05, "loss": 0.2059, "step": 2250 }, { "epoch": 3.377869777945051, "grad_norm": 0.1864963322877884, "learning_rate": 9.2664350201617e-05, "loss": 0.2759, "step": 2251 }, { "epoch": 3.3793752352277004, "grad_norm": 0.19034595787525177, "learning_rate": 9.265620476250352e-05, "loss": 0.278, "step": 2252 }, { "epoch": 3.38088069251035, "grad_norm": 0.2279701828956604, "learning_rate": 9.26480552053641e-05, "loss": 0.192, "step": 2253 }, { "epoch": 3.3823861497929997, "grad_norm": 0.2256770133972168, "learning_rate": 9.263990153108994e-05, "loss": 0.2511, "step": 2254 }, { "epoch": 3.383891607075649, "grad_norm": 0.19693337380886078, "learning_rate": 9.263174374057276e-05, "loss": 0.2375, "step": 2255 }, { "epoch": 3.385397064358299, "grad_norm": 0.20853127539157867, "learning_rate": 9.262358183470467e-05, "loss": 0.2045, "step": 2256 }, { "epoch": 3.3869025216409483, "grad_norm": 0.19974280893802643, "learning_rate": 9.261541581437822e-05, "loss": 0.216, "step": 2257 }, { "epoch": 3.388407978923598, "grad_norm": 0.24009400606155396, "learning_rate": 9.260724568048644e-05, "loss": 0.3137, "step": 2258 }, { "epoch": 3.3899134362062475, "grad_norm": 0.26767289638519287, "learning_rate": 9.259907143392284e-05, "loss": 0.2196, "step": 2259 }, { "epoch": 3.3914188934888974, "grad_norm": 0.2110011726617813, "learning_rate": 9.259089307558131e-05, "loss": 0.2792, "step": 2260 }, { "epoch": 3.3929243507715467, "grad_norm": 0.2085544914007187, "learning_rate": 9.258271060635623e-05, "loss": 0.2282, "step": 2261 }, { "epoch": 3.3944298080541966, "grad_norm": 0.254657119512558, "learning_rate": 9.257452402714242e-05, "loss": 0.2844, "step": 2262 }, { "epoch": 3.395935265336846, "grad_norm": 0.23291650414466858, "learning_rate": 9.256633333883515e-05, "loss": 0.2326, "step": 2263 }, { "epoch": 3.397440722619496, "grad_norm": 0.22585265338420868, "learning_rate": 9.255813854233016e-05, "loss": 0.2285, "step": 2264 }, { "epoch": 3.398946179902145, "grad_norm": 0.2372978776693344, "learning_rate": 9.254993963852359e-05, "loss": 0.2522, "step": 2265 }, { "epoch": 3.400451637184795, "grad_norm": 0.1908617913722992, "learning_rate": 9.25417366283121e-05, "loss": 0.2501, "step": 2266 }, { "epoch": 3.4019570944674444, "grad_norm": 0.21522113680839539, "learning_rate": 9.253352951259271e-05, "loss": 0.2415, "step": 2267 }, { "epoch": 3.4034625517500943, "grad_norm": 0.25068914890289307, "learning_rate": 9.252531829226297e-05, "loss": 0.2923, "step": 2268 }, { "epoch": 3.4049680090327437, "grad_norm": 0.21462558209896088, "learning_rate": 9.251710296822085e-05, "loss": 0.2593, "step": 2269 }, { "epoch": 3.4064734663153935, "grad_norm": 0.20428472757339478, "learning_rate": 9.250888354136475e-05, "loss": 0.2813, "step": 2270 }, { "epoch": 3.407978923598043, "grad_norm": 0.248600572347641, "learning_rate": 9.250066001259353e-05, "loss": 0.2521, "step": 2271 }, { "epoch": 3.4094843808806923, "grad_norm": 0.24194197356700897, "learning_rate": 9.249243238280653e-05, "loss": 0.232, "step": 2272 }, { "epoch": 3.410989838163342, "grad_norm": 0.2444886416196823, "learning_rate": 9.248420065290348e-05, "loss": 0.2351, "step": 2273 }, { "epoch": 3.412495295445992, "grad_norm": 0.2682279348373413, "learning_rate": 9.247596482378461e-05, "loss": 0.2129, "step": 2274 }, { "epoch": 3.4140007527286413, "grad_norm": 0.24674159288406372, "learning_rate": 9.246772489635057e-05, "loss": 0.2757, "step": 2275 }, { "epoch": 3.4155062100112907, "grad_norm": 0.23118120431900024, "learning_rate": 9.245948087150245e-05, "loss": 0.2473, "step": 2276 }, { "epoch": 3.4170116672939406, "grad_norm": 0.21810011565685272, "learning_rate": 9.245123275014185e-05, "loss": 0.2309, "step": 2277 }, { "epoch": 3.41851712457659, "grad_norm": 0.2605838179588318, "learning_rate": 9.244298053317074e-05, "loss": 0.268, "step": 2278 }, { "epoch": 3.42002258185924, "grad_norm": 0.25071361660957336, "learning_rate": 9.243472422149155e-05, "loss": 0.238, "step": 2279 }, { "epoch": 3.421528039141889, "grad_norm": 0.2603414058685303, "learning_rate": 9.242646381600722e-05, "loss": 0.2477, "step": 2280 }, { "epoch": 3.423033496424539, "grad_norm": 0.2256433367729187, "learning_rate": 9.241819931762108e-05, "loss": 0.2634, "step": 2281 }, { "epoch": 3.4245389537071884, "grad_norm": 0.22150947153568268, "learning_rate": 9.240993072723691e-05, "loss": 0.3021, "step": 2282 }, { "epoch": 3.4260444109898383, "grad_norm": 0.18876038491725922, "learning_rate": 9.240165804575897e-05, "loss": 0.2559, "step": 2283 }, { "epoch": 3.4275498682724876, "grad_norm": 0.21343903243541718, "learning_rate": 9.239338127409192e-05, "loss": 0.2143, "step": 2284 }, { "epoch": 3.4290553255551375, "grad_norm": 0.20324377715587616, "learning_rate": 9.238510041314094e-05, "loss": 0.2485, "step": 2285 }, { "epoch": 3.430560782837787, "grad_norm": 0.28999316692352295, "learning_rate": 9.237681546381157e-05, "loss": 0.2499, "step": 2286 }, { "epoch": 3.4320662401204367, "grad_norm": 0.20795615017414093, "learning_rate": 9.236852642700987e-05, "loss": 0.2352, "step": 2287 }, { "epoch": 3.433571697403086, "grad_norm": 0.19723457098007202, "learning_rate": 9.236023330364229e-05, "loss": 0.2655, "step": 2288 }, { "epoch": 3.435077154685736, "grad_norm": 0.19562450051307678, "learning_rate": 9.235193609461576e-05, "loss": 0.2703, "step": 2289 }, { "epoch": 3.4365826119683853, "grad_norm": 0.18776129186153412, "learning_rate": 9.234363480083768e-05, "loss": 0.2513, "step": 2290 }, { "epoch": 3.438088069251035, "grad_norm": 0.20037762820720673, "learning_rate": 9.233532942321581e-05, "loss": 0.2089, "step": 2291 }, { "epoch": 3.4395935265336846, "grad_norm": 0.2306079864501953, "learning_rate": 9.232701996265846e-05, "loss": 0.275, "step": 2292 }, { "epoch": 3.4410989838163344, "grad_norm": 0.23031748831272125, "learning_rate": 9.231870642007434e-05, "loss": 0.202, "step": 2293 }, { "epoch": 3.442604441098984, "grad_norm": 0.21111053228378296, "learning_rate": 9.23103887963726e-05, "loss": 0.2237, "step": 2294 }, { "epoch": 3.444109898381633, "grad_norm": 0.23636825382709503, "learning_rate": 9.230206709246282e-05, "loss": 0.3213, "step": 2295 }, { "epoch": 3.445615355664283, "grad_norm": 0.2501446604728699, "learning_rate": 9.229374130925506e-05, "loss": 0.2872, "step": 2296 }, { "epoch": 3.447120812946933, "grad_norm": 0.2018512785434723, "learning_rate": 9.228541144765983e-05, "loss": 0.2794, "step": 2297 }, { "epoch": 3.4486262702295822, "grad_norm": 0.23005828261375427, "learning_rate": 9.227707750858806e-05, "loss": 0.2804, "step": 2298 }, { "epoch": 3.4501317275122316, "grad_norm": 0.19407042860984802, "learning_rate": 9.226873949295115e-05, "loss": 0.2979, "step": 2299 }, { "epoch": 3.4516371847948815, "grad_norm": 0.21120233833789825, "learning_rate": 9.226039740166091e-05, "loss": 0.216, "step": 2300 }, { "epoch": 3.4531426420775313, "grad_norm": 0.21220196783542633, "learning_rate": 9.225205123562963e-05, "loss": 0.2591, "step": 2301 }, { "epoch": 3.4546480993601807, "grad_norm": 0.21215936541557312, "learning_rate": 9.224370099577003e-05, "loss": 0.2584, "step": 2302 }, { "epoch": 3.45615355664283, "grad_norm": 0.2136453539133072, "learning_rate": 9.22353466829953e-05, "loss": 0.2377, "step": 2303 }, { "epoch": 3.45765901392548, "grad_norm": 0.30339518189430237, "learning_rate": 9.222698829821903e-05, "loss": 0.2385, "step": 2304 }, { "epoch": 3.4591644712081293, "grad_norm": 0.30075380206108093, "learning_rate": 9.221862584235528e-05, "loss": 0.292, "step": 2305 }, { "epoch": 3.460669928490779, "grad_norm": 0.2716374695301056, "learning_rate": 9.22102593163186e-05, "loss": 0.2724, "step": 2306 }, { "epoch": 3.4621753857734285, "grad_norm": 0.22254973649978638, "learning_rate": 9.220188872102386e-05, "loss": 0.3521, "step": 2307 }, { "epoch": 3.4636808430560784, "grad_norm": 0.2566329836845398, "learning_rate": 9.219351405738652e-05, "loss": 0.2466, "step": 2308 }, { "epoch": 3.4651863003387278, "grad_norm": 0.23604808747768402, "learning_rate": 9.218513532632241e-05, "loss": 0.1942, "step": 2309 }, { "epoch": 3.4666917576213776, "grad_norm": 0.21911808848381042, "learning_rate": 9.21767525287478e-05, "loss": 0.2507, "step": 2310 }, { "epoch": 3.468197214904027, "grad_norm": 0.21776080131530762, "learning_rate": 9.216836566557943e-05, "loss": 0.2248, "step": 2311 }, { "epoch": 3.469702672186677, "grad_norm": 0.19881467521190643, "learning_rate": 9.215997473773448e-05, "loss": 0.2544, "step": 2312 }, { "epoch": 3.471208129469326, "grad_norm": 0.2807597815990448, "learning_rate": 9.215157974613056e-05, "loss": 0.3354, "step": 2313 }, { "epoch": 3.472713586751976, "grad_norm": 0.2979535758495331, "learning_rate": 9.214318069168572e-05, "loss": 0.3098, "step": 2314 }, { "epoch": 3.4742190440346254, "grad_norm": 0.28362923860549927, "learning_rate": 9.213477757531851e-05, "loss": 0.3073, "step": 2315 }, { "epoch": 3.4757245013172753, "grad_norm": 0.29891788959503174, "learning_rate": 9.212637039794783e-05, "loss": 0.1855, "step": 2316 }, { "epoch": 3.4772299585999247, "grad_norm": 0.30084577202796936, "learning_rate": 9.211795916049311e-05, "loss": 0.2212, "step": 2317 }, { "epoch": 3.4787354158825745, "grad_norm": 0.26147451996803284, "learning_rate": 9.210954386387418e-05, "loss": 0.2245, "step": 2318 }, { "epoch": 3.480240873165224, "grad_norm": 0.2729434669017792, "learning_rate": 9.210112450901134e-05, "loss": 0.256, "step": 2319 }, { "epoch": 3.4817463304478737, "grad_norm": 0.21694418787956238, "learning_rate": 9.20927010968253e-05, "loss": 0.3122, "step": 2320 }, { "epoch": 3.483251787730523, "grad_norm": 0.3214438259601593, "learning_rate": 9.208427362823721e-05, "loss": 0.2848, "step": 2321 }, { "epoch": 3.4847572450131725, "grad_norm": 0.3125142753124237, "learning_rate": 9.207584210416875e-05, "loss": 0.284, "step": 2322 }, { "epoch": 3.4862627022958224, "grad_norm": 0.31013011932373047, "learning_rate": 9.206740652554192e-05, "loss": 0.2818, "step": 2323 }, { "epoch": 3.487768159578472, "grad_norm": 0.2569538950920105, "learning_rate": 9.205896689327923e-05, "loss": 0.2868, "step": 2324 }, { "epoch": 3.4892736168611216, "grad_norm": 0.2208337038755417, "learning_rate": 9.205052320830367e-05, "loss": 0.2431, "step": 2325 }, { "epoch": 3.490779074143771, "grad_norm": 0.2013891637325287, "learning_rate": 9.204207547153858e-05, "loss": 0.1705, "step": 2326 }, { "epoch": 3.492284531426421, "grad_norm": 0.22645525634288788, "learning_rate": 9.20336236839078e-05, "loss": 0.1872, "step": 2327 }, { "epoch": 3.49378998870907, "grad_norm": 0.20171897113323212, "learning_rate": 9.202516784633563e-05, "loss": 0.2893, "step": 2328 }, { "epoch": 3.49529544599172, "grad_norm": 0.22148820757865906, "learning_rate": 9.201670795974676e-05, "loss": 0.2694, "step": 2329 }, { "epoch": 3.4968009032743694, "grad_norm": 0.19719378650188446, "learning_rate": 9.200824402506635e-05, "loss": 0.2961, "step": 2330 }, { "epoch": 3.4983063605570193, "grad_norm": 0.24286948144435883, "learning_rate": 9.199977604322003e-05, "loss": 0.2693, "step": 2331 }, { "epoch": 3.4998118178396687, "grad_norm": 0.21391309797763824, "learning_rate": 9.199130401513382e-05, "loss": 0.2513, "step": 2332 }, { "epoch": 3.5013172751223185, "grad_norm": 0.19377321004867554, "learning_rate": 9.198282794173424e-05, "loss": 0.2502, "step": 2333 }, { "epoch": 3.502822732404968, "grad_norm": 0.19868344068527222, "learning_rate": 9.197434782394818e-05, "loss": 0.1971, "step": 2334 }, { "epoch": 3.5043281896876177, "grad_norm": 0.2045525461435318, "learning_rate": 9.196586366270303e-05, "loss": 0.2488, "step": 2335 }, { "epoch": 3.505833646970267, "grad_norm": 0.22748549282550812, "learning_rate": 9.195737545892662e-05, "loss": 0.2094, "step": 2336 }, { "epoch": 3.507339104252917, "grad_norm": 0.331819087266922, "learning_rate": 9.194888321354719e-05, "loss": 0.3226, "step": 2337 }, { "epoch": 3.5088445615355663, "grad_norm": 0.2183634340763092, "learning_rate": 9.194038692749345e-05, "loss": 0.2369, "step": 2338 }, { "epoch": 3.510350018818216, "grad_norm": 0.21164311468601227, "learning_rate": 9.193188660169451e-05, "loss": 0.1789, "step": 2339 }, { "epoch": 3.5118554761008656, "grad_norm": 0.21869608759880066, "learning_rate": 9.192338223708001e-05, "loss": 0.2347, "step": 2340 }, { "epoch": 3.5133609333835154, "grad_norm": 0.22966331243515015, "learning_rate": 9.191487383457993e-05, "loss": 0.2515, "step": 2341 }, { "epoch": 3.514866390666165, "grad_norm": 0.1902851164340973, "learning_rate": 9.190636139512473e-05, "loss": 0.2592, "step": 2342 }, { "epoch": 3.5163718479488146, "grad_norm": 0.19246506690979004, "learning_rate": 9.189784491964536e-05, "loss": 0.2325, "step": 2343 }, { "epoch": 3.517877305231464, "grad_norm": 0.20321273803710938, "learning_rate": 9.188932440907313e-05, "loss": 0.194, "step": 2344 }, { "epoch": 3.5193827625141134, "grad_norm": 0.20647484064102173, "learning_rate": 9.188079986433985e-05, "loss": 0.2572, "step": 2345 }, { "epoch": 3.5208882197967633, "grad_norm": 0.22349724173545837, "learning_rate": 9.187227128637775e-05, "loss": 0.2454, "step": 2346 }, { "epoch": 3.522393677079413, "grad_norm": 0.21588021516799927, "learning_rate": 9.18637386761195e-05, "loss": 0.2613, "step": 2347 }, { "epoch": 3.5238991343620625, "grad_norm": 0.20765367150306702, "learning_rate": 9.18552020344982e-05, "loss": 0.2365, "step": 2348 }, { "epoch": 3.525404591644712, "grad_norm": 0.20658466219902039, "learning_rate": 9.184666136244743e-05, "loss": 0.2603, "step": 2349 }, { "epoch": 3.5269100489273617, "grad_norm": 0.21476250886917114, "learning_rate": 9.183811666090118e-05, "loss": 0.3076, "step": 2350 }, { "epoch": 3.5284155062100115, "grad_norm": 0.19896043837070465, "learning_rate": 9.182956793079384e-05, "loss": 0.2719, "step": 2351 }, { "epoch": 3.529920963492661, "grad_norm": 0.19759847223758698, "learning_rate": 9.182101517306036e-05, "loss": 0.2383, "step": 2352 }, { "epoch": 3.5314264207753103, "grad_norm": 0.23982073366641998, "learning_rate": 9.1812458388636e-05, "loss": 0.2478, "step": 2353 }, { "epoch": 3.53293187805796, "grad_norm": 0.23283784091472626, "learning_rate": 9.180389757845655e-05, "loss": 0.2965, "step": 2354 }, { "epoch": 3.5344373353406096, "grad_norm": 0.20972107350826263, "learning_rate": 9.179533274345818e-05, "loss": 0.2946, "step": 2355 }, { "epoch": 3.5359427926232594, "grad_norm": 0.2107367217540741, "learning_rate": 9.178676388457756e-05, "loss": 0.3196, "step": 2356 }, { "epoch": 3.537448249905909, "grad_norm": 0.20468567311763763, "learning_rate": 9.177819100275173e-05, "loss": 0.2772, "step": 2357 }, { "epoch": 3.5389537071885586, "grad_norm": 0.2237575650215149, "learning_rate": 9.176961409891824e-05, "loss": 0.2763, "step": 2358 }, { "epoch": 3.540459164471208, "grad_norm": 0.24677707254886627, "learning_rate": 9.176103317401503e-05, "loss": 0.2176, "step": 2359 }, { "epoch": 3.541964621753858, "grad_norm": 0.31434720754623413, "learning_rate": 9.17524482289805e-05, "loss": 0.2555, "step": 2360 }, { "epoch": 3.5434700790365072, "grad_norm": 0.3846309185028076, "learning_rate": 9.17438592647535e-05, "loss": 0.3057, "step": 2361 }, { "epoch": 3.544975536319157, "grad_norm": 0.3164254128932953, "learning_rate": 9.173526628227329e-05, "loss": 0.2289, "step": 2362 }, { "epoch": 3.5464809936018065, "grad_norm": 0.24745045602321625, "learning_rate": 9.172666928247957e-05, "loss": 0.222, "step": 2363 }, { "epoch": 3.5479864508844563, "grad_norm": 0.29087385535240173, "learning_rate": 9.171806826631256e-05, "loss": 0.3005, "step": 2364 }, { "epoch": 3.5494919081671057, "grad_norm": 0.2675272822380066, "learning_rate": 9.170946323471275e-05, "loss": 0.2139, "step": 2365 }, { "epoch": 3.5509973654497555, "grad_norm": 0.2311275452375412, "learning_rate": 9.170085418862126e-05, "loss": 0.2921, "step": 2366 }, { "epoch": 3.552502822732405, "grad_norm": 0.23201517760753632, "learning_rate": 9.169224112897955e-05, "loss": 0.2115, "step": 2367 }, { "epoch": 3.5540082800150543, "grad_norm": 0.23115283250808716, "learning_rate": 9.16836240567295e-05, "loss": 0.2178, "step": 2368 }, { "epoch": 3.555513737297704, "grad_norm": 0.23303930461406708, "learning_rate": 9.167500297281348e-05, "loss": 0.3013, "step": 2369 }, { "epoch": 3.557019194580354, "grad_norm": 0.20422665774822235, "learning_rate": 9.166637787817427e-05, "loss": 0.3013, "step": 2370 }, { "epoch": 3.5585246518630034, "grad_norm": 0.2351183146238327, "learning_rate": 9.165774877375511e-05, "loss": 0.3052, "step": 2371 }, { "epoch": 3.5600301091456528, "grad_norm": 0.22972533106803894, "learning_rate": 9.164911566049967e-05, "loss": 0.2118, "step": 2372 }, { "epoch": 3.5615355664283026, "grad_norm": 0.20319728553295135, "learning_rate": 9.164047853935202e-05, "loss": 0.244, "step": 2373 }, { "epoch": 3.5630410237109524, "grad_norm": 0.24958436191082, "learning_rate": 9.163183741125673e-05, "loss": 0.3363, "step": 2374 }, { "epoch": 3.564546480993602, "grad_norm": 0.2299891710281372, "learning_rate": 9.162319227715878e-05, "loss": 0.2312, "step": 2375 }, { "epoch": 3.566051938276251, "grad_norm": 0.24185694754123688, "learning_rate": 9.161454313800357e-05, "loss": 0.2524, "step": 2376 }, { "epoch": 3.567557395558901, "grad_norm": 0.2188352793455124, "learning_rate": 9.1605889994737e-05, "loss": 0.2897, "step": 2377 }, { "epoch": 3.569062852841551, "grad_norm": 0.25205695629119873, "learning_rate": 9.159723284830532e-05, "loss": 0.2781, "step": 2378 }, { "epoch": 3.5705683101242003, "grad_norm": 0.23348328471183777, "learning_rate": 9.158857169965527e-05, "loss": 0.3283, "step": 2379 }, { "epoch": 3.5720737674068497, "grad_norm": 0.2013077735900879, "learning_rate": 9.157990654973406e-05, "loss": 0.2668, "step": 2380 }, { "epoch": 3.5735792246894995, "grad_norm": 0.21018226444721222, "learning_rate": 9.157123739948924e-05, "loss": 0.2892, "step": 2381 }, { "epoch": 3.575084681972149, "grad_norm": 0.23986713588237762, "learning_rate": 9.156256424986888e-05, "loss": 0.2722, "step": 2382 }, { "epoch": 3.5765901392547987, "grad_norm": 0.23024408519268036, "learning_rate": 9.155388710182147e-05, "loss": 0.2528, "step": 2383 }, { "epoch": 3.578095596537448, "grad_norm": 0.25154808163642883, "learning_rate": 9.154520595629593e-05, "loss": 0.2272, "step": 2384 }, { "epoch": 3.579601053820098, "grad_norm": 0.2296338826417923, "learning_rate": 9.15365208142416e-05, "loss": 0.3015, "step": 2385 }, { "epoch": 3.5811065111027474, "grad_norm": 0.24886339902877808, "learning_rate": 9.15278316766083e-05, "loss": 0.3519, "step": 2386 }, { "epoch": 3.582611968385397, "grad_norm": 0.19526012241840363, "learning_rate": 9.151913854434625e-05, "loss": 0.2068, "step": 2387 }, { "epoch": 3.5841174256680466, "grad_norm": 0.2524930238723755, "learning_rate": 9.15104414184061e-05, "loss": 0.2233, "step": 2388 }, { "epoch": 3.5856228829506964, "grad_norm": 0.30518051981925964, "learning_rate": 9.150174029973897e-05, "loss": 0.2931, "step": 2389 }, { "epoch": 3.587128340233346, "grad_norm": 0.23576851189136505, "learning_rate": 9.149303518929641e-05, "loss": 0.2333, "step": 2390 }, { "epoch": 3.5886337975159956, "grad_norm": 0.21634773910045624, "learning_rate": 9.148432608803038e-05, "loss": 0.2797, "step": 2391 }, { "epoch": 3.590139254798645, "grad_norm": 0.20784959197044373, "learning_rate": 9.14756129968933e-05, "loss": 0.3172, "step": 2392 }, { "epoch": 3.591644712081295, "grad_norm": 0.1889403760433197, "learning_rate": 9.146689591683803e-05, "loss": 0.2258, "step": 2393 }, { "epoch": 3.5931501693639443, "grad_norm": 0.21172557771205902, "learning_rate": 9.145817484881784e-05, "loss": 0.286, "step": 2394 }, { "epoch": 3.5946556266465937, "grad_norm": 0.18170814216136932, "learning_rate": 9.144944979378648e-05, "loss": 0.2365, "step": 2395 }, { "epoch": 3.5961610839292435, "grad_norm": 0.18098123371601105, "learning_rate": 9.144072075269809e-05, "loss": 0.2102, "step": 2396 }, { "epoch": 3.5976665412118933, "grad_norm": 0.18120457231998444, "learning_rate": 9.143198772650725e-05, "loss": 0.202, "step": 2397 }, { "epoch": 3.5991719984945427, "grad_norm": 0.21471697092056274, "learning_rate": 9.142325071616901e-05, "loss": 0.27, "step": 2398 }, { "epoch": 3.600677455777192, "grad_norm": 0.2074868232011795, "learning_rate": 9.141450972263886e-05, "loss": 0.2894, "step": 2399 }, { "epoch": 3.602182913059842, "grad_norm": 0.24738414585590363, "learning_rate": 9.140576474687264e-05, "loss": 0.2612, "step": 2400 }, { "epoch": 3.602182913059842, "eval_loss": 0.2638201415538788, "eval_runtime": 533.0985, "eval_samples_per_second": 18.059, "eval_steps_per_second": 0.565, "step": 2400 }, { "epoch": 3.603688370342492, "grad_norm": 0.27758631110191345, "learning_rate": 9.139701578982673e-05, "loss": 0.3133, "step": 2401 }, { "epoch": 3.605193827625141, "grad_norm": 0.3066558241844177, "learning_rate": 9.13882628524579e-05, "loss": 0.2128, "step": 2402 }, { "epoch": 3.6066992849077906, "grad_norm": 0.2950681746006012, "learning_rate": 9.137950593572335e-05, "loss": 0.251, "step": 2403 }, { "epoch": 3.6082047421904404, "grad_norm": 0.23322978615760803, "learning_rate": 9.137074504058074e-05, "loss": 0.2013, "step": 2404 }, { "epoch": 3.60971019947309, "grad_norm": 0.2217419147491455, "learning_rate": 9.136198016798812e-05, "loss": 0.1836, "step": 2405 }, { "epoch": 3.6112156567557396, "grad_norm": 0.3122643530368805, "learning_rate": 9.135321131890403e-05, "loss": 0.2888, "step": 2406 }, { "epoch": 3.612721114038389, "grad_norm": 0.3097941279411316, "learning_rate": 9.13444384942874e-05, "loss": 0.3153, "step": 2407 }, { "epoch": 3.614226571321039, "grad_norm": 0.21298295259475708, "learning_rate": 9.133566169509763e-05, "loss": 0.2494, "step": 2408 }, { "epoch": 3.6157320286036883, "grad_norm": 0.28386375308036804, "learning_rate": 9.132688092229451e-05, "loss": 0.2292, "step": 2409 }, { "epoch": 3.617237485886338, "grad_norm": 0.35607725381851196, "learning_rate": 9.131809617683833e-05, "loss": 0.3109, "step": 2410 }, { "epoch": 3.6187429431689875, "grad_norm": 0.2825411260128021, "learning_rate": 9.130930745968974e-05, "loss": 0.2716, "step": 2411 }, { "epoch": 3.6202484004516373, "grad_norm": 0.25123539566993713, "learning_rate": 9.130051477180988e-05, "loss": 0.2482, "step": 2412 }, { "epoch": 3.6217538577342867, "grad_norm": 0.250530481338501, "learning_rate": 9.129171811416029e-05, "loss": 0.3271, "step": 2413 }, { "epoch": 3.6232593150169365, "grad_norm": 0.2626771330833435, "learning_rate": 9.128291748770298e-05, "loss": 0.2507, "step": 2414 }, { "epoch": 3.624764772299586, "grad_norm": 0.2501090168952942, "learning_rate": 9.127411289340036e-05, "loss": 0.2867, "step": 2415 }, { "epoch": 3.6262702295822358, "grad_norm": 0.22102849185466766, "learning_rate": 9.126530433221531e-05, "loss": 0.2685, "step": 2416 }, { "epoch": 3.627775686864885, "grad_norm": 0.2662089467048645, "learning_rate": 9.125649180511106e-05, "loss": 0.2863, "step": 2417 }, { "epoch": 3.6292811441475346, "grad_norm": 0.23286357522010803, "learning_rate": 9.124767531305141e-05, "loss": 0.2674, "step": 2418 }, { "epoch": 3.6307866014301844, "grad_norm": 0.21553368866443634, "learning_rate": 9.123885485700049e-05, "loss": 0.2733, "step": 2419 }, { "epoch": 3.6322920587128342, "grad_norm": 0.23410509526729584, "learning_rate": 9.123003043792289e-05, "loss": 0.2804, "step": 2420 }, { "epoch": 3.6337975159954836, "grad_norm": 0.22810646891593933, "learning_rate": 9.12212020567836e-05, "loss": 0.2722, "step": 2421 }, { "epoch": 3.635302973278133, "grad_norm": 0.2052464634180069, "learning_rate": 9.121236971454814e-05, "loss": 0.2161, "step": 2422 }, { "epoch": 3.636808430560783, "grad_norm": 0.23633573949337006, "learning_rate": 9.120353341218237e-05, "loss": 0.2419, "step": 2423 }, { "epoch": 3.6383138878434327, "grad_norm": 0.23098774254322052, "learning_rate": 9.119469315065259e-05, "loss": 0.3259, "step": 2424 }, { "epoch": 3.639819345126082, "grad_norm": 0.20570620894432068, "learning_rate": 9.118584893092563e-05, "loss": 0.2931, "step": 2425 }, { "epoch": 3.6413248024087315, "grad_norm": 0.20658861100673676, "learning_rate": 9.11770007539686e-05, "loss": 0.2067, "step": 2426 }, { "epoch": 3.6428302596913813, "grad_norm": 0.1982201188802719, "learning_rate": 9.116814862074916e-05, "loss": 0.2093, "step": 2427 }, { "epoch": 3.644335716974031, "grad_norm": 0.19137972593307495, "learning_rate": 9.11592925322354e-05, "loss": 0.1967, "step": 2428 }, { "epoch": 3.6458411742566805, "grad_norm": 0.20419009029865265, "learning_rate": 9.115043248939573e-05, "loss": 0.2576, "step": 2429 }, { "epoch": 3.64734663153933, "grad_norm": 0.21623623371124268, "learning_rate": 9.114156849319913e-05, "loss": 0.2717, "step": 2430 }, { "epoch": 3.6488520888219798, "grad_norm": 0.2287861406803131, "learning_rate": 9.113270054461495e-05, "loss": 0.2235, "step": 2431 }, { "epoch": 3.650357546104629, "grad_norm": 0.2086826115846634, "learning_rate": 9.112382864461296e-05, "loss": 0.2447, "step": 2432 }, { "epoch": 3.651863003387279, "grad_norm": 0.19284233450889587, "learning_rate": 9.111495279416337e-05, "loss": 0.2103, "step": 2433 }, { "epoch": 3.6533684606699284, "grad_norm": 0.2058851420879364, "learning_rate": 9.110607299423684e-05, "loss": 0.2264, "step": 2434 }, { "epoch": 3.654873917952578, "grad_norm": 0.20334511995315552, "learning_rate": 9.109718924580446e-05, "loss": 0.2403, "step": 2435 }, { "epoch": 3.6563793752352276, "grad_norm": 0.223549947142601, "learning_rate": 9.108830154983773e-05, "loss": 0.2248, "step": 2436 }, { "epoch": 3.6578848325178774, "grad_norm": 0.2129141241312027, "learning_rate": 9.10794099073086e-05, "loss": 0.3144, "step": 2437 }, { "epoch": 3.659390289800527, "grad_norm": 0.20745067298412323, "learning_rate": 9.107051431918944e-05, "loss": 0.205, "step": 2438 }, { "epoch": 3.6608957470831767, "grad_norm": 0.20649725198745728, "learning_rate": 9.106161478645308e-05, "loss": 0.1818, "step": 2439 }, { "epoch": 3.662401204365826, "grad_norm": 0.21740995347499847, "learning_rate": 9.105271131007274e-05, "loss": 0.2214, "step": 2440 }, { "epoch": 3.6639066616484754, "grad_norm": 0.20959387719631195, "learning_rate": 9.104380389102211e-05, "loss": 0.2283, "step": 2441 }, { "epoch": 3.6654121189311253, "grad_norm": 0.20833687484264374, "learning_rate": 9.103489253027526e-05, "loss": 0.2088, "step": 2442 }, { "epoch": 3.666917576213775, "grad_norm": 0.1980445832014084, "learning_rate": 9.102597722880674e-05, "loss": 0.1908, "step": 2443 }, { "epoch": 3.6684230334964245, "grad_norm": 0.19926661252975464, "learning_rate": 9.101705798759151e-05, "loss": 0.2289, "step": 2444 }, { "epoch": 3.669928490779074, "grad_norm": 0.17840074002742767, "learning_rate": 9.100813480760499e-05, "loss": 0.1989, "step": 2445 }, { "epoch": 3.6714339480617237, "grad_norm": 0.190543532371521, "learning_rate": 9.099920768982297e-05, "loss": 0.2062, "step": 2446 }, { "epoch": 3.6729394053443736, "grad_norm": 0.19774086773395538, "learning_rate": 9.099027663522171e-05, "loss": 0.219, "step": 2447 }, { "epoch": 3.674444862627023, "grad_norm": 0.20061010122299194, "learning_rate": 9.098134164477791e-05, "loss": 0.2743, "step": 2448 }, { "epoch": 3.6759503199096724, "grad_norm": 0.21432572603225708, "learning_rate": 9.09724027194687e-05, "loss": 0.2572, "step": 2449 }, { "epoch": 3.677455777192322, "grad_norm": 0.22395387291908264, "learning_rate": 9.096345986027161e-05, "loss": 0.249, "step": 2450 }, { "epoch": 3.678961234474972, "grad_norm": 0.22213183343410492, "learning_rate": 9.095451306816462e-05, "loss": 0.2718, "step": 2451 }, { "epoch": 3.6804666917576214, "grad_norm": 0.2055930197238922, "learning_rate": 9.094556234412614e-05, "loss": 0.2657, "step": 2452 }, { "epoch": 3.681972149040271, "grad_norm": 0.21787837147712708, "learning_rate": 9.093660768913501e-05, "loss": 0.1709, "step": 2453 }, { "epoch": 3.6834776063229206, "grad_norm": 0.19790588319301605, "learning_rate": 9.092764910417047e-05, "loss": 0.1818, "step": 2454 }, { "epoch": 3.68498306360557, "grad_norm": 0.18933063745498657, "learning_rate": 9.091868659021227e-05, "loss": 0.2436, "step": 2455 }, { "epoch": 3.68648852088822, "grad_norm": 0.1953386664390564, "learning_rate": 9.090972014824049e-05, "loss": 0.2674, "step": 2456 }, { "epoch": 3.6879939781708693, "grad_norm": 0.22640269994735718, "learning_rate": 9.09007497792357e-05, "loss": 0.2686, "step": 2457 }, { "epoch": 3.689499435453519, "grad_norm": 0.17847293615341187, "learning_rate": 9.08917754841789e-05, "loss": 0.2457, "step": 2458 }, { "epoch": 3.6910048927361685, "grad_norm": 0.2120620757341385, "learning_rate": 9.088279726405148e-05, "loss": 0.2561, "step": 2459 }, { "epoch": 3.6925103500188183, "grad_norm": 0.22125066816806793, "learning_rate": 9.087381511983533e-05, "loss": 0.2577, "step": 2460 }, { "epoch": 3.6940158073014677, "grad_norm": 0.23033291101455688, "learning_rate": 9.086482905251267e-05, "loss": 0.2195, "step": 2461 }, { "epoch": 3.6955212645841176, "grad_norm": 0.253165602684021, "learning_rate": 9.085583906306623e-05, "loss": 0.2519, "step": 2462 }, { "epoch": 3.697026721866767, "grad_norm": 0.26090380549430847, "learning_rate": 9.084684515247913e-05, "loss": 0.2177, "step": 2463 }, { "epoch": 3.698532179149417, "grad_norm": 0.243301123380661, "learning_rate": 9.083784732173496e-05, "loss": 0.2241, "step": 2464 }, { "epoch": 3.700037636432066, "grad_norm": 0.2384059876203537, "learning_rate": 9.082884557181768e-05, "loss": 0.3117, "step": 2465 }, { "epoch": 3.701543093714716, "grad_norm": 0.29537805914878845, "learning_rate": 9.081983990371171e-05, "loss": 0.3175, "step": 2466 }, { "epoch": 3.7030485509973654, "grad_norm": 0.2563481628894806, "learning_rate": 9.08108303184019e-05, "loss": 0.2311, "step": 2467 }, { "epoch": 3.704554008280015, "grad_norm": 0.26385101675987244, "learning_rate": 9.080181681687354e-05, "loss": 0.308, "step": 2468 }, { "epoch": 3.7060594655626646, "grad_norm": 0.24886226654052734, "learning_rate": 9.079279940011232e-05, "loss": 0.2402, "step": 2469 }, { "epoch": 3.7075649228453145, "grad_norm": 0.2631031274795532, "learning_rate": 9.078377806910436e-05, "loss": 0.2475, "step": 2470 }, { "epoch": 3.709070380127964, "grad_norm": 0.2637470066547394, "learning_rate": 9.077475282483624e-05, "loss": 0.2208, "step": 2471 }, { "epoch": 3.7105758374106133, "grad_norm": 0.2180352807044983, "learning_rate": 9.076572366829493e-05, "loss": 0.3033, "step": 2472 }, { "epoch": 3.712081294693263, "grad_norm": 0.25879108905792236, "learning_rate": 9.075669060046785e-05, "loss": 0.2324, "step": 2473 }, { "epoch": 3.713586751975913, "grad_norm": 0.3026946783065796, "learning_rate": 9.074765362234286e-05, "loss": 0.2473, "step": 2474 }, { "epoch": 3.7150922092585623, "grad_norm": 0.34327423572540283, "learning_rate": 9.07386127349082e-05, "loss": 0.247, "step": 2475 }, { "epoch": 3.7165976665412117, "grad_norm": 0.3201792538166046, "learning_rate": 9.07295679391526e-05, "loss": 0.2089, "step": 2476 }, { "epoch": 3.7181031238238615, "grad_norm": 0.2229660153388977, "learning_rate": 9.072051923606515e-05, "loss": 0.2233, "step": 2477 }, { "epoch": 3.7196085811065114, "grad_norm": 0.22460316121578217, "learning_rate": 9.071146662663544e-05, "loss": 0.2164, "step": 2478 }, { "epoch": 3.7211140383891608, "grad_norm": 0.3118736743927002, "learning_rate": 9.070241011185343e-05, "loss": 0.2181, "step": 2479 }, { "epoch": 3.72261949567181, "grad_norm": 0.2935691177845001, "learning_rate": 9.069334969270952e-05, "loss": 0.2402, "step": 2480 }, { "epoch": 3.72412495295446, "grad_norm": 0.21931670606136322, "learning_rate": 9.068428537019454e-05, "loss": 0.2022, "step": 2481 }, { "epoch": 3.7256304102371094, "grad_norm": 0.2219967097043991, "learning_rate": 9.067521714529976e-05, "loss": 0.2028, "step": 2482 }, { "epoch": 3.7271358675197592, "grad_norm": 0.19425994157791138, "learning_rate": 9.06661450190169e-05, "loss": 0.2071, "step": 2483 }, { "epoch": 3.7286413248024086, "grad_norm": 0.2317718118429184, "learning_rate": 9.065706899233803e-05, "loss": 0.2029, "step": 2484 }, { "epoch": 3.7301467820850585, "grad_norm": 0.25592872500419617, "learning_rate": 9.06479890662557e-05, "loss": 0.2854, "step": 2485 }, { "epoch": 3.731652239367708, "grad_norm": 0.2718307375907898, "learning_rate": 9.063890524176288e-05, "loss": 0.2571, "step": 2486 }, { "epoch": 3.7331576966503577, "grad_norm": 0.20590956509113312, "learning_rate": 9.062981751985296e-05, "loss": 0.1569, "step": 2487 }, { "epoch": 3.734663153933007, "grad_norm": 0.23257674276828766, "learning_rate": 9.062072590151977e-05, "loss": 0.2924, "step": 2488 }, { "epoch": 3.736168611215657, "grad_norm": 0.2264743447303772, "learning_rate": 9.061163038775757e-05, "loss": 0.2804, "step": 2489 }, { "epoch": 3.7376740684983063, "grad_norm": 0.24960723519325256, "learning_rate": 9.060253097956099e-05, "loss": 0.2187, "step": 2490 }, { "epoch": 3.7391795257809557, "grad_norm": 0.18940657377243042, "learning_rate": 9.059342767792516e-05, "loss": 0.2005, "step": 2491 }, { "epoch": 3.7406849830636055, "grad_norm": 0.2251359075307846, "learning_rate": 9.058432048384558e-05, "loss": 0.338, "step": 2492 }, { "epoch": 3.7421904403462554, "grad_norm": 0.20414413511753082, "learning_rate": 9.057520939831824e-05, "loss": 0.2343, "step": 2493 }, { "epoch": 3.7436958976289048, "grad_norm": 0.23150566220283508, "learning_rate": 9.056609442233945e-05, "loss": 0.2039, "step": 2494 }, { "epoch": 3.745201354911554, "grad_norm": 0.21892328560352325, "learning_rate": 9.055697555690608e-05, "loss": 0.2751, "step": 2495 }, { "epoch": 3.746706812194204, "grad_norm": 0.20429515838623047, "learning_rate": 9.05478528030153e-05, "loss": 0.2662, "step": 2496 }, { "epoch": 3.748212269476854, "grad_norm": 0.21849025785923004, "learning_rate": 9.05387261616648e-05, "loss": 0.2149, "step": 2497 }, { "epoch": 3.749717726759503, "grad_norm": 0.2114565372467041, "learning_rate": 9.05295956338526e-05, "loss": 0.258, "step": 2498 }, { "epoch": 3.7512231840421526, "grad_norm": 0.19521421194076538, "learning_rate": 9.052046122057728e-05, "loss": 0.2211, "step": 2499 }, { "epoch": 3.7527286413248024, "grad_norm": 0.2018977254629135, "learning_rate": 9.051132292283771e-05, "loss": 0.2395, "step": 2500 }, { "epoch": 3.7542340986074523, "grad_norm": 0.21523688733577728, "learning_rate": 9.050218074163327e-05, "loss": 0.2365, "step": 2501 }, { "epoch": 3.7557395558901017, "grad_norm": 0.21863456070423126, "learning_rate": 9.049303467796371e-05, "loss": 0.2642, "step": 2502 }, { "epoch": 3.757245013172751, "grad_norm": 0.21972806751728058, "learning_rate": 9.048388473282924e-05, "loss": 0.2377, "step": 2503 }, { "epoch": 3.758750470455401, "grad_norm": 0.19176726043224335, "learning_rate": 9.047473090723049e-05, "loss": 0.3016, "step": 2504 }, { "epoch": 3.7602559277380503, "grad_norm": 0.2820799648761749, "learning_rate": 9.046557320216849e-05, "loss": 0.2838, "step": 2505 }, { "epoch": 3.7617613850207, "grad_norm": 0.25554734468460083, "learning_rate": 9.045641161864474e-05, "loss": 0.204, "step": 2506 }, { "epoch": 3.7632668423033495, "grad_norm": 0.24530784785747528, "learning_rate": 9.04472461576611e-05, "loss": 0.3489, "step": 2507 }, { "epoch": 3.7647722995859993, "grad_norm": 0.20009317994117737, "learning_rate": 9.043807682021993e-05, "loss": 0.2171, "step": 2508 }, { "epoch": 3.7662777568686487, "grad_norm": 0.24508115649223328, "learning_rate": 9.042890360732397e-05, "loss": 0.2694, "step": 2509 }, { "epoch": 3.7677832141512986, "grad_norm": 0.22399820387363434, "learning_rate": 9.041972651997637e-05, "loss": 0.205, "step": 2510 }, { "epoch": 3.769288671433948, "grad_norm": 0.2793135941028595, "learning_rate": 9.041054555918074e-05, "loss": 0.257, "step": 2511 }, { "epoch": 3.770794128716598, "grad_norm": 0.2839261591434479, "learning_rate": 9.040136072594107e-05, "loss": 0.2532, "step": 2512 }, { "epoch": 3.772299585999247, "grad_norm": 0.2244696319103241, "learning_rate": 9.039217202126182e-05, "loss": 0.1963, "step": 2513 }, { "epoch": 3.773805043281897, "grad_norm": 0.23658886551856995, "learning_rate": 9.038297944614785e-05, "loss": 0.2419, "step": 2514 }, { "epoch": 3.7753105005645464, "grad_norm": 0.228580042719841, "learning_rate": 9.037378300160446e-05, "loss": 0.2186, "step": 2515 }, { "epoch": 3.7768159578471963, "grad_norm": 0.2287733107805252, "learning_rate": 9.036458268863732e-05, "loss": 0.2262, "step": 2516 }, { "epoch": 3.7783214151298457, "grad_norm": 0.22565622627735138, "learning_rate": 9.035537850825261e-05, "loss": 0.2987, "step": 2517 }, { "epoch": 3.779826872412495, "grad_norm": 0.27345606684684753, "learning_rate": 9.034617046145683e-05, "loss": 0.2516, "step": 2518 }, { "epoch": 3.781332329695145, "grad_norm": 0.24360083043575287, "learning_rate": 9.033695854925703e-05, "loss": 0.2779, "step": 2519 }, { "epoch": 3.7828377869777947, "grad_norm": 0.23207171261310577, "learning_rate": 9.032774277266055e-05, "loss": 0.2232, "step": 2520 }, { "epoch": 3.784343244260444, "grad_norm": 0.2212066352367401, "learning_rate": 9.031852313267525e-05, "loss": 0.1953, "step": 2521 }, { "epoch": 3.7858487015430935, "grad_norm": 0.2252817451953888, "learning_rate": 9.030929963030933e-05, "loss": 0.2322, "step": 2522 }, { "epoch": 3.7873541588257433, "grad_norm": 0.2741125226020813, "learning_rate": 9.030007226657151e-05, "loss": 0.2282, "step": 2523 }, { "epoch": 3.788859616108393, "grad_norm": 0.23799727857112885, "learning_rate": 9.029084104247086e-05, "loss": 0.2581, "step": 2524 }, { "epoch": 3.7903650733910426, "grad_norm": 0.24499137699604034, "learning_rate": 9.028160595901689e-05, "loss": 0.2526, "step": 2525 }, { "epoch": 3.791870530673692, "grad_norm": 0.2244609296321869, "learning_rate": 9.027236701721953e-05, "loss": 0.2172, "step": 2526 }, { "epoch": 3.793375987956342, "grad_norm": 0.26282548904418945, "learning_rate": 9.026312421808916e-05, "loss": 0.2418, "step": 2527 }, { "epoch": 3.794881445238991, "grad_norm": 0.22334909439086914, "learning_rate": 9.025387756263654e-05, "loss": 0.2708, "step": 2528 }, { "epoch": 3.796386902521641, "grad_norm": 0.23068417608737946, "learning_rate": 9.024462705187287e-05, "loss": 0.2377, "step": 2529 }, { "epoch": 3.7978923598042904, "grad_norm": 0.20546114444732666, "learning_rate": 9.023537268680978e-05, "loss": 0.2439, "step": 2530 }, { "epoch": 3.7993978170869402, "grad_norm": 0.2112434208393097, "learning_rate": 9.022611446845929e-05, "loss": 0.2353, "step": 2531 }, { "epoch": 3.8009032743695896, "grad_norm": 0.20610584318637848, "learning_rate": 9.02168523978339e-05, "loss": 0.2265, "step": 2532 }, { "epoch": 3.8024087316522395, "grad_norm": 0.22001783549785614, "learning_rate": 9.020758647594646e-05, "loss": 0.2348, "step": 2533 }, { "epoch": 3.803914188934889, "grad_norm": 0.24286778271198273, "learning_rate": 9.019831670381032e-05, "loss": 0.3083, "step": 2534 }, { "epoch": 3.8054196462175387, "grad_norm": 0.22440175712108612, "learning_rate": 9.018904308243917e-05, "loss": 0.2619, "step": 2535 }, { "epoch": 3.806925103500188, "grad_norm": 0.2466883808374405, "learning_rate": 9.017976561284719e-05, "loss": 0.2327, "step": 2536 }, { "epoch": 3.808430560782838, "grad_norm": 0.2308112233877182, "learning_rate": 9.017048429604891e-05, "loss": 0.2761, "step": 2537 }, { "epoch": 3.8099360180654873, "grad_norm": 0.2791782319545746, "learning_rate": 9.016119913305939e-05, "loss": 0.3154, "step": 2538 }, { "epoch": 3.811441475348137, "grad_norm": 0.2961975038051605, "learning_rate": 9.015191012489396e-05, "loss": 0.177, "step": 2539 }, { "epoch": 3.8129469326307865, "grad_norm": 0.29661503434181213, "learning_rate": 9.014261727256849e-05, "loss": 0.2385, "step": 2540 }, { "epoch": 3.814452389913436, "grad_norm": 0.23387178778648376, "learning_rate": 9.013332057709924e-05, "loss": 0.1916, "step": 2541 }, { "epoch": 3.8159578471960858, "grad_norm": 0.22659169137477875, "learning_rate": 9.012402003950286e-05, "loss": 0.2327, "step": 2542 }, { "epoch": 3.8174633044787356, "grad_norm": 0.20759527385234833, "learning_rate": 9.011471566079648e-05, "loss": 0.2126, "step": 2543 }, { "epoch": 3.818968761761385, "grad_norm": 0.19535690546035767, "learning_rate": 9.010540744199759e-05, "loss": 0.2257, "step": 2544 }, { "epoch": 3.8204742190440344, "grad_norm": 0.2159937173128128, "learning_rate": 9.00960953841241e-05, "loss": 0.2274, "step": 2545 }, { "epoch": 3.8219796763266842, "grad_norm": 0.23001006245613098, "learning_rate": 9.00867794881944e-05, "loss": 0.3114, "step": 2546 }, { "epoch": 3.823485133609334, "grad_norm": 0.22288405895233154, "learning_rate": 9.007745975522723e-05, "loss": 0.2209, "step": 2547 }, { "epoch": 3.8249905908919835, "grad_norm": 0.19204328954219818, "learning_rate": 9.006813618624181e-05, "loss": 0.2048, "step": 2548 }, { "epoch": 3.826496048174633, "grad_norm": 0.17672166228294373, "learning_rate": 9.005880878225774e-05, "loss": 0.1723, "step": 2549 }, { "epoch": 3.8280015054572827, "grad_norm": 0.1971510499715805, "learning_rate": 9.004947754429507e-05, "loss": 0.2474, "step": 2550 }, { "epoch": 3.8295069627399325, "grad_norm": 0.20752061903476715, "learning_rate": 9.004014247337422e-05, "loss": 0.2333, "step": 2551 }, { "epoch": 3.831012420022582, "grad_norm": 0.21194815635681152, "learning_rate": 9.003080357051607e-05, "loss": 0.1793, "step": 2552 }, { "epoch": 3.8325178773052313, "grad_norm": 0.19489121437072754, "learning_rate": 9.002146083674189e-05, "loss": 0.291, "step": 2553 }, { "epoch": 3.834023334587881, "grad_norm": 0.2064177542924881, "learning_rate": 9.001211427307343e-05, "loss": 0.2424, "step": 2554 }, { "epoch": 3.8355287918705305, "grad_norm": 0.1879769116640091, "learning_rate": 9.000276388053279e-05, "loss": 0.2875, "step": 2555 }, { "epoch": 3.8370342491531804, "grad_norm": 0.19241654872894287, "learning_rate": 8.999340966014251e-05, "loss": 0.2399, "step": 2556 }, { "epoch": 3.8385397064358298, "grad_norm": 0.17926965653896332, "learning_rate": 8.998405161292557e-05, "loss": 0.1645, "step": 2557 }, { "epoch": 3.8400451637184796, "grad_norm": 0.19164101779460907, "learning_rate": 8.997468973990534e-05, "loss": 0.2811, "step": 2558 }, { "epoch": 3.841550621001129, "grad_norm": 0.19837048649787903, "learning_rate": 8.996532404210562e-05, "loss": 0.235, "step": 2559 }, { "epoch": 3.843056078283779, "grad_norm": 0.19081202149391174, "learning_rate": 8.995595452055063e-05, "loss": 0.2047, "step": 2560 }, { "epoch": 3.844561535566428, "grad_norm": 0.19086527824401855, "learning_rate": 8.994658117626503e-05, "loss": 0.2037, "step": 2561 }, { "epoch": 3.846066992849078, "grad_norm": 0.2171124368906021, "learning_rate": 8.993720401027383e-05, "loss": 0.2401, "step": 2562 }, { "epoch": 3.8475724501317274, "grad_norm": 0.2272588163614273, "learning_rate": 8.992782302360253e-05, "loss": 0.2339, "step": 2563 }, { "epoch": 3.8490779074143773, "grad_norm": 0.1794801652431488, "learning_rate": 8.991843821727703e-05, "loss": 0.2304, "step": 2564 }, { "epoch": 3.8505833646970267, "grad_norm": 0.21115709841251373, "learning_rate": 8.990904959232362e-05, "loss": 0.2953, "step": 2565 }, { "epoch": 3.8520888219796765, "grad_norm": 0.17643369734287262, "learning_rate": 8.989965714976902e-05, "loss": 0.2314, "step": 2566 }, { "epoch": 3.853594279262326, "grad_norm": 0.2014954686164856, "learning_rate": 8.989026089064041e-05, "loss": 0.1983, "step": 2567 }, { "epoch": 3.8550997365449753, "grad_norm": 0.18793179094791412, "learning_rate": 8.98808608159653e-05, "loss": 0.178, "step": 2568 }, { "epoch": 3.856605193827625, "grad_norm": 0.1760825216770172, "learning_rate": 8.987145692677171e-05, "loss": 0.1714, "step": 2569 }, { "epoch": 3.858110651110275, "grad_norm": 0.24124319851398468, "learning_rate": 8.986204922408801e-05, "loss": 0.3106, "step": 2570 }, { "epoch": 3.8596161083929243, "grad_norm": 0.21572449803352356, "learning_rate": 8.985263770894302e-05, "loss": 0.2195, "step": 2571 }, { "epoch": 3.8611215656755737, "grad_norm": 0.19488020241260529, "learning_rate": 8.984322238236598e-05, "loss": 0.2411, "step": 2572 }, { "epoch": 3.8626270229582236, "grad_norm": 0.2010011523962021, "learning_rate": 8.983380324538652e-05, "loss": 0.2093, "step": 2573 }, { "epoch": 3.8641324802408734, "grad_norm": 0.1918937861919403, "learning_rate": 8.982438029903471e-05, "loss": 0.2539, "step": 2574 }, { "epoch": 3.865637937523523, "grad_norm": 0.211927130818367, "learning_rate": 8.981495354434103e-05, "loss": 0.2631, "step": 2575 }, { "epoch": 3.867143394806172, "grad_norm": 0.22594419121742249, "learning_rate": 8.980552298233638e-05, "loss": 0.227, "step": 2576 }, { "epoch": 3.868648852088822, "grad_norm": 0.21631291508674622, "learning_rate": 8.979608861405206e-05, "loss": 0.1699, "step": 2577 }, { "epoch": 3.8701543093714714, "grad_norm": 0.22302784025669098, "learning_rate": 8.97866504405198e-05, "loss": 0.1931, "step": 2578 }, { "epoch": 3.8716597666541213, "grad_norm": 0.2683633863925934, "learning_rate": 8.977720846277175e-05, "loss": 0.2517, "step": 2579 }, { "epoch": 3.8731652239367707, "grad_norm": 0.2490830272436142, "learning_rate": 8.976776268184046e-05, "loss": 0.2567, "step": 2580 }, { "epoch": 3.8746706812194205, "grad_norm": 0.23242713510990143, "learning_rate": 8.975831309875893e-05, "loss": 0.1946, "step": 2581 }, { "epoch": 3.87617613850207, "grad_norm": 0.25940626859664917, "learning_rate": 8.974885971456052e-05, "loss": 0.2325, "step": 2582 }, { "epoch": 3.8776815957847197, "grad_norm": 0.22156129777431488, "learning_rate": 8.973940253027908e-05, "loss": 0.2469, "step": 2583 }, { "epoch": 3.879187053067369, "grad_norm": 0.23077180981636047, "learning_rate": 8.972994154694881e-05, "loss": 0.269, "step": 2584 }, { "epoch": 3.880692510350019, "grad_norm": 0.214949369430542, "learning_rate": 8.972047676560433e-05, "loss": 0.2411, "step": 2585 }, { "epoch": 3.8821979676326683, "grad_norm": 0.20024323463439941, "learning_rate": 8.971100818728072e-05, "loss": 0.2286, "step": 2586 }, { "epoch": 3.883703424915318, "grad_norm": 0.3021923899650574, "learning_rate": 8.970153581301344e-05, "loss": 0.2234, "step": 2587 }, { "epoch": 3.8852088821979676, "grad_norm": 0.2788175046443939, "learning_rate": 8.969205964383839e-05, "loss": 0.3593, "step": 2588 }, { "epoch": 3.8867143394806174, "grad_norm": 0.25186803936958313, "learning_rate": 8.968257968079184e-05, "loss": 0.225, "step": 2589 }, { "epoch": 3.888219796763267, "grad_norm": 0.2414349913597107, "learning_rate": 8.967309592491052e-05, "loss": 0.2108, "step": 2590 }, { "epoch": 3.889725254045916, "grad_norm": 0.24578924477100372, "learning_rate": 8.966360837723157e-05, "loss": 0.1724, "step": 2591 }, { "epoch": 3.891230711328566, "grad_norm": 0.21999205648899078, "learning_rate": 8.965411703879251e-05, "loss": 0.1986, "step": 2592 }, { "epoch": 3.892736168611216, "grad_norm": 0.27215293049812317, "learning_rate": 8.964462191063132e-05, "loss": 0.1559, "step": 2593 }, { "epoch": 3.8942416258938652, "grad_norm": 0.28607088327407837, "learning_rate": 8.963512299378636e-05, "loss": 0.2862, "step": 2594 }, { "epoch": 3.8957470831765146, "grad_norm": 0.2586335837841034, "learning_rate": 8.962562028929645e-05, "loss": 0.2466, "step": 2595 }, { "epoch": 3.8972525404591645, "grad_norm": 0.22391396760940552, "learning_rate": 8.961611379820072e-05, "loss": 0.2282, "step": 2596 }, { "epoch": 3.8987579977418143, "grad_norm": 0.2760674059391022, "learning_rate": 8.960660352153885e-05, "loss": 0.2145, "step": 2597 }, { "epoch": 3.9002634550244637, "grad_norm": 0.26900938153266907, "learning_rate": 8.959708946035087e-05, "loss": 0.2705, "step": 2598 }, { "epoch": 3.901768912307113, "grad_norm": 0.22425170242786407, "learning_rate": 8.958757161567716e-05, "loss": 0.2242, "step": 2599 }, { "epoch": 3.903274369589763, "grad_norm": 0.2481439858675003, "learning_rate": 8.957804998855866e-05, "loss": 0.2731, "step": 2600 }, { "epoch": 3.903274369589763, "eval_loss": 0.24935732781887054, "eval_runtime": 541.1505, "eval_samples_per_second": 17.79, "eval_steps_per_second": 0.556, "step": 2600 }, { "epoch": 3.9047798268724128, "grad_norm": 0.3116472363471985, "learning_rate": 8.956852458003659e-05, "loss": 0.2445, "step": 2601 }, { "epoch": 3.906285284155062, "grad_norm": 0.33569714426994324, "learning_rate": 8.955899539115264e-05, "loss": 0.2426, "step": 2602 }, { "epoch": 3.9077907414377115, "grad_norm": 0.2482239305973053, "learning_rate": 8.954946242294891e-05, "loss": 0.2572, "step": 2603 }, { "epoch": 3.9092961987203614, "grad_norm": 0.20993918180465698, "learning_rate": 8.953992567646792e-05, "loss": 0.222, "step": 2604 }, { "epoch": 3.9108016560030108, "grad_norm": 0.24962006509304047, "learning_rate": 8.953038515275258e-05, "loss": 0.18, "step": 2605 }, { "epoch": 3.9123071132856606, "grad_norm": 0.22597786784172058, "learning_rate": 8.952084085284622e-05, "loss": 0.2532, "step": 2606 }, { "epoch": 3.91381257056831, "grad_norm": 0.1960928738117218, "learning_rate": 8.951129277779263e-05, "loss": 0.2029, "step": 2607 }, { "epoch": 3.91531802785096, "grad_norm": 0.2516019940376282, "learning_rate": 8.950174092863596e-05, "loss": 0.2031, "step": 2608 }, { "epoch": 3.9168234851336092, "grad_norm": 0.21801748871803284, "learning_rate": 8.949218530642075e-05, "loss": 0.2606, "step": 2609 }, { "epoch": 3.918328942416259, "grad_norm": 0.21399681270122528, "learning_rate": 8.948262591219203e-05, "loss": 0.1722, "step": 2610 }, { "epoch": 3.9198343996989085, "grad_norm": 0.20963166654109955, "learning_rate": 8.947306274699516e-05, "loss": 0.161, "step": 2611 }, { "epoch": 3.9213398569815583, "grad_norm": 0.21257001161575317, "learning_rate": 8.946349581187599e-05, "loss": 0.2145, "step": 2612 }, { "epoch": 3.9228453142642077, "grad_norm": 0.21650531888008118, "learning_rate": 8.945392510788075e-05, "loss": 0.2188, "step": 2613 }, { "epoch": 3.9243507715468575, "grad_norm": 0.17235924303531647, "learning_rate": 8.944435063605604e-05, "loss": 0.2332, "step": 2614 }, { "epoch": 3.925856228829507, "grad_norm": 0.21935121715068817, "learning_rate": 8.943477239744892e-05, "loss": 0.235, "step": 2615 }, { "epoch": 3.9273616861121567, "grad_norm": 0.17776541411876678, "learning_rate": 8.94251903931069e-05, "loss": 0.1698, "step": 2616 }, { "epoch": 3.928867143394806, "grad_norm": 0.20738331973552704, "learning_rate": 8.941560462407778e-05, "loss": 0.256, "step": 2617 }, { "epoch": 3.9303726006774555, "grad_norm": 0.21276316046714783, "learning_rate": 8.940601509140991e-05, "loss": 0.2468, "step": 2618 }, { "epoch": 3.9318780579601054, "grad_norm": 0.5534018874168396, "learning_rate": 8.939642179615194e-05, "loss": 0.2208, "step": 2619 }, { "epoch": 3.933383515242755, "grad_norm": 0.39186134934425354, "learning_rate": 8.9386824739353e-05, "loss": 0.1876, "step": 2620 }, { "epoch": 3.9348889725254046, "grad_norm": 0.20762744545936584, "learning_rate": 8.937722392206261e-05, "loss": 0.2446, "step": 2621 }, { "epoch": 3.936394429808054, "grad_norm": 0.22030818462371826, "learning_rate": 8.93676193453307e-05, "loss": 0.2218, "step": 2622 }, { "epoch": 3.937899887090704, "grad_norm": 0.23639872670173645, "learning_rate": 8.93580110102076e-05, "loss": 0.2216, "step": 2623 }, { "epoch": 3.9394053443733537, "grad_norm": 0.21274472773075104, "learning_rate": 8.934839891774408e-05, "loss": 0.2732, "step": 2624 }, { "epoch": 3.940910801656003, "grad_norm": 0.180352121591568, "learning_rate": 8.93387830689913e-05, "loss": 0.2175, "step": 2625 }, { "epoch": 3.9424162589386524, "grad_norm": 0.19176195561885834, "learning_rate": 8.932916346500082e-05, "loss": 0.1683, "step": 2626 }, { "epoch": 3.9439217162213023, "grad_norm": 0.18878008425235748, "learning_rate": 8.931954010682464e-05, "loss": 0.2491, "step": 2627 }, { "epoch": 3.9454271735039517, "grad_norm": 0.1961105763912201, "learning_rate": 8.930991299551515e-05, "loss": 0.2234, "step": 2628 }, { "epoch": 3.9469326307866015, "grad_norm": 0.18719978630542755, "learning_rate": 8.930028213212517e-05, "loss": 0.2531, "step": 2629 }, { "epoch": 3.948438088069251, "grad_norm": 0.19737806916236877, "learning_rate": 8.929064751770789e-05, "loss": 0.2419, "step": 2630 }, { "epoch": 3.9499435453519007, "grad_norm": 0.1999458223581314, "learning_rate": 8.928100915331698e-05, "loss": 0.1997, "step": 2631 }, { "epoch": 3.95144900263455, "grad_norm": 0.1995745301246643, "learning_rate": 8.927136704000643e-05, "loss": 0.2384, "step": 2632 }, { "epoch": 3.9529544599172, "grad_norm": 0.1803729087114334, "learning_rate": 8.926172117883071e-05, "loss": 0.2375, "step": 2633 }, { "epoch": 3.9544599171998494, "grad_norm": 0.17044882476329803, "learning_rate": 8.925207157084466e-05, "loss": 0.1966, "step": 2634 }, { "epoch": 3.955965374482499, "grad_norm": 0.21615274250507355, "learning_rate": 8.924241821710358e-05, "loss": 0.2535, "step": 2635 }, { "epoch": 3.9574708317651486, "grad_norm": 0.20490120351314545, "learning_rate": 8.923276111866312e-05, "loss": 0.2473, "step": 2636 }, { "epoch": 3.9589762890477984, "grad_norm": 0.1844978779554367, "learning_rate": 8.922310027657937e-05, "loss": 0.159, "step": 2637 }, { "epoch": 3.960481746330448, "grad_norm": 0.20069366693496704, "learning_rate": 8.921343569190884e-05, "loss": 0.1669, "step": 2638 }, { "epoch": 3.9619872036130976, "grad_norm": 0.22136624157428741, "learning_rate": 8.920376736570839e-05, "loss": 0.2585, "step": 2639 }, { "epoch": 3.963492660895747, "grad_norm": 0.24107734858989716, "learning_rate": 8.91940952990354e-05, "loss": 0.2387, "step": 2640 }, { "epoch": 3.9649981181783964, "grad_norm": 0.2286645472049713, "learning_rate": 8.918441949294752e-05, "loss": 0.2632, "step": 2641 }, { "epoch": 3.9665035754610463, "grad_norm": 0.21570859849452972, "learning_rate": 8.917473994850295e-05, "loss": 0.22, "step": 2642 }, { "epoch": 3.968009032743696, "grad_norm": 0.23246878385543823, "learning_rate": 8.91650566667602e-05, "loss": 0.2144, "step": 2643 }, { "epoch": 3.9695144900263455, "grad_norm": 0.22824488580226898, "learning_rate": 8.91553696487782e-05, "loss": 0.1915, "step": 2644 }, { "epoch": 3.971019947308995, "grad_norm": 0.218861922621727, "learning_rate": 8.914567889561636e-05, "loss": 0.2042, "step": 2645 }, { "epoch": 3.9725254045916447, "grad_norm": 0.18152187764644623, "learning_rate": 8.913598440833438e-05, "loss": 0.1984, "step": 2646 }, { "epoch": 3.9740308618742946, "grad_norm": 0.19670970737934113, "learning_rate": 8.91262861879925e-05, "loss": 0.248, "step": 2647 }, { "epoch": 3.975536319156944, "grad_norm": 0.21129253506660461, "learning_rate": 8.911658423565125e-05, "loss": 0.2705, "step": 2648 }, { "epoch": 3.9770417764395933, "grad_norm": 0.2121150940656662, "learning_rate": 8.910687855237164e-05, "loss": 0.1594, "step": 2649 }, { "epoch": 3.978547233722243, "grad_norm": 0.20443111658096313, "learning_rate": 8.909716913921508e-05, "loss": 0.26, "step": 2650 }, { "epoch": 3.980052691004893, "grad_norm": 0.2029525637626648, "learning_rate": 8.908745599724335e-05, "loss": 0.2096, "step": 2651 }, { "epoch": 3.9815581482875424, "grad_norm": 0.19130145013332367, "learning_rate": 8.90777391275187e-05, "loss": 0.19, "step": 2652 }, { "epoch": 3.983063605570192, "grad_norm": 0.21155308187007904, "learning_rate": 8.906801853110373e-05, "loss": 0.24, "step": 2653 }, { "epoch": 3.9845690628528416, "grad_norm": 0.24413460493087769, "learning_rate": 8.905829420906145e-05, "loss": 0.2564, "step": 2654 }, { "epoch": 3.986074520135491, "grad_norm": 0.23777168989181519, "learning_rate": 8.904856616245534e-05, "loss": 0.2136, "step": 2655 }, { "epoch": 3.987579977418141, "grad_norm": 0.22174957394599915, "learning_rate": 8.903883439234924e-05, "loss": 0.222, "step": 2656 }, { "epoch": 3.9890854347007902, "grad_norm": 0.22809618711471558, "learning_rate": 8.902909889980737e-05, "loss": 0.2152, "step": 2657 }, { "epoch": 3.99059089198344, "grad_norm": 0.24964728951454163, "learning_rate": 8.901935968589443e-05, "loss": 0.2356, "step": 2658 }, { "epoch": 3.9920963492660895, "grad_norm": 0.2296449840068817, "learning_rate": 8.900961675167543e-05, "loss": 0.215, "step": 2659 }, { "epoch": 3.9936018065487393, "grad_norm": 0.19847434759140015, "learning_rate": 8.899987009821588e-05, "loss": 0.2197, "step": 2660 }, { "epoch": 3.9951072638313887, "grad_norm": 0.23093271255493164, "learning_rate": 8.899011972658166e-05, "loss": 0.2195, "step": 2661 }, { "epoch": 3.9966127211140385, "grad_norm": 0.1993289291858673, "learning_rate": 8.898036563783906e-05, "loss": 0.2015, "step": 2662 }, { "epoch": 3.998118178396688, "grad_norm": 0.20724251866340637, "learning_rate": 8.897060783305476e-05, "loss": 0.2525, "step": 2663 }, { "epoch": 3.9996236356793378, "grad_norm": 0.27835121750831604, "learning_rate": 8.896084631329584e-05, "loss": 0.3188, "step": 2664 }, { "epoch": 4.001129092961987, "grad_norm": 0.22121590375900269, "learning_rate": 8.895108107962985e-05, "loss": 0.2693, "step": 2665 }, { "epoch": 4.002634550244637, "grad_norm": 0.2303275614976883, "learning_rate": 8.894131213312467e-05, "loss": 0.2299, "step": 2666 }, { "epoch": 4.004140007527287, "grad_norm": 0.25738129019737244, "learning_rate": 8.893153947484863e-05, "loss": 0.2643, "step": 2667 }, { "epoch": 4.005645464809936, "grad_norm": 0.23314355313777924, "learning_rate": 8.892176310587044e-05, "loss": 0.2048, "step": 2668 }, { "epoch": 4.007150922092586, "grad_norm": 0.19431562721729279, "learning_rate": 8.891198302725925e-05, "loss": 0.1866, "step": 2669 }, { "epoch": 4.0086563793752354, "grad_norm": 0.2078378051519394, "learning_rate": 8.890219924008456e-05, "loss": 0.159, "step": 2670 }, { "epoch": 4.010161836657884, "grad_norm": 0.25362735986709595, "learning_rate": 8.889241174541636e-05, "loss": 0.1915, "step": 2671 }, { "epoch": 4.011667293940534, "grad_norm": 0.1992723047733307, "learning_rate": 8.888262054432496e-05, "loss": 0.2401, "step": 2672 }, { "epoch": 4.013172751223184, "grad_norm": 0.2133617401123047, "learning_rate": 8.88728256378811e-05, "loss": 0.2314, "step": 2673 }, { "epoch": 4.014678208505834, "grad_norm": 0.18629099428653717, "learning_rate": 8.886302702715598e-05, "loss": 0.2115, "step": 2674 }, { "epoch": 4.016183665788483, "grad_norm": 0.2308099865913391, "learning_rate": 8.885322471322112e-05, "loss": 0.2343, "step": 2675 }, { "epoch": 4.017689123071133, "grad_norm": 0.20399737358093262, "learning_rate": 8.88434186971485e-05, "loss": 0.2574, "step": 2676 }, { "epoch": 4.0191945803537825, "grad_norm": 0.20653481781482697, "learning_rate": 8.883360898001051e-05, "loss": 0.1888, "step": 2677 }, { "epoch": 4.020700037636432, "grad_norm": 0.1679053157567978, "learning_rate": 8.88237955628799e-05, "loss": 0.1725, "step": 2678 }, { "epoch": 4.022205494919081, "grad_norm": 0.20669300854206085, "learning_rate": 8.881397844682986e-05, "loss": 0.3019, "step": 2679 }, { "epoch": 4.023710952201731, "grad_norm": 0.20395563542842865, "learning_rate": 8.880415763293398e-05, "loss": 0.2131, "step": 2680 }, { "epoch": 4.025216409484381, "grad_norm": 0.22672021389007568, "learning_rate": 8.879433312226625e-05, "loss": 0.2345, "step": 2681 }, { "epoch": 4.026721866767031, "grad_norm": 0.2131263017654419, "learning_rate": 8.878450491590105e-05, "loss": 0.2445, "step": 2682 }, { "epoch": 4.02822732404968, "grad_norm": 0.1819937527179718, "learning_rate": 8.877467301491318e-05, "loss": 0.1889, "step": 2683 }, { "epoch": 4.02973278133233, "grad_norm": 0.19153432548046112, "learning_rate": 8.876483742037785e-05, "loss": 0.2022, "step": 2684 }, { "epoch": 4.031238238614979, "grad_norm": 0.2328711748123169, "learning_rate": 8.875499813337069e-05, "loss": 0.2363, "step": 2685 }, { "epoch": 4.032743695897629, "grad_norm": 0.2560531497001648, "learning_rate": 8.874515515496767e-05, "loss": 0.1926, "step": 2686 }, { "epoch": 4.034249153180278, "grad_norm": 0.3048231601715088, "learning_rate": 8.873530848624521e-05, "loss": 0.2127, "step": 2687 }, { "epoch": 4.035754610462928, "grad_norm": 0.3358861804008484, "learning_rate": 8.872545812828013e-05, "loss": 0.2026, "step": 2688 }, { "epoch": 4.037260067745578, "grad_norm": 0.2976076006889343, "learning_rate": 8.871560408214967e-05, "loss": 0.2797, "step": 2689 }, { "epoch": 4.038765525028228, "grad_norm": 0.21699094772338867, "learning_rate": 8.870574634893143e-05, "loss": 0.2099, "step": 2690 }, { "epoch": 4.040270982310877, "grad_norm": 0.24257102608680725, "learning_rate": 8.869588492970344e-05, "loss": 0.1948, "step": 2691 }, { "epoch": 4.0417764395935265, "grad_norm": 0.26527902483940125, "learning_rate": 8.868601982554413e-05, "loss": 0.2526, "step": 2692 }, { "epoch": 4.043281896876176, "grad_norm": 0.202656552195549, "learning_rate": 8.867615103753236e-05, "loss": 0.1821, "step": 2693 }, { "epoch": 4.044787354158826, "grad_norm": 0.2327696830034256, "learning_rate": 8.866627856674731e-05, "loss": 0.2341, "step": 2694 }, { "epoch": 4.046292811441475, "grad_norm": 0.24058781564235687, "learning_rate": 8.86564024142687e-05, "loss": 0.2014, "step": 2695 }, { "epoch": 4.047798268724125, "grad_norm": 0.23191601037979126, "learning_rate": 8.86465225811765e-05, "loss": 0.2338, "step": 2696 }, { "epoch": 4.049303726006775, "grad_norm": 0.24628575146198273, "learning_rate": 8.863663906855117e-05, "loss": 0.1799, "step": 2697 }, { "epoch": 4.050809183289424, "grad_norm": 0.2576132118701935, "learning_rate": 8.862675187747356e-05, "loss": 0.2733, "step": 2698 }, { "epoch": 4.052314640572074, "grad_norm": 0.2738482356071472, "learning_rate": 8.861686100902495e-05, "loss": 0.2586, "step": 2699 }, { "epoch": 4.053820097854723, "grad_norm": 0.22510217130184174, "learning_rate": 8.860696646428693e-05, "loss": 0.2118, "step": 2700 }, { "epoch": 4.055325555137373, "grad_norm": 0.20970791578292847, "learning_rate": 8.85970682443416e-05, "loss": 0.2176, "step": 2701 }, { "epoch": 4.056831012420022, "grad_norm": 0.25661996006965637, "learning_rate": 8.858716635027139e-05, "loss": 0.1834, "step": 2702 }, { "epoch": 4.058336469702672, "grad_norm": 0.24595408141613007, "learning_rate": 8.857726078315918e-05, "loss": 0.1895, "step": 2703 }, { "epoch": 4.059841926985322, "grad_norm": 0.21520452201366425, "learning_rate": 8.85673515440882e-05, "loss": 0.2173, "step": 2704 }, { "epoch": 4.061347384267972, "grad_norm": 0.22178715467453003, "learning_rate": 8.855743863414214e-05, "loss": 0.2323, "step": 2705 }, { "epoch": 4.062852841550621, "grad_norm": 0.19610169529914856, "learning_rate": 8.854752205440501e-05, "loss": 0.2173, "step": 2706 }, { "epoch": 4.0643582988332705, "grad_norm": 0.18256628513336182, "learning_rate": 8.853760180596134e-05, "loss": 0.1921, "step": 2707 }, { "epoch": 4.06586375611592, "grad_norm": 0.2198975831270218, "learning_rate": 8.852767788989594e-05, "loss": 0.2448, "step": 2708 }, { "epoch": 4.06736921339857, "grad_norm": 0.18200254440307617, "learning_rate": 8.851775030729411e-05, "loss": 0.195, "step": 2709 }, { "epoch": 4.068874670681219, "grad_norm": 0.22348785400390625, "learning_rate": 8.85078190592415e-05, "loss": 0.2189, "step": 2710 }, { "epoch": 4.070380127963869, "grad_norm": 0.21850018203258514, "learning_rate": 8.849788414682416e-05, "loss": 0.2407, "step": 2711 }, { "epoch": 4.071885585246519, "grad_norm": 0.19786790013313293, "learning_rate": 8.848794557112857e-05, "loss": 0.2709, "step": 2712 }, { "epoch": 4.073391042529169, "grad_norm": 0.19164007902145386, "learning_rate": 8.847800333324162e-05, "loss": 0.2016, "step": 2713 }, { "epoch": 4.074896499811818, "grad_norm": 0.19480003416538239, "learning_rate": 8.846805743425055e-05, "loss": 0.2804, "step": 2714 }, { "epoch": 4.076401957094467, "grad_norm": 0.1958816945552826, "learning_rate": 8.845810787524304e-05, "loss": 0.1678, "step": 2715 }, { "epoch": 4.077907414377117, "grad_norm": 0.19604083895683289, "learning_rate": 8.844815465730716e-05, "loss": 0.2059, "step": 2716 }, { "epoch": 4.079412871659767, "grad_norm": 0.17135240137577057, "learning_rate": 8.843819778153137e-05, "loss": 0.2178, "step": 2717 }, { "epoch": 4.080918328942416, "grad_norm": 0.2011253535747528, "learning_rate": 8.842823724900453e-05, "loss": 0.2322, "step": 2718 }, { "epoch": 4.082423786225066, "grad_norm": 0.23345261812210083, "learning_rate": 8.841827306081595e-05, "loss": 0.2175, "step": 2719 }, { "epoch": 4.083929243507716, "grad_norm": 0.26557233929634094, "learning_rate": 8.840830521805525e-05, "loss": 0.2218, "step": 2720 }, { "epoch": 4.085434700790365, "grad_norm": 0.25532618165016174, "learning_rate": 8.839833372181254e-05, "loss": 0.2438, "step": 2721 }, { "epoch": 4.0869401580730145, "grad_norm": 0.20371879637241364, "learning_rate": 8.838835857317825e-05, "loss": 0.2209, "step": 2722 }, { "epoch": 4.088445615355664, "grad_norm": 0.20543085038661957, "learning_rate": 8.837837977324328e-05, "loss": 0.1727, "step": 2723 }, { "epoch": 4.089951072638314, "grad_norm": 0.26026809215545654, "learning_rate": 8.836839732309887e-05, "loss": 0.2307, "step": 2724 }, { "epoch": 4.091456529920963, "grad_norm": 0.20358024537563324, "learning_rate": 8.83584112238367e-05, "loss": 0.2153, "step": 2725 }, { "epoch": 4.092961987203613, "grad_norm": 0.16862739622592926, "learning_rate": 8.834842147654883e-05, "loss": 0.1754, "step": 2726 }, { "epoch": 4.094467444486263, "grad_norm": 0.23010730743408203, "learning_rate": 8.833842808232773e-05, "loss": 0.2284, "step": 2727 }, { "epoch": 4.095972901768913, "grad_norm": 0.25758689641952515, "learning_rate": 8.832843104226625e-05, "loss": 0.2964, "step": 2728 }, { "epoch": 4.0974783590515615, "grad_norm": 0.24872715771198273, "learning_rate": 8.831843035745765e-05, "loss": 0.2875, "step": 2729 }, { "epoch": 4.098983816334211, "grad_norm": 0.20517683029174805, "learning_rate": 8.830842602899563e-05, "loss": 0.1596, "step": 2730 }, { "epoch": 4.100489273616861, "grad_norm": 0.22950759530067444, "learning_rate": 8.82984180579742e-05, "loss": 0.2672, "step": 2731 }, { "epoch": 4.101994730899511, "grad_norm": 0.2224380373954773, "learning_rate": 8.828840644548784e-05, "loss": 0.3095, "step": 2732 }, { "epoch": 4.10350018818216, "grad_norm": 0.21887889504432678, "learning_rate": 8.82783911926314e-05, "loss": 0.2235, "step": 2733 }, { "epoch": 4.10500564546481, "grad_norm": 0.22940893471240997, "learning_rate": 8.826837230050014e-05, "loss": 0.1869, "step": 2734 }, { "epoch": 4.10651110274746, "grad_norm": 0.207151859998703, "learning_rate": 8.825834977018968e-05, "loss": 0.2335, "step": 2735 }, { "epoch": 4.1080165600301095, "grad_norm": 0.24850021302700043, "learning_rate": 8.824832360279612e-05, "loss": 0.2502, "step": 2736 }, { "epoch": 4.1095220173127585, "grad_norm": 0.244556725025177, "learning_rate": 8.823829379941586e-05, "loss": 0.2799, "step": 2737 }, { "epoch": 4.111027474595408, "grad_norm": 0.21283917129039764, "learning_rate": 8.822826036114577e-05, "loss": 0.2329, "step": 2738 }, { "epoch": 4.112532931878058, "grad_norm": 0.19060036540031433, "learning_rate": 8.821822328908308e-05, "loss": 0.2007, "step": 2739 }, { "epoch": 4.114038389160708, "grad_norm": 0.17279846966266632, "learning_rate": 8.820818258432543e-05, "loss": 0.1705, "step": 2740 }, { "epoch": 4.115543846443357, "grad_norm": 0.1989525854587555, "learning_rate": 8.819813824797088e-05, "loss": 0.1871, "step": 2741 }, { "epoch": 4.117049303726007, "grad_norm": 0.22337159514427185, "learning_rate": 8.818809028111783e-05, "loss": 0.2431, "step": 2742 }, { "epoch": 4.118554761008657, "grad_norm": 0.23791033029556274, "learning_rate": 8.817803868486512e-05, "loss": 0.2908, "step": 2743 }, { "epoch": 4.120060218291306, "grad_norm": 0.21523727476596832, "learning_rate": 8.816798346031199e-05, "loss": 0.2298, "step": 2744 }, { "epoch": 4.121565675573955, "grad_norm": 0.19422776997089386, "learning_rate": 8.815792460855806e-05, "loss": 0.2005, "step": 2745 }, { "epoch": 4.123071132856605, "grad_norm": 0.19336199760437012, "learning_rate": 8.814786213070334e-05, "loss": 0.2803, "step": 2746 }, { "epoch": 4.124576590139255, "grad_norm": 0.21009941399097443, "learning_rate": 8.813779602784825e-05, "loss": 0.1434, "step": 2747 }, { "epoch": 4.126082047421904, "grad_norm": 0.2210082709789276, "learning_rate": 8.812772630109363e-05, "loss": 0.2188, "step": 2748 }, { "epoch": 4.127587504704554, "grad_norm": 0.1862749606370926, "learning_rate": 8.811765295154064e-05, "loss": 0.2246, "step": 2749 }, { "epoch": 4.129092961987204, "grad_norm": 0.18672747910022736, "learning_rate": 8.810757598029093e-05, "loss": 0.2121, "step": 2750 }, { "epoch": 4.1305984192698535, "grad_norm": 0.21503429114818573, "learning_rate": 8.809749538844648e-05, "loss": 0.2446, "step": 2751 }, { "epoch": 4.132103876552502, "grad_norm": 0.18072117865085602, "learning_rate": 8.80874111771097e-05, "loss": 0.2124, "step": 2752 }, { "epoch": 4.133609333835152, "grad_norm": 0.21462693810462952, "learning_rate": 8.807732334738338e-05, "loss": 0.2042, "step": 2753 }, { "epoch": 4.135114791117802, "grad_norm": 0.22279226779937744, "learning_rate": 8.806723190037071e-05, "loss": 0.2134, "step": 2754 }, { "epoch": 4.136620248400452, "grad_norm": 0.17466652393341064, "learning_rate": 8.805713683717527e-05, "loss": 0.2157, "step": 2755 }, { "epoch": 4.138125705683101, "grad_norm": 0.20544803142547607, "learning_rate": 8.804703815890105e-05, "loss": 0.2112, "step": 2756 }, { "epoch": 4.139631162965751, "grad_norm": 0.2074091136455536, "learning_rate": 8.803693586665244e-05, "loss": 0.2276, "step": 2757 }, { "epoch": 4.141136620248401, "grad_norm": 0.21451234817504883, "learning_rate": 8.802682996153418e-05, "loss": 0.2156, "step": 2758 }, { "epoch": 4.14264207753105, "grad_norm": 0.23977865278720856, "learning_rate": 8.801672044465144e-05, "loss": 0.2581, "step": 2759 }, { "epoch": 4.144147534813699, "grad_norm": 0.23143914341926575, "learning_rate": 8.800660731710981e-05, "loss": 0.2402, "step": 2760 }, { "epoch": 4.145652992096349, "grad_norm": 0.2675269842147827, "learning_rate": 8.799649058001521e-05, "loss": 0.2406, "step": 2761 }, { "epoch": 4.147158449378999, "grad_norm": 0.26960447430610657, "learning_rate": 8.798637023447401e-05, "loss": 0.2287, "step": 2762 }, { "epoch": 4.148663906661649, "grad_norm": 0.21294482052326202, "learning_rate": 8.797624628159296e-05, "loss": 0.1743, "step": 2763 }, { "epoch": 4.150169363944298, "grad_norm": 0.2252664715051651, "learning_rate": 8.796611872247921e-05, "loss": 0.2365, "step": 2764 }, { "epoch": 4.151674821226948, "grad_norm": 0.23333865404129028, "learning_rate": 8.795598755824026e-05, "loss": 0.2357, "step": 2765 }, { "epoch": 4.1531802785095975, "grad_norm": 0.21779775619506836, "learning_rate": 8.794585278998407e-05, "loss": 0.1837, "step": 2766 }, { "epoch": 4.154685735792247, "grad_norm": 0.21975177526474, "learning_rate": 8.793571441881896e-05, "loss": 0.214, "step": 2767 }, { "epoch": 4.156191193074896, "grad_norm": 0.2316526174545288, "learning_rate": 8.792557244585363e-05, "loss": 0.2106, "step": 2768 }, { "epoch": 4.157696650357546, "grad_norm": 0.21477894484996796, "learning_rate": 8.79154268721972e-05, "loss": 0.1872, "step": 2769 }, { "epoch": 4.159202107640196, "grad_norm": 0.20140749216079712, "learning_rate": 8.790527769895917e-05, "loss": 0.2154, "step": 2770 }, { "epoch": 4.160707564922845, "grad_norm": 0.20108355581760406, "learning_rate": 8.789512492724945e-05, "loss": 0.2211, "step": 2771 }, { "epoch": 4.162213022205495, "grad_norm": 0.20511029660701752, "learning_rate": 8.788496855817832e-05, "loss": 0.1767, "step": 2772 }, { "epoch": 4.1637184794881446, "grad_norm": 0.20304182171821594, "learning_rate": 8.787480859285648e-05, "loss": 0.2314, "step": 2773 }, { "epoch": 4.165223936770794, "grad_norm": 0.2239808589220047, "learning_rate": 8.7864645032395e-05, "loss": 0.206, "step": 2774 }, { "epoch": 4.166729394053443, "grad_norm": 0.1850244104862213, "learning_rate": 8.785447787790534e-05, "loss": 0.2065, "step": 2775 }, { "epoch": 4.168234851336093, "grad_norm": 0.19804197549819946, "learning_rate": 8.784430713049939e-05, "loss": 0.1937, "step": 2776 }, { "epoch": 4.169740308618743, "grad_norm": 0.20651300251483917, "learning_rate": 8.783413279128936e-05, "loss": 0.2034, "step": 2777 }, { "epoch": 4.171245765901393, "grad_norm": 0.21799713373184204, "learning_rate": 8.782395486138797e-05, "loss": 0.2179, "step": 2778 }, { "epoch": 4.172751223184042, "grad_norm": 0.24571563303470612, "learning_rate": 8.781377334190819e-05, "loss": 0.2739, "step": 2779 }, { "epoch": 4.174256680466692, "grad_norm": 0.1818808615207672, "learning_rate": 8.780358823396352e-05, "loss": 0.1629, "step": 2780 }, { "epoch": 4.1757621377493415, "grad_norm": 0.2129875123500824, "learning_rate": 8.779339953866777e-05, "loss": 0.1937, "step": 2781 }, { "epoch": 4.177267595031991, "grad_norm": 0.2364037185907364, "learning_rate": 8.778320725713512e-05, "loss": 0.1992, "step": 2782 }, { "epoch": 4.17877305231464, "grad_norm": 0.2184993475675583, "learning_rate": 8.777301139048025e-05, "loss": 0.2025, "step": 2783 }, { "epoch": 4.18027850959729, "grad_norm": 0.21871216595172882, "learning_rate": 8.776281193981809e-05, "loss": 0.2509, "step": 2784 }, { "epoch": 4.18178396687994, "grad_norm": 0.199310302734375, "learning_rate": 8.775260890626408e-05, "loss": 0.1547, "step": 2785 }, { "epoch": 4.18328942416259, "grad_norm": 0.21207790076732635, "learning_rate": 8.774240229093402e-05, "loss": 0.2921, "step": 2786 }, { "epoch": 4.184794881445239, "grad_norm": 0.22893567383289337, "learning_rate": 8.773219209494407e-05, "loss": 0.1982, "step": 2787 }, { "epoch": 4.1863003387278885, "grad_norm": 0.18111400306224823, "learning_rate": 8.772197831941079e-05, "loss": 0.2166, "step": 2788 }, { "epoch": 4.187805796010538, "grad_norm": 0.22967487573623657, "learning_rate": 8.771176096545116e-05, "loss": 0.2543, "step": 2789 }, { "epoch": 4.189311253293188, "grad_norm": 0.1996890753507614, "learning_rate": 8.770154003418254e-05, "loss": 0.2176, "step": 2790 }, { "epoch": 4.190816710575837, "grad_norm": 0.2217959314584732, "learning_rate": 8.769131552672267e-05, "loss": 0.2, "step": 2791 }, { "epoch": 4.192322167858487, "grad_norm": 0.17806179821491241, "learning_rate": 8.768108744418968e-05, "loss": 0.216, "step": 2792 }, { "epoch": 4.193827625141137, "grad_norm": 0.2329476922750473, "learning_rate": 8.767085578770212e-05, "loss": 0.1899, "step": 2793 }, { "epoch": 4.195333082423787, "grad_norm": 0.20178499817848206, "learning_rate": 8.766062055837886e-05, "loss": 0.207, "step": 2794 }, { "epoch": 4.196838539706436, "grad_norm": 0.2253638505935669, "learning_rate": 8.765038175733926e-05, "loss": 0.249, "step": 2795 }, { "epoch": 4.1983439969890854, "grad_norm": 0.2290458083152771, "learning_rate": 8.7640139385703e-05, "loss": 0.2005, "step": 2796 }, { "epoch": 4.199849454271735, "grad_norm": 0.21388152241706848, "learning_rate": 8.762989344459016e-05, "loss": 0.1942, "step": 2797 }, { "epoch": 4.201354911554384, "grad_norm": 0.23319561779499054, "learning_rate": 8.761964393512124e-05, "loss": 0.2066, "step": 2798 }, { "epoch": 4.202860368837034, "grad_norm": 0.23589573800563812, "learning_rate": 8.76093908584171e-05, "loss": 0.2371, "step": 2799 }, { "epoch": 4.204365826119684, "grad_norm": 0.26122379302978516, "learning_rate": 8.759913421559902e-05, "loss": 0.2087, "step": 2800 }, { "epoch": 4.204365826119684, "eval_loss": 0.23951560258865356, "eval_runtime": 533.0154, "eval_samples_per_second": 18.061, "eval_steps_per_second": 0.565, "step": 2800 }, { "epoch": 4.205871283402334, "grad_norm": 0.2823229432106018, "learning_rate": 8.758887400778862e-05, "loss": 0.2435, "step": 2801 }, { "epoch": 4.207376740684983, "grad_norm": 0.2742069363594055, "learning_rate": 8.757861023610794e-05, "loss": 0.1913, "step": 2802 }, { "epoch": 4.2088821979676325, "grad_norm": 0.2270815223455429, "learning_rate": 8.756834290167944e-05, "loss": 0.1564, "step": 2803 }, { "epoch": 4.210387655250282, "grad_norm": 0.2043285369873047, "learning_rate": 8.755807200562593e-05, "loss": 0.1823, "step": 2804 }, { "epoch": 4.211893112532932, "grad_norm": 0.2857407033443451, "learning_rate": 8.75477975490706e-05, "loss": 0.2142, "step": 2805 }, { "epoch": 4.213398569815581, "grad_norm": 0.259597510099411, "learning_rate": 8.753751953313708e-05, "loss": 0.142, "step": 2806 }, { "epoch": 4.214904027098231, "grad_norm": 0.20173980295658112, "learning_rate": 8.752723795894933e-05, "loss": 0.1944, "step": 2807 }, { "epoch": 4.216409484380881, "grad_norm": 0.303693950176239, "learning_rate": 8.751695282763174e-05, "loss": 0.3149, "step": 2808 }, { "epoch": 4.217914941663531, "grad_norm": 0.30634135007858276, "learning_rate": 8.750666414030909e-05, "loss": 0.2739, "step": 2809 }, { "epoch": 4.21942039894618, "grad_norm": 0.2030346691608429, "learning_rate": 8.749637189810654e-05, "loss": 0.2178, "step": 2810 }, { "epoch": 4.220925856228829, "grad_norm": 0.2794303894042969, "learning_rate": 8.748607610214959e-05, "loss": 0.2808, "step": 2811 }, { "epoch": 4.222431313511479, "grad_norm": 0.29936689138412476, "learning_rate": 8.74757767535642e-05, "loss": 0.2178, "step": 2812 }, { "epoch": 4.223936770794129, "grad_norm": 0.22351621091365814, "learning_rate": 8.74654738534767e-05, "loss": 0.2333, "step": 2813 }, { "epoch": 4.225442228076778, "grad_norm": 0.27688607573509216, "learning_rate": 8.745516740301378e-05, "loss": 0.2125, "step": 2814 }, { "epoch": 4.226947685359428, "grad_norm": 0.2606205642223358, "learning_rate": 8.744485740330256e-05, "loss": 0.2187, "step": 2815 }, { "epoch": 4.228453142642078, "grad_norm": 0.21558254957199097, "learning_rate": 8.743454385547052e-05, "loss": 0.2113, "step": 2816 }, { "epoch": 4.229958599924728, "grad_norm": 0.24724222719669342, "learning_rate": 8.742422676064551e-05, "loss": 0.1884, "step": 2817 }, { "epoch": 4.2314640572073765, "grad_norm": 0.22633914649486542, "learning_rate": 8.741390611995581e-05, "loss": 0.2519, "step": 2818 }, { "epoch": 4.232969514490026, "grad_norm": 0.21020342409610748, "learning_rate": 8.740358193453008e-05, "loss": 0.2278, "step": 2819 }, { "epoch": 4.234474971772676, "grad_norm": 0.23018747568130493, "learning_rate": 8.739325420549735e-05, "loss": 0.2582, "step": 2820 }, { "epoch": 4.235980429055326, "grad_norm": 0.2040175050497055, "learning_rate": 8.738292293398705e-05, "loss": 0.1651, "step": 2821 }, { "epoch": 4.237485886337975, "grad_norm": 0.2101309895515442, "learning_rate": 8.737258812112896e-05, "loss": 0.1693, "step": 2822 }, { "epoch": 4.238991343620625, "grad_norm": 0.2153545767068863, "learning_rate": 8.736224976805333e-05, "loss": 0.2168, "step": 2823 }, { "epoch": 4.240496800903275, "grad_norm": 0.21132367849349976, "learning_rate": 8.735190787589069e-05, "loss": 0.1892, "step": 2824 }, { "epoch": 4.242002258185924, "grad_norm": 0.18736021220684052, "learning_rate": 8.734156244577209e-05, "loss": 0.2344, "step": 2825 }, { "epoch": 4.243507715468573, "grad_norm": 0.1951134204864502, "learning_rate": 8.73312134788288e-05, "loss": 0.1913, "step": 2826 }, { "epoch": 4.245013172751223, "grad_norm": 0.22678838670253754, "learning_rate": 8.732086097619265e-05, "loss": 0.2131, "step": 2827 }, { "epoch": 4.246518630033873, "grad_norm": 0.2467191368341446, "learning_rate": 8.731050493899572e-05, "loss": 0.2347, "step": 2828 }, { "epoch": 4.248024087316522, "grad_norm": 0.19583705067634583, "learning_rate": 8.730014536837055e-05, "loss": 0.1876, "step": 2829 }, { "epoch": 4.249529544599172, "grad_norm": 0.19894061982631683, "learning_rate": 8.728978226545003e-05, "loss": 0.1979, "step": 2830 }, { "epoch": 4.251035001881822, "grad_norm": 0.20443131029605865, "learning_rate": 8.72794156313675e-05, "loss": 0.1841, "step": 2831 }, { "epoch": 4.2525404591644715, "grad_norm": 0.1975025236606598, "learning_rate": 8.726904546725658e-05, "loss": 0.1958, "step": 2832 }, { "epoch": 4.2540459164471205, "grad_norm": 0.186842679977417, "learning_rate": 8.725867177425138e-05, "loss": 0.2108, "step": 2833 }, { "epoch": 4.25555137372977, "grad_norm": 0.18825663626194, "learning_rate": 8.724829455348633e-05, "loss": 0.2256, "step": 2834 }, { "epoch": 4.25705683101242, "grad_norm": 0.17196719348430634, "learning_rate": 8.723791380609625e-05, "loss": 0.1997, "step": 2835 }, { "epoch": 4.25856228829507, "grad_norm": 0.2340603768825531, "learning_rate": 8.722752953321644e-05, "loss": 0.1974, "step": 2836 }, { "epoch": 4.260067745577719, "grad_norm": 0.1675722301006317, "learning_rate": 8.72171417359824e-05, "loss": 0.1755, "step": 2837 }, { "epoch": 4.261573202860369, "grad_norm": 0.19383513927459717, "learning_rate": 8.72067504155302e-05, "loss": 0.2628, "step": 2838 }, { "epoch": 4.263078660143019, "grad_norm": 0.24434000253677368, "learning_rate": 8.71963555729962e-05, "loss": 0.1875, "step": 2839 }, { "epoch": 4.2645841174256685, "grad_norm": 0.25466838479042053, "learning_rate": 8.718595720951716e-05, "loss": 0.1983, "step": 2840 }, { "epoch": 4.266089574708317, "grad_norm": 0.24605302512645721, "learning_rate": 8.717555532623022e-05, "loss": 0.2126, "step": 2841 }, { "epoch": 4.267595031990967, "grad_norm": 0.22921240329742432, "learning_rate": 8.716514992427293e-05, "loss": 0.1962, "step": 2842 }, { "epoch": 4.269100489273617, "grad_norm": 0.20927946269512177, "learning_rate": 8.715474100478321e-05, "loss": 0.2075, "step": 2843 }, { "epoch": 4.270605946556266, "grad_norm": 0.21074211597442627, "learning_rate": 8.714432856889935e-05, "loss": 0.1803, "step": 2844 }, { "epoch": 4.272111403838916, "grad_norm": 0.17868515849113464, "learning_rate": 8.713391261776004e-05, "loss": 0.1671, "step": 2845 }, { "epoch": 4.273616861121566, "grad_norm": 0.2040361762046814, "learning_rate": 8.712349315250436e-05, "loss": 0.2371, "step": 2846 }, { "epoch": 4.2751223184042155, "grad_norm": 0.21209068596363068, "learning_rate": 8.711307017427178e-05, "loss": 0.2192, "step": 2847 }, { "epoch": 4.2766277756868645, "grad_norm": 0.17593874037265778, "learning_rate": 8.710264368420212e-05, "loss": 0.1894, "step": 2848 }, { "epoch": 4.278133232969514, "grad_norm": 0.19093386828899384, "learning_rate": 8.709221368343562e-05, "loss": 0.2069, "step": 2849 }, { "epoch": 4.279638690252164, "grad_norm": 0.18128634989261627, "learning_rate": 8.708178017311287e-05, "loss": 0.2159, "step": 2850 }, { "epoch": 4.281144147534814, "grad_norm": 0.1897697150707245, "learning_rate": 8.707134315437489e-05, "loss": 0.1675, "step": 2851 }, { "epoch": 4.282649604817463, "grad_norm": 0.17529678344726562, "learning_rate": 8.706090262836301e-05, "loss": 0.2192, "step": 2852 }, { "epoch": 4.284155062100113, "grad_norm": 0.22420789301395416, "learning_rate": 8.705045859621905e-05, "loss": 0.2262, "step": 2853 }, { "epoch": 4.285660519382763, "grad_norm": 0.21908843517303467, "learning_rate": 8.704001105908512e-05, "loss": 0.2594, "step": 2854 }, { "epoch": 4.287165976665412, "grad_norm": 0.26821649074554443, "learning_rate": 8.702956001810375e-05, "loss": 0.2516, "step": 2855 }, { "epoch": 4.288671433948061, "grad_norm": 0.2979215681552887, "learning_rate": 8.701910547441786e-05, "loss": 0.2221, "step": 2856 }, { "epoch": 4.290176891230711, "grad_norm": 0.23300133645534515, "learning_rate": 8.700864742917072e-05, "loss": 0.1772, "step": 2857 }, { "epoch": 4.291682348513361, "grad_norm": 0.21919851005077362, "learning_rate": 8.699818588350601e-05, "loss": 0.206, "step": 2858 }, { "epoch": 4.293187805796011, "grad_norm": 0.21540357172489166, "learning_rate": 8.698772083856782e-05, "loss": 0.2338, "step": 2859 }, { "epoch": 4.29469326307866, "grad_norm": 0.22703789174556732, "learning_rate": 8.697725229550056e-05, "loss": 0.2124, "step": 2860 }, { "epoch": 4.29619872036131, "grad_norm": 0.2366381585597992, "learning_rate": 8.696678025544908e-05, "loss": 0.179, "step": 2861 }, { "epoch": 4.2977041776439595, "grad_norm": 0.2691628038883209, "learning_rate": 8.695630471955859e-05, "loss": 0.1774, "step": 2862 }, { "epoch": 4.299209634926609, "grad_norm": 0.17622381448745728, "learning_rate": 8.694582568897462e-05, "loss": 0.1644, "step": 2863 }, { "epoch": 4.300715092209258, "grad_norm": 0.1980699747800827, "learning_rate": 8.693534316484321e-05, "loss": 0.2097, "step": 2864 }, { "epoch": 4.302220549491908, "grad_norm": 0.23370076715946198, "learning_rate": 8.692485714831067e-05, "loss": 0.1969, "step": 2865 }, { "epoch": 4.303726006774558, "grad_norm": 0.22615362703800201, "learning_rate": 8.691436764052378e-05, "loss": 0.1972, "step": 2866 }, { "epoch": 4.305231464057208, "grad_norm": 0.21471597254276276, "learning_rate": 8.69038746426296e-05, "loss": 0.1747, "step": 2867 }, { "epoch": 4.306736921339857, "grad_norm": 0.20538710057735443, "learning_rate": 8.689337815577564e-05, "loss": 0.1352, "step": 2868 }, { "epoch": 4.308242378622507, "grad_norm": 0.2880326807498932, "learning_rate": 8.688287818110983e-05, "loss": 0.2314, "step": 2869 }, { "epoch": 4.309747835905156, "grad_norm": 0.2576182186603546, "learning_rate": 8.687237471978038e-05, "loss": 0.2136, "step": 2870 }, { "epoch": 4.311253293187805, "grad_norm": 0.22249527275562286, "learning_rate": 8.686186777293594e-05, "loss": 0.2152, "step": 2871 }, { "epoch": 4.312758750470455, "grad_norm": 0.20301170647144318, "learning_rate": 8.685135734172557e-05, "loss": 0.182, "step": 2872 }, { "epoch": 4.314264207753105, "grad_norm": 0.2339143007993698, "learning_rate": 8.684084342729864e-05, "loss": 0.1984, "step": 2873 }, { "epoch": 4.315769665035755, "grad_norm": 0.20869648456573486, "learning_rate": 8.683032603080494e-05, "loss": 0.1645, "step": 2874 }, { "epoch": 4.317275122318404, "grad_norm": 0.174856036901474, "learning_rate": 8.681980515339464e-05, "loss": 0.191, "step": 2875 }, { "epoch": 4.318780579601054, "grad_norm": 0.22066085040569305, "learning_rate": 8.68092807962183e-05, "loss": 0.2239, "step": 2876 }, { "epoch": 4.3202860368837035, "grad_norm": 0.21049214899539948, "learning_rate": 8.679875296042682e-05, "loss": 0.1867, "step": 2877 }, { "epoch": 4.321791494166353, "grad_norm": 0.19166381657123566, "learning_rate": 8.678822164717155e-05, "loss": 0.1713, "step": 2878 }, { "epoch": 4.323296951449002, "grad_norm": 0.21627908945083618, "learning_rate": 8.677768685760412e-05, "loss": 0.2321, "step": 2879 }, { "epoch": 4.324802408731652, "grad_norm": 0.21555939316749573, "learning_rate": 8.676714859287666e-05, "loss": 0.2046, "step": 2880 }, { "epoch": 4.326307866014302, "grad_norm": 0.19859610497951508, "learning_rate": 8.675660685414157e-05, "loss": 0.1933, "step": 2881 }, { "epoch": 4.327813323296952, "grad_norm": 0.18463802337646484, "learning_rate": 8.674606164255171e-05, "loss": 0.227, "step": 2882 }, { "epoch": 4.329318780579601, "grad_norm": 0.1980578750371933, "learning_rate": 8.673551295926028e-05, "loss": 0.1964, "step": 2883 }, { "epoch": 4.330824237862251, "grad_norm": 0.1831749528646469, "learning_rate": 8.672496080542087e-05, "loss": 0.1837, "step": 2884 }, { "epoch": 4.3323296951449, "grad_norm": 0.19255639612674713, "learning_rate": 8.671440518218744e-05, "loss": 0.1965, "step": 2885 }, { "epoch": 4.33383515242755, "grad_norm": 0.20438645780086517, "learning_rate": 8.670384609071435e-05, "loss": 0.1746, "step": 2886 }, { "epoch": 4.335340609710199, "grad_norm": 0.22714994847774506, "learning_rate": 8.66932835321563e-05, "loss": 0.2826, "step": 2887 }, { "epoch": 4.336846066992849, "grad_norm": 0.1973155438899994, "learning_rate": 8.668271750766843e-05, "loss": 0.1854, "step": 2888 }, { "epoch": 4.338351524275499, "grad_norm": 0.17053496837615967, "learning_rate": 8.667214801840619e-05, "loss": 0.1597, "step": 2889 }, { "epoch": 4.339856981558149, "grad_norm": 0.23782314360141754, "learning_rate": 8.666157506552547e-05, "loss": 0.2782, "step": 2890 }, { "epoch": 4.341362438840798, "grad_norm": 0.20079070329666138, "learning_rate": 8.66509986501825e-05, "loss": 0.1688, "step": 2891 }, { "epoch": 4.3428678961234475, "grad_norm": 0.2332342267036438, "learning_rate": 8.66404187735339e-05, "loss": 0.1817, "step": 2892 }, { "epoch": 4.344373353406097, "grad_norm": 0.22466860711574554, "learning_rate": 8.662983543673669e-05, "loss": 0.2421, "step": 2893 }, { "epoch": 4.345878810688747, "grad_norm": 0.20518988370895386, "learning_rate": 8.661924864094822e-05, "loss": 0.2085, "step": 2894 }, { "epoch": 4.347384267971396, "grad_norm": 0.20059159398078918, "learning_rate": 8.660865838732626e-05, "loss": 0.2614, "step": 2895 }, { "epoch": 4.348889725254046, "grad_norm": 0.20096753537654877, "learning_rate": 8.659806467702894e-05, "loss": 0.2288, "step": 2896 }, { "epoch": 4.350395182536696, "grad_norm": 0.22888346016407013, "learning_rate": 8.658746751121478e-05, "loss": 0.2449, "step": 2897 }, { "epoch": 4.351900639819345, "grad_norm": 0.18186995387077332, "learning_rate": 8.657686689104267e-05, "loss": 0.2128, "step": 2898 }, { "epoch": 4.3534060971019946, "grad_norm": 0.21781222522258759, "learning_rate": 8.656626281767184e-05, "loss": 0.2185, "step": 2899 }, { "epoch": 4.354911554384644, "grad_norm": 0.20299239456653595, "learning_rate": 8.655565529226198e-05, "loss": 0.218, "step": 2900 }, { "epoch": 4.356417011667294, "grad_norm": 0.22190813720226288, "learning_rate": 8.654504431597312e-05, "loss": 0.1758, "step": 2901 }, { "epoch": 4.357922468949943, "grad_norm": 0.19974347949028015, "learning_rate": 8.653442988996563e-05, "loss": 0.1904, "step": 2902 }, { "epoch": 4.359427926232593, "grad_norm": 0.2003534436225891, "learning_rate": 8.652381201540031e-05, "loss": 0.2457, "step": 2903 }, { "epoch": 4.360933383515243, "grad_norm": 0.21515397727489471, "learning_rate": 8.651319069343828e-05, "loss": 0.2337, "step": 2904 }, { "epoch": 4.362438840797893, "grad_norm": 0.18797598779201508, "learning_rate": 8.650256592524112e-05, "loss": 0.2371, "step": 2905 }, { "epoch": 4.363944298080542, "grad_norm": 0.18368829786777496, "learning_rate": 8.649193771197068e-05, "loss": 0.2147, "step": 2906 }, { "epoch": 4.3654497553631915, "grad_norm": 0.21790817379951477, "learning_rate": 8.648130605478931e-05, "loss": 0.2125, "step": 2907 }, { "epoch": 4.366955212645841, "grad_norm": 0.2204899787902832, "learning_rate": 8.647067095485963e-05, "loss": 0.2325, "step": 2908 }, { "epoch": 4.368460669928491, "grad_norm": 0.21290048956871033, "learning_rate": 8.646003241334468e-05, "loss": 0.2441, "step": 2909 }, { "epoch": 4.36996612721114, "grad_norm": 0.21268296241760254, "learning_rate": 8.64493904314079e-05, "loss": 0.1839, "step": 2910 }, { "epoch": 4.37147158449379, "grad_norm": 0.18950755894184113, "learning_rate": 8.643874501021307e-05, "loss": 0.1937, "step": 2911 }, { "epoch": 4.37297704177644, "grad_norm": 0.23578569293022156, "learning_rate": 8.642809615092435e-05, "loss": 0.188, "step": 2912 }, { "epoch": 4.37448249905909, "grad_norm": 3.8463876247406006, "learning_rate": 8.641744385470628e-05, "loss": 0.2831, "step": 2913 }, { "epoch": 4.3759879563417385, "grad_norm": 0.3210218846797943, "learning_rate": 8.640678812272378e-05, "loss": 0.2182, "step": 2914 }, { "epoch": 4.377493413624388, "grad_norm": 0.42736008763313293, "learning_rate": 8.639612895614216e-05, "loss": 0.2734, "step": 2915 }, { "epoch": 4.378998870907038, "grad_norm": 0.47212204337120056, "learning_rate": 8.638546635612708e-05, "loss": 0.2137, "step": 2916 }, { "epoch": 4.380504328189687, "grad_norm": 0.3636210858821869, "learning_rate": 8.637480032384459e-05, "loss": 0.2145, "step": 2917 }, { "epoch": 4.382009785472337, "grad_norm": 0.30441391468048096, "learning_rate": 8.636413086046109e-05, "loss": 0.2342, "step": 2918 }, { "epoch": 4.383515242754987, "grad_norm": 0.3911392390727997, "learning_rate": 8.63534579671434e-05, "loss": 0.2703, "step": 2919 }, { "epoch": 4.385020700037637, "grad_norm": 0.34245216846466064, "learning_rate": 8.63427816450587e-05, "loss": 0.2345, "step": 2920 }, { "epoch": 4.3865261573202865, "grad_norm": 0.32176655530929565, "learning_rate": 8.633210189537452e-05, "loss": 0.2812, "step": 2921 }, { "epoch": 4.3880316146029354, "grad_norm": 0.23261822760105133, "learning_rate": 8.632141871925877e-05, "loss": 0.2405, "step": 2922 }, { "epoch": 4.389537071885585, "grad_norm": 0.2706559896469116, "learning_rate": 8.631073211787978e-05, "loss": 0.289, "step": 2923 }, { "epoch": 4.391042529168235, "grad_norm": 0.2631830871105194, "learning_rate": 8.630004209240619e-05, "loss": 0.2018, "step": 2924 }, { "epoch": 4.392547986450884, "grad_norm": 0.22754915058612823, "learning_rate": 8.628934864400706e-05, "loss": 0.1977, "step": 2925 }, { "epoch": 4.394053443733534, "grad_norm": 0.23294663429260254, "learning_rate": 8.627865177385178e-05, "loss": 0.2298, "step": 2926 }, { "epoch": 4.395558901016184, "grad_norm": 0.1901208907365799, "learning_rate": 8.626795148311022e-05, "loss": 0.1747, "step": 2927 }, { "epoch": 4.397064358298834, "grad_norm": 0.20999588072299957, "learning_rate": 8.625724777295245e-05, "loss": 0.1718, "step": 2928 }, { "epoch": 4.3985698155814825, "grad_norm": 0.21329064667224884, "learning_rate": 8.624654064454907e-05, "loss": 0.1707, "step": 2929 }, { "epoch": 4.400075272864132, "grad_norm": 0.1908072978258133, "learning_rate": 8.623583009907099e-05, "loss": 0.1541, "step": 2930 }, { "epoch": 4.401580730146782, "grad_norm": 0.20116914808750153, "learning_rate": 8.62251161376895e-05, "loss": 0.2001, "step": 2931 }, { "epoch": 4.403086187429432, "grad_norm": 0.21612931787967682, "learning_rate": 8.621439876157622e-05, "loss": 0.223, "step": 2932 }, { "epoch": 4.404591644712081, "grad_norm": 0.2006808966398239, "learning_rate": 8.620367797190327e-05, "loss": 0.1914, "step": 2933 }, { "epoch": 4.406097101994731, "grad_norm": 0.17786982655525208, "learning_rate": 8.619295376984297e-05, "loss": 0.2142, "step": 2934 }, { "epoch": 4.407602559277381, "grad_norm": 0.24125505983829498, "learning_rate": 8.618222615656816e-05, "loss": 0.2605, "step": 2935 }, { "epoch": 4.4091080165600305, "grad_norm": 0.1870722472667694, "learning_rate": 8.617149513325198e-05, "loss": 0.2053, "step": 2936 }, { "epoch": 4.410613473842679, "grad_norm": 0.20805026590824127, "learning_rate": 8.616076070106796e-05, "loss": 0.2228, "step": 2937 }, { "epoch": 4.412118931125329, "grad_norm": 0.18511131405830383, "learning_rate": 8.615002286119e-05, "loss": 0.2468, "step": 2938 }, { "epoch": 4.413624388407979, "grad_norm": 0.19784891605377197, "learning_rate": 8.613928161479237e-05, "loss": 0.1962, "step": 2939 }, { "epoch": 4.415129845690629, "grad_norm": 0.18889009952545166, "learning_rate": 8.612853696304972e-05, "loss": 0.2273, "step": 2940 }, { "epoch": 4.416635302973278, "grad_norm": 0.21295589208602905, "learning_rate": 8.611778890713707e-05, "loss": 0.1579, "step": 2941 }, { "epoch": 4.418140760255928, "grad_norm": 0.17215490341186523, "learning_rate": 8.610703744822981e-05, "loss": 0.1873, "step": 2942 }, { "epoch": 4.419646217538578, "grad_norm": 0.20256756246089935, "learning_rate": 8.60962825875037e-05, "loss": 0.2104, "step": 2943 }, { "epoch": 4.4211516748212265, "grad_norm": 0.19657611846923828, "learning_rate": 8.608552432613488e-05, "loss": 0.1897, "step": 2944 }, { "epoch": 4.422657132103876, "grad_norm": 0.17982307076454163, "learning_rate": 8.607476266529987e-05, "loss": 0.1455, "step": 2945 }, { "epoch": 4.424162589386526, "grad_norm": 0.18841798603534698, "learning_rate": 8.606399760617552e-05, "loss": 0.2881, "step": 2946 }, { "epoch": 4.425668046669176, "grad_norm": 0.19870753586292267, "learning_rate": 8.605322914993909e-05, "loss": 0.2047, "step": 2947 }, { "epoch": 4.427173503951825, "grad_norm": 0.1879764348268509, "learning_rate": 8.604245729776822e-05, "loss": 0.1599, "step": 2948 }, { "epoch": 4.428678961234475, "grad_norm": 0.17604675889015198, "learning_rate": 8.60316820508409e-05, "loss": 0.2163, "step": 2949 }, { "epoch": 4.430184418517125, "grad_norm": 0.18975429236888885, "learning_rate": 8.602090341033547e-05, "loss": 0.2115, "step": 2950 }, { "epoch": 4.4316898757997745, "grad_norm": 0.18929138779640198, "learning_rate": 8.601012137743069e-05, "loss": 0.2389, "step": 2951 }, { "epoch": 4.433195333082423, "grad_norm": 0.17649726569652557, "learning_rate": 8.599933595330566e-05, "loss": 0.1937, "step": 2952 }, { "epoch": 4.434700790365073, "grad_norm": 0.1904294490814209, "learning_rate": 8.598854713913985e-05, "loss": 0.1938, "step": 2953 }, { "epoch": 4.436206247647723, "grad_norm": 0.19500333070755005, "learning_rate": 8.597775493611311e-05, "loss": 0.1753, "step": 2954 }, { "epoch": 4.437711704930373, "grad_norm": 0.17428529262542725, "learning_rate": 8.596695934540567e-05, "loss": 0.2198, "step": 2955 }, { "epoch": 4.439217162213022, "grad_norm": 0.17751406133174896, "learning_rate": 8.595616036819812e-05, "loss": 0.2108, "step": 2956 }, { "epoch": 4.440722619495672, "grad_norm": 0.2142602652311325, "learning_rate": 8.594535800567142e-05, "loss": 0.2399, "step": 2957 }, { "epoch": 4.4422280767783215, "grad_norm": 0.2130075842142105, "learning_rate": 8.593455225900688e-05, "loss": 0.1881, "step": 2958 }, { "epoch": 4.443733534060971, "grad_norm": 0.25636914372444153, "learning_rate": 8.592374312938623e-05, "loss": 0.2074, "step": 2959 }, { "epoch": 4.44523899134362, "grad_norm": 0.2875373661518097, "learning_rate": 8.591293061799151e-05, "loss": 0.2091, "step": 2960 }, { "epoch": 4.44674444862627, "grad_norm": 0.22072215378284454, "learning_rate": 8.590211472600518e-05, "loss": 0.1952, "step": 2961 }, { "epoch": 4.44824990590892, "grad_norm": 0.25798699259757996, "learning_rate": 8.589129545461003e-05, "loss": 0.2025, "step": 2962 }, { "epoch": 4.44975536319157, "grad_norm": 0.23596318066120148, "learning_rate": 8.588047280498926e-05, "loss": 0.1889, "step": 2963 }, { "epoch": 4.451260820474219, "grad_norm": 0.23367880284786224, "learning_rate": 8.586964677832643e-05, "loss": 0.173, "step": 2964 }, { "epoch": 4.452766277756869, "grad_norm": 0.2131187617778778, "learning_rate": 8.585881737580543e-05, "loss": 0.2755, "step": 2965 }, { "epoch": 4.4542717350395185, "grad_norm": 0.23786818981170654, "learning_rate": 8.584798459861055e-05, "loss": 0.1962, "step": 2966 }, { "epoch": 4.455777192322168, "grad_norm": 0.19237324595451355, "learning_rate": 8.583714844792646e-05, "loss": 0.2103, "step": 2967 }, { "epoch": 4.457282649604817, "grad_norm": 0.2074752151966095, "learning_rate": 8.582630892493818e-05, "loss": 0.1672, "step": 2968 }, { "epoch": 4.458788106887467, "grad_norm": 0.18948866426944733, "learning_rate": 8.58154660308311e-05, "loss": 0.1956, "step": 2969 }, { "epoch": 4.460293564170117, "grad_norm": 0.22502897679805756, "learning_rate": 8.5804619766791e-05, "loss": 0.1702, "step": 2970 }, { "epoch": 4.461799021452766, "grad_norm": 0.2659279406070709, "learning_rate": 8.579377013400398e-05, "loss": 0.1821, "step": 2971 }, { "epoch": 4.463304478735416, "grad_norm": 0.2448033094406128, "learning_rate": 8.578291713365656e-05, "loss": 0.1694, "step": 2972 }, { "epoch": 4.4648099360180655, "grad_norm": 0.24702733755111694, "learning_rate": 8.57720607669356e-05, "loss": 0.1891, "step": 2973 }, { "epoch": 4.466315393300715, "grad_norm": 0.2023964673280716, "learning_rate": 8.576120103502834e-05, "loss": 0.1427, "step": 2974 }, { "epoch": 4.467820850583364, "grad_norm": 0.19746951758861542, "learning_rate": 8.575033793912239e-05, "loss": 0.1786, "step": 2975 }, { "epoch": 4.469326307866014, "grad_norm": 0.21436265110969543, "learning_rate": 8.57394714804057e-05, "loss": 0.2536, "step": 2976 }, { "epoch": 4.470831765148664, "grad_norm": 0.22498951852321625, "learning_rate": 8.572860166006665e-05, "loss": 0.1914, "step": 2977 }, { "epoch": 4.472337222431314, "grad_norm": 0.20666764676570892, "learning_rate": 8.57177284792939e-05, "loss": 0.1795, "step": 2978 }, { "epoch": 4.473842679713963, "grad_norm": 0.18426603078842163, "learning_rate": 8.570685193927655e-05, "loss": 0.1854, "step": 2979 }, { "epoch": 4.475348136996613, "grad_norm": 0.20979231595993042, "learning_rate": 8.569597204120405e-05, "loss": 0.1987, "step": 2980 }, { "epoch": 4.476853594279262, "grad_norm": 0.16791684925556183, "learning_rate": 8.568508878626618e-05, "loss": 0.1846, "step": 2981 }, { "epoch": 4.478359051561912, "grad_norm": 0.19246931374073029, "learning_rate": 8.567420217565315e-05, "loss": 0.1666, "step": 2982 }, { "epoch": 4.479864508844561, "grad_norm": 0.20922601222991943, "learning_rate": 8.566331221055549e-05, "loss": 0.2199, "step": 2983 }, { "epoch": 4.481369966127211, "grad_norm": 0.19576333463191986, "learning_rate": 8.56524188921641e-05, "loss": 0.2162, "step": 2984 }, { "epoch": 4.482875423409861, "grad_norm": 0.18625321984291077, "learning_rate": 8.564152222167027e-05, "loss": 0.1849, "step": 2985 }, { "epoch": 4.484380880692511, "grad_norm": 0.17676690220832825, "learning_rate": 8.563062220026564e-05, "loss": 0.272, "step": 2986 }, { "epoch": 4.48588633797516, "grad_norm": 0.19836759567260742, "learning_rate": 8.561971882914223e-05, "loss": 0.2051, "step": 2987 }, { "epoch": 4.4873917952578095, "grad_norm": 0.22977298498153687, "learning_rate": 8.560881210949238e-05, "loss": 0.2448, "step": 2988 }, { "epoch": 4.488897252540459, "grad_norm": 0.21447022259235382, "learning_rate": 8.559790204250887e-05, "loss": 0.1461, "step": 2989 }, { "epoch": 4.490402709823109, "grad_norm": 0.23934486508369446, "learning_rate": 8.55869886293848e-05, "loss": 0.218, "step": 2990 }, { "epoch": 4.491908167105758, "grad_norm": 0.22613056004047394, "learning_rate": 8.557607187131364e-05, "loss": 0.1959, "step": 2991 }, { "epoch": 4.493413624388408, "grad_norm": 0.22175215184688568, "learning_rate": 8.55651517694892e-05, "loss": 0.1883, "step": 2992 }, { "epoch": 4.494919081671058, "grad_norm": 0.1957298368215561, "learning_rate": 8.555422832510576e-05, "loss": 0.1533, "step": 2993 }, { "epoch": 4.496424538953708, "grad_norm": 0.1956290453672409, "learning_rate": 8.554330153935782e-05, "loss": 0.2322, "step": 2994 }, { "epoch": 4.497929996236357, "grad_norm": 0.21157214045524597, "learning_rate": 8.553237141344035e-05, "loss": 0.2458, "step": 2995 }, { "epoch": 4.499435453519006, "grad_norm": 0.20362776517868042, "learning_rate": 8.552143794854865e-05, "loss": 0.1441, "step": 2996 }, { "epoch": 4.500940910801656, "grad_norm": 0.2155793309211731, "learning_rate": 8.55105011458784e-05, "loss": 0.2108, "step": 2997 }, { "epoch": 4.502446368084305, "grad_norm": 0.20505475997924805, "learning_rate": 8.54995610066256e-05, "loss": 0.2255, "step": 2998 }, { "epoch": 4.503951825366955, "grad_norm": 0.20709174871444702, "learning_rate": 8.548861753198665e-05, "loss": 0.2175, "step": 2999 }, { "epoch": 4.505457282649605, "grad_norm": 0.1992296576499939, "learning_rate": 8.547767072315835e-05, "loss": 0.2211, "step": 3000 }, { "epoch": 4.505457282649605, "eval_loss": 0.22689932584762573, "eval_runtime": 540.0713, "eval_samples_per_second": 17.825, "eval_steps_per_second": 0.557, "step": 3000 } ], "logging_steps": 1, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 151, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.620308197288234e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }