{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9927710843373494, "eval_steps": 500, "global_step": 621, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004819277108433735, "grad_norm": 6.955545902252197, "learning_rate": 1.5873015873015874e-07, "loss": 1.0435, "step": 1 }, { "epoch": 0.00963855421686747, "grad_norm": 6.793174743652344, "learning_rate": 3.174603174603175e-07, "loss": 1.0378, "step": 2 }, { "epoch": 0.014457831325301205, "grad_norm": 6.922445774078369, "learning_rate": 4.7619047619047623e-07, "loss": 1.0495, "step": 3 }, { "epoch": 0.01927710843373494, "grad_norm": 6.903090000152588, "learning_rate": 6.34920634920635e-07, "loss": 1.0422, "step": 4 }, { "epoch": 0.024096385542168676, "grad_norm": 6.824069499969482, "learning_rate": 7.936507936507937e-07, "loss": 1.0456, "step": 5 }, { "epoch": 0.02891566265060241, "grad_norm": 6.71932315826416, "learning_rate": 9.523809523809525e-07, "loss": 1.0416, "step": 6 }, { "epoch": 0.033734939759036145, "grad_norm": 6.264998912811279, "learning_rate": 1.111111111111111e-06, "loss": 1.0182, "step": 7 }, { "epoch": 0.03855421686746988, "grad_norm": 6.157326698303223, "learning_rate": 1.26984126984127e-06, "loss": 1.0122, "step": 8 }, { "epoch": 0.043373493975903614, "grad_norm": 5.046329498291016, "learning_rate": 1.4285714285714286e-06, "loss": 1.002, "step": 9 }, { "epoch": 0.04819277108433735, "grad_norm": 4.785087585449219, "learning_rate": 1.5873015873015873e-06, "loss": 0.9743, "step": 10 }, { "epoch": 0.05301204819277108, "grad_norm": 4.505481719970703, "learning_rate": 1.746031746031746e-06, "loss": 0.9795, "step": 11 }, { "epoch": 0.05783132530120482, "grad_norm": 3.0111618041992188, "learning_rate": 1.904761904761905e-06, "loss": 0.9396, "step": 12 }, { "epoch": 0.06265060240963856, "grad_norm": 2.865196466445923, "learning_rate": 2.0634920634920634e-06, "loss": 0.9502, "step": 13 }, { "epoch": 0.06746987951807229, "grad_norm": 2.732715368270874, "learning_rate": 2.222222222222222e-06, "loss": 0.9336, "step": 14 }, { "epoch": 0.07228915662650602, "grad_norm": 2.576692581176758, "learning_rate": 2.380952380952381e-06, "loss": 0.933, "step": 15 }, { "epoch": 0.07710843373493977, "grad_norm": 3.3130340576171875, "learning_rate": 2.53968253968254e-06, "loss": 0.9039, "step": 16 }, { "epoch": 0.0819277108433735, "grad_norm": 3.8548290729522705, "learning_rate": 2.6984126984126986e-06, "loss": 0.8975, "step": 17 }, { "epoch": 0.08674698795180723, "grad_norm": 3.933690071105957, "learning_rate": 2.8571428571428573e-06, "loss": 0.9065, "step": 18 }, { "epoch": 0.09156626506024096, "grad_norm": 3.6290793418884277, "learning_rate": 3.015873015873016e-06, "loss": 0.8772, "step": 19 }, { "epoch": 0.0963855421686747, "grad_norm": 3.045029878616333, "learning_rate": 3.1746031746031746e-06, "loss": 0.8738, "step": 20 }, { "epoch": 0.10120481927710843, "grad_norm": 2.285907506942749, "learning_rate": 3.3333333333333333e-06, "loss": 0.8443, "step": 21 }, { "epoch": 0.10602409638554217, "grad_norm": 1.6617300510406494, "learning_rate": 3.492063492063492e-06, "loss": 0.8385, "step": 22 }, { "epoch": 0.1108433734939759, "grad_norm": 1.595490574836731, "learning_rate": 3.6507936507936507e-06, "loss": 0.8201, "step": 23 }, { "epoch": 0.11566265060240964, "grad_norm": 1.3998520374298096, "learning_rate": 3.80952380952381e-06, "loss": 0.8115, "step": 24 }, { "epoch": 0.12048192771084337, "grad_norm": 1.24544358253479, "learning_rate": 3.968253968253968e-06, "loss": 0.8131, "step": 25 }, { "epoch": 0.12530120481927712, "grad_norm": 1.094412088394165, "learning_rate": 4.126984126984127e-06, "loss": 0.7769, "step": 26 }, { "epoch": 0.13012048192771083, "grad_norm": 1.0416014194488525, "learning_rate": 4.2857142857142855e-06, "loss": 0.7739, "step": 27 }, { "epoch": 0.13493975903614458, "grad_norm": 1.002666711807251, "learning_rate": 4.444444444444444e-06, "loss": 0.7797, "step": 28 }, { "epoch": 0.13975903614457832, "grad_norm": 0.9590147137641907, "learning_rate": 4.603174603174604e-06, "loss": 0.7592, "step": 29 }, { "epoch": 0.14457831325301204, "grad_norm": 0.8477379679679871, "learning_rate": 4.761904761904762e-06, "loss": 0.755, "step": 30 }, { "epoch": 0.1493975903614458, "grad_norm": 0.8571174740791321, "learning_rate": 4.920634920634921e-06, "loss": 0.7514, "step": 31 }, { "epoch": 0.15421686746987953, "grad_norm": 0.8876820802688599, "learning_rate": 5.07936507936508e-06, "loss": 0.7371, "step": 32 }, { "epoch": 0.15903614457831325, "grad_norm": 0.8745198249816895, "learning_rate": 5.2380952380952384e-06, "loss": 0.7375, "step": 33 }, { "epoch": 0.163855421686747, "grad_norm": 0.94056236743927, "learning_rate": 5.396825396825397e-06, "loss": 0.7282, "step": 34 }, { "epoch": 0.1686746987951807, "grad_norm": 0.7261127829551697, "learning_rate": 5.555555555555557e-06, "loss": 0.7217, "step": 35 }, { "epoch": 0.17349397590361446, "grad_norm": 0.7808772325515747, "learning_rate": 5.7142857142857145e-06, "loss": 0.7069, "step": 36 }, { "epoch": 0.1783132530120482, "grad_norm": 0.8928313255310059, "learning_rate": 5.873015873015874e-06, "loss": 0.7165, "step": 37 }, { "epoch": 0.18313253012048192, "grad_norm": 0.7088349461555481, "learning_rate": 6.031746031746032e-06, "loss": 0.7122, "step": 38 }, { "epoch": 0.18795180722891566, "grad_norm": 0.8545775413513184, "learning_rate": 6.1904761904761914e-06, "loss": 0.7151, "step": 39 }, { "epoch": 0.1927710843373494, "grad_norm": 0.8626333475112915, "learning_rate": 6.349206349206349e-06, "loss": 0.7127, "step": 40 }, { "epoch": 0.19759036144578312, "grad_norm": 0.7708625197410583, "learning_rate": 6.507936507936509e-06, "loss": 0.6916, "step": 41 }, { "epoch": 0.20240963855421687, "grad_norm": 0.6877345442771912, "learning_rate": 6.666666666666667e-06, "loss": 0.707, "step": 42 }, { "epoch": 0.20722891566265061, "grad_norm": 0.6759275197982788, "learning_rate": 6.825396825396826e-06, "loss": 0.6822, "step": 43 }, { "epoch": 0.21204819277108433, "grad_norm": 0.9545477032661438, "learning_rate": 6.984126984126984e-06, "loss": 0.6943, "step": 44 }, { "epoch": 0.21686746987951808, "grad_norm": 0.789354681968689, "learning_rate": 7.1428571428571436e-06, "loss": 0.6799, "step": 45 }, { "epoch": 0.2216867469879518, "grad_norm": 0.6081812381744385, "learning_rate": 7.301587301587301e-06, "loss": 0.6824, "step": 46 }, { "epoch": 0.22650602409638554, "grad_norm": 0.8076633214950562, "learning_rate": 7.460317460317461e-06, "loss": 0.6701, "step": 47 }, { "epoch": 0.23132530120481928, "grad_norm": 0.6392861604690552, "learning_rate": 7.61904761904762e-06, "loss": 0.6774, "step": 48 }, { "epoch": 0.236144578313253, "grad_norm": 0.6257482767105103, "learning_rate": 7.77777777777778e-06, "loss": 0.6908, "step": 49 }, { "epoch": 0.24096385542168675, "grad_norm": 0.6038609743118286, "learning_rate": 7.936507936507936e-06, "loss": 0.6784, "step": 50 }, { "epoch": 0.2457831325301205, "grad_norm": 0.5643861293792725, "learning_rate": 8.095238095238097e-06, "loss": 0.6705, "step": 51 }, { "epoch": 0.25060240963855424, "grad_norm": 0.6746996641159058, "learning_rate": 8.253968253968254e-06, "loss": 0.6674, "step": 52 }, { "epoch": 0.25542168674698795, "grad_norm": 0.6142429113388062, "learning_rate": 8.412698412698414e-06, "loss": 0.6798, "step": 53 }, { "epoch": 0.26024096385542167, "grad_norm": 0.48977726697921753, "learning_rate": 8.571428571428571e-06, "loss": 0.6678, "step": 54 }, { "epoch": 0.26506024096385544, "grad_norm": 0.6775186061859131, "learning_rate": 8.730158730158731e-06, "loss": 0.6777, "step": 55 }, { "epoch": 0.26987951807228916, "grad_norm": 0.6069589257240295, "learning_rate": 8.888888888888888e-06, "loss": 0.6571, "step": 56 }, { "epoch": 0.2746987951807229, "grad_norm": 0.5993375778198242, "learning_rate": 9.047619047619049e-06, "loss": 0.6659, "step": 57 }, { "epoch": 0.27951807228915665, "grad_norm": 0.5999571681022644, "learning_rate": 9.206349206349207e-06, "loss": 0.6586, "step": 58 }, { "epoch": 0.28433734939759037, "grad_norm": 0.6867564916610718, "learning_rate": 9.365079365079366e-06, "loss": 0.6662, "step": 59 }, { "epoch": 0.2891566265060241, "grad_norm": 0.6454688906669617, "learning_rate": 9.523809523809525e-06, "loss": 0.655, "step": 60 }, { "epoch": 0.29397590361445786, "grad_norm": 0.6191383600234985, "learning_rate": 9.682539682539683e-06, "loss": 0.6641, "step": 61 }, { "epoch": 0.2987951807228916, "grad_norm": 0.6252975463867188, "learning_rate": 9.841269841269842e-06, "loss": 0.6436, "step": 62 }, { "epoch": 0.3036144578313253, "grad_norm": 0.5347315669059753, "learning_rate": 1e-05, "loss": 0.6542, "step": 63 }, { "epoch": 0.30843373493975906, "grad_norm": 0.6288977861404419, "learning_rate": 9.999920755303033e-06, "loss": 0.6472, "step": 64 }, { "epoch": 0.3132530120481928, "grad_norm": 0.5932735204696655, "learning_rate": 9.999683023724021e-06, "loss": 0.6385, "step": 65 }, { "epoch": 0.3180722891566265, "grad_norm": 0.573313295841217, "learning_rate": 9.99928681279855e-06, "loss": 0.6504, "step": 66 }, { "epoch": 0.3228915662650602, "grad_norm": 0.5847601294517517, "learning_rate": 9.998732135085665e-06, "loss": 0.6469, "step": 67 }, { "epoch": 0.327710843373494, "grad_norm": 0.5990540385246277, "learning_rate": 9.998019008167476e-06, "loss": 0.6498, "step": 68 }, { "epoch": 0.3325301204819277, "grad_norm": 0.5455608367919922, "learning_rate": 9.99714745464859e-06, "loss": 0.6425, "step": 69 }, { "epoch": 0.3373493975903614, "grad_norm": 0.6302762627601624, "learning_rate": 9.99611750215541e-06, "loss": 0.6386, "step": 70 }, { "epoch": 0.3421686746987952, "grad_norm": 0.6006896495819092, "learning_rate": 9.994929183335237e-06, "loss": 0.6402, "step": 71 }, { "epoch": 0.3469879518072289, "grad_norm": 0.6036058664321899, "learning_rate": 9.993582535855265e-06, "loss": 0.66, "step": 72 }, { "epoch": 0.35180722891566263, "grad_norm": 0.5187335014343262, "learning_rate": 9.992077602401358e-06, "loss": 0.6471, "step": 73 }, { "epoch": 0.3566265060240964, "grad_norm": 0.6480401754379272, "learning_rate": 9.990414430676716e-06, "loss": 0.6432, "step": 74 }, { "epoch": 0.3614457831325301, "grad_norm": 0.5152376294136047, "learning_rate": 9.988593073400354e-06, "loss": 0.6351, "step": 75 }, { "epoch": 0.36626506024096384, "grad_norm": 0.5486935377120972, "learning_rate": 9.986613588305435e-06, "loss": 0.6307, "step": 76 }, { "epoch": 0.3710843373493976, "grad_norm": 0.6851136684417725, "learning_rate": 9.984476038137437e-06, "loss": 0.6366, "step": 77 }, { "epoch": 0.3759036144578313, "grad_norm": 0.5467675924301147, "learning_rate": 9.982180490652165e-06, "loss": 0.6366, "step": 78 }, { "epoch": 0.38072289156626504, "grad_norm": 0.7552728652954102, "learning_rate": 9.979727018613607e-06, "loss": 0.6286, "step": 79 }, { "epoch": 0.3855421686746988, "grad_norm": 0.651406466960907, "learning_rate": 9.977115699791622e-06, "loss": 0.6359, "step": 80 }, { "epoch": 0.39036144578313253, "grad_norm": 0.7135255336761475, "learning_rate": 9.974346616959476e-06, "loss": 0.6327, "step": 81 }, { "epoch": 0.39518072289156625, "grad_norm": 0.6827096343040466, "learning_rate": 9.971419857891223e-06, "loss": 0.6181, "step": 82 }, { "epoch": 0.4, "grad_norm": 0.6823281645774841, "learning_rate": 9.968335515358916e-06, "loss": 0.6377, "step": 83 }, { "epoch": 0.40481927710843374, "grad_norm": 0.5493991374969482, "learning_rate": 9.965093687129669e-06, "loss": 0.6391, "step": 84 }, { "epoch": 0.40963855421686746, "grad_norm": 0.6720536351203918, "learning_rate": 9.961694475962562e-06, "loss": 0.6331, "step": 85 }, { "epoch": 0.41445783132530123, "grad_norm": 0.6092298030853271, "learning_rate": 9.95813798960538e-06, "loss": 0.6341, "step": 86 }, { "epoch": 0.41927710843373495, "grad_norm": 0.5759454369544983, "learning_rate": 9.954424340791195e-06, "loss": 0.633, "step": 87 }, { "epoch": 0.42409638554216866, "grad_norm": 0.6757510900497437, "learning_rate": 9.950553647234798e-06, "loss": 0.6243, "step": 88 }, { "epoch": 0.42891566265060244, "grad_norm": 0.5293172597885132, "learning_rate": 9.94652603162896e-06, "loss": 0.6424, "step": 89 }, { "epoch": 0.43373493975903615, "grad_norm": 0.663861870765686, "learning_rate": 9.942341621640558e-06, "loss": 0.6409, "step": 90 }, { "epoch": 0.43855421686746987, "grad_norm": 0.5438668727874756, "learning_rate": 9.938000549906509e-06, "loss": 0.6391, "step": 91 }, { "epoch": 0.4433734939759036, "grad_norm": 0.6359225511550903, "learning_rate": 9.93350295402958e-06, "loss": 0.6288, "step": 92 }, { "epoch": 0.44819277108433736, "grad_norm": 0.5770338773727417, "learning_rate": 9.92884897657402e-06, "loss": 0.6114, "step": 93 }, { "epoch": 0.4530120481927711, "grad_norm": 0.5579841136932373, "learning_rate": 9.924038765061042e-06, "loss": 0.6137, "step": 94 }, { "epoch": 0.4578313253012048, "grad_norm": 0.597624659538269, "learning_rate": 9.919072471964146e-06, "loss": 0.6455, "step": 95 }, { "epoch": 0.46265060240963857, "grad_norm": 0.5436515212059021, "learning_rate": 9.913950254704291e-06, "loss": 0.6274, "step": 96 }, { "epoch": 0.4674698795180723, "grad_norm": 0.5492904782295227, "learning_rate": 9.908672275644898e-06, "loss": 0.6291, "step": 97 }, { "epoch": 0.472289156626506, "grad_norm": 0.5357149839401245, "learning_rate": 9.903238702086707e-06, "loss": 0.6422, "step": 98 }, { "epoch": 0.4771084337349398, "grad_norm": 0.5629865527153015, "learning_rate": 9.897649706262474e-06, "loss": 0.636, "step": 99 }, { "epoch": 0.4819277108433735, "grad_norm": 0.5739107728004456, "learning_rate": 9.89190546533151e-06, "loss": 0.6261, "step": 100 }, { "epoch": 0.4867469879518072, "grad_norm": 0.600075900554657, "learning_rate": 9.88600616137407e-06, "loss": 0.6206, "step": 101 }, { "epoch": 0.491566265060241, "grad_norm": 0.562242329120636, "learning_rate": 9.879951981385577e-06, "loss": 0.6263, "step": 102 }, { "epoch": 0.4963855421686747, "grad_norm": 0.5759828686714172, "learning_rate": 9.873743117270691e-06, "loss": 0.6287, "step": 103 }, { "epoch": 0.5012048192771085, "grad_norm": 0.7623122334480286, "learning_rate": 9.867379765837237e-06, "loss": 0.6112, "step": 104 }, { "epoch": 0.5060240963855421, "grad_norm": 0.5862762928009033, "learning_rate": 9.860862128789954e-06, "loss": 0.63, "step": 105 }, { "epoch": 0.5108433734939759, "grad_norm": 0.848891019821167, "learning_rate": 9.854190412724114e-06, "loss": 0.6322, "step": 106 }, { "epoch": 0.5156626506024097, "grad_norm": 0.5686174035072327, "learning_rate": 9.847364829118963e-06, "loss": 0.6148, "step": 107 }, { "epoch": 0.5204819277108433, "grad_norm": 0.772788941860199, "learning_rate": 9.840385594331022e-06, "loss": 0.62, "step": 108 }, { "epoch": 0.5253012048192771, "grad_norm": 0.7146121263504028, "learning_rate": 9.833252929587231e-06, "loss": 0.6251, "step": 109 }, { "epoch": 0.5301204819277109, "grad_norm": 0.6730661988258362, "learning_rate": 9.825967060977933e-06, "loss": 0.6231, "step": 110 }, { "epoch": 0.5349397590361445, "grad_norm": 0.6394177675247192, "learning_rate": 9.818528219449705e-06, "loss": 0.6188, "step": 111 }, { "epoch": 0.5397590361445783, "grad_norm": 0.6056435108184814, "learning_rate": 9.810936640798046e-06, "loss": 0.6376, "step": 112 }, { "epoch": 0.5445783132530121, "grad_norm": 0.5977922677993774, "learning_rate": 9.803192565659898e-06, "loss": 0.6179, "step": 113 }, { "epoch": 0.5493975903614458, "grad_norm": 0.6605879664421082, "learning_rate": 9.795296239506011e-06, "loss": 0.6337, "step": 114 }, { "epoch": 0.5542168674698795, "grad_norm": 0.5492590069770813, "learning_rate": 9.78724791263318e-06, "loss": 0.6205, "step": 115 }, { "epoch": 0.5590361445783133, "grad_norm": 0.6682798862457275, "learning_rate": 9.779047840156288e-06, "loss": 0.6102, "step": 116 }, { "epoch": 0.563855421686747, "grad_norm": 0.6884996294975281, "learning_rate": 9.770696282000245e-06, "loss": 0.6063, "step": 117 }, { "epoch": 0.5686746987951807, "grad_norm": 0.6539535522460938, "learning_rate": 9.762193502891726e-06, "loss": 0.6193, "step": 118 }, { "epoch": 0.5734939759036145, "grad_norm": 0.5900925397872925, "learning_rate": 9.753539772350792e-06, "loss": 0.6113, "step": 119 }, { "epoch": 0.5783132530120482, "grad_norm": 0.7695890069007874, "learning_rate": 9.744735364682347e-06, "loss": 0.6219, "step": 120 }, { "epoch": 0.5831325301204819, "grad_norm": 0.6271756887435913, "learning_rate": 9.735780558967434e-06, "loss": 0.603, "step": 121 }, { "epoch": 0.5879518072289157, "grad_norm": 0.667735755443573, "learning_rate": 9.726675639054403e-06, "loss": 0.616, "step": 122 }, { "epoch": 0.5927710843373494, "grad_norm": 0.5626581311225891, "learning_rate": 9.717420893549902e-06, "loss": 0.6251, "step": 123 }, { "epoch": 0.5975903614457831, "grad_norm": 0.5545538067817688, "learning_rate": 9.70801661580973e-06, "loss": 0.6165, "step": 124 }, { "epoch": 0.6024096385542169, "grad_norm": 0.5270641446113586, "learning_rate": 9.698463103929542e-06, "loss": 0.6012, "step": 125 }, { "epoch": 0.6072289156626506, "grad_norm": 0.6241425275802612, "learning_rate": 9.688760660735403e-06, "loss": 0.6072, "step": 126 }, { "epoch": 0.6120481927710844, "grad_norm": 0.5785877108573914, "learning_rate": 9.67890959377418e-06, "loss": 0.6165, "step": 127 }, { "epoch": 0.6168674698795181, "grad_norm": 0.6360320448875427, "learning_rate": 9.668910215303797e-06, "loss": 0.6112, "step": 128 }, { "epoch": 0.6216867469879518, "grad_norm": 0.5739641189575195, "learning_rate": 9.658762842283343e-06, "loss": 0.6101, "step": 129 }, { "epoch": 0.6265060240963856, "grad_norm": 0.698076605796814, "learning_rate": 9.648467796363019e-06, "loss": 0.6136, "step": 130 }, { "epoch": 0.6313253012048192, "grad_norm": 0.6175894141197205, "learning_rate": 9.638025403873939e-06, "loss": 0.6188, "step": 131 }, { "epoch": 0.636144578313253, "grad_norm": 0.5730419754981995, "learning_rate": 9.627435995817799e-06, "loss": 0.6235, "step": 132 }, { "epoch": 0.6409638554216868, "grad_norm": 0.6372531056404114, "learning_rate": 9.616699907856368e-06, "loss": 0.6054, "step": 133 }, { "epoch": 0.6457831325301204, "grad_norm": 0.5383384823799133, "learning_rate": 9.605817480300863e-06, "loss": 0.6066, "step": 134 }, { "epoch": 0.6506024096385542, "grad_norm": 0.6308354735374451, "learning_rate": 9.594789058101154e-06, "loss": 0.6084, "step": 135 }, { "epoch": 0.655421686746988, "grad_norm": 0.5583477020263672, "learning_rate": 9.58361499083483e-06, "loss": 0.6187, "step": 136 }, { "epoch": 0.6602409638554216, "grad_norm": 0.5494464039802551, "learning_rate": 9.57229563269612e-06, "loss": 0.6177, "step": 137 }, { "epoch": 0.6650602409638554, "grad_norm": 0.5109444856643677, "learning_rate": 9.560831342484668e-06, "loss": 0.6171, "step": 138 }, { "epoch": 0.6698795180722892, "grad_norm": 0.6080160737037659, "learning_rate": 9.549222483594154e-06, "loss": 0.6126, "step": 139 }, { "epoch": 0.6746987951807228, "grad_norm": 0.5663288235664368, "learning_rate": 9.53746942400078e-06, "loss": 0.616, "step": 140 }, { "epoch": 0.6795180722891566, "grad_norm": 0.5445743203163147, "learning_rate": 9.525572536251608e-06, "loss": 0.6093, "step": 141 }, { "epoch": 0.6843373493975904, "grad_norm": 0.6263227462768555, "learning_rate": 9.513532197452737e-06, "loss": 0.6118, "step": 142 }, { "epoch": 0.689156626506024, "grad_norm": 0.6303161978721619, "learning_rate": 9.501348789257373e-06, "loss": 0.6136, "step": 143 }, { "epoch": 0.6939759036144578, "grad_norm": 0.5429930686950684, "learning_rate": 9.48902269785371e-06, "loss": 0.6039, "step": 144 }, { "epoch": 0.6987951807228916, "grad_norm": 0.6205868124961853, "learning_rate": 9.476554313952697e-06, "loss": 0.6216, "step": 145 }, { "epoch": 0.7036144578313253, "grad_norm": 0.5463981032371521, "learning_rate": 9.46394403277566e-06, "loss": 0.613, "step": 146 }, { "epoch": 0.708433734939759, "grad_norm": 0.5573562979698181, "learning_rate": 9.451192254041759e-06, "loss": 0.6074, "step": 147 }, { "epoch": 0.7132530120481928, "grad_norm": 0.6575038433074951, "learning_rate": 9.438299381955333e-06, "loss": 0.6174, "step": 148 }, { "epoch": 0.7180722891566265, "grad_norm": 0.5069466233253479, "learning_rate": 9.425265825193077e-06, "loss": 0.5937, "step": 149 }, { "epoch": 0.7228915662650602, "grad_norm": 0.5574111938476562, "learning_rate": 9.412091996891097e-06, "loss": 0.6242, "step": 150 }, { "epoch": 0.727710843373494, "grad_norm": 0.5976607799530029, "learning_rate": 9.398778314631801e-06, "loss": 0.6035, "step": 151 }, { "epoch": 0.7325301204819277, "grad_norm": 0.5757405161857605, "learning_rate": 9.385325200430679e-06, "loss": 0.6024, "step": 152 }, { "epoch": 0.7373493975903614, "grad_norm": 0.5733328461647034, "learning_rate": 9.371733080722911e-06, "loss": 0.6242, "step": 153 }, { "epoch": 0.7421686746987952, "grad_norm": 0.600737452507019, "learning_rate": 9.358002386349862e-06, "loss": 0.6105, "step": 154 }, { "epoch": 0.7469879518072289, "grad_norm": 0.6255360841751099, "learning_rate": 9.34413355254542e-06, "loss": 0.6112, "step": 155 }, { "epoch": 0.7518072289156627, "grad_norm": 0.5567818880081177, "learning_rate": 9.330127018922195e-06, "loss": 0.5927, "step": 156 }, { "epoch": 0.7566265060240964, "grad_norm": 0.7106531262397766, "learning_rate": 9.31598322945759e-06, "loss": 0.6112, "step": 157 }, { "epoch": 0.7614457831325301, "grad_norm": 0.5977474451065063, "learning_rate": 9.301702632479734e-06, "loss": 0.6084, "step": 158 }, { "epoch": 0.7662650602409639, "grad_norm": 0.664481520652771, "learning_rate": 9.287285680653254e-06, "loss": 0.5982, "step": 159 }, { "epoch": 0.7710843373493976, "grad_norm": 0.6926606297492981, "learning_rate": 9.272732830964948e-06, "loss": 0.6083, "step": 160 }, { "epoch": 0.7759036144578313, "grad_norm": 0.49633654952049255, "learning_rate": 9.258044544709276e-06, "loss": 0.6157, "step": 161 }, { "epoch": 0.7807228915662651, "grad_norm": 0.6019198894500732, "learning_rate": 9.243221287473755e-06, "loss": 0.6017, "step": 162 }, { "epoch": 0.7855421686746988, "grad_norm": 0.5977044105529785, "learning_rate": 9.228263529124199e-06, "loss": 0.6164, "step": 163 }, { "epoch": 0.7903614457831325, "grad_norm": 0.6039947867393494, "learning_rate": 9.21317174378982e-06, "loss": 0.5977, "step": 164 }, { "epoch": 0.7951807228915663, "grad_norm": 0.5924364328384399, "learning_rate": 9.197946409848196e-06, "loss": 0.591, "step": 165 }, { "epoch": 0.8, "grad_norm": 0.5773972868919373, "learning_rate": 9.182588009910119e-06, "loss": 0.6025, "step": 166 }, { "epoch": 0.8048192771084337, "grad_norm": 0.5809570550918579, "learning_rate": 9.167097030804289e-06, "loss": 0.6091, "step": 167 }, { "epoch": 0.8096385542168675, "grad_norm": 0.6400211453437805, "learning_rate": 9.151473963561884e-06, "loss": 0.5995, "step": 168 }, { "epoch": 0.8144578313253013, "grad_norm": 0.5365077257156372, "learning_rate": 9.135719303400995e-06, "loss": 0.6015, "step": 169 }, { "epoch": 0.8192771084337349, "grad_norm": 0.5059504508972168, "learning_rate": 9.119833549710927e-06, "loss": 0.5981, "step": 170 }, { "epoch": 0.8240963855421687, "grad_norm": 0.6700984835624695, "learning_rate": 9.103817206036383e-06, "loss": 0.623, "step": 171 }, { "epoch": 0.8289156626506025, "grad_norm": 0.48179715871810913, "learning_rate": 9.087670780061477e-06, "loss": 0.6016, "step": 172 }, { "epoch": 0.8337349397590361, "grad_norm": 0.571553647518158, "learning_rate": 9.071394783593664e-06, "loss": 0.5971, "step": 173 }, { "epoch": 0.8385542168674699, "grad_norm": 0.5676701664924622, "learning_rate": 9.054989732547507e-06, "loss": 0.5975, "step": 174 }, { "epoch": 0.8433734939759037, "grad_norm": 0.5889990925788879, "learning_rate": 9.038456146928325e-06, "loss": 0.6008, "step": 175 }, { "epoch": 0.8481927710843373, "grad_norm": 0.5731596946716309, "learning_rate": 9.021794550815713e-06, "loss": 0.6061, "step": 176 }, { "epoch": 0.8530120481927711, "grad_norm": 0.5482961535453796, "learning_rate": 9.005005472346923e-06, "loss": 0.621, "step": 177 }, { "epoch": 0.8578313253012049, "grad_norm": 0.5580418109893799, "learning_rate": 8.988089443700131e-06, "loss": 0.5861, "step": 178 }, { "epoch": 0.8626506024096385, "grad_norm": 0.6586818099021912, "learning_rate": 8.971047001077561e-06, "loss": 0.5879, "step": 179 }, { "epoch": 0.8674698795180723, "grad_norm": 0.5529095530509949, "learning_rate": 8.953878684688492e-06, "loss": 0.5908, "step": 180 }, { "epoch": 0.8722891566265061, "grad_norm": 0.7042589783668518, "learning_rate": 8.936585038732143e-06, "loss": 0.6026, "step": 181 }, { "epoch": 0.8771084337349397, "grad_norm": 0.5728626251220703, "learning_rate": 8.919166611380397e-06, "loss": 0.5951, "step": 182 }, { "epoch": 0.8819277108433735, "grad_norm": 0.6059430837631226, "learning_rate": 8.90162395476046e-06, "loss": 0.6004, "step": 183 }, { "epoch": 0.8867469879518072, "grad_norm": 0.5582807660102844, "learning_rate": 8.883957624937333e-06, "loss": 0.5945, "step": 184 }, { "epoch": 0.891566265060241, "grad_norm": 0.5483564138412476, "learning_rate": 8.866168181896198e-06, "loss": 0.5853, "step": 185 }, { "epoch": 0.8963855421686747, "grad_norm": 0.5057897567749023, "learning_rate": 8.848256189524661e-06, "loss": 0.5998, "step": 186 }, { "epoch": 0.9012048192771084, "grad_norm": 0.5957621932029724, "learning_rate": 8.83022221559489e-06, "loss": 0.6038, "step": 187 }, { "epoch": 0.9060240963855422, "grad_norm": 0.6007516980171204, "learning_rate": 8.812066831745602e-06, "loss": 0.5897, "step": 188 }, { "epoch": 0.9108433734939759, "grad_norm": 0.603797435760498, "learning_rate": 8.793790613463956e-06, "loss": 0.6006, "step": 189 }, { "epoch": 0.9156626506024096, "grad_norm": 0.6564244627952576, "learning_rate": 8.775394140067299e-06, "loss": 0.597, "step": 190 }, { "epoch": 0.9204819277108434, "grad_norm": 0.7330297231674194, "learning_rate": 8.756877994684818e-06, "loss": 0.5946, "step": 191 }, { "epoch": 0.9253012048192771, "grad_norm": 0.509863555431366, "learning_rate": 8.738242764239046e-06, "loss": 0.6, "step": 192 }, { "epoch": 0.9301204819277108, "grad_norm": 0.6488967537879944, "learning_rate": 8.719489039427256e-06, "loss": 0.6097, "step": 193 }, { "epoch": 0.9349397590361446, "grad_norm": 0.5334682464599609, "learning_rate": 8.700617414702746e-06, "loss": 0.6196, "step": 194 }, { "epoch": 0.9397590361445783, "grad_norm": 0.4760792851448059, "learning_rate": 8.681628488255986e-06, "loss": 0.6079, "step": 195 }, { "epoch": 0.944578313253012, "grad_norm": 0.5883437395095825, "learning_rate": 8.66252286199567e-06, "loss": 0.6005, "step": 196 }, { "epoch": 0.9493975903614458, "grad_norm": 0.5796838998794556, "learning_rate": 8.643301141529619e-06, "loss": 0.5867, "step": 197 }, { "epoch": 0.9542168674698795, "grad_norm": 0.6529198884963989, "learning_rate": 8.6239639361456e-06, "loss": 0.5948, "step": 198 }, { "epoch": 0.9590361445783132, "grad_norm": 0.5081263184547424, "learning_rate": 8.604511858792006e-06, "loss": 0.6159, "step": 199 }, { "epoch": 0.963855421686747, "grad_norm": 0.5447860360145569, "learning_rate": 8.584945526058426e-06, "loss": 0.6027, "step": 200 }, { "epoch": 0.9686746987951808, "grad_norm": 0.5446210503578186, "learning_rate": 8.565265558156101e-06, "loss": 0.586, "step": 201 }, { "epoch": 0.9734939759036144, "grad_norm": 0.5749803781509399, "learning_rate": 8.545472578898276e-06, "loss": 0.6034, "step": 202 }, { "epoch": 0.9783132530120482, "grad_norm": 0.539688229560852, "learning_rate": 8.525567215680397e-06, "loss": 0.6044, "step": 203 }, { "epoch": 0.983132530120482, "grad_norm": 0.7089233994483948, "learning_rate": 8.505550099460264e-06, "loss": 0.6023, "step": 204 }, { "epoch": 0.9879518072289156, "grad_norm": 0.5110678672790527, "learning_rate": 8.485421864737997e-06, "loss": 0.5975, "step": 205 }, { "epoch": 0.9927710843373494, "grad_norm": 0.6963241696357727, "learning_rate": 8.465183149535939e-06, "loss": 0.5997, "step": 206 }, { "epoch": 0.9975903614457832, "grad_norm": 0.5199764370918274, "learning_rate": 8.444834595378434e-06, "loss": 0.5887, "step": 207 }, { "epoch": 1.002409638554217, "grad_norm": 1.0180178880691528, "learning_rate": 8.424376847271483e-06, "loss": 0.8314, "step": 208 }, { "epoch": 1.0072289156626506, "grad_norm": 0.6434845924377441, "learning_rate": 8.403810553682307e-06, "loss": 0.6377, "step": 209 }, { "epoch": 1.0120481927710843, "grad_norm": 0.5937601327896118, "learning_rate": 8.383136366518788e-06, "loss": 0.5636, "step": 210 }, { "epoch": 1.0168674698795181, "grad_norm": 0.5739064812660217, "learning_rate": 8.362354941108803e-06, "loss": 0.5732, "step": 211 }, { "epoch": 1.0216867469879518, "grad_norm": 0.6371908783912659, "learning_rate": 8.341466936179457e-06, "loss": 0.5808, "step": 212 }, { "epoch": 1.0265060240963855, "grad_norm": 0.5970191359519958, "learning_rate": 8.320473013836197e-06, "loss": 0.5793, "step": 213 }, { "epoch": 1.0313253012048194, "grad_norm": 0.6064131259918213, "learning_rate": 8.299373839541829e-06, "loss": 0.5324, "step": 214 }, { "epoch": 1.036144578313253, "grad_norm": 0.7945104837417603, "learning_rate": 8.278170082095422e-06, "loss": 0.5765, "step": 215 }, { "epoch": 1.0409638554216867, "grad_norm": 0.5729210376739502, "learning_rate": 8.256862413611113e-06, "loss": 0.578, "step": 216 }, { "epoch": 1.0457831325301206, "grad_norm": 0.6838613152503967, "learning_rate": 8.23545150949679e-06, "loss": 0.5703, "step": 217 }, { "epoch": 1.0506024096385542, "grad_norm": 0.5426240563392639, "learning_rate": 8.213938048432697e-06, "loss": 0.5667, "step": 218 }, { "epoch": 1.0554216867469879, "grad_norm": 0.6620539426803589, "learning_rate": 8.192322712349917e-06, "loss": 0.5592, "step": 219 }, { "epoch": 1.0602409638554218, "grad_norm": 0.6412476897239685, "learning_rate": 8.170606186408756e-06, "loss": 0.5855, "step": 220 }, { "epoch": 1.0650602409638554, "grad_norm": 0.6816362142562866, "learning_rate": 8.148789158977012e-06, "loss": 0.5883, "step": 221 }, { "epoch": 1.069879518072289, "grad_norm": 0.5668280124664307, "learning_rate": 8.126872321608185e-06, "loss": 0.5248, "step": 222 }, { "epoch": 1.074698795180723, "grad_norm": 0.5772892832756042, "learning_rate": 8.104856369019525e-06, "loss": 0.6062, "step": 223 }, { "epoch": 1.0795180722891566, "grad_norm": 0.7769073247909546, "learning_rate": 8.08274199907003e-06, "loss": 0.5695, "step": 224 }, { "epoch": 1.0843373493975903, "grad_norm": 0.5870208740234375, "learning_rate": 8.060529912738316e-06, "loss": 0.543, "step": 225 }, { "epoch": 1.0891566265060242, "grad_norm": 0.8082980513572693, "learning_rate": 8.038220814100403e-06, "loss": 0.5556, "step": 226 }, { "epoch": 1.0939759036144578, "grad_norm": 0.5669785141944885, "learning_rate": 8.0158154103074e-06, "loss": 0.5608, "step": 227 }, { "epoch": 1.0987951807228915, "grad_norm": 0.7154496908187866, "learning_rate": 7.993314411563075e-06, "loss": 0.5701, "step": 228 }, { "epoch": 1.1036144578313254, "grad_norm": 0.5299010276794434, "learning_rate": 7.970718531101365e-06, "loss": 0.5321, "step": 229 }, { "epoch": 1.108433734939759, "grad_norm": 0.5549928545951843, "learning_rate": 7.948028485163744e-06, "loss": 0.6037, "step": 230 }, { "epoch": 1.1132530120481927, "grad_norm": 0.5550109148025513, "learning_rate": 7.925244992976538e-06, "loss": 0.5405, "step": 231 }, { "epoch": 1.1180722891566266, "grad_norm": 0.5465673208236694, "learning_rate": 7.902368776728125e-06, "loss": 0.5652, "step": 232 }, { "epoch": 1.1228915662650603, "grad_norm": 0.5998708009719849, "learning_rate": 7.879400561546033e-06, "loss": 0.5237, "step": 233 }, { "epoch": 1.127710843373494, "grad_norm": 0.5261906981468201, "learning_rate": 7.856341075473963e-06, "loss": 0.5947, "step": 234 }, { "epoch": 1.1325301204819278, "grad_norm": 0.5877994894981384, "learning_rate": 7.833191049448706e-06, "loss": 0.5326, "step": 235 }, { "epoch": 1.1373493975903615, "grad_norm": 0.49554064869880676, "learning_rate": 7.809951217276986e-06, "loss": 0.5494, "step": 236 }, { "epoch": 1.1421686746987951, "grad_norm": 0.6176630258560181, "learning_rate": 7.786622315612182e-06, "loss": 0.6198, "step": 237 }, { "epoch": 1.146987951807229, "grad_norm": 0.5112401247024536, "learning_rate": 7.763205083930995e-06, "loss": 0.5629, "step": 238 }, { "epoch": 1.1518072289156627, "grad_norm": 0.5183048844337463, "learning_rate": 7.739700264509993e-06, "loss": 0.5139, "step": 239 }, { "epoch": 1.1566265060240963, "grad_norm": 0.5413950681686401, "learning_rate": 7.716108602402094e-06, "loss": 0.6041, "step": 240 }, { "epoch": 1.16144578313253, "grad_norm": 0.526046872138977, "learning_rate": 7.692430845412946e-06, "loss": 0.5592, "step": 241 }, { "epoch": 1.1662650602409639, "grad_norm": 0.5707272887229919, "learning_rate": 7.668667744077215e-06, "loss": 0.546, "step": 242 }, { "epoch": 1.1710843373493975, "grad_norm": 0.6153143048286438, "learning_rate": 7.644820051634813e-06, "loss": 0.561, "step": 243 }, { "epoch": 1.1759036144578312, "grad_norm": 0.5286562442779541, "learning_rate": 7.6208885240069995e-06, "loss": 0.5251, "step": 244 }, { "epoch": 1.180722891566265, "grad_norm": 0.5454757809638977, "learning_rate": 7.596873919772438e-06, "loss": 0.5976, "step": 245 }, { "epoch": 1.1855421686746987, "grad_norm": 0.6441790461540222, "learning_rate": 7.572777000143145e-06, "loss": 0.5747, "step": 246 }, { "epoch": 1.1903614457831324, "grad_norm": 0.5183297395706177, "learning_rate": 7.548598528940354e-06, "loss": 0.4837, "step": 247 }, { "epoch": 1.1951807228915663, "grad_norm": 0.5835433006286621, "learning_rate": 7.524339272570317e-06, "loss": 0.6416, "step": 248 }, { "epoch": 1.2, "grad_norm": 0.5134496092796326, "learning_rate": 7.500000000000001e-06, "loss": 0.54, "step": 249 }, { "epoch": 1.2048192771084336, "grad_norm": 0.5053899884223938, "learning_rate": 7.475581482732717e-06, "loss": 0.5513, "step": 250 }, { "epoch": 1.2096385542168675, "grad_norm": 0.5715299844741821, "learning_rate": 7.451084494783668e-06, "loss": 0.5604, "step": 251 }, { "epoch": 1.2144578313253012, "grad_norm": 0.5025492310523987, "learning_rate": 7.4265098126554065e-06, "loss": 0.5426, "step": 252 }, { "epoch": 1.2192771084337348, "grad_norm": 0.5954846739768982, "learning_rate": 7.401858215313228e-06, "loss": 0.5872, "step": 253 }, { "epoch": 1.2240963855421687, "grad_norm": 0.4413267970085144, "learning_rate": 7.3771304841604764e-06, "loss": 0.5803, "step": 254 }, { "epoch": 1.2289156626506024, "grad_norm": 0.5452926754951477, "learning_rate": 7.352327403013779e-06, "loss": 0.5996, "step": 255 }, { "epoch": 1.233734939759036, "grad_norm": 0.4635935127735138, "learning_rate": 7.327449758078194e-06, "loss": 0.5109, "step": 256 }, { "epoch": 1.23855421686747, "grad_norm": 0.49783962965011597, "learning_rate": 7.302498337922293e-06, "loss": 0.5357, "step": 257 }, { "epoch": 1.2433734939759036, "grad_norm": 0.5647582411766052, "learning_rate": 7.27747393345317e-06, "loss": 0.5724, "step": 258 }, { "epoch": 1.2481927710843372, "grad_norm": 0.49849966168403625, "learning_rate": 7.2523773378913655e-06, "loss": 0.5722, "step": 259 }, { "epoch": 1.2530120481927711, "grad_norm": 0.5430841445922852, "learning_rate": 7.2272093467457226e-06, "loss": 0.5995, "step": 260 }, { "epoch": 1.2578313253012048, "grad_norm": 0.583263099193573, "learning_rate": 7.201970757788172e-06, "loss": 0.5915, "step": 261 }, { "epoch": 1.2626506024096384, "grad_norm": 0.49482282996177673, "learning_rate": 7.17666237102845e-06, "loss": 0.5607, "step": 262 }, { "epoch": 1.2674698795180723, "grad_norm": 0.5552793741226196, "learning_rate": 7.151284988688731e-06, "loss": 0.5993, "step": 263 }, { "epoch": 1.272289156626506, "grad_norm": 0.5349662899971008, "learning_rate": 7.125839415178204e-06, "loss": 0.5268, "step": 264 }, { "epoch": 1.2771084337349397, "grad_norm": 0.4651695191860199, "learning_rate": 7.100326457067576e-06, "loss": 0.5822, "step": 265 }, { "epoch": 1.2819277108433735, "grad_norm": 0.5936465859413147, "learning_rate": 7.074746923063497e-06, "loss": 0.5578, "step": 266 }, { "epoch": 1.2867469879518072, "grad_norm": 0.5026842355728149, "learning_rate": 7.049101623982938e-06, "loss": 0.5922, "step": 267 }, { "epoch": 1.2915662650602409, "grad_norm": 0.5798065662384033, "learning_rate": 7.02339137272748e-06, "loss": 0.5598, "step": 268 }, { "epoch": 1.2963855421686747, "grad_norm": 0.5257403254508972, "learning_rate": 6.9976169842575526e-06, "loss": 0.5361, "step": 269 }, { "epoch": 1.3012048192771084, "grad_norm": 0.4787715971469879, "learning_rate": 6.971779275566593e-06, "loss": 0.5669, "step": 270 }, { "epoch": 1.306024096385542, "grad_norm": 0.5384206175804138, "learning_rate": 6.945879065655164e-06, "loss": 0.5447, "step": 271 }, { "epoch": 1.310843373493976, "grad_norm": 0.45355817675590515, "learning_rate": 6.919917175504978e-06, "loss": 0.6003, "step": 272 }, { "epoch": 1.3156626506024096, "grad_norm": 0.5099201798439026, "learning_rate": 6.893894428052881e-06, "loss": 0.5545, "step": 273 }, { "epoch": 1.3204819277108433, "grad_norm": 0.4605954587459564, "learning_rate": 6.867811648164769e-06, "loss": 0.5668, "step": 274 }, { "epoch": 1.3253012048192772, "grad_norm": 0.5192990899085999, "learning_rate": 6.841669662609437e-06, "loss": 0.577, "step": 275 }, { "epoch": 1.3301204819277108, "grad_norm": 0.48977819085121155, "learning_rate": 6.815469300032374e-06, "loss": 0.5341, "step": 276 }, { "epoch": 1.3349397590361445, "grad_norm": 0.5304272174835205, "learning_rate": 6.789211390929497e-06, "loss": 0.565, "step": 277 }, { "epoch": 1.3397590361445784, "grad_norm": 0.6027169823646545, "learning_rate": 6.762896767620827e-06, "loss": 0.5453, "step": 278 }, { "epoch": 1.344578313253012, "grad_norm": 0.531121551990509, "learning_rate": 6.736526264224101e-06, "loss": 0.5959, "step": 279 }, { "epoch": 1.3493975903614457, "grad_norm": 0.5106316208839417, "learning_rate": 6.710100716628345e-06, "loss": 0.5563, "step": 280 }, { "epoch": 1.3542168674698796, "grad_norm": 0.5311499238014221, "learning_rate": 6.6836209624673575e-06, "loss": 0.5956, "step": 281 }, { "epoch": 1.3590361445783132, "grad_norm": 0.4638383686542511, "learning_rate": 6.657087841093179e-06, "loss": 0.5184, "step": 282 }, { "epoch": 1.363855421686747, "grad_norm": 0.5220621824264526, "learning_rate": 6.6305021935494755e-06, "loss": 0.6286, "step": 283 }, { "epoch": 1.3686746987951808, "grad_norm": 0.4436984956264496, "learning_rate": 6.603864862544879e-06, "loss": 0.5357, "step": 284 }, { "epoch": 1.3734939759036144, "grad_norm": 0.46485018730163574, "learning_rate": 6.5771766924262795e-06, "loss": 0.5653, "step": 285 }, { "epoch": 1.378313253012048, "grad_norm": 0.5112223625183105, "learning_rate": 6.5504385291520554e-06, "loss": 0.5715, "step": 286 }, { "epoch": 1.383132530120482, "grad_norm": 0.4580565094947815, "learning_rate": 6.523651220265269e-06, "loss": 0.5742, "step": 287 }, { "epoch": 1.3879518072289156, "grad_norm": 0.43429890275001526, "learning_rate": 6.496815614866792e-06, "loss": 0.5597, "step": 288 }, { "epoch": 1.3927710843373493, "grad_norm": 0.4501931667327881, "learning_rate": 6.469932563588386e-06, "loss": 0.566, "step": 289 }, { "epoch": 1.3975903614457832, "grad_norm": 0.4644792377948761, "learning_rate": 6.443002918565754e-06, "loss": 0.5657, "step": 290 }, { "epoch": 1.4024096385542169, "grad_norm": 0.4781135618686676, "learning_rate": 6.41602753341152e-06, "loss": 0.5773, "step": 291 }, { "epoch": 1.4072289156626505, "grad_norm": 0.5058557391166687, "learning_rate": 6.389007263188176e-06, "loss": 0.5463, "step": 292 }, { "epoch": 1.4120481927710844, "grad_norm": 0.4932103753089905, "learning_rate": 6.361942964380967e-06, "loss": 0.5801, "step": 293 }, { "epoch": 1.416867469879518, "grad_norm": 0.5443869829177856, "learning_rate": 6.334835494870759e-06, "loss": 0.5264, "step": 294 }, { "epoch": 1.4216867469879517, "grad_norm": 0.5028607845306396, "learning_rate": 6.307685713906835e-06, "loss": 0.6111, "step": 295 }, { "epoch": 1.4265060240963856, "grad_norm": 0.4613734781742096, "learning_rate": 6.2804944820796596e-06, "loss": 0.5421, "step": 296 }, { "epoch": 1.4313253012048193, "grad_norm": 0.5405146479606628, "learning_rate": 6.2532626612936035e-06, "loss": 0.6173, "step": 297 }, { "epoch": 1.436144578313253, "grad_norm": 0.4562685787677765, "learning_rate": 6.225991114739622e-06, "loss": 0.534, "step": 298 }, { "epoch": 1.4409638554216868, "grad_norm": 0.5416148900985718, "learning_rate": 6.1986807068678926e-06, "loss": 0.5861, "step": 299 }, { "epoch": 1.4457831325301205, "grad_norm": 0.5628028512001038, "learning_rate": 6.171332303360411e-06, "loss": 0.5484, "step": 300 }, { "epoch": 1.4506024096385541, "grad_norm": 0.4369142949581146, "learning_rate": 6.143946771103561e-06, "loss": 0.5304, "step": 301 }, { "epoch": 1.455421686746988, "grad_norm": 0.5397925972938538, "learning_rate": 6.11652497816062e-06, "loss": 0.5407, "step": 302 }, { "epoch": 1.4602409638554217, "grad_norm": 0.4883024990558624, "learning_rate": 6.089067793744258e-06, "loss": 0.588, "step": 303 }, { "epoch": 1.4650602409638553, "grad_norm": 0.4950958490371704, "learning_rate": 6.061576088188981e-06, "loss": 0.5884, "step": 304 }, { "epoch": 1.4698795180722892, "grad_norm": 0.4288600981235504, "learning_rate": 6.034050732923538e-06, "loss": 0.555, "step": 305 }, { "epoch": 1.4746987951807229, "grad_norm": 0.5213388800621033, "learning_rate": 6.006492600443301e-06, "loss": 0.5619, "step": 306 }, { "epoch": 1.4795180722891565, "grad_norm": 0.4272007346153259, "learning_rate": 5.978902564282616e-06, "loss": 0.5384, "step": 307 }, { "epoch": 1.4843373493975904, "grad_norm": 0.5602285861968994, "learning_rate": 5.951281498987106e-06, "loss": 0.5827, "step": 308 }, { "epoch": 1.489156626506024, "grad_norm": 0.47353655099868774, "learning_rate": 5.923630280085948e-06, "loss": 0.541, "step": 309 }, { "epoch": 1.4939759036144578, "grad_norm": 0.5271124839782715, "learning_rate": 5.895949784064126e-06, "loss": 0.5503, "step": 310 }, { "epoch": 1.4987951807228916, "grad_norm": 0.5316128134727478, "learning_rate": 5.8682408883346535e-06, "loss": 0.5409, "step": 311 }, { "epoch": 1.5036144578313253, "grad_norm": 0.553626298904419, "learning_rate": 5.840504471210742e-06, "loss": 0.5777, "step": 312 }, { "epoch": 1.508433734939759, "grad_norm": 0.48189061880111694, "learning_rate": 5.8127414118779825e-06, "loss": 0.5784, "step": 313 }, { "epoch": 1.5132530120481928, "grad_norm": 0.6508918404579163, "learning_rate": 5.7849525903664636e-06, "loss": 0.5652, "step": 314 }, { "epoch": 1.5180722891566265, "grad_norm": 0.4616662859916687, "learning_rate": 5.757138887522884e-06, "loss": 0.5433, "step": 315 }, { "epoch": 1.5228915662650602, "grad_norm": 0.5459868311882019, "learning_rate": 5.729301184982622e-06, "loss": 0.5303, "step": 316 }, { "epoch": 1.527710843373494, "grad_norm": 0.5954388976097107, "learning_rate": 5.701440365141799e-06, "loss": 0.584, "step": 317 }, { "epoch": 1.5325301204819277, "grad_norm": 0.4622243344783783, "learning_rate": 5.673557311129306e-06, "loss": 0.5334, "step": 318 }, { "epoch": 1.5373493975903614, "grad_norm": 0.6150903701782227, "learning_rate": 5.645652906778808e-06, "loss": 0.5803, "step": 319 }, { "epoch": 1.5421686746987953, "grad_norm": 0.6536097526550293, "learning_rate": 5.617728036600734e-06, "loss": 0.6041, "step": 320 }, { "epoch": 1.546987951807229, "grad_norm": 0.48668476939201355, "learning_rate": 5.5897835857542315e-06, "loss": 0.5577, "step": 321 }, { "epoch": 1.5518072289156626, "grad_norm": 0.5237818956375122, "learning_rate": 5.561820440019117e-06, "loss": 0.5162, "step": 322 }, { "epoch": 1.5566265060240965, "grad_norm": 0.5246903896331787, "learning_rate": 5.533839485767795e-06, "loss": 0.5891, "step": 323 }, { "epoch": 1.5614457831325301, "grad_norm": 0.5068934559822083, "learning_rate": 5.505841609937162e-06, "loss": 0.5608, "step": 324 }, { "epoch": 1.5662650602409638, "grad_norm": 0.5139860510826111, "learning_rate": 5.477827700000492e-06, "loss": 0.563, "step": 325 }, { "epoch": 1.5710843373493977, "grad_norm": 0.49885353446006775, "learning_rate": 5.449798643939305e-06, "loss": 0.5472, "step": 326 }, { "epoch": 1.5759036144578313, "grad_norm": 0.5572274327278137, "learning_rate": 5.421755330215223e-06, "loss": 0.5805, "step": 327 }, { "epoch": 1.580722891566265, "grad_norm": 0.5297819972038269, "learning_rate": 5.39369864774181e-06, "loss": 0.5477, "step": 328 }, { "epoch": 1.5855421686746989, "grad_norm": 0.5140382647514343, "learning_rate": 5.365629485856381e-06, "loss": 0.5239, "step": 329 }, { "epoch": 1.5903614457831325, "grad_norm": 0.5845142006874084, "learning_rate": 5.337548734291827e-06, "loss": 0.5787, "step": 330 }, { "epoch": 1.5951807228915662, "grad_norm": 0.4414353668689728, "learning_rate": 5.30945728314841e-06, "loss": 0.5797, "step": 331 }, { "epoch": 1.6, "grad_norm": 0.49196693301200867, "learning_rate": 5.281356022865542e-06, "loss": 0.5767, "step": 332 }, { "epoch": 1.6048192771084338, "grad_norm": 0.5447960495948792, "learning_rate": 5.253245844193564e-06, "loss": 0.5502, "step": 333 }, { "epoch": 1.6096385542168674, "grad_norm": 0.4547137916088104, "learning_rate": 5.225127638165514e-06, "loss": 0.5579, "step": 334 }, { "epoch": 1.6144578313253013, "grad_norm": 0.5411679148674011, "learning_rate": 5.197002296068878e-06, "loss": 0.5754, "step": 335 }, { "epoch": 1.619277108433735, "grad_norm": 0.45455247163772583, "learning_rate": 5.168870709417342e-06, "loss": 0.5283, "step": 336 }, { "epoch": 1.6240963855421686, "grad_norm": 0.5107097029685974, "learning_rate": 5.140733769922525e-06, "loss": 0.5377, "step": 337 }, { "epoch": 1.6289156626506025, "grad_norm": 0.495237797498703, "learning_rate": 5.112592369465731e-06, "loss": 0.616, "step": 338 }, { "epoch": 1.6337349397590362, "grad_norm": 0.5090997219085693, "learning_rate": 5.084447400069656e-06, "loss": 0.5562, "step": 339 }, { "epoch": 1.6385542168674698, "grad_norm": 0.4626471996307373, "learning_rate": 5.0562997538701295e-06, "loss": 0.5302, "step": 340 }, { "epoch": 1.6433734939759037, "grad_norm": 0.5563095211982727, "learning_rate": 5.0281503230878304e-06, "loss": 0.5631, "step": 341 }, { "epoch": 1.6481927710843374, "grad_norm": 0.4692661166191101, "learning_rate": 5e-06, "loss": 0.609, "step": 342 }, { "epoch": 1.653012048192771, "grad_norm": 0.4281330108642578, "learning_rate": 4.971849676912172e-06, "loss": 0.5204, "step": 343 }, { "epoch": 1.657831325301205, "grad_norm": 0.5017001032829285, "learning_rate": 4.943700246129871e-06, "loss": 0.5618, "step": 344 }, { "epoch": 1.6626506024096386, "grad_norm": 0.47061556577682495, "learning_rate": 4.915552599930345e-06, "loss": 0.5413, "step": 345 }, { "epoch": 1.6674698795180722, "grad_norm": 0.5569798350334167, "learning_rate": 4.887407630534271e-06, "loss": 0.5217, "step": 346 }, { "epoch": 1.6722891566265061, "grad_norm": 0.4373623728752136, "learning_rate": 4.859266230077474e-06, "loss": 0.577, "step": 347 }, { "epoch": 1.6771084337349398, "grad_norm": 0.4877110719680786, "learning_rate": 4.83112929058266e-06, "loss": 0.586, "step": 348 }, { "epoch": 1.6819277108433734, "grad_norm": 0.5045443773269653, "learning_rate": 4.802997703931124e-06, "loss": 0.5214, "step": 349 }, { "epoch": 1.6867469879518073, "grad_norm": 0.4335879683494568, "learning_rate": 4.7748723618344865e-06, "loss": 0.5975, "step": 350 }, { "epoch": 1.691566265060241, "grad_norm": 0.47140806913375854, "learning_rate": 4.746754155806437e-06, "loss": 0.5399, "step": 351 }, { "epoch": 1.6963855421686747, "grad_norm": 0.44901373982429504, "learning_rate": 4.71864397713446e-06, "loss": 0.617, "step": 352 }, { "epoch": 1.7012048192771085, "grad_norm": 0.3893554210662842, "learning_rate": 4.6905427168515914e-06, "loss": 0.4837, "step": 353 }, { "epoch": 1.7060240963855422, "grad_norm": 0.5149268507957458, "learning_rate": 4.662451265708174e-06, "loss": 0.5979, "step": 354 }, { "epoch": 1.7108433734939759, "grad_norm": 0.3967888057231903, "learning_rate": 4.63437051414362e-06, "loss": 0.5654, "step": 355 }, { "epoch": 1.7156626506024097, "grad_norm": 0.4324769675731659, "learning_rate": 4.606301352258192e-06, "loss": 0.5548, "step": 356 }, { "epoch": 1.7204819277108434, "grad_norm": 0.44806748628616333, "learning_rate": 4.5782446697847775e-06, "loss": 0.5607, "step": 357 }, { "epoch": 1.725301204819277, "grad_norm": 0.457933634519577, "learning_rate": 4.550201356060695e-06, "loss": 0.5773, "step": 358 }, { "epoch": 1.730120481927711, "grad_norm": 0.41432875394821167, "learning_rate": 4.52217229999951e-06, "loss": 0.5119, "step": 359 }, { "epoch": 1.7349397590361446, "grad_norm": 0.46618443727493286, "learning_rate": 4.49415839006284e-06, "loss": 0.5282, "step": 360 }, { "epoch": 1.7397590361445783, "grad_norm": 0.4658416509628296, "learning_rate": 4.466160514232206e-06, "loss": 0.5911, "step": 361 }, { "epoch": 1.7445783132530122, "grad_norm": 0.45206916332244873, "learning_rate": 4.438179559980885e-06, "loss": 0.5368, "step": 362 }, { "epoch": 1.7493975903614458, "grad_norm": 0.5187458395957947, "learning_rate": 4.410216414245771e-06, "loss": 0.5681, "step": 363 }, { "epoch": 1.7542168674698795, "grad_norm": 0.44342824816703796, "learning_rate": 4.382271963399268e-06, "loss": 0.5654, "step": 364 }, { "epoch": 1.7590361445783134, "grad_norm": 0.45453277230262756, "learning_rate": 4.354347093221194e-06, "loss": 0.5477, "step": 365 }, { "epoch": 1.763855421686747, "grad_norm": 0.5282381176948547, "learning_rate": 4.326442688870697e-06, "loss": 0.5759, "step": 366 }, { "epoch": 1.7686746987951807, "grad_norm": 0.4427390694618225, "learning_rate": 4.298559634858202e-06, "loss": 0.5478, "step": 367 }, { "epoch": 1.7734939759036146, "grad_norm": 0.4339542090892792, "learning_rate": 4.270698815017379e-06, "loss": 0.5743, "step": 368 }, { "epoch": 1.7783132530120482, "grad_norm": 0.4502948522567749, "learning_rate": 4.2428611124771184e-06, "loss": 0.5666, "step": 369 }, { "epoch": 1.783132530120482, "grad_norm": 0.45227017998695374, "learning_rate": 4.2150474096335356e-06, "loss": 0.5561, "step": 370 }, { "epoch": 1.7879518072289158, "grad_norm": 0.41950294375419617, "learning_rate": 4.187258588122019e-06, "loss": 0.5617, "step": 371 }, { "epoch": 1.7927710843373494, "grad_norm": 0.4249022901058197, "learning_rate": 4.15949552878926e-06, "loss": 0.5506, "step": 372 }, { "epoch": 1.797590361445783, "grad_norm": 0.4221281111240387, "learning_rate": 4.131759111665349e-06, "loss": 0.5161, "step": 373 }, { "epoch": 1.802409638554217, "grad_norm": 0.48842665553092957, "learning_rate": 4.104050215935875e-06, "loss": 0.5988, "step": 374 }, { "epoch": 1.8072289156626506, "grad_norm": 0.39976850152015686, "learning_rate": 4.076369719914055e-06, "loss": 0.5246, "step": 375 }, { "epoch": 1.8120481927710843, "grad_norm": 0.4550154209136963, "learning_rate": 4.048718501012895e-06, "loss": 0.5871, "step": 376 }, { "epoch": 1.8168674698795182, "grad_norm": 0.44595929980278015, "learning_rate": 4.021097435717386e-06, "loss": 0.5507, "step": 377 }, { "epoch": 1.8216867469879519, "grad_norm": 0.407740980386734, "learning_rate": 3.993507399556699e-06, "loss": 0.5536, "step": 378 }, { "epoch": 1.8265060240963855, "grad_norm": 0.518549919128418, "learning_rate": 3.965949267076465e-06, "loss": 0.569, "step": 379 }, { "epoch": 1.8313253012048194, "grad_norm": 0.39872604608535767, "learning_rate": 3.938423911811021e-06, "loss": 0.5161, "step": 380 }, { "epoch": 1.836144578313253, "grad_norm": 0.4431675374507904, "learning_rate": 3.910932206255742e-06, "loss": 0.5988, "step": 381 }, { "epoch": 1.8409638554216867, "grad_norm": 0.43980199098587036, "learning_rate": 3.883475021839382e-06, "loss": 0.5529, "step": 382 }, { "epoch": 1.8457831325301206, "grad_norm": 0.43700751662254333, "learning_rate": 3.856053228896442e-06, "loss": 0.5552, "step": 383 }, { "epoch": 1.8506024096385543, "grad_norm": 0.4302417039871216, "learning_rate": 3.8286676966395895e-06, "loss": 0.532, "step": 384 }, { "epoch": 1.855421686746988, "grad_norm": 0.4162864089012146, "learning_rate": 3.8013192931321095e-06, "loss": 0.5583, "step": 385 }, { "epoch": 1.8602409638554218, "grad_norm": 0.43791574239730835, "learning_rate": 3.77400888526038e-06, "loss": 0.5247, "step": 386 }, { "epoch": 1.8650602409638555, "grad_norm": 0.4298801124095917, "learning_rate": 3.7467373387063973e-06, "loss": 0.5928, "step": 387 }, { "epoch": 1.8698795180722891, "grad_norm": 0.40685486793518066, "learning_rate": 3.719505517920342e-06, "loss": 0.5563, "step": 388 }, { "epoch": 1.874698795180723, "grad_norm": 0.3878326416015625, "learning_rate": 3.692314286093167e-06, "loss": 0.5353, "step": 389 }, { "epoch": 1.8795180722891565, "grad_norm": 0.4625506103038788, "learning_rate": 3.6651645051292415e-06, "loss": 0.5886, "step": 390 }, { "epoch": 1.8843373493975903, "grad_norm": 0.42777660489082336, "learning_rate": 3.6380570356190346e-06, "loss": 0.5705, "step": 391 }, { "epoch": 1.8891566265060242, "grad_norm": 0.40551841259002686, "learning_rate": 3.610992736811827e-06, "loss": 0.5354, "step": 392 }, { "epoch": 1.8939759036144577, "grad_norm": 0.42313259840011597, "learning_rate": 3.58397246658848e-06, "loss": 0.5534, "step": 393 }, { "epoch": 1.8987951807228916, "grad_norm": 0.4355023503303528, "learning_rate": 3.556997081434248e-06, "loss": 0.5733, "step": 394 }, { "epoch": 1.9036144578313254, "grad_norm": 0.42895275354385376, "learning_rate": 3.5300674364116173e-06, "loss": 0.5624, "step": 395 }, { "epoch": 1.9084337349397589, "grad_norm": 0.43042075634002686, "learning_rate": 3.5031843851332105e-06, "loss": 0.6029, "step": 396 }, { "epoch": 1.9132530120481928, "grad_norm": 0.37077224254608154, "learning_rate": 3.476348779734732e-06, "loss": 0.5392, "step": 397 }, { "epoch": 1.9180722891566266, "grad_norm": 0.4056945741176605, "learning_rate": 3.449561470847947e-06, "loss": 0.5322, "step": 398 }, { "epoch": 1.92289156626506, "grad_norm": 0.4261414706707001, "learning_rate": 3.4228233075737225e-06, "loss": 0.5752, "step": 399 }, { "epoch": 1.927710843373494, "grad_norm": 0.38658595085144043, "learning_rate": 3.3961351374551234e-06, "loss": 0.5601, "step": 400 }, { "epoch": 1.9325301204819278, "grad_norm": 0.4442770183086395, "learning_rate": 3.3694978064505258e-06, "loss": 0.6121, "step": 401 }, { "epoch": 1.9373493975903613, "grad_norm": 0.41190898418426514, "learning_rate": 3.3429121589068213e-06, "loss": 0.5003, "step": 402 }, { "epoch": 1.9421686746987952, "grad_norm": 0.3673941195011139, "learning_rate": 3.316379037532644e-06, "loss": 0.535, "step": 403 }, { "epoch": 1.946987951807229, "grad_norm": 0.4322197139263153, "learning_rate": 3.289899283371657e-06, "loss": 0.528, "step": 404 }, { "epoch": 1.9518072289156625, "grad_norm": 0.45587092638015747, "learning_rate": 3.2634737357758994e-06, "loss": 0.5608, "step": 405 }, { "epoch": 1.9566265060240964, "grad_norm": 0.4296489953994751, "learning_rate": 3.2371032323791757e-06, "loss": 0.6065, "step": 406 }, { "epoch": 1.9614457831325303, "grad_norm": 0.4287756085395813, "learning_rate": 3.2107886090705035e-06, "loss": 0.5215, "step": 407 }, { "epoch": 1.9662650602409637, "grad_norm": 0.4841082692146301, "learning_rate": 3.1845306999676274e-06, "loss": 0.5658, "step": 408 }, { "epoch": 1.9710843373493976, "grad_norm": 0.3839523196220398, "learning_rate": 3.158330337390565e-06, "loss": 0.5713, "step": 409 }, { "epoch": 1.9759036144578315, "grad_norm": 0.37823107838630676, "learning_rate": 3.132188351835232e-06, "loss": 0.5295, "step": 410 }, { "epoch": 1.980722891566265, "grad_norm": 0.4143429696559906, "learning_rate": 3.10610557194712e-06, "loss": 0.5654, "step": 411 }, { "epoch": 1.9855421686746988, "grad_norm": 0.3939684331417084, "learning_rate": 3.080082824495024e-06, "loss": 0.5429, "step": 412 }, { "epoch": 1.9903614457831327, "grad_norm": 0.4271552264690399, "learning_rate": 3.0541209343448373e-06, "loss": 0.5801, "step": 413 }, { "epoch": 1.9951807228915661, "grad_norm": 0.4314172565937042, "learning_rate": 3.0282207244334084e-06, "loss": 0.5558, "step": 414 }, { "epoch": 2.0, "grad_norm": 0.6961327195167542, "learning_rate": 3.0023830157424504e-06, "loss": 0.7905, "step": 415 }, { "epoch": 2.004819277108434, "grad_norm": 0.3929261863231659, "learning_rate": 2.97660862727252e-06, "loss": 0.5401, "step": 416 }, { "epoch": 2.0096385542168673, "grad_norm": 0.46623867750167847, "learning_rate": 2.950898376017064e-06, "loss": 0.5227, "step": 417 }, { "epoch": 2.014457831325301, "grad_norm": 0.40226995944976807, "learning_rate": 2.9252530769365053e-06, "loss": 0.533, "step": 418 }, { "epoch": 2.019277108433735, "grad_norm": 0.43160927295684814, "learning_rate": 2.8996735429324256e-06, "loss": 0.5335, "step": 419 }, { "epoch": 2.0240963855421685, "grad_norm": 0.42182430624961853, "learning_rate": 2.874160584821798e-06, "loss": 0.5219, "step": 420 }, { "epoch": 2.0289156626506024, "grad_norm": 0.4267122149467468, "learning_rate": 2.848715011311271e-06, "loss": 0.5259, "step": 421 }, { "epoch": 2.0337349397590363, "grad_norm": 0.41059398651123047, "learning_rate": 2.823337628971551e-06, "loss": 0.5302, "step": 422 }, { "epoch": 2.0385542168674697, "grad_norm": 0.3666737675666809, "learning_rate": 2.7980292422118282e-06, "loss": 0.525, "step": 423 }, { "epoch": 2.0433734939759036, "grad_norm": 0.460478276014328, "learning_rate": 2.7727906532542783e-06, "loss": 0.5416, "step": 424 }, { "epoch": 2.0481927710843375, "grad_norm": 0.45187464356422424, "learning_rate": 2.7476226621086354e-06, "loss": 0.5293, "step": 425 }, { "epoch": 2.053012048192771, "grad_norm": 0.40986311435699463, "learning_rate": 2.72252606654683e-06, "loss": 0.5429, "step": 426 }, { "epoch": 2.057831325301205, "grad_norm": 0.3846244812011719, "learning_rate": 2.697501662077707e-06, "loss": 0.5322, "step": 427 }, { "epoch": 2.0626506024096387, "grad_norm": 0.44859009981155396, "learning_rate": 2.6725502419218084e-06, "loss": 0.5246, "step": 428 }, { "epoch": 2.067469879518072, "grad_norm": 0.4478023648262024, "learning_rate": 2.6476725969862227e-06, "loss": 0.5453, "step": 429 }, { "epoch": 2.072289156626506, "grad_norm": 0.4618147015571594, "learning_rate": 2.622869515839524e-06, "loss": 0.5373, "step": 430 }, { "epoch": 2.07710843373494, "grad_norm": 0.3915225863456726, "learning_rate": 2.5981417846867753e-06, "loss": 0.5298, "step": 431 }, { "epoch": 2.0819277108433734, "grad_norm": 0.41270238161087036, "learning_rate": 2.573490187344596e-06, "loss": 0.5294, "step": 432 }, { "epoch": 2.0867469879518072, "grad_norm": 0.44997212290763855, "learning_rate": 2.548915505216333e-06, "loss": 0.5152, "step": 433 }, { "epoch": 2.091566265060241, "grad_norm": 0.445286363363266, "learning_rate": 2.524418517267283e-06, "loss": 0.5349, "step": 434 }, { "epoch": 2.0963855421686746, "grad_norm": 0.4325414001941681, "learning_rate": 2.5000000000000015e-06, "loss": 0.5424, "step": 435 }, { "epoch": 2.1012048192771084, "grad_norm": 0.4230119287967682, "learning_rate": 2.4756607274296844e-06, "loss": 0.5278, "step": 436 }, { "epoch": 2.1060240963855423, "grad_norm": 0.37730202078819275, "learning_rate": 2.4514014710596467e-06, "loss": 0.5133, "step": 437 }, { "epoch": 2.1108433734939758, "grad_norm": 0.41996341943740845, "learning_rate": 2.4272229998568576e-06, "loss": 0.5367, "step": 438 }, { "epoch": 2.1156626506024097, "grad_norm": 0.3746831715106964, "learning_rate": 2.4031260802275623e-06, "loss": 0.5228, "step": 439 }, { "epoch": 2.1204819277108435, "grad_norm": 0.39061239361763, "learning_rate": 2.3791114759930013e-06, "loss": 0.5354, "step": 440 }, { "epoch": 2.125301204819277, "grad_norm": 0.406981498003006, "learning_rate": 2.3551799483651894e-06, "loss": 0.5279, "step": 441 }, { "epoch": 2.130120481927711, "grad_norm": 0.4518032670021057, "learning_rate": 2.331332255922784e-06, "loss": 0.5394, "step": 442 }, { "epoch": 2.1349397590361447, "grad_norm": 0.40522995591163635, "learning_rate": 2.307569154587056e-06, "loss": 0.539, "step": 443 }, { "epoch": 2.139759036144578, "grad_norm": 0.3735210597515106, "learning_rate": 2.283891397597908e-06, "loss": 0.5265, "step": 444 }, { "epoch": 2.144578313253012, "grad_norm": 0.4129243791103363, "learning_rate": 2.2602997354900075e-06, "loss": 0.5319, "step": 445 }, { "epoch": 2.149397590361446, "grad_norm": 0.36963480710983276, "learning_rate": 2.236794916069007e-06, "loss": 0.5279, "step": 446 }, { "epoch": 2.1542168674698794, "grad_norm": 0.3678368926048279, "learning_rate": 2.2133776843878185e-06, "loss": 0.5317, "step": 447 }, { "epoch": 2.1590361445783133, "grad_norm": 0.37665683031082153, "learning_rate": 2.190048782723015e-06, "loss": 0.5339, "step": 448 }, { "epoch": 2.163855421686747, "grad_norm": 0.37703850865364075, "learning_rate": 2.166808950551296e-06, "loss": 0.5406, "step": 449 }, { "epoch": 2.1686746987951806, "grad_norm": 0.40551963448524475, "learning_rate": 2.1436589245260375e-06, "loss": 0.5373, "step": 450 }, { "epoch": 2.1734939759036145, "grad_norm": 0.35308101773262024, "learning_rate": 2.120599438453968e-06, "loss": 0.5154, "step": 451 }, { "epoch": 2.1783132530120484, "grad_norm": 0.35655319690704346, "learning_rate": 2.0976312232718763e-06, "loss": 0.5408, "step": 452 }, { "epoch": 2.183132530120482, "grad_norm": 0.3718145489692688, "learning_rate": 2.074755007023461e-06, "loss": 0.5486, "step": 453 }, { "epoch": 2.1879518072289157, "grad_norm": 0.40163010358810425, "learning_rate": 2.0519715148362585e-06, "loss": 0.5233, "step": 454 }, { "epoch": 2.1927710843373496, "grad_norm": 0.41173961758613586, "learning_rate": 2.0292814688986375e-06, "loss": 0.5135, "step": 455 }, { "epoch": 2.197590361445783, "grad_norm": 0.3652428686618805, "learning_rate": 2.0066855884369246e-06, "loss": 0.5151, "step": 456 }, { "epoch": 2.202409638554217, "grad_norm": 0.36961421370506287, "learning_rate": 1.9841845896926022e-06, "loss": 0.5446, "step": 457 }, { "epoch": 2.207228915662651, "grad_norm": 0.3552957773208618, "learning_rate": 1.961779185899597e-06, "loss": 0.5252, "step": 458 }, { "epoch": 2.212048192771084, "grad_norm": 0.3752812147140503, "learning_rate": 1.9394700872616856e-06, "loss": 0.5395, "step": 459 }, { "epoch": 2.216867469879518, "grad_norm": 0.38084596395492554, "learning_rate": 1.9172580009299735e-06, "loss": 0.5256, "step": 460 }, { "epoch": 2.221686746987952, "grad_norm": 0.3959069848060608, "learning_rate": 1.8951436309804766e-06, "loss": 0.5474, "step": 461 }, { "epoch": 2.2265060240963854, "grad_norm": 0.4074706435203552, "learning_rate": 1.8731276783918162e-06, "loss": 0.5334, "step": 462 }, { "epoch": 2.2313253012048193, "grad_norm": 0.37793341279029846, "learning_rate": 1.8512108410229878e-06, "loss": 0.5284, "step": 463 }, { "epoch": 2.236144578313253, "grad_norm": 0.369093656539917, "learning_rate": 1.8293938135912475e-06, "loss": 0.5312, "step": 464 }, { "epoch": 2.2409638554216866, "grad_norm": 0.4202345609664917, "learning_rate": 1.8076772876500831e-06, "loss": 0.5063, "step": 465 }, { "epoch": 2.2457831325301205, "grad_norm": 0.36291444301605225, "learning_rate": 1.7860619515673034e-06, "loss": 0.5294, "step": 466 }, { "epoch": 2.2506024096385544, "grad_norm": 0.3744347095489502, "learning_rate": 1.7645484905032129e-06, "loss": 0.5451, "step": 467 }, { "epoch": 2.255421686746988, "grad_norm": 0.41913270950317383, "learning_rate": 1.74313758638889e-06, "loss": 0.5137, "step": 468 }, { "epoch": 2.2602409638554217, "grad_norm": 0.3765053153038025, "learning_rate": 1.7218299179045789e-06, "loss": 0.5371, "step": 469 }, { "epoch": 2.2650602409638556, "grad_norm": 0.33764129877090454, "learning_rate": 1.7006261604581725e-06, "loss": 0.5358, "step": 470 }, { "epoch": 2.269879518072289, "grad_norm": 0.36633729934692383, "learning_rate": 1.6795269861638041e-06, "loss": 0.5336, "step": 471 }, { "epoch": 2.274698795180723, "grad_norm": 0.37890884280204773, "learning_rate": 1.6585330638205454e-06, "loss": 0.5236, "step": 472 }, { "epoch": 2.279518072289157, "grad_norm": 0.3956688344478607, "learning_rate": 1.6376450588911985e-06, "loss": 0.5505, "step": 473 }, { "epoch": 2.2843373493975903, "grad_norm": 0.3635486662387848, "learning_rate": 1.6168636334812126e-06, "loss": 0.5234, "step": 474 }, { "epoch": 2.289156626506024, "grad_norm": 0.3534378707408905, "learning_rate": 1.5961894463176942e-06, "loss": 0.5181, "step": 475 }, { "epoch": 2.293975903614458, "grad_norm": 0.34564465284347534, "learning_rate": 1.5756231527285181e-06, "loss": 0.5247, "step": 476 }, { "epoch": 2.2987951807228915, "grad_norm": 0.39127737283706665, "learning_rate": 1.555165404621567e-06, "loss": 0.5445, "step": 477 }, { "epoch": 2.3036144578313253, "grad_norm": 0.3836536407470703, "learning_rate": 1.5348168504640631e-06, "loss": 0.5235, "step": 478 }, { "epoch": 2.3084337349397592, "grad_norm": 0.361017107963562, "learning_rate": 1.5145781352620054e-06, "loss": 0.5248, "step": 479 }, { "epoch": 2.3132530120481927, "grad_norm": 0.3382739722728729, "learning_rate": 1.4944499005397372e-06, "loss": 0.5279, "step": 480 }, { "epoch": 2.3180722891566266, "grad_norm": 0.3852854073047638, "learning_rate": 1.4744327843196043e-06, "loss": 0.5159, "step": 481 }, { "epoch": 2.32289156626506, "grad_norm": 0.3639327883720398, "learning_rate": 1.4545274211017264e-06, "loss": 0.524, "step": 482 }, { "epoch": 2.327710843373494, "grad_norm": 0.3571608066558838, "learning_rate": 1.434734441843899e-06, "loss": 0.5219, "step": 483 }, { "epoch": 2.3325301204819278, "grad_norm": 0.3344271183013916, "learning_rate": 1.4150544739415755e-06, "loss": 0.5267, "step": 484 }, { "epoch": 2.337349397590361, "grad_norm": 0.35768088698387146, "learning_rate": 1.3954881412079945e-06, "loss": 0.5165, "step": 485 }, { "epoch": 2.342168674698795, "grad_norm": 0.36367958784103394, "learning_rate": 1.3760360638544012e-06, "loss": 0.5154, "step": 486 }, { "epoch": 2.346987951807229, "grad_norm": 0.36191970109939575, "learning_rate": 1.3566988584703817e-06, "loss": 0.5112, "step": 487 }, { "epoch": 2.3518072289156624, "grad_norm": 0.3416457772254944, "learning_rate": 1.3374771380043306e-06, "loss": 0.5161, "step": 488 }, { "epoch": 2.3566265060240963, "grad_norm": 0.3561415374279022, "learning_rate": 1.3183715117440143e-06, "loss": 0.5121, "step": 489 }, { "epoch": 2.36144578313253, "grad_norm": 0.35016006231307983, "learning_rate": 1.2993825852972559e-06, "loss": 0.5324, "step": 490 }, { "epoch": 2.3662650602409636, "grad_norm": 0.3487759530544281, "learning_rate": 1.280510960572745e-06, "loss": 0.5243, "step": 491 }, { "epoch": 2.3710843373493975, "grad_norm": 0.3596084713935852, "learning_rate": 1.2617572357609565e-06, "loss": 0.5138, "step": 492 }, { "epoch": 2.3759036144578314, "grad_norm": 0.3738745152950287, "learning_rate": 1.2431220053151832e-06, "loss": 0.5235, "step": 493 }, { "epoch": 2.380722891566265, "grad_norm": 0.3412809371948242, "learning_rate": 1.2246058599327021e-06, "loss": 0.5283, "step": 494 }, { "epoch": 2.3855421686746987, "grad_norm": 0.3288966119289398, "learning_rate": 1.2062093865360458e-06, "loss": 0.5332, "step": 495 }, { "epoch": 2.3903614457831326, "grad_norm": 0.3649263381958008, "learning_rate": 1.1879331682543972e-06, "loss": 0.5308, "step": 496 }, { "epoch": 2.395180722891566, "grad_norm": 0.34862828254699707, "learning_rate": 1.1697777844051105e-06, "loss": 0.5242, "step": 497 }, { "epoch": 2.4, "grad_norm": 0.33387741446495056, "learning_rate": 1.1517438104753386e-06, "loss": 0.5346, "step": 498 }, { "epoch": 2.404819277108434, "grad_norm": 0.3655238449573517, "learning_rate": 1.1338318181038037e-06, "loss": 0.5173, "step": 499 }, { "epoch": 2.4096385542168672, "grad_norm": 0.3874536454677582, "learning_rate": 1.1160423750626693e-06, "loss": 0.5382, "step": 500 }, { "epoch": 2.414457831325301, "grad_norm": 0.33767327666282654, "learning_rate": 1.0983760452395415e-06, "loss": 0.543, "step": 501 }, { "epoch": 2.419277108433735, "grad_norm": 0.33204296231269836, "learning_rate": 1.0808333886196038e-06, "loss": 0.5508, "step": 502 }, { "epoch": 2.4240963855421684, "grad_norm": 0.3084474802017212, "learning_rate": 1.063414961267859e-06, "loss": 0.5483, "step": 503 }, { "epoch": 2.4289156626506023, "grad_norm": 0.36713990569114685, "learning_rate": 1.046121315311508e-06, "loss": 0.5242, "step": 504 }, { "epoch": 2.433734939759036, "grad_norm": 0.3953552544116974, "learning_rate": 1.02895299892244e-06, "loss": 0.521, "step": 505 }, { "epoch": 2.4385542168674696, "grad_norm": 0.32059621810913086, "learning_rate": 1.01191055629987e-06, "loss": 0.5199, "step": 506 }, { "epoch": 2.4433734939759035, "grad_norm": 0.3375447392463684, "learning_rate": 9.949945276530782e-07, "loss": 0.5326, "step": 507 }, { "epoch": 2.4481927710843374, "grad_norm": 0.34036242961883545, "learning_rate": 9.782054491842879e-07, "loss": 0.5394, "step": 508 }, { "epoch": 2.453012048192771, "grad_norm": 0.35132986307144165, "learning_rate": 9.615438530716753e-07, "loss": 0.5458, "step": 509 }, { "epoch": 2.4578313253012047, "grad_norm": 0.36198896169662476, "learning_rate": 9.450102674524952e-07, "loss": 0.5061, "step": 510 }, { "epoch": 2.4626506024096386, "grad_norm": 0.3417421877384186, "learning_rate": 9.286052164063369e-07, "loss": 0.5279, "step": 511 }, { "epoch": 2.467469879518072, "grad_norm": 0.3652937412261963, "learning_rate": 9.123292199385247e-07, "loss": 0.5379, "step": 512 }, { "epoch": 2.472289156626506, "grad_norm": 0.3249654471874237, "learning_rate": 8.961827939636198e-07, "loss": 0.5483, "step": 513 }, { "epoch": 2.47710843373494, "grad_norm": 0.3499116003513336, "learning_rate": 8.801664502890722e-07, "loss": 0.5173, "step": 514 }, { "epoch": 2.4819277108433733, "grad_norm": 0.3960937559604645, "learning_rate": 8.64280696599008e-07, "loss": 0.5293, "step": 515 }, { "epoch": 2.486746987951807, "grad_norm": 0.3167719841003418, "learning_rate": 8.485260364381187e-07, "loss": 0.5362, "step": 516 }, { "epoch": 2.491566265060241, "grad_norm": 0.36978796124458313, "learning_rate": 8.329029691957124e-07, "loss": 0.5255, "step": 517 }, { "epoch": 2.4963855421686745, "grad_norm": 0.34822145104408264, "learning_rate": 8.17411990089883e-07, "loss": 0.5274, "step": 518 }, { "epoch": 2.5012048192771084, "grad_norm": 0.33780673146247864, "learning_rate": 8.02053590151805e-07, "loss": 0.5249, "step": 519 }, { "epoch": 2.5060240963855422, "grad_norm": 0.3500611484050751, "learning_rate": 7.868282562101819e-07, "loss": 0.5419, "step": 520 }, { "epoch": 2.5108433734939757, "grad_norm": 0.33515140414237976, "learning_rate": 7.717364708758024e-07, "loss": 0.5446, "step": 521 }, { "epoch": 2.5156626506024096, "grad_norm": 0.35610055923461914, "learning_rate": 7.567787125262449e-07, "loss": 0.5345, "step": 522 }, { "epoch": 2.5204819277108435, "grad_norm": 0.3281942903995514, "learning_rate": 7.41955455290726e-07, "loss": 0.5257, "step": 523 }, { "epoch": 2.525301204819277, "grad_norm": 0.3701113164424896, "learning_rate": 7.27267169035053e-07, "loss": 0.519, "step": 524 }, { "epoch": 2.5301204819277108, "grad_norm": 0.32227593660354614, "learning_rate": 7.127143193467445e-07, "loss": 0.53, "step": 525 }, { "epoch": 2.5349397590361447, "grad_norm": 0.34199750423431396, "learning_rate": 6.982973675202676e-07, "loss": 0.5174, "step": 526 }, { "epoch": 2.539759036144578, "grad_norm": 0.33478933572769165, "learning_rate": 6.840167705424106e-07, "loss": 0.5374, "step": 527 }, { "epoch": 2.544578313253012, "grad_norm": 0.37539225816726685, "learning_rate": 6.698729810778065e-07, "loss": 0.5273, "step": 528 }, { "epoch": 2.549397590361446, "grad_norm": 0.3374510705471039, "learning_rate": 6.558664474545817e-07, "loss": 0.5294, "step": 529 }, { "epoch": 2.5542168674698793, "grad_norm": 0.37236514687538147, "learning_rate": 6.419976136501377e-07, "loss": 0.5398, "step": 530 }, { "epoch": 2.559036144578313, "grad_norm": 0.35223281383514404, "learning_rate": 6.282669192770896e-07, "loss": 0.5034, "step": 531 }, { "epoch": 2.563855421686747, "grad_norm": 0.3377327024936676, "learning_rate": 6.146747995693225e-07, "loss": 0.5197, "step": 532 }, { "epoch": 2.5686746987951805, "grad_norm": 0.34087198972702026, "learning_rate": 6.012216853682001e-07, "loss": 0.5047, "step": 533 }, { "epoch": 2.5734939759036144, "grad_norm": 0.3489024043083191, "learning_rate": 5.879080031089047e-07, "loss": 0.5131, "step": 534 }, { "epoch": 2.5783132530120483, "grad_norm": 0.31479188799858093, "learning_rate": 5.747341748069229e-07, "loss": 0.5346, "step": 535 }, { "epoch": 2.5831325301204817, "grad_norm": 0.352466344833374, "learning_rate": 5.617006180446688e-07, "loss": 0.5279, "step": 536 }, { "epoch": 2.5879518072289156, "grad_norm": 0.34586963057518005, "learning_rate": 5.488077459582425e-07, "loss": 0.5321, "step": 537 }, { "epoch": 2.5927710843373495, "grad_norm": 0.35135793685913086, "learning_rate": 5.360559672243421e-07, "loss": 0.5376, "step": 538 }, { "epoch": 2.597590361445783, "grad_norm": 0.35118043422698975, "learning_rate": 5.234456860473042e-07, "loss": 0.5253, "step": 539 }, { "epoch": 2.602409638554217, "grad_norm": 0.4087206721305847, "learning_rate": 5.109773021462921e-07, "loss": 0.5263, "step": 540 }, { "epoch": 2.6072289156626507, "grad_norm": 0.3330649137496948, "learning_rate": 4.986512107426283e-07, "loss": 0.5325, "step": 541 }, { "epoch": 2.612048192771084, "grad_norm": 0.3389038145542145, "learning_rate": 4.864678025472635e-07, "loss": 0.5334, "step": 542 }, { "epoch": 2.616867469879518, "grad_norm": 0.35696545243263245, "learning_rate": 4.7442746374839363e-07, "loss": 0.5447, "step": 543 }, { "epoch": 2.621686746987952, "grad_norm": 0.35791394114494324, "learning_rate": 4.625305759992205e-07, "loss": 0.5125, "step": 544 }, { "epoch": 2.6265060240963853, "grad_norm": 0.3383076786994934, "learning_rate": 4.50777516405847e-07, "loss": 0.5353, "step": 545 }, { "epoch": 2.6313253012048192, "grad_norm": 0.32981517910957336, "learning_rate": 4.3916865751533313e-07, "loss": 0.532, "step": 546 }, { "epoch": 2.636144578313253, "grad_norm": 0.33359435200691223, "learning_rate": 4.2770436730388166e-07, "loss": 0.5177, "step": 547 }, { "epoch": 2.6409638554216865, "grad_norm": 0.3236476480960846, "learning_rate": 4.163850091651717e-07, "loss": 0.5314, "step": 548 }, { "epoch": 2.6457831325301204, "grad_norm": 0.31992560625076294, "learning_rate": 4.05210941898847e-07, "loss": 0.5489, "step": 549 }, { "epoch": 2.6506024096385543, "grad_norm": 0.3217684030532837, "learning_rate": 3.941825196991378e-07, "loss": 0.5393, "step": 550 }, { "epoch": 2.6554216867469878, "grad_norm": 0.3366377353668213, "learning_rate": 3.8330009214363197e-07, "loss": 0.5393, "step": 551 }, { "epoch": 2.6602409638554216, "grad_norm": 0.3715660572052002, "learning_rate": 3.725640041822026e-07, "loss": 0.5259, "step": 552 }, { "epoch": 2.6650602409638555, "grad_norm": 0.3347671329975128, "learning_rate": 3.619745961260623e-07, "loss": 0.5225, "step": 553 }, { "epoch": 2.669879518072289, "grad_norm": 0.3271038234233856, "learning_rate": 3.5153220363698225e-07, "loss": 0.5336, "step": 554 }, { "epoch": 2.674698795180723, "grad_norm": 0.36232107877731323, "learning_rate": 3.4123715771665786e-07, "loss": 0.5342, "step": 555 }, { "epoch": 2.6795180722891567, "grad_norm": 0.3338398337364197, "learning_rate": 3.310897846962041e-07, "loss": 0.5349, "step": 556 }, { "epoch": 2.68433734939759, "grad_norm": 0.33740854263305664, "learning_rate": 3.2109040622582186e-07, "loss": 0.5357, "step": 557 }, { "epoch": 2.689156626506024, "grad_norm": 0.33538562059402466, "learning_rate": 3.112393392645985e-07, "loss": 0.542, "step": 558 }, { "epoch": 2.693975903614458, "grad_norm": 0.3457934856414795, "learning_rate": 3.015368960704584e-07, "loss": 0.524, "step": 559 }, { "epoch": 2.6987951807228914, "grad_norm": 0.32423001527786255, "learning_rate": 2.919833841902714e-07, "loss": 0.5345, "step": 560 }, { "epoch": 2.7036144578313253, "grad_norm": 0.3165851831436157, "learning_rate": 2.8257910645009935e-07, "loss": 0.5491, "step": 561 }, { "epoch": 2.708433734939759, "grad_norm": 0.3321724236011505, "learning_rate": 2.733243609455971e-07, "loss": 0.5283, "step": 562 }, { "epoch": 2.7132530120481926, "grad_norm": 0.34521132707595825, "learning_rate": 2.6421944103256657e-07, "loss": 0.5348, "step": 563 }, { "epoch": 2.7180722891566265, "grad_norm": 0.34511709213256836, "learning_rate": 2.5526463531765467e-07, "loss": 0.5283, "step": 564 }, { "epoch": 2.7228915662650603, "grad_norm": 0.3328828513622284, "learning_rate": 2.4646022764920843e-07, "loss": 0.5213, "step": 565 }, { "epoch": 2.727710843373494, "grad_norm": 0.31696540117263794, "learning_rate": 2.3780649710827552e-07, "loss": 0.5261, "step": 566 }, { "epoch": 2.7325301204819277, "grad_norm": 0.31569039821624756, "learning_rate": 2.2930371799975593e-07, "loss": 0.5176, "step": 567 }, { "epoch": 2.7373493975903616, "grad_norm": 0.3491840660572052, "learning_rate": 2.20952159843712e-07, "loss": 0.5261, "step": 568 }, { "epoch": 2.742168674698795, "grad_norm": 0.3705739974975586, "learning_rate": 2.1275208736682262e-07, "loss": 0.5104, "step": 569 }, { "epoch": 2.746987951807229, "grad_norm": 0.34111452102661133, "learning_rate": 2.0470376049398944e-07, "loss": 0.5179, "step": 570 }, { "epoch": 2.7518072289156628, "grad_norm": 0.3375121057033539, "learning_rate": 1.9680743434010385e-07, "loss": 0.532, "step": 571 }, { "epoch": 2.756626506024096, "grad_norm": 0.34646302461624146, "learning_rate": 1.8906335920195418e-07, "loss": 0.5358, "step": 572 }, { "epoch": 2.76144578313253, "grad_norm": 0.3552838861942291, "learning_rate": 1.814717805502958e-07, "loss": 0.529, "step": 573 }, { "epoch": 2.766265060240964, "grad_norm": 0.344050794839859, "learning_rate": 1.7403293902206851e-07, "loss": 0.5296, "step": 574 }, { "epoch": 2.7710843373493974, "grad_norm": 0.3431393802165985, "learning_rate": 1.667470704127694e-07, "loss": 0.5234, "step": 575 }, { "epoch": 2.7759036144578313, "grad_norm": 0.33988475799560547, "learning_rate": 1.5961440566897913e-07, "loss": 0.5205, "step": 576 }, { "epoch": 2.780722891566265, "grad_norm": 0.3232595920562744, "learning_rate": 1.5263517088103862e-07, "loss": 0.521, "step": 577 }, { "epoch": 2.7855421686746986, "grad_norm": 0.32091447710990906, "learning_rate": 1.4580958727588746e-07, "loss": 0.5295, "step": 578 }, { "epoch": 2.7903614457831325, "grad_norm": 0.3785247206687927, "learning_rate": 1.3913787121004717e-07, "loss": 0.5105, "step": 579 }, { "epoch": 2.7951807228915664, "grad_norm": 0.3253907561302185, "learning_rate": 1.3262023416276414e-07, "loss": 0.5347, "step": 580 }, { "epoch": 2.8, "grad_norm": 0.3325127363204956, "learning_rate": 1.2625688272930925e-07, "loss": 0.5201, "step": 581 }, { "epoch": 2.8048192771084337, "grad_norm": 0.32310402393341064, "learning_rate": 1.2004801861442373e-07, "loss": 0.5328, "step": 582 }, { "epoch": 2.8096385542168676, "grad_norm": 0.3170939087867737, "learning_rate": 1.1399383862592928e-07, "loss": 0.536, "step": 583 }, { "epoch": 2.814457831325301, "grad_norm": 0.33208224177360535, "learning_rate": 1.0809453466849029e-07, "loss": 0.5444, "step": 584 }, { "epoch": 2.819277108433735, "grad_norm": 0.30856242775917053, "learning_rate": 1.0235029373752758e-07, "loss": 0.5357, "step": 585 }, { "epoch": 2.824096385542169, "grad_norm": 0.3055464029312134, "learning_rate": 9.676129791329481e-08, "loss": 0.5371, "step": 586 }, { "epoch": 2.8289156626506022, "grad_norm": 0.3335382342338562, "learning_rate": 9.132772435510362e-08, "loss": 0.5267, "step": 587 }, { "epoch": 2.833734939759036, "grad_norm": 0.33981144428253174, "learning_rate": 8.604974529571042e-08, "loss": 0.5368, "step": 588 }, { "epoch": 2.83855421686747, "grad_norm": 0.3252619802951813, "learning_rate": 8.092752803585513e-08, "loss": 0.5216, "step": 589 }, { "epoch": 2.8433734939759034, "grad_norm": 0.33591899275779724, "learning_rate": 7.59612349389599e-08, "loss": 0.5165, "step": 590 }, { "epoch": 2.8481927710843373, "grad_norm": 0.3571572005748749, "learning_rate": 7.115102342598101e-08, "loss": 0.5313, "step": 591 }, { "epoch": 2.853012048192771, "grad_norm": 0.3264915943145752, "learning_rate": 6.649704597042061e-08, "loss": 0.5343, "step": 592 }, { "epoch": 2.8578313253012047, "grad_norm": 0.3618753254413605, "learning_rate": 6.199945009349173e-08, "loss": 0.4944, "step": 593 }, { "epoch": 2.8626506024096385, "grad_norm": 0.33330121636390686, "learning_rate": 5.7658378359443104e-08, "loss": 0.547, "step": 594 }, { "epoch": 2.8674698795180724, "grad_norm": 0.3385322391986847, "learning_rate": 5.3473968371040575e-08, "loss": 0.5333, "step": 595 }, { "epoch": 2.872289156626506, "grad_norm": 0.33145639300346375, "learning_rate": 4.944635276520393e-08, "loss": 0.5362, "step": 596 }, { "epoch": 2.8771084337349397, "grad_norm": 0.33541208505630493, "learning_rate": 4.55756592088058e-08, "loss": 0.5307, "step": 597 }, { "epoch": 2.8819277108433736, "grad_norm": 0.32140061259269714, "learning_rate": 4.186201039462046e-08, "loss": 0.5274, "step": 598 }, { "epoch": 2.886746987951807, "grad_norm": 0.3215773403644562, "learning_rate": 3.8305524037438035e-08, "loss": 0.5363, "step": 599 }, { "epoch": 2.891566265060241, "grad_norm": 0.3285579979419708, "learning_rate": 3.4906312870331973e-08, "loss": 0.5268, "step": 600 }, { "epoch": 2.896385542168675, "grad_norm": 0.3298654854297638, "learning_rate": 3.166448464108629e-08, "loss": 0.5326, "step": 601 }, { "epoch": 2.9012048192771083, "grad_norm": 0.32575464248657227, "learning_rate": 2.8580142108778354e-08, "loss": 0.5243, "step": 602 }, { "epoch": 2.906024096385542, "grad_norm": 0.34302592277526855, "learning_rate": 2.5653383040524228e-08, "loss": 0.5192, "step": 603 }, { "epoch": 2.910843373493976, "grad_norm": 0.314879834651947, "learning_rate": 2.2884300208378395e-08, "loss": 0.5039, "step": 604 }, { "epoch": 2.9156626506024095, "grad_norm": 0.33504053950309753, "learning_rate": 2.0272981386393332e-08, "loss": 0.5245, "step": 605 }, { "epoch": 2.9204819277108434, "grad_norm": 0.3199070394039154, "learning_rate": 1.781950934783505e-08, "loss": 0.5315, "step": 606 }, { "epoch": 2.9253012048192772, "grad_norm": 0.3318592607975006, "learning_rate": 1.552396186256411e-08, "loss": 0.5318, "step": 607 }, { "epoch": 2.9301204819277107, "grad_norm": 0.30516648292541504, "learning_rate": 1.3386411694565894e-08, "loss": 0.5263, "step": 608 }, { "epoch": 2.9349397590361446, "grad_norm": 0.31260839104652405, "learning_rate": 1.1406926599646373e-08, "loss": 0.5299, "step": 609 }, { "epoch": 2.9397590361445785, "grad_norm": 0.334585040807724, "learning_rate": 9.585569323284915e-09, "loss": 0.5403, "step": 610 }, { "epoch": 2.944578313253012, "grad_norm": 0.343319296836853, "learning_rate": 7.922397598642551e-09, "loss": 0.5216, "step": 611 }, { "epoch": 2.9493975903614458, "grad_norm": 0.3292441964149475, "learning_rate": 6.417464144736208e-09, "loss": 0.5248, "step": 612 }, { "epoch": 2.9542168674698797, "grad_norm": 0.35108089447021484, "learning_rate": 5.0708166647628345e-09, "loss": 0.5206, "step": 613 }, { "epoch": 2.959036144578313, "grad_norm": 0.33806362748146057, "learning_rate": 3.88249784459227e-09, "loss": 0.5416, "step": 614 }, { "epoch": 2.963855421686747, "grad_norm": 0.304426908493042, "learning_rate": 2.8525453514099966e-09, "loss": 0.5359, "step": 615 }, { "epoch": 2.968674698795181, "grad_norm": 0.34446102380752563, "learning_rate": 1.980991832524759e-09, "loss": 0.5275, "step": 616 }, { "epoch": 2.9734939759036143, "grad_norm": 0.3511168956756592, "learning_rate": 1.2678649143349485e-09, "loss": 0.5181, "step": 617 }, { "epoch": 2.978313253012048, "grad_norm": 0.37222975492477417, "learning_rate": 7.131872014509711e-10, "loss": 0.5235, "step": 618 }, { "epoch": 2.983132530120482, "grad_norm": 0.3414348363876343, "learning_rate": 3.1697627597970794e-10, "loss": 0.5343, "step": 619 }, { "epoch": 2.9879518072289155, "grad_norm": 0.3310386836528778, "learning_rate": 7.924469696718451e-11, "loss": 0.5362, "step": 620 }, { "epoch": 2.9927710843373494, "grad_norm": 0.31892916560173035, "learning_rate": 0.0, "loss": 0.5334, "step": 621 }, { "epoch": 2.9927710843373494, "step": 621, "total_flos": 8.029368648088945e+17, "train_loss": 0.5880239765233272, "train_runtime": 15009.674, "train_samples_per_second": 3.978, "train_steps_per_second": 0.041 } ], "logging_steps": 1, "max_steps": 621, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.029368648088945e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }