{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 2436, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008210180623973727, "grad_norm": 0.6582953929901123, "learning_rate": 0.0, "loss": 1.5398, "step": 1 }, { "epoch": 0.0016420361247947454, "grad_norm": 0.28089845180511475, "learning_rate": 3.5436764027111585e-06, "loss": 1.3121, "step": 2 }, { "epoch": 0.0024630541871921183, "grad_norm": 0.27971383929252625, "learning_rate": 5.61659421298763e-06, "loss": 1.3344, "step": 3 }, { "epoch": 0.003284072249589491, "grad_norm": 0.19216927886009216, "learning_rate": 7.087352805422317e-06, "loss": 1.2702, "step": 4 }, { "epoch": 0.004105090311986864, "grad_norm": 0.1607084572315216, "learning_rate": 8.228161798644422e-06, "loss": 1.2373, "step": 5 }, { "epoch": 0.0049261083743842365, "grad_norm": 0.22933775186538696, "learning_rate": 9.160270615698787e-06, "loss": 1.1523, "step": 6 }, { "epoch": 0.005747126436781609, "grad_norm": 0.10622569918632507, "learning_rate": 9.948357391330555e-06, "loss": 1.0738, "step": 7 }, { "epoch": 0.006568144499178982, "grad_norm": 0.12714657187461853, "learning_rate": 1.0631029208133474e-05, "loss": 1.0801, "step": 8 }, { "epoch": 0.007389162561576354, "grad_norm": 0.0601385124027729, "learning_rate": 1.123318842597526e-05, "loss": 1.0557, "step": 9 }, { "epoch": 0.008210180623973728, "grad_norm": 0.06311753392219543, "learning_rate": 1.1771838201355582e-05, "loss": 1.0616, "step": 10 }, { "epoch": 0.0090311986863711, "grad_norm": 0.06363195180892944, "learning_rate": 1.2259106193757859e-05, "loss": 0.9838, "step": 11 }, { "epoch": 0.009852216748768473, "grad_norm": 0.05629933625459671, "learning_rate": 1.2703947018409945e-05, "loss": 0.9668, "step": 12 }, { "epoch": 0.010673234811165846, "grad_norm": 0.05555826425552368, "learning_rate": 1.311316090883172e-05, "loss": 0.9613, "step": 13 }, { "epoch": 0.011494252873563218, "grad_norm": 0.048940908163785934, "learning_rate": 1.3492033794041713e-05, "loss": 0.936, "step": 14 }, { "epoch": 0.012315270935960592, "grad_norm": 0.046876948326826096, "learning_rate": 1.384475601163205e-05, "loss": 0.938, "step": 15 }, { "epoch": 0.013136288998357963, "grad_norm": 0.04091876372694969, "learning_rate": 1.4174705610844634e-05, "loss": 0.902, "step": 16 }, { "epoch": 0.013957307060755337, "grad_norm": 0.032738663256168365, "learning_rate": 1.4484645617497535e-05, "loss": 0.8972, "step": 17 }, { "epoch": 0.014778325123152709, "grad_norm": 0.03321428596973419, "learning_rate": 1.4776864828686414e-05, "loss": 0.8866, "step": 18 }, { "epoch": 0.015599343185550082, "grad_norm": 0.035821426659822464, "learning_rate": 1.505328048981752e-05, "loss": 0.8873, "step": 19 }, { "epoch": 0.016420361247947456, "grad_norm": 0.03367010876536369, "learning_rate": 1.5315514604066738e-05, "loss": 0.897, "step": 20 }, { "epoch": 0.017241379310344827, "grad_norm": 0.028361069038510323, "learning_rate": 1.5564951604318184e-05, "loss": 0.8919, "step": 21 }, { "epoch": 0.0180623973727422, "grad_norm": 0.02552921511232853, "learning_rate": 1.580278259646902e-05, "loss": 0.8935, "step": 22 }, { "epoch": 0.018883415435139574, "grad_norm": 0.025205889716744423, "learning_rate": 1.603003975988117e-05, "loss": 0.8757, "step": 23 }, { "epoch": 0.019704433497536946, "grad_norm": 0.027938349172472954, "learning_rate": 1.6247623421121105e-05, "loss": 0.8661, "step": 24 }, { "epoch": 0.020525451559934318, "grad_norm": 0.02539875917136669, "learning_rate": 1.6456323597288844e-05, "loss": 0.8442, "step": 25 }, { "epoch": 0.021346469622331693, "grad_norm": 0.02299790270626545, "learning_rate": 1.6656837311542876e-05, "loss": 0.8599, "step": 26 }, { "epoch": 0.022167487684729065, "grad_norm": 0.021373076364398003, "learning_rate": 1.6849782638962885e-05, "loss": 0.8818, "step": 27 }, { "epoch": 0.022988505747126436, "grad_norm": 0.02153177559375763, "learning_rate": 1.7035710196752873e-05, "loss": 0.8575, "step": 28 }, { "epoch": 0.023809523809523808, "grad_norm": 0.023905470967292786, "learning_rate": 1.7215112617252848e-05, "loss": 0.8415, "step": 29 }, { "epoch": 0.024630541871921183, "grad_norm": 0.02177165262401104, "learning_rate": 1.738843241434321e-05, "loss": 0.8751, "step": 30 }, { "epoch": 0.025451559934318555, "grad_norm": 0.022234134376049042, "learning_rate": 1.7556068559516658e-05, "loss": 0.855, "step": 31 }, { "epoch": 0.026272577996715927, "grad_norm": 0.023117555305361748, "learning_rate": 1.7718382013555794e-05, "loss": 0.8693, "step": 32 }, { "epoch": 0.027093596059113302, "grad_norm": 0.02077527530491352, "learning_rate": 1.7875700406745488e-05, "loss": 0.8267, "step": 33 }, { "epoch": 0.027914614121510674, "grad_norm": 0.020166542381048203, "learning_rate": 1.8028322020208693e-05, "loss": 0.8648, "step": 34 }, { "epoch": 0.028735632183908046, "grad_norm": 0.021614355966448784, "learning_rate": 1.817651918997498e-05, "loss": 0.8696, "step": 35 }, { "epoch": 0.029556650246305417, "grad_norm": 0.022183669731020927, "learning_rate": 1.8320541231397574e-05, "loss": 0.851, "step": 36 }, { "epoch": 0.030377668308702793, "grad_norm": 0.018353113904595375, "learning_rate": 1.8460616962803535e-05, "loss": 0.832, "step": 37 }, { "epoch": 0.031198686371100164, "grad_norm": 0.018161995336413383, "learning_rate": 1.859695689252868e-05, "loss": 0.8261, "step": 38 }, { "epoch": 0.03201970443349754, "grad_norm": 0.02032575011253357, "learning_rate": 1.872975512181935e-05, "loss": 0.8564, "step": 39 }, { "epoch": 0.03284072249589491, "grad_norm": 0.018219608813524246, "learning_rate": 1.8859191006777896e-05, "loss": 0.8292, "step": 40 }, { "epoch": 0.03366174055829228, "grad_norm": 0.018575625494122505, "learning_rate": 1.8985430615062968e-05, "loss": 0.8813, "step": 41 }, { "epoch": 0.034482758620689655, "grad_norm": 0.018095295876264572, "learning_rate": 1.9108628007029345e-05, "loss": 0.8824, "step": 42 }, { "epoch": 0.035303776683087026, "grad_norm": 0.018108762800693512, "learning_rate": 1.9228926366101076e-05, "loss": 0.8519, "step": 43 }, { "epoch": 0.0361247947454844, "grad_norm": 0.018117869272828102, "learning_rate": 1.9346458999180177e-05, "loss": 0.8221, "step": 44 }, { "epoch": 0.03694581280788178, "grad_norm": 0.016675056889653206, "learning_rate": 1.946135022461968e-05, "loss": 0.8224, "step": 45 }, { "epoch": 0.03776683087027915, "grad_norm": 0.018497612327337265, "learning_rate": 1.9573716162592327e-05, "loss": 0.8284, "step": 46 }, { "epoch": 0.03858784893267652, "grad_norm": 0.01725911535322666, "learning_rate": 1.9683665440452515e-05, "loss": 0.8224, "step": 47 }, { "epoch": 0.03940886699507389, "grad_norm": 0.01857002265751362, "learning_rate": 1.9791299823832263e-05, "loss": 0.8144, "step": 48 }, { "epoch": 0.040229885057471264, "grad_norm": 0.017686009407043457, "learning_rate": 1.989671478266111e-05, "loss": 0.8341, "step": 49 }, { "epoch": 0.041050903119868636, "grad_norm": 0.017901845276355743, "learning_rate": 2e-05, "loss": 0.8418, "step": 50 }, { "epoch": 0.04187192118226601, "grad_norm": 0.017756789922714233, "learning_rate": 1.9999991332669114e-05, "loss": 0.8406, "step": 51 }, { "epoch": 0.042692939244663386, "grad_norm": 0.01748877577483654, "learning_rate": 1.9999965330691477e-05, "loss": 0.8348, "step": 52 }, { "epoch": 0.04351395730706076, "grad_norm": 0.016478868201375008, "learning_rate": 1.9999921994112167e-05, "loss": 0.8261, "step": 53 }, { "epoch": 0.04433497536945813, "grad_norm": 0.0172673761844635, "learning_rate": 1.999986132300632e-05, "loss": 0.8104, "step": 54 }, { "epoch": 0.0451559934318555, "grad_norm": 0.01734466291964054, "learning_rate": 1.999978331747911e-05, "loss": 0.8411, "step": 55 }, { "epoch": 0.04597701149425287, "grad_norm": 0.017689796164631844, "learning_rate": 1.9999687977665772e-05, "loss": 0.818, "step": 56 }, { "epoch": 0.046798029556650245, "grad_norm": 0.01693199947476387, "learning_rate": 1.9999575303731594e-05, "loss": 0.8168, "step": 57 }, { "epoch": 0.047619047619047616, "grad_norm": 0.01733194850385189, "learning_rate": 1.9999445295871912e-05, "loss": 0.8335, "step": 58 }, { "epoch": 0.048440065681444995, "grad_norm": 0.01722332462668419, "learning_rate": 1.9999297954312115e-05, "loss": 0.8142, "step": 59 }, { "epoch": 0.04926108374384237, "grad_norm": 0.0171380452811718, "learning_rate": 1.999913327930763e-05, "loss": 0.8373, "step": 60 }, { "epoch": 0.05008210180623974, "grad_norm": 0.01710970140993595, "learning_rate": 1.9998951271143953e-05, "loss": 0.8168, "step": 61 }, { "epoch": 0.05090311986863711, "grad_norm": 0.01784527860581875, "learning_rate": 1.999875193013662e-05, "loss": 0.809, "step": 62 }, { "epoch": 0.05172413793103448, "grad_norm": 0.020665330812335014, "learning_rate": 1.9998535256631214e-05, "loss": 0.8716, "step": 63 }, { "epoch": 0.052545155993431854, "grad_norm": 0.018589649349451065, "learning_rate": 1.9998301251003368e-05, "loss": 0.8226, "step": 64 }, { "epoch": 0.053366174055829226, "grad_norm": 0.017995208501815796, "learning_rate": 1.9998049913658765e-05, "loss": 0.8002, "step": 65 }, { "epoch": 0.054187192118226604, "grad_norm": 0.017737260088324547, "learning_rate": 1.9997781245033135e-05, "loss": 0.8138, "step": 66 }, { "epoch": 0.055008210180623976, "grad_norm": 0.017538918182253838, "learning_rate": 1.9997495245592252e-05, "loss": 0.8192, "step": 67 }, { "epoch": 0.05582922824302135, "grad_norm": 0.01784035935997963, "learning_rate": 1.9997191915831932e-05, "loss": 0.8133, "step": 68 }, { "epoch": 0.05665024630541872, "grad_norm": 0.01769757829606533, "learning_rate": 1.9996871256278047e-05, "loss": 0.8089, "step": 69 }, { "epoch": 0.05747126436781609, "grad_norm": 0.3861883282661438, "learning_rate": 1.9996533267486494e-05, "loss": 0.8143, "step": 70 }, { "epoch": 0.05829228243021346, "grad_norm": 0.05268603190779686, "learning_rate": 1.9996177950043233e-05, "loss": 0.8226, "step": 71 }, { "epoch": 0.059113300492610835, "grad_norm": 0.09072672575712204, "learning_rate": 1.999580530456425e-05, "loss": 0.8232, "step": 72 }, { "epoch": 0.05993431855500821, "grad_norm": 0.018575428053736687, "learning_rate": 1.9995415331695588e-05, "loss": 0.8283, "step": 73 }, { "epoch": 0.060755336617405585, "grad_norm": 0.01753455400466919, "learning_rate": 1.999500803211331e-05, "loss": 0.8133, "step": 74 }, { "epoch": 0.06157635467980296, "grad_norm": 0.016476836055517197, "learning_rate": 1.9994583406523526e-05, "loss": 0.8066, "step": 75 }, { "epoch": 0.06239737274220033, "grad_norm": 0.0193136278539896, "learning_rate": 1.999414145566239e-05, "loss": 0.7894, "step": 76 }, { "epoch": 0.06321839080459771, "grad_norm": 0.026382092386484146, "learning_rate": 1.9993682180296084e-05, "loss": 0.8242, "step": 77 }, { "epoch": 0.06403940886699508, "grad_norm": 0.018766207620501518, "learning_rate": 1.9993205581220822e-05, "loss": 0.7967, "step": 78 }, { "epoch": 0.06486042692939245, "grad_norm": 0.01918818987905979, "learning_rate": 1.999271165926286e-05, "loss": 0.812, "step": 79 }, { "epoch": 0.06568144499178982, "grad_norm": 0.017409011721611023, "learning_rate": 1.9992200415278473e-05, "loss": 0.8105, "step": 80 }, { "epoch": 0.0665024630541872, "grad_norm": 0.019298365339636803, "learning_rate": 1.9991671850153987e-05, "loss": 0.8194, "step": 81 }, { "epoch": 0.06732348111658457, "grad_norm": 0.018301578238606453, "learning_rate": 1.999112596480573e-05, "loss": 0.8273, "step": 82 }, { "epoch": 0.06814449917898194, "grad_norm": 0.019984964281320572, "learning_rate": 1.9990562760180073e-05, "loss": 0.8108, "step": 83 }, { "epoch": 0.06896551724137931, "grad_norm": 0.020242325961589813, "learning_rate": 1.9989982237253418e-05, "loss": 0.7913, "step": 84 }, { "epoch": 0.06978653530377668, "grad_norm": 0.01941094361245632, "learning_rate": 1.9989384397032178e-05, "loss": 0.8071, "step": 85 }, { "epoch": 0.07060755336617405, "grad_norm": 0.018655262887477875, "learning_rate": 1.998876924055279e-05, "loss": 0.8088, "step": 86 }, { "epoch": 0.07142857142857142, "grad_norm": 0.020017502829432487, "learning_rate": 1.998813676888171e-05, "loss": 0.8026, "step": 87 }, { "epoch": 0.0722495894909688, "grad_norm": 0.028423313051462173, "learning_rate": 1.9987486983115427e-05, "loss": 0.8238, "step": 88 }, { "epoch": 0.07307060755336617, "grad_norm": 0.01810065284371376, "learning_rate": 1.9986819884380427e-05, "loss": 0.7931, "step": 89 }, { "epoch": 0.07389162561576355, "grad_norm": 0.02027701586484909, "learning_rate": 1.9986135473833223e-05, "loss": 0.7983, "step": 90 }, { "epoch": 0.07471264367816093, "grad_norm": 0.01899072341620922, "learning_rate": 1.998543375266033e-05, "loss": 0.7828, "step": 91 }, { "epoch": 0.0755336617405583, "grad_norm": 0.019647974520921707, "learning_rate": 1.9984714722078288e-05, "loss": 0.8059, "step": 92 }, { "epoch": 0.07635467980295567, "grad_norm": 0.02055743895471096, "learning_rate": 1.9983978383333634e-05, "loss": 0.8053, "step": 93 }, { "epoch": 0.07717569786535304, "grad_norm": 0.021487019956111908, "learning_rate": 1.998322473770291e-05, "loss": 0.7965, "step": 94 }, { "epoch": 0.07799671592775041, "grad_norm": 0.018697261810302734, "learning_rate": 1.998245378649267e-05, "loss": 0.7925, "step": 95 }, { "epoch": 0.07881773399014778, "grad_norm": 0.01994401216506958, "learning_rate": 1.998166553103946e-05, "loss": 0.7829, "step": 96 }, { "epoch": 0.07963875205254516, "grad_norm": 0.018602045252919197, "learning_rate": 1.998085997270984e-05, "loss": 0.8178, "step": 97 }, { "epoch": 0.08045977011494253, "grad_norm": 0.01834348775446415, "learning_rate": 1.998003711290035e-05, "loss": 0.8021, "step": 98 }, { "epoch": 0.0812807881773399, "grad_norm": 0.01896612159907818, "learning_rate": 1.9979196953037534e-05, "loss": 0.8292, "step": 99 }, { "epoch": 0.08210180623973727, "grad_norm": 0.02134113945066929, "learning_rate": 1.9978339494577928e-05, "loss": 0.8148, "step": 100 }, { "epoch": 0.08292282430213464, "grad_norm": 0.017580389976501465, "learning_rate": 1.9977464739008055e-05, "loss": 0.8124, "step": 101 }, { "epoch": 0.08374384236453201, "grad_norm": 0.018544457852840424, "learning_rate": 1.997657268784443e-05, "loss": 0.7745, "step": 102 }, { "epoch": 0.08456486042692939, "grad_norm": 0.020207742229104042, "learning_rate": 1.997566334263354e-05, "loss": 0.7876, "step": 103 }, { "epoch": 0.08538587848932677, "grad_norm": 0.018654216080904007, "learning_rate": 1.9974736704951867e-05, "loss": 0.7995, "step": 104 }, { "epoch": 0.08620689655172414, "grad_norm": 0.01848197728395462, "learning_rate": 1.9973792776405866e-05, "loss": 0.7934, "step": 105 }, { "epoch": 0.08702791461412152, "grad_norm": 0.019450314342975616, "learning_rate": 1.9972831558631965e-05, "loss": 0.7761, "step": 106 }, { "epoch": 0.08784893267651889, "grad_norm": 0.017349369823932648, "learning_rate": 1.9971853053296576e-05, "loss": 0.7595, "step": 107 }, { "epoch": 0.08866995073891626, "grad_norm": 0.019032703712582588, "learning_rate": 1.9970857262096075e-05, "loss": 0.7897, "step": 108 }, { "epoch": 0.08949096880131363, "grad_norm": 0.02083767019212246, "learning_rate": 1.9969844186756796e-05, "loss": 0.7897, "step": 109 }, { "epoch": 0.090311986863711, "grad_norm": 0.02040976472198963, "learning_rate": 1.9968813829035056e-05, "loss": 0.7883, "step": 110 }, { "epoch": 0.09113300492610837, "grad_norm": 0.019137177616357803, "learning_rate": 1.9967766190717114e-05, "loss": 0.7971, "step": 111 }, { "epoch": 0.09195402298850575, "grad_norm": 0.019424546509981155, "learning_rate": 1.9966701273619207e-05, "loss": 0.7615, "step": 112 }, { "epoch": 0.09277504105090312, "grad_norm": 0.018387768417596817, "learning_rate": 1.996561907958751e-05, "loss": 0.8069, "step": 113 }, { "epoch": 0.09359605911330049, "grad_norm": 0.02053052745759487, "learning_rate": 1.9964519610498167e-05, "loss": 0.7922, "step": 114 }, { "epoch": 0.09441707717569786, "grad_norm": 0.01887485757470131, "learning_rate": 1.9963402868257246e-05, "loss": 0.7831, "step": 115 }, { "epoch": 0.09523809523809523, "grad_norm": 0.017095383256673813, "learning_rate": 1.996226885480079e-05, "loss": 0.7849, "step": 116 }, { "epoch": 0.0960591133004926, "grad_norm": 0.019223198294639587, "learning_rate": 1.9961117572094764e-05, "loss": 0.8316, "step": 117 }, { "epoch": 0.09688013136288999, "grad_norm": 0.01948581077158451, "learning_rate": 1.995994902213507e-05, "loss": 0.7983, "step": 118 }, { "epoch": 0.09770114942528736, "grad_norm": 0.01796896941959858, "learning_rate": 1.995876320694757e-05, "loss": 0.8077, "step": 119 }, { "epoch": 0.09852216748768473, "grad_norm": 0.018026575446128845, "learning_rate": 1.995756012858802e-05, "loss": 0.7956, "step": 120 }, { "epoch": 0.0993431855500821, "grad_norm": 0.020010560750961304, "learning_rate": 1.995633978914214e-05, "loss": 0.7787, "step": 121 }, { "epoch": 0.10016420361247948, "grad_norm": 0.01902727782726288, "learning_rate": 1.9955102190725552e-05, "loss": 0.8253, "step": 122 }, { "epoch": 0.10098522167487685, "grad_norm": 0.018272224813699722, "learning_rate": 1.99538473354838e-05, "loss": 0.7899, "step": 123 }, { "epoch": 0.10180623973727422, "grad_norm": 0.019500743597745895, "learning_rate": 1.9952575225592363e-05, "loss": 0.7838, "step": 124 }, { "epoch": 0.10262725779967159, "grad_norm": 0.017104769125580788, "learning_rate": 1.9951285863256612e-05, "loss": 0.7918, "step": 125 }, { "epoch": 0.10344827586206896, "grad_norm": 0.018713781610131264, "learning_rate": 1.9949979250711842e-05, "loss": 0.7732, "step": 126 }, { "epoch": 0.10426929392446634, "grad_norm": 0.017736734822392464, "learning_rate": 1.994865539022324e-05, "loss": 0.8286, "step": 127 }, { "epoch": 0.10509031198686371, "grad_norm": 0.01878439076244831, "learning_rate": 1.9947314284085916e-05, "loss": 0.7924, "step": 128 }, { "epoch": 0.10591133004926108, "grad_norm": 0.01853361539542675, "learning_rate": 1.994595593462485e-05, "loss": 0.7731, "step": 129 }, { "epoch": 0.10673234811165845, "grad_norm": 0.018976392224431038, "learning_rate": 1.9944580344194936e-05, "loss": 0.7645, "step": 130 }, { "epoch": 0.10755336617405582, "grad_norm": 0.017582492902874947, "learning_rate": 1.994318751518096e-05, "loss": 0.793, "step": 131 }, { "epoch": 0.10837438423645321, "grad_norm": 0.01757206581532955, "learning_rate": 1.9941777449997573e-05, "loss": 0.7817, "step": 132 }, { "epoch": 0.10919540229885058, "grad_norm": 0.018101617693901062, "learning_rate": 1.994035015108933e-05, "loss": 0.7807, "step": 133 }, { "epoch": 0.11001642036124795, "grad_norm": 0.018215730786323547, "learning_rate": 1.9938905620930645e-05, "loss": 0.7759, "step": 134 }, { "epoch": 0.11083743842364532, "grad_norm": 0.018368102610111237, "learning_rate": 1.9937443862025818e-05, "loss": 0.7559, "step": 135 }, { "epoch": 0.1116584564860427, "grad_norm": 0.01753397285938263, "learning_rate": 1.9935964876909007e-05, "loss": 0.8079, "step": 136 }, { "epoch": 0.11247947454844007, "grad_norm": 0.03522775322198868, "learning_rate": 1.9934468668144244e-05, "loss": 0.782, "step": 137 }, { "epoch": 0.11330049261083744, "grad_norm": 0.017528459429740906, "learning_rate": 1.9932955238325412e-05, "loss": 0.7803, "step": 138 }, { "epoch": 0.11412151067323481, "grad_norm": 0.019998522475361824, "learning_rate": 1.9931424590076256e-05, "loss": 0.7997, "step": 139 }, { "epoch": 0.11494252873563218, "grad_norm": 0.021649999544024467, "learning_rate": 1.9929876726050365e-05, "loss": 0.8062, "step": 140 }, { "epoch": 0.11576354679802955, "grad_norm": 0.017345383763313293, "learning_rate": 1.9928311648931182e-05, "loss": 0.786, "step": 141 }, { "epoch": 0.11658456486042693, "grad_norm": 0.02030874788761139, "learning_rate": 1.992672936143198e-05, "loss": 0.799, "step": 142 }, { "epoch": 0.1174055829228243, "grad_norm": 0.018863452598452568, "learning_rate": 1.9925129866295882e-05, "loss": 0.7719, "step": 143 }, { "epoch": 0.11822660098522167, "grad_norm": 0.018443239852786064, "learning_rate": 1.992351316629583e-05, "loss": 0.7792, "step": 144 }, { "epoch": 0.11904761904761904, "grad_norm": 0.01949496753513813, "learning_rate": 1.9921879264234604e-05, "loss": 0.7812, "step": 145 }, { "epoch": 0.11986863711001643, "grad_norm": 0.017172547057271004, "learning_rate": 1.9920228162944798e-05, "loss": 0.7696, "step": 146 }, { "epoch": 0.1206896551724138, "grad_norm": 0.01942411996424198, "learning_rate": 1.9918559865288825e-05, "loss": 0.7878, "step": 147 }, { "epoch": 0.12151067323481117, "grad_norm": 0.01829535886645317, "learning_rate": 1.991687437415892e-05, "loss": 0.7557, "step": 148 }, { "epoch": 0.12233169129720854, "grad_norm": 0.019312867894768715, "learning_rate": 1.9915171692477108e-05, "loss": 0.7985, "step": 149 }, { "epoch": 0.12315270935960591, "grad_norm": 0.017465777695178986, "learning_rate": 1.9913451823195225e-05, "loss": 0.7688, "step": 150 }, { "epoch": 0.12397372742200329, "grad_norm": 0.01828722096979618, "learning_rate": 1.9911714769294916e-05, "loss": 0.7935, "step": 151 }, { "epoch": 0.12479474548440066, "grad_norm": 0.02119818516075611, "learning_rate": 1.990996053378759e-05, "loss": 0.7757, "step": 152 }, { "epoch": 0.12561576354679804, "grad_norm": 0.018074961379170418, "learning_rate": 1.9908189119714473e-05, "loss": 0.7932, "step": 153 }, { "epoch": 0.12643678160919541, "grad_norm": 0.021184042096138, "learning_rate": 1.990640053014655e-05, "loss": 0.7887, "step": 154 }, { "epoch": 0.1272577996715928, "grad_norm": 0.020672885701060295, "learning_rate": 1.9904594768184598e-05, "loss": 0.7893, "step": 155 }, { "epoch": 0.12807881773399016, "grad_norm": 0.06383033841848373, "learning_rate": 1.9902771836959153e-05, "loss": 0.7899, "step": 156 }, { "epoch": 0.12889983579638753, "grad_norm": 0.01920251175761223, "learning_rate": 1.9900931739630514e-05, "loss": 0.771, "step": 157 }, { "epoch": 0.1297208538587849, "grad_norm": 0.018562542274594307, "learning_rate": 1.989907447938876e-05, "loss": 0.7822, "step": 158 }, { "epoch": 0.13054187192118227, "grad_norm": 0.0168649572879076, "learning_rate": 1.9897200059453696e-05, "loss": 0.7752, "step": 159 }, { "epoch": 0.13136288998357964, "grad_norm": 0.017897505313158035, "learning_rate": 1.9895308483074905e-05, "loss": 0.7623, "step": 160 }, { "epoch": 0.13218390804597702, "grad_norm": 0.01833323948085308, "learning_rate": 1.989339975353169e-05, "loss": 0.7648, "step": 161 }, { "epoch": 0.1330049261083744, "grad_norm": 0.019092628732323647, "learning_rate": 1.98914738741331e-05, "loss": 0.7672, "step": 162 }, { "epoch": 0.13382594417077176, "grad_norm": 0.018991252407431602, "learning_rate": 1.9889530848217918e-05, "loss": 0.7771, "step": 163 }, { "epoch": 0.13464696223316913, "grad_norm": 0.01823980174958706, "learning_rate": 1.988757067915465e-05, "loss": 0.7833, "step": 164 }, { "epoch": 0.1354679802955665, "grad_norm": 0.018265975639224052, "learning_rate": 1.9885593370341526e-05, "loss": 0.7853, "step": 165 }, { "epoch": 0.13628899835796388, "grad_norm": 0.01876806654036045, "learning_rate": 1.988359892520648e-05, "loss": 0.7739, "step": 166 }, { "epoch": 0.13711001642036125, "grad_norm": 0.018582550808787346, "learning_rate": 1.9881587347207164e-05, "loss": 0.7672, "step": 167 }, { "epoch": 0.13793103448275862, "grad_norm": 0.02045062929391861, "learning_rate": 1.9879558639830935e-05, "loss": 0.7749, "step": 168 }, { "epoch": 0.138752052545156, "grad_norm": 0.02357109636068344, "learning_rate": 1.987751280659483e-05, "loss": 0.7544, "step": 169 }, { "epoch": 0.13957307060755336, "grad_norm": 0.01908900961279869, "learning_rate": 1.9875449851045593e-05, "loss": 0.7892, "step": 170 }, { "epoch": 0.14039408866995073, "grad_norm": 0.018463164567947388, "learning_rate": 1.9873369776759642e-05, "loss": 0.7742, "step": 171 }, { "epoch": 0.1412151067323481, "grad_norm": 0.02121536247432232, "learning_rate": 1.987127258734308e-05, "loss": 0.7823, "step": 172 }, { "epoch": 0.14203612479474548, "grad_norm": 0.019161978736519814, "learning_rate": 1.986915828643167e-05, "loss": 0.7803, "step": 173 }, { "epoch": 0.14285714285714285, "grad_norm": 0.018574459478259087, "learning_rate": 1.9867026877690857e-05, "loss": 0.7623, "step": 174 }, { "epoch": 0.14367816091954022, "grad_norm": 0.01913372240960598, "learning_rate": 1.9864878364815732e-05, "loss": 0.7881, "step": 175 }, { "epoch": 0.1444991789819376, "grad_norm": 0.018835924565792084, "learning_rate": 1.986271275153104e-05, "loss": 0.7919, "step": 176 }, { "epoch": 0.14532019704433496, "grad_norm": 0.022191274911165237, "learning_rate": 1.9860530041591174e-05, "loss": 0.8043, "step": 177 }, { "epoch": 0.14614121510673234, "grad_norm": 0.01955992542207241, "learning_rate": 1.985833023878017e-05, "loss": 0.7843, "step": 178 }, { "epoch": 0.1469622331691297, "grad_norm": 0.01837492361664772, "learning_rate": 1.9856113346911688e-05, "loss": 0.8037, "step": 179 }, { "epoch": 0.1477832512315271, "grad_norm": 0.019290300086140633, "learning_rate": 1.9853879369829024e-05, "loss": 0.769, "step": 180 }, { "epoch": 0.14860426929392448, "grad_norm": 0.017271889373660088, "learning_rate": 1.9851628311405085e-05, "loss": 0.7887, "step": 181 }, { "epoch": 0.14942528735632185, "grad_norm": 0.018982931971549988, "learning_rate": 1.98493601755424e-05, "loss": 0.7821, "step": 182 }, { "epoch": 0.15024630541871922, "grad_norm": 0.01820552349090576, "learning_rate": 1.984707496617309e-05, "loss": 0.7571, "step": 183 }, { "epoch": 0.1510673234811166, "grad_norm": 0.02312224917113781, "learning_rate": 1.9844772687258895e-05, "loss": 0.7641, "step": 184 }, { "epoch": 0.15188834154351397, "grad_norm": 0.019761936739087105, "learning_rate": 1.9842453342791127e-05, "loss": 0.7664, "step": 185 }, { "epoch": 0.15270935960591134, "grad_norm": 0.020412521436810493, "learning_rate": 1.98401169367907e-05, "loss": 0.7887, "step": 186 }, { "epoch": 0.1535303776683087, "grad_norm": 0.019694669172167778, "learning_rate": 1.98377634733081e-05, "loss": 0.7708, "step": 187 }, { "epoch": 0.15435139573070608, "grad_norm": 0.020171768963336945, "learning_rate": 1.983539295642338e-05, "loss": 0.7465, "step": 188 }, { "epoch": 0.15517241379310345, "grad_norm": 0.019384175539016724, "learning_rate": 1.9833005390246166e-05, "loss": 0.7618, "step": 189 }, { "epoch": 0.15599343185550082, "grad_norm": 0.01909920573234558, "learning_rate": 1.983060077891563e-05, "loss": 0.7708, "step": 190 }, { "epoch": 0.1568144499178982, "grad_norm": 0.021261801943182945, "learning_rate": 1.9828179126600513e-05, "loss": 0.7812, "step": 191 }, { "epoch": 0.15763546798029557, "grad_norm": 0.018536368384957314, "learning_rate": 1.982574043749908e-05, "loss": 0.7728, "step": 192 }, { "epoch": 0.15845648604269294, "grad_norm": 0.01783687435090542, "learning_rate": 1.9823284715839135e-05, "loss": 0.7683, "step": 193 }, { "epoch": 0.1592775041050903, "grad_norm": 0.019637102261185646, "learning_rate": 1.9820811965878024e-05, "loss": 0.7671, "step": 194 }, { "epoch": 0.16009852216748768, "grad_norm": 0.01893172413110733, "learning_rate": 1.9818322191902592e-05, "loss": 0.7763, "step": 195 }, { "epoch": 0.16091954022988506, "grad_norm": 0.019183963537216187, "learning_rate": 1.9815815398229222e-05, "loss": 0.7646, "step": 196 }, { "epoch": 0.16174055829228243, "grad_norm": 0.02789168246090412, "learning_rate": 1.9813291589203786e-05, "loss": 0.7683, "step": 197 }, { "epoch": 0.1625615763546798, "grad_norm": 0.019427109509706497, "learning_rate": 1.9810750769201655e-05, "loss": 0.7722, "step": 198 }, { "epoch": 0.16338259441707717, "grad_norm": 0.01943468302488327, "learning_rate": 1.98081929426277e-05, "loss": 0.7807, "step": 199 }, { "epoch": 0.16420361247947454, "grad_norm": 0.018427614122629166, "learning_rate": 1.9805618113916267e-05, "loss": 0.7328, "step": 200 }, { "epoch": 0.16502463054187191, "grad_norm": 0.01850549317896366, "learning_rate": 1.9803026287531183e-05, "loss": 0.7608, "step": 201 }, { "epoch": 0.16584564860426929, "grad_norm": 0.018799586221575737, "learning_rate": 1.980041746796574e-05, "loss": 0.7586, "step": 202 }, { "epoch": 0.16666666666666666, "grad_norm": 0.016580747440457344, "learning_rate": 1.979779165974269e-05, "loss": 0.7956, "step": 203 }, { "epoch": 0.16748768472906403, "grad_norm": 0.01716994307935238, "learning_rate": 1.979514886741424e-05, "loss": 0.7628, "step": 204 }, { "epoch": 0.1683087027914614, "grad_norm": 0.017782388255000114, "learning_rate": 1.9792489095562037e-05, "loss": 0.7913, "step": 205 }, { "epoch": 0.16912972085385877, "grad_norm": 0.017343951389193535, "learning_rate": 1.9789812348797167e-05, "loss": 0.7655, "step": 206 }, { "epoch": 0.16995073891625614, "grad_norm": 0.01779538206756115, "learning_rate": 1.978711863176015e-05, "loss": 0.7898, "step": 207 }, { "epoch": 0.17077175697865354, "grad_norm": 0.01693977788090706, "learning_rate": 1.9784407949120908e-05, "loss": 0.7773, "step": 208 }, { "epoch": 0.17159277504105092, "grad_norm": 0.01755443774163723, "learning_rate": 1.97816803055788e-05, "loss": 0.7689, "step": 209 }, { "epoch": 0.1724137931034483, "grad_norm": 0.017623629420995712, "learning_rate": 1.977893570586257e-05, "loss": 0.7652, "step": 210 }, { "epoch": 0.17323481116584566, "grad_norm": 0.017354611307382584, "learning_rate": 1.9776174154730362e-05, "loss": 0.7521, "step": 211 }, { "epoch": 0.17405582922824303, "grad_norm": 0.018976280465722084, "learning_rate": 1.977339565696972e-05, "loss": 0.7882, "step": 212 }, { "epoch": 0.1748768472906404, "grad_norm": 0.017396554350852966, "learning_rate": 1.9770600217397552e-05, "loss": 0.7597, "step": 213 }, { "epoch": 0.17569786535303777, "grad_norm": 0.017672264948487282, "learning_rate": 1.976778784086014e-05, "loss": 0.7653, "step": 214 }, { "epoch": 0.17651888341543515, "grad_norm": 0.01836102455854416, "learning_rate": 1.9764958532233132e-05, "loss": 0.7683, "step": 215 }, { "epoch": 0.17733990147783252, "grad_norm": 0.018145913258194923, "learning_rate": 1.9762112296421533e-05, "loss": 0.7409, "step": 216 }, { "epoch": 0.1781609195402299, "grad_norm": 0.017084160819649696, "learning_rate": 1.9759249138359687e-05, "loss": 0.7435, "step": 217 }, { "epoch": 0.17898193760262726, "grad_norm": 0.017151299864053726, "learning_rate": 1.9756369063011277e-05, "loss": 0.7532, "step": 218 }, { "epoch": 0.17980295566502463, "grad_norm": 0.018525032326579094, "learning_rate": 1.975347207536932e-05, "loss": 0.7658, "step": 219 }, { "epoch": 0.180623973727422, "grad_norm": 0.01792103610932827, "learning_rate": 1.9750558180456146e-05, "loss": 0.7532, "step": 220 }, { "epoch": 0.18144499178981938, "grad_norm": 0.017167169600725174, "learning_rate": 1.97476273833234e-05, "loss": 0.7678, "step": 221 }, { "epoch": 0.18226600985221675, "grad_norm": 0.01852918043732643, "learning_rate": 1.974467968905202e-05, "loss": 0.7467, "step": 222 }, { "epoch": 0.18308702791461412, "grad_norm": 0.01723858341574669, "learning_rate": 1.9741715102752254e-05, "loss": 0.7641, "step": 223 }, { "epoch": 0.1839080459770115, "grad_norm": 0.0166771300137043, "learning_rate": 1.9738733629563623e-05, "loss": 0.7581, "step": 224 }, { "epoch": 0.18472906403940886, "grad_norm": 0.017830727621912956, "learning_rate": 1.973573527465493e-05, "loss": 0.7514, "step": 225 }, { "epoch": 0.18555008210180624, "grad_norm": 0.01702488772571087, "learning_rate": 1.9732720043224237e-05, "loss": 0.7605, "step": 226 }, { "epoch": 0.1863711001642036, "grad_norm": 0.01843581534922123, "learning_rate": 1.972968794049887e-05, "loss": 0.7472, "step": 227 }, { "epoch": 0.18719211822660098, "grad_norm": 0.01715726964175701, "learning_rate": 1.9726638971735403e-05, "loss": 0.781, "step": 228 }, { "epoch": 0.18801313628899835, "grad_norm": 0.017639420926570892, "learning_rate": 1.972357314221965e-05, "loss": 0.7383, "step": 229 }, { "epoch": 0.18883415435139572, "grad_norm": 0.01681230403482914, "learning_rate": 1.972049045726665e-05, "loss": 0.7352, "step": 230 }, { "epoch": 0.1896551724137931, "grad_norm": 0.0180879607796669, "learning_rate": 1.971739092222067e-05, "loss": 0.7558, "step": 231 }, { "epoch": 0.19047619047619047, "grad_norm": 0.01861249841749668, "learning_rate": 1.9714274542455184e-05, "loss": 0.7644, "step": 232 }, { "epoch": 0.19129720853858784, "grad_norm": 0.01822645403444767, "learning_rate": 1.9711141323372877e-05, "loss": 0.7626, "step": 233 }, { "epoch": 0.1921182266009852, "grad_norm": 0.01956329122185707, "learning_rate": 1.9707991270405616e-05, "loss": 0.7721, "step": 234 }, { "epoch": 0.19293924466338258, "grad_norm": 0.017874253913760185, "learning_rate": 1.9704824389014464e-05, "loss": 0.7383, "step": 235 }, { "epoch": 0.19376026272577998, "grad_norm": 0.02042638696730137, "learning_rate": 1.9701640684689644e-05, "loss": 0.7734, "step": 236 }, { "epoch": 0.19458128078817735, "grad_norm": 0.018782762810587883, "learning_rate": 1.9698440162950556e-05, "loss": 0.743, "step": 237 }, { "epoch": 0.19540229885057472, "grad_norm": 0.019306477159261703, "learning_rate": 1.969522282934575e-05, "loss": 0.7382, "step": 238 }, { "epoch": 0.1962233169129721, "grad_norm": 0.019744873046875, "learning_rate": 1.9691988689452924e-05, "loss": 0.7603, "step": 239 }, { "epoch": 0.19704433497536947, "grad_norm": 0.020885009318590164, "learning_rate": 1.9688737748878906e-05, "loss": 0.7705, "step": 240 }, { "epoch": 0.19786535303776684, "grad_norm": 0.018523551523685455, "learning_rate": 1.9685470013259665e-05, "loss": 0.752, "step": 241 }, { "epoch": 0.1986863711001642, "grad_norm": 0.020956575870513916, "learning_rate": 1.9682185488260265e-05, "loss": 0.7519, "step": 242 }, { "epoch": 0.19950738916256158, "grad_norm": 0.020481063053011894, "learning_rate": 1.9678884179574897e-05, "loss": 0.7511, "step": 243 }, { "epoch": 0.20032840722495895, "grad_norm": 0.01720617711544037, "learning_rate": 1.9675566092926835e-05, "loss": 0.7423, "step": 244 }, { "epoch": 0.20114942528735633, "grad_norm": 0.05303420498967171, "learning_rate": 1.9672231234068447e-05, "loss": 0.7554, "step": 245 }, { "epoch": 0.2019704433497537, "grad_norm": 0.017836442217230797, "learning_rate": 1.966887960878118e-05, "loss": 0.7828, "step": 246 }, { "epoch": 0.20279146141215107, "grad_norm": 0.016690334305167198, "learning_rate": 1.9665511222875534e-05, "loss": 0.7283, "step": 247 }, { "epoch": 0.20361247947454844, "grad_norm": 0.0185385849326849, "learning_rate": 1.966212608219109e-05, "loss": 0.7821, "step": 248 }, { "epoch": 0.2044334975369458, "grad_norm": 0.01794321835041046, "learning_rate": 1.9658724192596455e-05, "loss": 0.7783, "step": 249 }, { "epoch": 0.20525451559934318, "grad_norm": 0.02011408470571041, "learning_rate": 1.9655305559989272e-05, "loss": 0.7449, "step": 250 }, { "epoch": 0.20607553366174056, "grad_norm": 0.017536552622914314, "learning_rate": 1.965187019029623e-05, "loss": 0.7579, "step": 251 }, { "epoch": 0.20689655172413793, "grad_norm": 0.018185539171099663, "learning_rate": 1.9648418089473014e-05, "loss": 0.7258, "step": 252 }, { "epoch": 0.2077175697865353, "grad_norm": 0.017615322023630142, "learning_rate": 1.9644949263504323e-05, "loss": 0.7568, "step": 253 }, { "epoch": 0.20853858784893267, "grad_norm": 0.017303621396422386, "learning_rate": 1.9641463718403856e-05, "loss": 0.7706, "step": 254 }, { "epoch": 0.20935960591133004, "grad_norm": 0.017102226614952087, "learning_rate": 1.9637961460214285e-05, "loss": 0.7614, "step": 255 }, { "epoch": 0.21018062397372742, "grad_norm": 0.016364838927984238, "learning_rate": 1.963444249500727e-05, "loss": 0.754, "step": 256 }, { "epoch": 0.2110016420361248, "grad_norm": 0.017316818237304688, "learning_rate": 1.963090682888342e-05, "loss": 0.7364, "step": 257 }, { "epoch": 0.21182266009852216, "grad_norm": 0.01698889397084713, "learning_rate": 1.9627354467972315e-05, "loss": 0.7256, "step": 258 }, { "epoch": 0.21264367816091953, "grad_norm": 0.017812803387641907, "learning_rate": 1.9623785418432462e-05, "loss": 0.7531, "step": 259 }, { "epoch": 0.2134646962233169, "grad_norm": 0.018814606592059135, "learning_rate": 1.962019968645131e-05, "loss": 0.7545, "step": 260 }, { "epoch": 0.21428571428571427, "grad_norm": 0.01808818429708481, "learning_rate": 1.961659727824522e-05, "loss": 0.7572, "step": 261 }, { "epoch": 0.21510673234811165, "grad_norm": 0.017670316621661186, "learning_rate": 1.961297820005948e-05, "loss": 0.7473, "step": 262 }, { "epoch": 0.21592775041050905, "grad_norm": 0.018272770568728447, "learning_rate": 1.9609342458168253e-05, "loss": 0.7364, "step": 263 }, { "epoch": 0.21674876847290642, "grad_norm": 0.018436534330248833, "learning_rate": 1.9605690058874614e-05, "loss": 0.7597, "step": 264 }, { "epoch": 0.2175697865353038, "grad_norm": 0.020006582140922546, "learning_rate": 1.9602021008510506e-05, "loss": 0.7471, "step": 265 }, { "epoch": 0.21839080459770116, "grad_norm": 0.017969874665141106, "learning_rate": 1.959833531343674e-05, "loss": 0.7522, "step": 266 }, { "epoch": 0.21921182266009853, "grad_norm": 0.019225740805268288, "learning_rate": 1.9594632980042983e-05, "loss": 0.7621, "step": 267 }, { "epoch": 0.2200328407224959, "grad_norm": 0.01800500974059105, "learning_rate": 1.9590914014747742e-05, "loss": 0.7601, "step": 268 }, { "epoch": 0.22085385878489328, "grad_norm": 0.017222722992300987, "learning_rate": 1.958717842399837e-05, "loss": 0.7546, "step": 269 }, { "epoch": 0.22167487684729065, "grad_norm": 0.018958937376737595, "learning_rate": 1.9583426214271022e-05, "loss": 0.7611, "step": 270 }, { "epoch": 0.22249589490968802, "grad_norm": 0.01793014258146286, "learning_rate": 1.957965739207069e-05, "loss": 0.7415, "step": 271 }, { "epoch": 0.2233169129720854, "grad_norm": 0.017088264226913452, "learning_rate": 1.957587196393115e-05, "loss": 0.7456, "step": 272 }, { "epoch": 0.22413793103448276, "grad_norm": 0.01903565786778927, "learning_rate": 1.9572069936414964e-05, "loss": 0.7541, "step": 273 }, { "epoch": 0.22495894909688013, "grad_norm": 0.01736553944647312, "learning_rate": 1.9568251316113485e-05, "loss": 0.7411, "step": 274 }, { "epoch": 0.2257799671592775, "grad_norm": 0.018346663564443588, "learning_rate": 1.956441610964682e-05, "loss": 0.7721, "step": 275 }, { "epoch": 0.22660098522167488, "grad_norm": 0.017558882012963295, "learning_rate": 1.9560564323663837e-05, "loss": 0.7456, "step": 276 }, { "epoch": 0.22742200328407225, "grad_norm": 0.01856360025703907, "learning_rate": 1.9556695964842144e-05, "loss": 0.7683, "step": 277 }, { "epoch": 0.22824302134646962, "grad_norm": 0.018104780465364456, "learning_rate": 1.9552811039888076e-05, "loss": 0.7438, "step": 278 }, { "epoch": 0.229064039408867, "grad_norm": 0.018308473750948906, "learning_rate": 1.95489095555367e-05, "loss": 0.7311, "step": 279 }, { "epoch": 0.22988505747126436, "grad_norm": 0.01785454899072647, "learning_rate": 1.9544991518551777e-05, "loss": 0.7901, "step": 280 }, { "epoch": 0.23070607553366174, "grad_norm": 0.017216375097632408, "learning_rate": 1.9541056935725772e-05, "loss": 0.7612, "step": 281 }, { "epoch": 0.2315270935960591, "grad_norm": 0.018095433712005615, "learning_rate": 1.9537105813879837e-05, "loss": 0.74, "step": 282 }, { "epoch": 0.23234811165845648, "grad_norm": 0.01828164793550968, "learning_rate": 1.9533138159863793e-05, "loss": 0.7408, "step": 283 }, { "epoch": 0.23316912972085385, "grad_norm": 0.017393935471773148, "learning_rate": 1.9529153980556118e-05, "loss": 0.734, "step": 284 }, { "epoch": 0.23399014778325122, "grad_norm": 0.01744520664215088, "learning_rate": 1.9525153282863944e-05, "loss": 0.7597, "step": 285 }, { "epoch": 0.2348111658456486, "grad_norm": 0.018121015280485153, "learning_rate": 1.9521136073723044e-05, "loss": 0.7211, "step": 286 }, { "epoch": 0.23563218390804597, "grad_norm": 0.017998313531279564, "learning_rate": 1.95171023600978e-05, "loss": 0.7354, "step": 287 }, { "epoch": 0.23645320197044334, "grad_norm": 0.018213143572211266, "learning_rate": 1.9513052148981227e-05, "loss": 0.7184, "step": 288 }, { "epoch": 0.2372742200328407, "grad_norm": 0.020882774144411087, "learning_rate": 1.9508985447394927e-05, "loss": 0.7233, "step": 289 }, { "epoch": 0.23809523809523808, "grad_norm": 0.01900722272694111, "learning_rate": 1.9504902262389095e-05, "loss": 0.7418, "step": 290 }, { "epoch": 0.23891625615763548, "grad_norm": 0.01815676875412464, "learning_rate": 1.9500802601042504e-05, "loss": 0.7538, "step": 291 }, { "epoch": 0.23973727422003285, "grad_norm": 0.018912343308329582, "learning_rate": 1.9496686470462486e-05, "loss": 0.771, "step": 292 }, { "epoch": 0.24055829228243022, "grad_norm": 0.018609460443258286, "learning_rate": 1.949255387778493e-05, "loss": 0.7588, "step": 293 }, { "epoch": 0.2413793103448276, "grad_norm": 0.01930108666419983, "learning_rate": 1.948840483017426e-05, "loss": 0.7277, "step": 294 }, { "epoch": 0.24220032840722497, "grad_norm": 0.020578257739543915, "learning_rate": 1.948423933482343e-05, "loss": 0.7471, "step": 295 }, { "epoch": 0.24302134646962234, "grad_norm": 0.017388805747032166, "learning_rate": 1.948005739895391e-05, "loss": 0.743, "step": 296 }, { "epoch": 0.2438423645320197, "grad_norm": 0.019881686195731163, "learning_rate": 1.947585902981566e-05, "loss": 0.7568, "step": 297 }, { "epoch": 0.24466338259441708, "grad_norm": 0.017406996339559555, "learning_rate": 1.9471644234687153e-05, "loss": 0.7277, "step": 298 }, { "epoch": 0.24548440065681446, "grad_norm": 0.018607793375849724, "learning_rate": 1.9467413020875315e-05, "loss": 0.7701, "step": 299 }, { "epoch": 0.24630541871921183, "grad_norm": 0.017961794510483742, "learning_rate": 1.9463165395715546e-05, "loss": 0.7442, "step": 300 }, { "epoch": 0.2471264367816092, "grad_norm": 0.018925359472632408, "learning_rate": 1.94589013665717e-05, "loss": 0.7786, "step": 301 }, { "epoch": 0.24794745484400657, "grad_norm": 0.01819562539458275, "learning_rate": 1.9454620940836066e-05, "loss": 0.7373, "step": 302 }, { "epoch": 0.24876847290640394, "grad_norm": 0.01819402165710926, "learning_rate": 1.945032412592936e-05, "loss": 0.7662, "step": 303 }, { "epoch": 0.24958949096880131, "grad_norm": 0.018124252557754517, "learning_rate": 1.9446010929300704e-05, "loss": 0.7447, "step": 304 }, { "epoch": 0.2504105090311987, "grad_norm": 0.018326397985219955, "learning_rate": 1.9441681358427635e-05, "loss": 0.7411, "step": 305 }, { "epoch": 0.2512315270935961, "grad_norm": 0.01862204633653164, "learning_rate": 1.9437335420816065e-05, "loss": 0.735, "step": 306 }, { "epoch": 0.25205254515599346, "grad_norm": 0.017028551548719406, "learning_rate": 1.943297312400028e-05, "loss": 0.7223, "step": 307 }, { "epoch": 0.25287356321839083, "grad_norm": 0.017043277621269226, "learning_rate": 1.9428594475542934e-05, "loss": 0.757, "step": 308 }, { "epoch": 0.2536945812807882, "grad_norm": 0.017166845500469208, "learning_rate": 1.9424199483035026e-05, "loss": 0.739, "step": 309 }, { "epoch": 0.2545155993431856, "grad_norm": 0.01831471361219883, "learning_rate": 1.941978815409588e-05, "loss": 0.7408, "step": 310 }, { "epoch": 0.25533661740558294, "grad_norm": 0.01768493466079235, "learning_rate": 1.9415360496373163e-05, "loss": 0.7577, "step": 311 }, { "epoch": 0.2561576354679803, "grad_norm": 0.07712123543024063, "learning_rate": 1.9410916517542827e-05, "loss": 0.7267, "step": 312 }, { "epoch": 0.2569786535303777, "grad_norm": 0.017748819664120674, "learning_rate": 1.9406456225309142e-05, "loss": 0.7272, "step": 313 }, { "epoch": 0.25779967159277506, "grad_norm": 0.02894209511578083, "learning_rate": 1.9401979627404633e-05, "loss": 0.7253, "step": 314 }, { "epoch": 0.25862068965517243, "grad_norm": 0.016417548060417175, "learning_rate": 1.9397486731590112e-05, "loss": 0.7303, "step": 315 }, { "epoch": 0.2594417077175698, "grad_norm": 0.02018548734486103, "learning_rate": 1.9392977545654643e-05, "loss": 0.7287, "step": 316 }, { "epoch": 0.2602627257799672, "grad_norm": 0.018340161070227623, "learning_rate": 1.938845207741553e-05, "loss": 0.7467, "step": 317 }, { "epoch": 0.26108374384236455, "grad_norm": 0.017918815836310387, "learning_rate": 1.93839103347183e-05, "loss": 0.7723, "step": 318 }, { "epoch": 0.2619047619047619, "grad_norm": 0.01907137595117092, "learning_rate": 1.93793523254367e-05, "loss": 0.7453, "step": 319 }, { "epoch": 0.2627257799671593, "grad_norm": 0.017367763444781303, "learning_rate": 1.9374778057472676e-05, "loss": 0.7435, "step": 320 }, { "epoch": 0.26354679802955666, "grad_norm": 0.01824215054512024, "learning_rate": 1.9370187538756354e-05, "loss": 0.6991, "step": 321 }, { "epoch": 0.26436781609195403, "grad_norm": 0.01871606521308422, "learning_rate": 1.9365580777246043e-05, "loss": 0.7024, "step": 322 }, { "epoch": 0.2651888341543514, "grad_norm": 0.01713935285806656, "learning_rate": 1.936095778092821e-05, "loss": 0.7511, "step": 323 }, { "epoch": 0.2660098522167488, "grad_norm": 0.020226337015628815, "learning_rate": 1.9356318557817463e-05, "loss": 0.7469, "step": 324 }, { "epoch": 0.26683087027914615, "grad_norm": 0.02122541144490242, "learning_rate": 1.935166311595654e-05, "loss": 0.7221, "step": 325 }, { "epoch": 0.2676518883415435, "grad_norm": 0.018967485055327415, "learning_rate": 1.9346991463416294e-05, "loss": 0.7167, "step": 326 }, { "epoch": 0.2684729064039409, "grad_norm": 0.018821313977241516, "learning_rate": 1.9342303608295696e-05, "loss": 0.7111, "step": 327 }, { "epoch": 0.26929392446633826, "grad_norm": 0.016692837700247765, "learning_rate": 1.9337599558721797e-05, "loss": 0.7394, "step": 328 }, { "epoch": 0.27011494252873564, "grad_norm": 0.018485937267541885, "learning_rate": 1.9332879322849717e-05, "loss": 0.7298, "step": 329 }, { "epoch": 0.270935960591133, "grad_norm": 0.01673051156103611, "learning_rate": 1.9328142908862643e-05, "loss": 0.7314, "step": 330 }, { "epoch": 0.2717569786535304, "grad_norm": 0.0210958831012249, "learning_rate": 1.9323390324971818e-05, "loss": 0.7545, "step": 331 }, { "epoch": 0.27257799671592775, "grad_norm": 0.017795182764530182, "learning_rate": 1.9318621579416502e-05, "loss": 0.7234, "step": 332 }, { "epoch": 0.2733990147783251, "grad_norm": 0.01925070397555828, "learning_rate": 1.9313836680463986e-05, "loss": 0.7328, "step": 333 }, { "epoch": 0.2742200328407225, "grad_norm": 0.018927501514554024, "learning_rate": 1.930903563640955e-05, "loss": 0.7337, "step": 334 }, { "epoch": 0.27504105090311987, "grad_norm": 0.0188999492675066, "learning_rate": 1.9304218455576488e-05, "loss": 0.7335, "step": 335 }, { "epoch": 0.27586206896551724, "grad_norm": 0.020573224872350693, "learning_rate": 1.9299385146316047e-05, "loss": 0.7235, "step": 336 }, { "epoch": 0.2766830870279146, "grad_norm": 0.02095266804099083, "learning_rate": 1.9294535717007442e-05, "loss": 0.7237, "step": 337 }, { "epoch": 0.277504105090312, "grad_norm": 0.02014998160302639, "learning_rate": 1.928967017605784e-05, "loss": 0.7174, "step": 338 }, { "epoch": 0.27832512315270935, "grad_norm": 0.02009834349155426, "learning_rate": 1.928478853190233e-05, "loss": 0.7492, "step": 339 }, { "epoch": 0.2791461412151067, "grad_norm": 0.021052444353699684, "learning_rate": 1.9279890793003932e-05, "loss": 0.7116, "step": 340 }, { "epoch": 0.2799671592775041, "grad_norm": 0.02818828821182251, "learning_rate": 1.9274976967853546e-05, "loss": 0.758, "step": 341 }, { "epoch": 0.28078817733990147, "grad_norm": 0.021207420155405998, "learning_rate": 1.9270047064969987e-05, "loss": 0.7054, "step": 342 }, { "epoch": 0.28160919540229884, "grad_norm": 0.019864482805132866, "learning_rate": 1.926510109289992e-05, "loss": 0.7002, "step": 343 }, { "epoch": 0.2824302134646962, "grad_norm": 0.02149053104221821, "learning_rate": 1.9260139060217885e-05, "loss": 0.7298, "step": 344 }, { "epoch": 0.2832512315270936, "grad_norm": 0.01982991024851799, "learning_rate": 1.9255160975526255e-05, "loss": 0.7253, "step": 345 }, { "epoch": 0.28407224958949095, "grad_norm": 0.02043965272605419, "learning_rate": 1.925016684745523e-05, "loss": 0.7121, "step": 346 }, { "epoch": 0.2848932676518883, "grad_norm": 0.018458595499396324, "learning_rate": 1.9245156684662836e-05, "loss": 0.739, "step": 347 }, { "epoch": 0.2857142857142857, "grad_norm": 0.022648988291621208, "learning_rate": 1.9240130495834884e-05, "loss": 0.7353, "step": 348 }, { "epoch": 0.28653530377668307, "grad_norm": 0.019444739446043968, "learning_rate": 1.923508828968498e-05, "loss": 0.7542, "step": 349 }, { "epoch": 0.28735632183908044, "grad_norm": 0.019751805812120438, "learning_rate": 1.9230030074954483e-05, "loss": 0.7387, "step": 350 }, { "epoch": 0.2881773399014778, "grad_norm": 0.017509428784251213, "learning_rate": 1.9224955860412517e-05, "loss": 0.7264, "step": 351 }, { "epoch": 0.2889983579638752, "grad_norm": 0.018732164055109024, "learning_rate": 1.9219865654855947e-05, "loss": 0.7429, "step": 352 }, { "epoch": 0.28981937602627256, "grad_norm": 0.06331244856119156, "learning_rate": 1.921475946710935e-05, "loss": 0.7085, "step": 353 }, { "epoch": 0.29064039408866993, "grad_norm": 0.01749587431550026, "learning_rate": 1.920963730602501e-05, "loss": 0.7103, "step": 354 }, { "epoch": 0.2914614121510673, "grad_norm": 0.016905365511775017, "learning_rate": 1.9204499180482913e-05, "loss": 0.7302, "step": 355 }, { "epoch": 0.2922824302134647, "grad_norm": 0.018366722390055656, "learning_rate": 1.919934509939072e-05, "loss": 0.7188, "step": 356 }, { "epoch": 0.29310344827586204, "grad_norm": 0.01711583510041237, "learning_rate": 1.9194175071683746e-05, "loss": 0.7231, "step": 357 }, { "epoch": 0.2939244663382594, "grad_norm": 0.018634028732776642, "learning_rate": 1.9188989106324955e-05, "loss": 0.7357, "step": 358 }, { "epoch": 0.2947454844006568, "grad_norm": 0.018125787377357483, "learning_rate": 1.9183787212304938e-05, "loss": 0.7033, "step": 359 }, { "epoch": 0.2955665024630542, "grad_norm": 0.04068536311388016, "learning_rate": 1.9178569398641916e-05, "loss": 0.7284, "step": 360 }, { "epoch": 0.2963875205254516, "grad_norm": 0.01987866871058941, "learning_rate": 1.9173335674381687e-05, "loss": 0.7385, "step": 361 }, { "epoch": 0.29720853858784896, "grad_norm": 0.017811084166169167, "learning_rate": 1.916808604859764e-05, "loss": 0.6942, "step": 362 }, { "epoch": 0.29802955665024633, "grad_norm": 0.01769954152405262, "learning_rate": 1.9162820530390744e-05, "loss": 0.7279, "step": 363 }, { "epoch": 0.2988505747126437, "grad_norm": 0.017293747514486313, "learning_rate": 1.9157539128889504e-05, "loss": 0.7037, "step": 364 }, { "epoch": 0.2996715927750411, "grad_norm": 0.018431762233376503, "learning_rate": 1.9152241853249964e-05, "loss": 0.7241, "step": 365 }, { "epoch": 0.30049261083743845, "grad_norm": 0.017725232988595963, "learning_rate": 1.9146928712655698e-05, "loss": 0.734, "step": 366 }, { "epoch": 0.3013136288998358, "grad_norm": 0.01857619546353817, "learning_rate": 1.914159971631777e-05, "loss": 0.7142, "step": 367 }, { "epoch": 0.3021346469622332, "grad_norm": 0.01750655099749565, "learning_rate": 1.913625487347474e-05, "loss": 0.7151, "step": 368 }, { "epoch": 0.30295566502463056, "grad_norm": 0.01792227104306221, "learning_rate": 1.913089419339264e-05, "loss": 0.743, "step": 369 }, { "epoch": 0.30377668308702793, "grad_norm": 0.018382202833890915, "learning_rate": 1.9125517685364954e-05, "loss": 0.7242, "step": 370 }, { "epoch": 0.3045977011494253, "grad_norm": 0.030260898172855377, "learning_rate": 1.9120125358712617e-05, "loss": 0.7075, "step": 371 }, { "epoch": 0.3054187192118227, "grad_norm": 0.01646268367767334, "learning_rate": 1.9114717222783968e-05, "loss": 0.696, "step": 372 }, { "epoch": 0.30623973727422005, "grad_norm": 0.017564821988344193, "learning_rate": 1.9109293286954777e-05, "loss": 0.7495, "step": 373 }, { "epoch": 0.3070607553366174, "grad_norm": 0.017239045351743698, "learning_rate": 1.9103853560628184e-05, "loss": 0.7376, "step": 374 }, { "epoch": 0.3078817733990148, "grad_norm": 0.018933456391096115, "learning_rate": 1.9098398053234717e-05, "loss": 0.744, "step": 375 }, { "epoch": 0.30870279146141216, "grad_norm": 0.017166370525956154, "learning_rate": 1.9092926774232267e-05, "loss": 0.7177, "step": 376 }, { "epoch": 0.30952380952380953, "grad_norm": 0.01894722692668438, "learning_rate": 1.9087439733106046e-05, "loss": 0.7383, "step": 377 }, { "epoch": 0.3103448275862069, "grad_norm": 0.01632995344698429, "learning_rate": 1.9081936939368615e-05, "loss": 0.7047, "step": 378 }, { "epoch": 0.3111658456486043, "grad_norm": 0.041332852095365524, "learning_rate": 1.907641840255983e-05, "loss": 0.7403, "step": 379 }, { "epoch": 0.31198686371100165, "grad_norm": 0.018109457567334175, "learning_rate": 1.9070884132246848e-05, "loss": 0.7602, "step": 380 }, { "epoch": 0.312807881773399, "grad_norm": 0.01651688478887081, "learning_rate": 1.9065334138024098e-05, "loss": 0.699, "step": 381 }, { "epoch": 0.3136288998357964, "grad_norm": 0.01825406402349472, "learning_rate": 1.905976842951327e-05, "loss": 0.7283, "step": 382 }, { "epoch": 0.31444991789819376, "grad_norm": 0.01836152747273445, "learning_rate": 1.9054187016363293e-05, "loss": 0.7403, "step": 383 }, { "epoch": 0.31527093596059114, "grad_norm": 0.017918312922120094, "learning_rate": 1.9048589908250325e-05, "loss": 0.749, "step": 384 }, { "epoch": 0.3160919540229885, "grad_norm": 0.018564963713288307, "learning_rate": 1.904297711487774e-05, "loss": 0.7295, "step": 385 }, { "epoch": 0.3169129720853859, "grad_norm": 0.018025949597358704, "learning_rate": 1.90373486459761e-05, "loss": 0.7079, "step": 386 }, { "epoch": 0.31773399014778325, "grad_norm": 0.018403654918074608, "learning_rate": 1.903170451130313e-05, "loss": 0.7185, "step": 387 }, { "epoch": 0.3185550082101806, "grad_norm": 0.016878578811883926, "learning_rate": 1.9026044720643733e-05, "loss": 0.7269, "step": 388 }, { "epoch": 0.319376026272578, "grad_norm": 0.01893213763833046, "learning_rate": 1.9020369283809946e-05, "loss": 0.7414, "step": 389 }, { "epoch": 0.32019704433497537, "grad_norm": 0.019158078357577324, "learning_rate": 1.901467821064093e-05, "loss": 0.7209, "step": 390 }, { "epoch": 0.32101806239737274, "grad_norm": 0.017097778618335724, "learning_rate": 1.900897151100295e-05, "loss": 0.7406, "step": 391 }, { "epoch": 0.3218390804597701, "grad_norm": 0.01854608580470085, "learning_rate": 1.9003249194789374e-05, "loss": 0.7182, "step": 392 }, { "epoch": 0.3226600985221675, "grad_norm": 0.019103560596704483, "learning_rate": 1.8997511271920625e-05, "loss": 0.7024, "step": 393 }, { "epoch": 0.32348111658456485, "grad_norm": 0.018994774669408798, "learning_rate": 1.8991757752344204e-05, "loss": 0.7465, "step": 394 }, { "epoch": 0.3243021346469622, "grad_norm": 0.01828889362514019, "learning_rate": 1.8985988646034636e-05, "loss": 0.7135, "step": 395 }, { "epoch": 0.3251231527093596, "grad_norm": 0.01696670427918434, "learning_rate": 1.898020396299347e-05, "loss": 0.7343, "step": 396 }, { "epoch": 0.32594417077175697, "grad_norm": 0.017366508021950722, "learning_rate": 1.897440371324926e-05, "loss": 0.6983, "step": 397 }, { "epoch": 0.32676518883415434, "grad_norm": 0.017792167142033577, "learning_rate": 1.8968587906857554e-05, "loss": 0.7114, "step": 398 }, { "epoch": 0.3275862068965517, "grad_norm": 0.018296966329216957, "learning_rate": 1.8962756553900862e-05, "loss": 0.7196, "step": 399 }, { "epoch": 0.3284072249589491, "grad_norm": 0.017503276467323303, "learning_rate": 1.8956909664488642e-05, "loss": 0.7134, "step": 400 }, { "epoch": 0.32922824302134646, "grad_norm": 0.018446452915668488, "learning_rate": 1.89510472487573e-05, "loss": 0.7232, "step": 401 }, { "epoch": 0.33004926108374383, "grad_norm": 0.017988353967666626, "learning_rate": 1.8945169316870155e-05, "loss": 0.7195, "step": 402 }, { "epoch": 0.3308702791461412, "grad_norm": 0.016692044213414192, "learning_rate": 1.8939275879017408e-05, "loss": 0.7426, "step": 403 }, { "epoch": 0.33169129720853857, "grad_norm": 0.01764729805290699, "learning_rate": 1.8933366945416167e-05, "loss": 0.7116, "step": 404 }, { "epoch": 0.33251231527093594, "grad_norm": 0.017176272347569466, "learning_rate": 1.8927442526310392e-05, "loss": 0.7034, "step": 405 }, { "epoch": 0.3333333333333333, "grad_norm": 0.019005002453923225, "learning_rate": 1.8921502631970886e-05, "loss": 0.7074, "step": 406 }, { "epoch": 0.3341543513957307, "grad_norm": 0.01888890378177166, "learning_rate": 1.891554727269529e-05, "loss": 0.6997, "step": 407 }, { "epoch": 0.33497536945812806, "grad_norm": 0.01684563420712948, "learning_rate": 1.8909576458808044e-05, "loss": 0.7105, "step": 408 }, { "epoch": 0.33579638752052543, "grad_norm": 0.018028918653726578, "learning_rate": 1.8903590200660393e-05, "loss": 0.7048, "step": 409 }, { "epoch": 0.3366174055829228, "grad_norm": 0.01777043007314205, "learning_rate": 1.8897588508630346e-05, "loss": 0.7243, "step": 410 }, { "epoch": 0.3374384236453202, "grad_norm": 0.017991749569773674, "learning_rate": 1.889157139312268e-05, "loss": 0.6918, "step": 411 }, { "epoch": 0.33825944170771755, "grad_norm": 0.017815804108977318, "learning_rate": 1.8885538864568896e-05, "loss": 0.7373, "step": 412 }, { "epoch": 0.3390804597701149, "grad_norm": 1.5692856311798096, "learning_rate": 1.887949093342723e-05, "loss": 0.7339, "step": 413 }, { "epoch": 0.3399014778325123, "grad_norm": 0.018212290480732918, "learning_rate": 1.887342761018261e-05, "loss": 0.6992, "step": 414 }, { "epoch": 0.34072249589490966, "grad_norm": 0.018956903368234634, "learning_rate": 1.8867348905346662e-05, "loss": 0.7224, "step": 415 }, { "epoch": 0.3415435139573071, "grad_norm": 0.0180122759193182, "learning_rate": 1.8861254829457657e-05, "loss": 0.7289, "step": 416 }, { "epoch": 0.34236453201970446, "grad_norm": 0.017314651980996132, "learning_rate": 1.8855145393080535e-05, "loss": 0.7027, "step": 417 }, { "epoch": 0.34318555008210183, "grad_norm": 0.0186073686927557, "learning_rate": 1.8849020606806855e-05, "loss": 0.7226, "step": 418 }, { "epoch": 0.3440065681444992, "grad_norm": 0.016628842800855637, "learning_rate": 1.8842880481254784e-05, "loss": 0.7086, "step": 419 }, { "epoch": 0.3448275862068966, "grad_norm": 0.01828058250248432, "learning_rate": 1.883672502706909e-05, "loss": 0.732, "step": 420 }, { "epoch": 0.34564860426929395, "grad_norm": 0.016988927498459816, "learning_rate": 1.883055425492111e-05, "loss": 0.7089, "step": 421 }, { "epoch": 0.3464696223316913, "grad_norm": 0.017838910222053528, "learning_rate": 1.882436817550874e-05, "loss": 0.7083, "step": 422 }, { "epoch": 0.3472906403940887, "grad_norm": 0.018007835373282433, "learning_rate": 1.8818166799556414e-05, "loss": 0.6958, "step": 423 }, { "epoch": 0.34811165845648606, "grad_norm": 0.016725176945328712, "learning_rate": 1.8811950137815078e-05, "loss": 0.7337, "step": 424 }, { "epoch": 0.34893267651888343, "grad_norm": 0.01811240054666996, "learning_rate": 1.8805718201062187e-05, "loss": 0.7038, "step": 425 }, { "epoch": 0.3497536945812808, "grad_norm": 0.016960307955741882, "learning_rate": 1.8799471000101674e-05, "loss": 0.6906, "step": 426 }, { "epoch": 0.3505747126436782, "grad_norm": 0.01762058399617672, "learning_rate": 1.879320854576392e-05, "loss": 0.6943, "step": 427 }, { "epoch": 0.35139573070607555, "grad_norm": 0.020590325817465782, "learning_rate": 1.878693084890578e-05, "loss": 0.6897, "step": 428 }, { "epoch": 0.3522167487684729, "grad_norm": 0.01673879101872444, "learning_rate": 1.8780637920410513e-05, "loss": 0.7357, "step": 429 }, { "epoch": 0.3530377668308703, "grad_norm": 0.019154410809278488, "learning_rate": 1.8774329771187788e-05, "loss": 0.7133, "step": 430 }, { "epoch": 0.35385878489326766, "grad_norm": 0.01742209866642952, "learning_rate": 1.8768006412173656e-05, "loss": 0.7282, "step": 431 }, { "epoch": 0.35467980295566504, "grad_norm": 0.017900466918945312, "learning_rate": 1.8761667854330553e-05, "loss": 0.72, "step": 432 }, { "epoch": 0.3555008210180624, "grad_norm": 0.017765605822205544, "learning_rate": 1.8755314108647244e-05, "loss": 0.7104, "step": 433 }, { "epoch": 0.3563218390804598, "grad_norm": 0.01766427978873253, "learning_rate": 1.8748945186138837e-05, "loss": 0.7027, "step": 434 }, { "epoch": 0.35714285714285715, "grad_norm": 0.019235454499721527, "learning_rate": 1.874256109784675e-05, "loss": 0.7162, "step": 435 }, { "epoch": 0.3579638752052545, "grad_norm": 0.016227075830101967, "learning_rate": 1.873616185483869e-05, "loss": 0.7153, "step": 436 }, { "epoch": 0.3587848932676519, "grad_norm": 0.018999559804797173, "learning_rate": 1.8729747468208635e-05, "loss": 0.7374, "step": 437 }, { "epoch": 0.35960591133004927, "grad_norm": 0.01700562983751297, "learning_rate": 1.8723317949076823e-05, "loss": 0.6881, "step": 438 }, { "epoch": 0.36042692939244664, "grad_norm": 0.018006272614002228, "learning_rate": 1.8716873308589722e-05, "loss": 0.6931, "step": 439 }, { "epoch": 0.361247947454844, "grad_norm": 0.016939446330070496, "learning_rate": 1.8710413557920008e-05, "loss": 0.7084, "step": 440 }, { "epoch": 0.3620689655172414, "grad_norm": 0.019490262493491173, "learning_rate": 1.8703938708266575e-05, "loss": 0.7437, "step": 441 }, { "epoch": 0.36288998357963875, "grad_norm": 0.016849994659423828, "learning_rate": 1.8697448770854466e-05, "loss": 0.7049, "step": 442 }, { "epoch": 0.3637110016420361, "grad_norm": 0.01825053058564663, "learning_rate": 1.8690943756934898e-05, "loss": 0.6998, "step": 443 }, { "epoch": 0.3645320197044335, "grad_norm": 0.01664423570036888, "learning_rate": 1.8684423677785217e-05, "loss": 0.7266, "step": 444 }, { "epoch": 0.36535303776683087, "grad_norm": 0.017769193276762962, "learning_rate": 1.8677888544708894e-05, "loss": 0.7063, "step": 445 }, { "epoch": 0.36617405582922824, "grad_norm": 0.016835421323776245, "learning_rate": 1.8671338369035493e-05, "loss": 0.7062, "step": 446 }, { "epoch": 0.3669950738916256, "grad_norm": 0.018016424030065536, "learning_rate": 1.8664773162120657e-05, "loss": 0.7063, "step": 447 }, { "epoch": 0.367816091954023, "grad_norm": 0.017350900918245316, "learning_rate": 1.8658192935346088e-05, "loss": 0.7011, "step": 448 }, { "epoch": 0.36863711001642036, "grad_norm": 0.016105959191918373, "learning_rate": 1.865159770011953e-05, "loss": 0.6992, "step": 449 }, { "epoch": 0.3694581280788177, "grad_norm": 0.01851324737071991, "learning_rate": 1.864498746787474e-05, "loss": 0.7058, "step": 450 }, { "epoch": 0.3702791461412151, "grad_norm": 0.01893213577568531, "learning_rate": 1.863836225007148e-05, "loss": 0.7216, "step": 451 }, { "epoch": 0.37110016420361247, "grad_norm": 0.017081711441278458, "learning_rate": 1.863172205819549e-05, "loss": 0.6967, "step": 452 }, { "epoch": 0.37192118226600984, "grad_norm": 0.019305530935525894, "learning_rate": 1.8625066903758465e-05, "loss": 0.7272, "step": 453 }, { "epoch": 0.3727422003284072, "grad_norm": 0.01726316288113594, "learning_rate": 1.861839679829805e-05, "loss": 0.71, "step": 454 }, { "epoch": 0.3735632183908046, "grad_norm": 0.019272828474640846, "learning_rate": 1.86117117533778e-05, "loss": 0.7017, "step": 455 }, { "epoch": 0.37438423645320196, "grad_norm": 0.01670173369348049, "learning_rate": 1.8605011780587176e-05, "loss": 0.696, "step": 456 }, { "epoch": 0.37520525451559933, "grad_norm": 0.016480784863233566, "learning_rate": 1.8598296891541507e-05, "loss": 0.7011, "step": 457 }, { "epoch": 0.3760262725779967, "grad_norm": 0.016828998923301697, "learning_rate": 1.8591567097882002e-05, "loss": 0.7165, "step": 458 }, { "epoch": 0.3768472906403941, "grad_norm": 0.018435994163155556, "learning_rate": 1.8584822411275688e-05, "loss": 0.7338, "step": 459 }, { "epoch": 0.37766830870279144, "grad_norm": 0.01648596301674843, "learning_rate": 1.857806284341543e-05, "loss": 0.7268, "step": 460 }, { "epoch": 0.3784893267651888, "grad_norm": 0.017949406057596207, "learning_rate": 1.8571288406019873e-05, "loss": 0.6979, "step": 461 }, { "epoch": 0.3793103448275862, "grad_norm": 0.016292806714773178, "learning_rate": 1.8564499110833457e-05, "loss": 0.6755, "step": 462 }, { "epoch": 0.38013136288998356, "grad_norm": 0.018525758758187294, "learning_rate": 1.8557694969626364e-05, "loss": 0.6975, "step": 463 }, { "epoch": 0.38095238095238093, "grad_norm": 0.029960710555315018, "learning_rate": 1.855087599419453e-05, "loss": 0.7016, "step": 464 }, { "epoch": 0.3817733990147783, "grad_norm": 0.020171433687210083, "learning_rate": 1.8544042196359593e-05, "loss": 0.7387, "step": 465 }, { "epoch": 0.3825944170771757, "grad_norm": 0.018705761060118675, "learning_rate": 1.8537193587968906e-05, "loss": 0.7173, "step": 466 }, { "epoch": 0.38341543513957305, "grad_norm": 0.017162490636110306, "learning_rate": 1.853033018089548e-05, "loss": 0.6939, "step": 467 }, { "epoch": 0.3842364532019704, "grad_norm": 0.017675748094916344, "learning_rate": 1.8523451987037985e-05, "loss": 0.7326, "step": 468 }, { "epoch": 0.3850574712643678, "grad_norm": 0.01910443976521492, "learning_rate": 1.8516559018320743e-05, "loss": 0.7218, "step": 469 }, { "epoch": 0.38587848932676516, "grad_norm": 0.01660807989537716, "learning_rate": 1.850965128669366e-05, "loss": 0.6793, "step": 470 }, { "epoch": 0.3866995073891626, "grad_norm": 0.01925588585436344, "learning_rate": 1.8502728804132265e-05, "loss": 0.7142, "step": 471 }, { "epoch": 0.38752052545155996, "grad_norm": 0.01841765083372593, "learning_rate": 1.8495791582637646e-05, "loss": 0.6736, "step": 472 }, { "epoch": 0.38834154351395733, "grad_norm": 0.017147410660982132, "learning_rate": 1.8488839634236436e-05, "loss": 0.7105, "step": 473 }, { "epoch": 0.3891625615763547, "grad_norm": 0.10528808832168579, "learning_rate": 1.848187297098082e-05, "loss": 0.7187, "step": 474 }, { "epoch": 0.3899835796387521, "grad_norm": 0.016844168305397034, "learning_rate": 1.8474891604948475e-05, "loss": 0.6879, "step": 475 }, { "epoch": 0.39080459770114945, "grad_norm": 0.016894148662686348, "learning_rate": 1.8467895548242573e-05, "loss": 0.7101, "step": 476 }, { "epoch": 0.3916256157635468, "grad_norm": 0.016782978549599648, "learning_rate": 1.8460884812991755e-05, "loss": 0.7358, "step": 477 }, { "epoch": 0.3924466338259442, "grad_norm": 0.017482850700616837, "learning_rate": 1.8453859411350107e-05, "loss": 0.6979, "step": 478 }, { "epoch": 0.39326765188834156, "grad_norm": 0.017027592286467552, "learning_rate": 1.8446819355497146e-05, "loss": 0.7281, "step": 479 }, { "epoch": 0.39408866995073893, "grad_norm": 0.017223374918103218, "learning_rate": 1.8439764657637787e-05, "loss": 0.6953, "step": 480 }, { "epoch": 0.3949096880131363, "grad_norm": 0.01746206358075142, "learning_rate": 1.8432695330002336e-05, "loss": 0.7192, "step": 481 }, { "epoch": 0.3957307060755337, "grad_norm": 0.017568418756127357, "learning_rate": 1.8425611384846467e-05, "loss": 0.7144, "step": 482 }, { "epoch": 0.39655172413793105, "grad_norm": 0.015782440081238747, "learning_rate": 1.841851283445117e-05, "loss": 0.7005, "step": 483 }, { "epoch": 0.3973727422003284, "grad_norm": 0.01822992041707039, "learning_rate": 1.8411399691122786e-05, "loss": 0.7303, "step": 484 }, { "epoch": 0.3981937602627258, "grad_norm": 0.01773962564766407, "learning_rate": 1.8404271967192936e-05, "loss": 0.7131, "step": 485 }, { "epoch": 0.39901477832512317, "grad_norm": 0.017363429069519043, "learning_rate": 1.839712967501853e-05, "loss": 0.6988, "step": 486 }, { "epoch": 0.39983579638752054, "grad_norm": 0.016964895650744438, "learning_rate": 1.838997282698172e-05, "loss": 0.7063, "step": 487 }, { "epoch": 0.4006568144499179, "grad_norm": 0.017347659915685654, "learning_rate": 1.8382801435489903e-05, "loss": 0.6928, "step": 488 }, { "epoch": 0.4014778325123153, "grad_norm": 0.016964338719844818, "learning_rate": 1.8375615512975694e-05, "loss": 0.6841, "step": 489 }, { "epoch": 0.40229885057471265, "grad_norm": 0.01606549508869648, "learning_rate": 1.836841507189688e-05, "loss": 0.6756, "step": 490 }, { "epoch": 0.40311986863711, "grad_norm": 0.01882852055132389, "learning_rate": 1.8361200124736443e-05, "loss": 0.7281, "step": 491 }, { "epoch": 0.4039408866995074, "grad_norm": 0.016065390780568123, "learning_rate": 1.8353970684002492e-05, "loss": 0.7075, "step": 492 }, { "epoch": 0.40476190476190477, "grad_norm": 0.01775914430618286, "learning_rate": 1.8346726762228276e-05, "loss": 0.7217, "step": 493 }, { "epoch": 0.40558292282430214, "grad_norm": 0.01623908244073391, "learning_rate": 1.833946837197214e-05, "loss": 0.7067, "step": 494 }, { "epoch": 0.4064039408866995, "grad_norm": 0.01873624511063099, "learning_rate": 1.8332195525817518e-05, "loss": 0.7143, "step": 495 }, { "epoch": 0.4072249589490969, "grad_norm": 0.01659097708761692, "learning_rate": 1.832490823637291e-05, "loss": 0.6933, "step": 496 }, { "epoch": 0.40804597701149425, "grad_norm": 0.01750343292951584, "learning_rate": 1.831760651627184e-05, "loss": 0.7071, "step": 497 }, { "epoch": 0.4088669950738916, "grad_norm": 0.017589328810572624, "learning_rate": 1.831029037817286e-05, "loss": 0.6811, "step": 498 }, { "epoch": 0.409688013136289, "grad_norm": 0.016336020082235336, "learning_rate": 1.830295983475952e-05, "loss": 0.6851, "step": 499 }, { "epoch": 0.41050903119868637, "grad_norm": 0.016994768753647804, "learning_rate": 1.829561489874034e-05, "loss": 0.6991, "step": 500 }, { "epoch": 0.41133004926108374, "grad_norm": 0.01879769191145897, "learning_rate": 1.8288255582848788e-05, "loss": 0.7426, "step": 501 }, { "epoch": 0.4121510673234811, "grad_norm": 0.018339132890105247, "learning_rate": 1.828088189984327e-05, "loss": 0.682, "step": 502 }, { "epoch": 0.4129720853858785, "grad_norm": 0.017647089436650276, "learning_rate": 1.827349386250709e-05, "loss": 0.7086, "step": 503 }, { "epoch": 0.41379310344827586, "grad_norm": 0.016644716262817383, "learning_rate": 1.826609148364844e-05, "loss": 0.7003, "step": 504 }, { "epoch": 0.41461412151067323, "grad_norm": 0.016854122281074524, "learning_rate": 1.8258674776100386e-05, "loss": 0.6993, "step": 505 }, { "epoch": 0.4154351395730706, "grad_norm": 0.01683785393834114, "learning_rate": 1.8251243752720816e-05, "loss": 0.6717, "step": 506 }, { "epoch": 0.41625615763546797, "grad_norm": 0.01630851812660694, "learning_rate": 1.824379842639245e-05, "loss": 0.6876, "step": 507 }, { "epoch": 0.41707717569786534, "grad_norm": 0.01651563122868538, "learning_rate": 1.8236338810022796e-05, "loss": 0.6827, "step": 508 }, { "epoch": 0.4178981937602627, "grad_norm": 0.03996599465608597, "learning_rate": 1.8228864916544146e-05, "loss": 0.6789, "step": 509 }, { "epoch": 0.4187192118226601, "grad_norm": 0.016969434916973114, "learning_rate": 1.8221376758913534e-05, "loss": 0.7074, "step": 510 }, { "epoch": 0.41954022988505746, "grad_norm": 0.017690205946564674, "learning_rate": 1.8213874350112725e-05, "loss": 0.6873, "step": 511 }, { "epoch": 0.42036124794745483, "grad_norm": 0.018253302201628685, "learning_rate": 1.8206357703148197e-05, "loss": 0.72, "step": 512 }, { "epoch": 0.4211822660098522, "grad_norm": 0.018245898187160492, "learning_rate": 1.81988268310511e-05, "loss": 0.692, "step": 513 }, { "epoch": 0.4220032840722496, "grad_norm": 0.018002424389123917, "learning_rate": 1.8191281746877255e-05, "loss": 0.6911, "step": 514 }, { "epoch": 0.42282430213464695, "grad_norm": 0.017526011914014816, "learning_rate": 1.8183722463707117e-05, "loss": 0.6982, "step": 515 }, { "epoch": 0.4236453201970443, "grad_norm": 0.017052462324500084, "learning_rate": 1.817614899464576e-05, "loss": 0.6995, "step": 516 }, { "epoch": 0.4244663382594417, "grad_norm": 0.015839073807001114, "learning_rate": 1.8168561352822847e-05, "loss": 0.6726, "step": 517 }, { "epoch": 0.42528735632183906, "grad_norm": 0.018590344116091728, "learning_rate": 1.8160959551392616e-05, "loss": 0.6964, "step": 518 }, { "epoch": 0.42610837438423643, "grad_norm": 0.01979670487344265, "learning_rate": 1.8153343603533847e-05, "loss": 0.6831, "step": 519 }, { "epoch": 0.4269293924466338, "grad_norm": 0.016640229150652885, "learning_rate": 1.814571352244985e-05, "loss": 0.7, "step": 520 }, { "epoch": 0.4277504105090312, "grad_norm": 0.02792847901582718, "learning_rate": 1.8138069321368444e-05, "loss": 0.7112, "step": 521 }, { "epoch": 0.42857142857142855, "grad_norm": 0.0167517252266407, "learning_rate": 1.8130411013541907e-05, "loss": 0.7009, "step": 522 }, { "epoch": 0.4293924466338259, "grad_norm": 0.017193689942359924, "learning_rate": 1.8122738612246987e-05, "loss": 0.7171, "step": 523 }, { "epoch": 0.4302134646962233, "grad_norm": 0.018655825406312943, "learning_rate": 1.8115052130784863e-05, "loss": 0.7084, "step": 524 }, { "epoch": 0.43103448275862066, "grad_norm": 0.016838306561112404, "learning_rate": 1.8107351582481124e-05, "loss": 0.6995, "step": 525 }, { "epoch": 0.4318555008210181, "grad_norm": 0.01882016658782959, "learning_rate": 1.809963698068574e-05, "loss": 0.66, "step": 526 }, { "epoch": 0.43267651888341546, "grad_norm": 0.01801176927983761, "learning_rate": 1.8091908338773057e-05, "loss": 0.701, "step": 527 }, { "epoch": 0.43349753694581283, "grad_norm": 0.017072102054953575, "learning_rate": 1.808416567014175e-05, "loss": 0.6854, "step": 528 }, { "epoch": 0.4343185550082102, "grad_norm": 0.017353670671582222, "learning_rate": 1.807640898821482e-05, "loss": 0.6892, "step": 529 }, { "epoch": 0.4351395730706076, "grad_norm": 0.0168203916400671, "learning_rate": 1.8068638306439555e-05, "loss": 0.6811, "step": 530 }, { "epoch": 0.43596059113300495, "grad_norm": 0.020089950412511826, "learning_rate": 1.8060853638287508e-05, "loss": 0.7134, "step": 531 }, { "epoch": 0.4367816091954023, "grad_norm": 0.018472637981176376, "learning_rate": 1.80530549972545e-05, "loss": 0.7162, "step": 532 }, { "epoch": 0.4376026272577997, "grad_norm": 0.024471227079629898, "learning_rate": 1.804524239686056e-05, "loss": 0.6962, "step": 533 }, { "epoch": 0.43842364532019706, "grad_norm": 0.017592880874872208, "learning_rate": 1.803741585064992e-05, "loss": 0.707, "step": 534 }, { "epoch": 0.43924466338259444, "grad_norm": 0.019170664250850677, "learning_rate": 1.8029575372190986e-05, "loss": 0.7191, "step": 535 }, { "epoch": 0.4400656814449918, "grad_norm": 0.018689261749386787, "learning_rate": 1.8021720975076328e-05, "loss": 0.6869, "step": 536 }, { "epoch": 0.4408866995073892, "grad_norm": 0.017529118806123734, "learning_rate": 1.801385267292263e-05, "loss": 0.6695, "step": 537 }, { "epoch": 0.44170771756978655, "grad_norm": 0.019137078896164894, "learning_rate": 1.80059704793707e-05, "loss": 0.7058, "step": 538 }, { "epoch": 0.4425287356321839, "grad_norm": 0.016290688887238503, "learning_rate": 1.7998074408085426e-05, "loss": 0.707, "step": 539 }, { "epoch": 0.4433497536945813, "grad_norm": 0.01769746094942093, "learning_rate": 1.799016447275573e-05, "loss": 0.6811, "step": 540 }, { "epoch": 0.44417077175697867, "grad_norm": 0.016966434195637703, "learning_rate": 1.798224068709461e-05, "loss": 0.697, "step": 541 }, { "epoch": 0.44499178981937604, "grad_norm": 0.01857050508260727, "learning_rate": 1.797430306483904e-05, "loss": 0.7012, "step": 542 }, { "epoch": 0.4458128078817734, "grad_norm": 0.01686185784637928, "learning_rate": 1.7966351619750003e-05, "loss": 0.6949, "step": 543 }, { "epoch": 0.4466338259441708, "grad_norm": 0.018403999507427216, "learning_rate": 1.7958386365612433e-05, "loss": 0.6962, "step": 544 }, { "epoch": 0.44745484400656815, "grad_norm": 0.017617959529161453, "learning_rate": 1.7950407316235213e-05, "loss": 0.7138, "step": 545 }, { "epoch": 0.4482758620689655, "grad_norm": 0.01721654087305069, "learning_rate": 1.794241448545114e-05, "loss": 0.6838, "step": 546 }, { "epoch": 0.4490968801313629, "grad_norm": 0.017514709383249283, "learning_rate": 1.79344078871169e-05, "loss": 0.6918, "step": 547 }, { "epoch": 0.44991789819376027, "grad_norm": 0.018308190628886223, "learning_rate": 1.7926387535113044e-05, "loss": 0.6968, "step": 548 }, { "epoch": 0.45073891625615764, "grad_norm": 0.05773812532424927, "learning_rate": 1.791835344334398e-05, "loss": 0.7299, "step": 549 }, { "epoch": 0.451559934318555, "grad_norm": 0.018153799697756767, "learning_rate": 1.791030562573792e-05, "loss": 0.665, "step": 550 }, { "epoch": 0.4523809523809524, "grad_norm": 0.017418669536709785, "learning_rate": 1.790224409624688e-05, "loss": 0.7091, "step": 551 }, { "epoch": 0.45320197044334976, "grad_norm": 0.017961759120225906, "learning_rate": 1.789416886884665e-05, "loss": 0.6635, "step": 552 }, { "epoch": 0.4540229885057471, "grad_norm": 0.017309408634901047, "learning_rate": 1.788607995753676e-05, "loss": 0.6621, "step": 553 }, { "epoch": 0.4548440065681445, "grad_norm": 0.016804173588752747, "learning_rate": 1.787797737634047e-05, "loss": 0.6832, "step": 554 }, { "epoch": 0.45566502463054187, "grad_norm": 0.018586277961730957, "learning_rate": 1.7869861139304732e-05, "loss": 0.6937, "step": 555 }, { "epoch": 0.45648604269293924, "grad_norm": 0.017219744622707367, "learning_rate": 1.7861731260500175e-05, "loss": 0.6887, "step": 556 }, { "epoch": 0.4573070607553366, "grad_norm": 0.017284739762544632, "learning_rate": 1.785358775402108e-05, "loss": 0.6927, "step": 557 }, { "epoch": 0.458128078817734, "grad_norm": 0.016925275325775146, "learning_rate": 1.784543063398535e-05, "loss": 0.7073, "step": 558 }, { "epoch": 0.45894909688013136, "grad_norm": 0.01839420758187771, "learning_rate": 1.7837259914534498e-05, "loss": 0.7005, "step": 559 }, { "epoch": 0.45977011494252873, "grad_norm": 0.019194522872567177, "learning_rate": 1.782907560983359e-05, "loss": 0.673, "step": 560 }, { "epoch": 0.4605911330049261, "grad_norm": 0.017680633813142776, "learning_rate": 1.782087773407128e-05, "loss": 0.6837, "step": 561 }, { "epoch": 0.4614121510673235, "grad_norm": 0.020238593220710754, "learning_rate": 1.7812666301459717e-05, "loss": 0.7075, "step": 562 }, { "epoch": 0.46223316912972084, "grad_norm": 0.018868079409003258, "learning_rate": 1.780444132623457e-05, "loss": 0.6613, "step": 563 }, { "epoch": 0.4630541871921182, "grad_norm": 0.017627542838454247, "learning_rate": 1.7796202822654983e-05, "loss": 0.6939, "step": 564 }, { "epoch": 0.4638752052545156, "grad_norm": 0.017295856028795242, "learning_rate": 1.7787950805003545e-05, "loss": 0.6653, "step": 565 }, { "epoch": 0.46469622331691296, "grad_norm": 0.01726636290550232, "learning_rate": 1.7779685287586284e-05, "loss": 0.7069, "step": 566 }, { "epoch": 0.46551724137931033, "grad_norm": 0.01734928973019123, "learning_rate": 1.7771406284732626e-05, "loss": 0.6758, "step": 567 }, { "epoch": 0.4663382594417077, "grad_norm": 0.016457127407193184, "learning_rate": 1.7763113810795382e-05, "loss": 0.6695, "step": 568 }, { "epoch": 0.4671592775041051, "grad_norm": 0.016592755913734436, "learning_rate": 1.775480788015071e-05, "loss": 0.6836, "step": 569 }, { "epoch": 0.46798029556650245, "grad_norm": 0.018676698207855225, "learning_rate": 1.7746488507198095e-05, "loss": 0.6604, "step": 570 }, { "epoch": 0.4688013136288998, "grad_norm": 0.016720978543162346, "learning_rate": 1.7738155706360344e-05, "loss": 0.6696, "step": 571 }, { "epoch": 0.4696223316912972, "grad_norm": 0.017305318266153336, "learning_rate": 1.7729809492083515e-05, "loss": 0.6781, "step": 572 }, { "epoch": 0.47044334975369456, "grad_norm": 0.024956095963716507, "learning_rate": 1.7721449878836944e-05, "loss": 0.675, "step": 573 }, { "epoch": 0.47126436781609193, "grad_norm": 0.017627490684390068, "learning_rate": 1.7713076881113185e-05, "loss": 0.6832, "step": 574 }, { "epoch": 0.4720853858784893, "grad_norm": 0.017559083178639412, "learning_rate": 1.7704690513427997e-05, "loss": 0.6851, "step": 575 }, { "epoch": 0.4729064039408867, "grad_norm": 0.017515765503048897, "learning_rate": 1.769629079032032e-05, "loss": 0.6843, "step": 576 }, { "epoch": 0.47372742200328405, "grad_norm": 0.016924096271395683, "learning_rate": 1.7687877726352244e-05, "loss": 0.6676, "step": 577 }, { "epoch": 0.4745484400656814, "grad_norm": 0.01729103922843933, "learning_rate": 1.7679451336108994e-05, "loss": 0.6709, "step": 578 }, { "epoch": 0.4753694581280788, "grad_norm": 0.1233232319355011, "learning_rate": 1.7671011634198888e-05, "loss": 0.7197, "step": 579 }, { "epoch": 0.47619047619047616, "grad_norm": 0.018454348668456078, "learning_rate": 1.766255863525333e-05, "loss": 0.691, "step": 580 }, { "epoch": 0.47701149425287354, "grad_norm": 0.01805203966796398, "learning_rate": 1.765409235392677e-05, "loss": 0.6668, "step": 581 }, { "epoch": 0.47783251231527096, "grad_norm": 0.018456457182765007, "learning_rate": 1.7645612804896692e-05, "loss": 0.6712, "step": 582 }, { "epoch": 0.47865353037766833, "grad_norm": 0.01743881218135357, "learning_rate": 1.7637120002863577e-05, "loss": 0.6726, "step": 583 }, { "epoch": 0.4794745484400657, "grad_norm": 0.021646324545145035, "learning_rate": 1.762861396255088e-05, "loss": 0.6953, "step": 584 }, { "epoch": 0.4802955665024631, "grad_norm": 0.016764746978878975, "learning_rate": 1.762009469870501e-05, "loss": 0.6883, "step": 585 }, { "epoch": 0.48111658456486045, "grad_norm": 0.019434746354818344, "learning_rate": 1.76115622260953e-05, "loss": 0.6958, "step": 586 }, { "epoch": 0.4819376026272578, "grad_norm": 0.018379854038357735, "learning_rate": 1.7603016559513984e-05, "loss": 0.7081, "step": 587 }, { "epoch": 0.4827586206896552, "grad_norm": 0.01935613714158535, "learning_rate": 1.7594457713776163e-05, "loss": 0.6797, "step": 588 }, { "epoch": 0.48357963875205257, "grad_norm": 0.01850968785583973, "learning_rate": 1.7585885703719793e-05, "loss": 0.6869, "step": 589 }, { "epoch": 0.48440065681444994, "grad_norm": 0.020868735387921333, "learning_rate": 1.757730054420565e-05, "loss": 0.7021, "step": 590 }, { "epoch": 0.4852216748768473, "grad_norm": 0.018456505611538887, "learning_rate": 1.7568702250117305e-05, "loss": 0.6849, "step": 591 }, { "epoch": 0.4860426929392447, "grad_norm": 0.01909414306282997, "learning_rate": 1.7560090836361102e-05, "loss": 0.6428, "step": 592 }, { "epoch": 0.48686371100164205, "grad_norm": 0.018443796783685684, "learning_rate": 1.7551466317866125e-05, "loss": 0.6798, "step": 593 }, { "epoch": 0.4876847290640394, "grad_norm": 0.019238535314798355, "learning_rate": 1.754282870958418e-05, "loss": 0.6789, "step": 594 }, { "epoch": 0.4885057471264368, "grad_norm": 0.017626311630010605, "learning_rate": 1.7534178026489775e-05, "loss": 0.6913, "step": 595 }, { "epoch": 0.48932676518883417, "grad_norm": 0.017543040215969086, "learning_rate": 1.7525514283580066e-05, "loss": 0.7049, "step": 596 }, { "epoch": 0.49014778325123154, "grad_norm": 0.01849355734884739, "learning_rate": 1.7516837495874866e-05, "loss": 0.7137, "step": 597 }, { "epoch": 0.4909688013136289, "grad_norm": 0.017363345250487328, "learning_rate": 1.7508147678416597e-05, "loss": 0.7075, "step": 598 }, { "epoch": 0.4917898193760263, "grad_norm": 0.019062954932451248, "learning_rate": 1.7499444846270266e-05, "loss": 0.689, "step": 599 }, { "epoch": 0.49261083743842365, "grad_norm": 0.017589613795280457, "learning_rate": 1.749072901452345e-05, "loss": 0.6751, "step": 600 }, { "epoch": 0.493431855500821, "grad_norm": 0.0181493628770113, "learning_rate": 1.7482000198286262e-05, "loss": 0.6587, "step": 601 }, { "epoch": 0.4942528735632184, "grad_norm": 0.01762031577527523, "learning_rate": 1.7473258412691317e-05, "loss": 0.6791, "step": 602 }, { "epoch": 0.49507389162561577, "grad_norm": 0.018445096909999847, "learning_rate": 1.7464503672893727e-05, "loss": 0.6883, "step": 603 }, { "epoch": 0.49589490968801314, "grad_norm": 0.01709083281457424, "learning_rate": 1.745573599407105e-05, "loss": 0.7009, "step": 604 }, { "epoch": 0.4967159277504105, "grad_norm": 0.01673067919909954, "learning_rate": 1.7446955391423277e-05, "loss": 0.6847, "step": 605 }, { "epoch": 0.4975369458128079, "grad_norm": 0.016538165509700775, "learning_rate": 1.743816188017282e-05, "loss": 0.64, "step": 606 }, { "epoch": 0.49835796387520526, "grad_norm": 0.017870107665657997, "learning_rate": 1.7429355475564446e-05, "loss": 0.7066, "step": 607 }, { "epoch": 0.49917898193760263, "grad_norm": 0.018343057483434677, "learning_rate": 1.7420536192865294e-05, "loss": 0.6396, "step": 608 }, { "epoch": 0.5, "grad_norm": 0.017366474494338036, "learning_rate": 1.741170404736482e-05, "loss": 0.6765, "step": 609 }, { "epoch": 0.5008210180623974, "grad_norm": 0.016345709562301636, "learning_rate": 1.7402859054374775e-05, "loss": 0.6679, "step": 610 }, { "epoch": 0.5016420361247947, "grad_norm": 0.015820622444152832, "learning_rate": 1.73940012292292e-05, "loss": 0.6652, "step": 611 }, { "epoch": 0.5024630541871922, "grad_norm": 0.01616581715643406, "learning_rate": 1.7385130587284357e-05, "loss": 0.6536, "step": 612 }, { "epoch": 0.5032840722495895, "grad_norm": 0.01662985049188137, "learning_rate": 1.7376247143918748e-05, "loss": 0.6881, "step": 613 }, { "epoch": 0.5041050903119869, "grad_norm": 0.017209812998771667, "learning_rate": 1.7367350914533063e-05, "loss": 0.6554, "step": 614 }, { "epoch": 0.5049261083743842, "grad_norm": 0.016647804528474808, "learning_rate": 1.735844191455016e-05, "loss": 0.6688, "step": 615 }, { "epoch": 0.5057471264367817, "grad_norm": 0.017030401155352592, "learning_rate": 1.7349520159415025e-05, "loss": 0.6573, "step": 616 }, { "epoch": 0.506568144499179, "grad_norm": 0.01632831245660782, "learning_rate": 1.734058566459477e-05, "loss": 0.6666, "step": 617 }, { "epoch": 0.5073891625615764, "grad_norm": 0.016468718647956848, "learning_rate": 1.733163844557859e-05, "loss": 0.7041, "step": 618 }, { "epoch": 0.5082101806239737, "grad_norm": 0.017008313909173012, "learning_rate": 1.732267851787774e-05, "loss": 0.6768, "step": 619 }, { "epoch": 0.5090311986863711, "grad_norm": 0.016277499496936798, "learning_rate": 1.7313705897025496e-05, "loss": 0.6935, "step": 620 }, { "epoch": 0.5098522167487685, "grad_norm": 0.016870250925421715, "learning_rate": 1.7304720598577152e-05, "loss": 0.6962, "step": 621 }, { "epoch": 0.5106732348111659, "grad_norm": 0.017975609749555588, "learning_rate": 1.7295722638109985e-05, "loss": 0.6603, "step": 622 }, { "epoch": 0.5114942528735632, "grad_norm": 0.01659955456852913, "learning_rate": 1.7286712031223204e-05, "loss": 0.6755, "step": 623 }, { "epoch": 0.5123152709359606, "grad_norm": 0.016961120069026947, "learning_rate": 1.7277688793537957e-05, "loss": 0.6787, "step": 624 }, { "epoch": 0.513136288998358, "grad_norm": 0.016218258067965508, "learning_rate": 1.726865294069729e-05, "loss": 0.6471, "step": 625 }, { "epoch": 0.5139573070607554, "grad_norm": 0.01686953939497471, "learning_rate": 1.7259604488366106e-05, "loss": 0.6732, "step": 626 }, { "epoch": 0.5147783251231527, "grad_norm": 0.01660560816526413, "learning_rate": 1.7250543452231166e-05, "loss": 0.6873, "step": 627 }, { "epoch": 0.5155993431855501, "grad_norm": 0.016757989302277565, "learning_rate": 1.724146984800104e-05, "loss": 0.6435, "step": 628 }, { "epoch": 0.5164203612479474, "grad_norm": 0.019561342895030975, "learning_rate": 1.7232383691406088e-05, "loss": 0.6823, "step": 629 }, { "epoch": 0.5172413793103449, "grad_norm": 0.016868341714143753, "learning_rate": 1.722328499819843e-05, "loss": 0.6849, "step": 630 }, { "epoch": 0.5180623973727422, "grad_norm": 0.015701819211244583, "learning_rate": 1.7214173784151916e-05, "loss": 0.6711, "step": 631 }, { "epoch": 0.5188834154351396, "grad_norm": 0.016820194199681282, "learning_rate": 1.720505006506211e-05, "loss": 0.6964, "step": 632 }, { "epoch": 0.5197044334975369, "grad_norm": 0.01761906035244465, "learning_rate": 1.7195913856746255e-05, "loss": 0.6606, "step": 633 }, { "epoch": 0.5205254515599343, "grad_norm": 0.017528265714645386, "learning_rate": 1.718676517504324e-05, "loss": 0.6484, "step": 634 }, { "epoch": 0.5213464696223317, "grad_norm": 0.01697097159922123, "learning_rate": 1.717760403581358e-05, "loss": 0.6682, "step": 635 }, { "epoch": 0.5221674876847291, "grad_norm": 0.018297243863344193, "learning_rate": 1.7168430454939392e-05, "loss": 0.6738, "step": 636 }, { "epoch": 0.5229885057471264, "grad_norm": 0.016424909234046936, "learning_rate": 1.715924444832435e-05, "loss": 0.6517, "step": 637 }, { "epoch": 0.5238095238095238, "grad_norm": 0.01824391447007656, "learning_rate": 1.715004603189369e-05, "loss": 0.6681, "step": 638 }, { "epoch": 0.5246305418719212, "grad_norm": 0.017483752220869064, "learning_rate": 1.7140835221594142e-05, "loss": 0.6595, "step": 639 }, { "epoch": 0.5254515599343186, "grad_norm": 0.019100680947303772, "learning_rate": 1.7131612033393942e-05, "loss": 0.6672, "step": 640 }, { "epoch": 0.5262725779967159, "grad_norm": 0.017237763851881027, "learning_rate": 1.7122376483282757e-05, "loss": 0.6551, "step": 641 }, { "epoch": 0.5270935960591133, "grad_norm": 0.017968932166695595, "learning_rate": 1.7113128587271714e-05, "loss": 0.6403, "step": 642 }, { "epoch": 0.5279146141215106, "grad_norm": 0.015886008739471436, "learning_rate": 1.7103868361393325e-05, "loss": 0.6576, "step": 643 }, { "epoch": 0.5287356321839081, "grad_norm": 0.019075507298111916, "learning_rate": 1.709459582170149e-05, "loss": 0.6683, "step": 644 }, { "epoch": 0.5295566502463054, "grad_norm": 0.016744088381528854, "learning_rate": 1.708531098427145e-05, "loss": 0.674, "step": 645 }, { "epoch": 0.5303776683087028, "grad_norm": 0.01836806908249855, "learning_rate": 1.7076013865199754e-05, "loss": 0.6705, "step": 646 }, { "epoch": 0.5311986863711001, "grad_norm": 0.018107879906892776, "learning_rate": 1.7066704480604273e-05, "loss": 0.648, "step": 647 }, { "epoch": 0.5320197044334976, "grad_norm": 0.019784096628427505, "learning_rate": 1.705738284662411e-05, "loss": 0.6784, "step": 648 }, { "epoch": 0.5328407224958949, "grad_norm": 0.01731187291443348, "learning_rate": 1.7048048979419628e-05, "loss": 0.6616, "step": 649 }, { "epoch": 0.5336617405582923, "grad_norm": 0.0180632583796978, "learning_rate": 1.7038702895172383e-05, "loss": 0.6321, "step": 650 }, { "epoch": 0.5344827586206896, "grad_norm": 0.019297795370221138, "learning_rate": 1.7029344610085116e-05, "loss": 0.6763, "step": 651 }, { "epoch": 0.535303776683087, "grad_norm": 0.017113182693719864, "learning_rate": 1.7019974140381727e-05, "loss": 0.6692, "step": 652 }, { "epoch": 0.5361247947454844, "grad_norm": 0.0177195742726326, "learning_rate": 1.7010591502307222e-05, "loss": 0.6844, "step": 653 }, { "epoch": 0.5369458128078818, "grad_norm": 0.017182065173983574, "learning_rate": 1.7001196712127722e-05, "loss": 0.672, "step": 654 }, { "epoch": 0.5377668308702791, "grad_norm": 0.01781817153096199, "learning_rate": 1.69917897861304e-05, "loss": 0.667, "step": 655 }, { "epoch": 0.5385878489326765, "grad_norm": 0.017118120566010475, "learning_rate": 1.698237074062348e-05, "loss": 0.6561, "step": 656 }, { "epoch": 0.5394088669950738, "grad_norm": 0.017036570236086845, "learning_rate": 1.697293959193619e-05, "loss": 0.6913, "step": 657 }, { "epoch": 0.5402298850574713, "grad_norm": 0.016578413546085358, "learning_rate": 1.6963496356418745e-05, "loss": 0.6644, "step": 658 }, { "epoch": 0.5410509031198686, "grad_norm": 0.01715630106627941, "learning_rate": 1.6954041050442306e-05, "loss": 0.6764, "step": 659 }, { "epoch": 0.541871921182266, "grad_norm": 0.016607729718089104, "learning_rate": 1.694457369039897e-05, "loss": 0.648, "step": 660 }, { "epoch": 0.5426929392446633, "grad_norm": 0.017757434397935867, "learning_rate": 1.6935094292701725e-05, "loss": 0.651, "step": 661 }, { "epoch": 0.5435139573070608, "grad_norm": 0.015876375138759613, "learning_rate": 1.692560287378443e-05, "loss": 0.647, "step": 662 }, { "epoch": 0.5443349753694581, "grad_norm": 0.017393162474036217, "learning_rate": 1.6916099450101778e-05, "loss": 0.7026, "step": 663 }, { "epoch": 0.5451559934318555, "grad_norm": 0.016414165496826172, "learning_rate": 1.690658403812929e-05, "loss": 0.641, "step": 664 }, { "epoch": 0.5459770114942529, "grad_norm": 0.017111750319600105, "learning_rate": 1.6897056654363263e-05, "loss": 0.6685, "step": 665 }, { "epoch": 0.5467980295566502, "grad_norm": 0.016760293394327164, "learning_rate": 1.688751731532073e-05, "loss": 0.6825, "step": 666 }, { "epoch": 0.5476190476190477, "grad_norm": 0.016746580600738525, "learning_rate": 1.687796603753948e-05, "loss": 0.6909, "step": 667 }, { "epoch": 0.548440065681445, "grad_norm": 0.018176497891545296, "learning_rate": 1.686840283757798e-05, "loss": 0.666, "step": 668 }, { "epoch": 0.5492610837438424, "grad_norm": 0.01722828485071659, "learning_rate": 1.6858827732015376e-05, "loss": 0.6665, "step": 669 }, { "epoch": 0.5500821018062397, "grad_norm": 0.019629109650850296, "learning_rate": 1.6849240737451445e-05, "loss": 0.6681, "step": 670 }, { "epoch": 0.5509031198686372, "grad_norm": 0.016900289803743362, "learning_rate": 1.683964187050658e-05, "loss": 0.6576, "step": 671 }, { "epoch": 0.5517241379310345, "grad_norm": 0.01912236399948597, "learning_rate": 1.6830031147821765e-05, "loss": 0.699, "step": 672 }, { "epoch": 0.5525451559934319, "grad_norm": 0.016731834039092064, "learning_rate": 1.6820408586058513e-05, "loss": 0.6551, "step": 673 }, { "epoch": 0.5533661740558292, "grad_norm": 0.017752835527062416, "learning_rate": 1.6810774201898892e-05, "loss": 0.6705, "step": 674 }, { "epoch": 0.5541871921182266, "grad_norm": 0.01697412319481373, "learning_rate": 1.6801128012045446e-05, "loss": 0.6779, "step": 675 }, { "epoch": 0.555008210180624, "grad_norm": 0.017596764490008354, "learning_rate": 1.679147003322119e-05, "loss": 0.6712, "step": 676 }, { "epoch": 0.5558292282430214, "grad_norm": 0.01809236966073513, "learning_rate": 1.6781800282169578e-05, "loss": 0.6678, "step": 677 }, { "epoch": 0.5566502463054187, "grad_norm": 0.017593486234545708, "learning_rate": 1.6772118775654474e-05, "loss": 0.6778, "step": 678 }, { "epoch": 0.5574712643678161, "grad_norm": 0.017885543406009674, "learning_rate": 1.676242553046012e-05, "loss": 0.6445, "step": 679 }, { "epoch": 0.5582922824302134, "grad_norm": 0.10626628249883652, "learning_rate": 1.6752720563391108e-05, "loss": 0.6579, "step": 680 }, { "epoch": 0.5591133004926109, "grad_norm": 0.018731504678726196, "learning_rate": 1.674300389127236e-05, "loss": 0.6475, "step": 681 }, { "epoch": 0.5599343185550082, "grad_norm": 0.017796343192458153, "learning_rate": 1.6733275530949074e-05, "loss": 0.6539, "step": 682 }, { "epoch": 0.5607553366174056, "grad_norm": 0.017240945249795914, "learning_rate": 1.672353549928672e-05, "loss": 0.6655, "step": 683 }, { "epoch": 0.5615763546798029, "grad_norm": 0.018114123493433, "learning_rate": 1.671378381317101e-05, "loss": 0.6718, "step": 684 }, { "epoch": 0.5623973727422004, "grad_norm": 0.018826628103852272, "learning_rate": 1.6704020489507844e-05, "loss": 0.6607, "step": 685 }, { "epoch": 0.5632183908045977, "grad_norm": 0.017946327105164528, "learning_rate": 1.6694245545223314e-05, "loss": 0.6668, "step": 686 }, { "epoch": 0.5640394088669951, "grad_norm": 0.022967630997300148, "learning_rate": 1.668445899726364e-05, "loss": 0.6289, "step": 687 }, { "epoch": 0.5648604269293924, "grad_norm": 0.018239399418234825, "learning_rate": 1.667466086259518e-05, "loss": 0.6799, "step": 688 }, { "epoch": 0.5656814449917899, "grad_norm": 0.017382318153977394, "learning_rate": 1.666485115820436e-05, "loss": 0.6745, "step": 689 }, { "epoch": 0.5665024630541872, "grad_norm": 0.017755093052983284, "learning_rate": 1.6655029901097673e-05, "loss": 0.6787, "step": 690 }, { "epoch": 0.5673234811165846, "grad_norm": 0.016991199925541878, "learning_rate": 1.664519710830164e-05, "loss": 0.6877, "step": 691 }, { "epoch": 0.5681444991789819, "grad_norm": 0.01736479252576828, "learning_rate": 1.6635352796862778e-05, "loss": 0.6701, "step": 692 }, { "epoch": 0.5689655172413793, "grad_norm": 0.017156532034277916, "learning_rate": 1.6625496983847573e-05, "loss": 0.6629, "step": 693 }, { "epoch": 0.5697865353037767, "grad_norm": 0.027409827336668968, "learning_rate": 1.661562968634246e-05, "loss": 0.6758, "step": 694 }, { "epoch": 0.5706075533661741, "grad_norm": 0.018333178013563156, "learning_rate": 1.6605750921453763e-05, "loss": 0.6495, "step": 695 }, { "epoch": 0.5714285714285714, "grad_norm": 0.01716003566980362, "learning_rate": 1.6595860706307712e-05, "loss": 0.6748, "step": 696 }, { "epoch": 0.5722495894909688, "grad_norm": 0.018700186163187027, "learning_rate": 1.6585959058050368e-05, "loss": 0.6439, "step": 697 }, { "epoch": 0.5730706075533661, "grad_norm": 0.016825927421450615, "learning_rate": 1.657604599384762e-05, "loss": 0.6632, "step": 698 }, { "epoch": 0.5738916256157636, "grad_norm": 0.01781759411096573, "learning_rate": 1.656612153088515e-05, "loss": 0.6861, "step": 699 }, { "epoch": 0.5747126436781609, "grad_norm": 0.016946712508797646, "learning_rate": 1.6556185686368403e-05, "loss": 0.6775, "step": 700 }, { "epoch": 0.5755336617405583, "grad_norm": 0.015819542109966278, "learning_rate": 1.6546238477522547e-05, "loss": 0.6501, "step": 701 }, { "epoch": 0.5763546798029556, "grad_norm": 0.018469011411070824, "learning_rate": 1.653627992159246e-05, "loss": 0.6583, "step": 702 }, { "epoch": 0.577175697865353, "grad_norm": 0.01635398156940937, "learning_rate": 1.652631003584268e-05, "loss": 0.6589, "step": 703 }, { "epoch": 0.5779967159277504, "grad_norm": 0.017141584306955338, "learning_rate": 1.6516328837557406e-05, "loss": 0.6709, "step": 704 }, { "epoch": 0.5788177339901478, "grad_norm": 0.030548566952347755, "learning_rate": 1.650633634404043e-05, "loss": 0.6342, "step": 705 }, { "epoch": 0.5796387520525451, "grad_norm": 0.01782088167965412, "learning_rate": 1.649633257261514e-05, "loss": 0.6594, "step": 706 }, { "epoch": 0.5804597701149425, "grad_norm": 0.016215698793530464, "learning_rate": 1.6486317540624465e-05, "loss": 0.6546, "step": 707 }, { "epoch": 0.5812807881773399, "grad_norm": 0.017641177400946617, "learning_rate": 1.6476291265430865e-05, "loss": 0.6743, "step": 708 }, { "epoch": 0.5821018062397373, "grad_norm": 0.01702251099050045, "learning_rate": 1.6466253764416284e-05, "loss": 0.6525, "step": 709 }, { "epoch": 0.5829228243021346, "grad_norm": 0.017801588401198387, "learning_rate": 1.645620505498213e-05, "loss": 0.6561, "step": 710 }, { "epoch": 0.583743842364532, "grad_norm": 0.01731495000422001, "learning_rate": 1.6446145154549243e-05, "loss": 0.6819, "step": 711 }, { "epoch": 0.5845648604269293, "grad_norm": 0.018705395981669426, "learning_rate": 1.6436074080557864e-05, "loss": 0.6515, "step": 712 }, { "epoch": 0.5853858784893268, "grad_norm": 0.01701977476477623, "learning_rate": 1.6425991850467612e-05, "loss": 0.6664, "step": 713 }, { "epoch": 0.5862068965517241, "grad_norm": 0.0191891398280859, "learning_rate": 1.641589848175743e-05, "loss": 0.6608, "step": 714 }, { "epoch": 0.5870279146141215, "grad_norm": 0.015958420932292938, "learning_rate": 1.6405793991925586e-05, "loss": 0.6498, "step": 715 }, { "epoch": 0.5878489326765188, "grad_norm": 0.018721166998147964, "learning_rate": 1.6395678398489624e-05, "loss": 0.6716, "step": 716 }, { "epoch": 0.5886699507389163, "grad_norm": 0.01588141918182373, "learning_rate": 1.6385551718986333e-05, "loss": 0.6486, "step": 717 }, { "epoch": 0.5894909688013136, "grad_norm": 0.016987184062600136, "learning_rate": 1.6375413970971728e-05, "loss": 0.6534, "step": 718 }, { "epoch": 0.590311986863711, "grad_norm": 0.017321176826953888, "learning_rate": 1.636526517202101e-05, "loss": 0.6915, "step": 719 }, { "epoch": 0.5911330049261084, "grad_norm": 0.017296040430665016, "learning_rate": 1.6355105339728536e-05, "loss": 0.656, "step": 720 }, { "epoch": 0.5919540229885057, "grad_norm": 0.01609204150736332, "learning_rate": 1.6344934491707792e-05, "loss": 0.6418, "step": 721 }, { "epoch": 0.5927750410509032, "grad_norm": 0.0592983178794384, "learning_rate": 1.633475264559137e-05, "loss": 0.6452, "step": 722 }, { "epoch": 0.5935960591133005, "grad_norm": 0.09712128341197968, "learning_rate": 1.632455981903091e-05, "loss": 0.6654, "step": 723 }, { "epoch": 0.5944170771756979, "grad_norm": 0.017508788034319878, "learning_rate": 1.6314356029697108e-05, "loss": 0.6452, "step": 724 }, { "epoch": 0.5952380952380952, "grad_norm": 0.01589817740023136, "learning_rate": 1.630414129527965e-05, "loss": 0.6552, "step": 725 }, { "epoch": 0.5960591133004927, "grad_norm": 0.018241778016090393, "learning_rate": 1.629391563348721e-05, "loss": 0.6387, "step": 726 }, { "epoch": 0.59688013136289, "grad_norm": 0.016658680513501167, "learning_rate": 1.6283679062047397e-05, "loss": 0.6482, "step": 727 }, { "epoch": 0.5977011494252874, "grad_norm": 0.01660832017660141, "learning_rate": 1.6273431598706733e-05, "loss": 0.6509, "step": 728 }, { "epoch": 0.5985221674876847, "grad_norm": 0.01633264124393463, "learning_rate": 1.626317326123063e-05, "loss": 0.6769, "step": 729 }, { "epoch": 0.5993431855500821, "grad_norm": 0.015908334404230118, "learning_rate": 1.6252904067403344e-05, "loss": 0.6338, "step": 730 }, { "epoch": 0.6001642036124795, "grad_norm": 0.018177775666117668, "learning_rate": 1.6242624035027958e-05, "loss": 0.6695, "step": 731 }, { "epoch": 0.6009852216748769, "grad_norm": 0.08955889940261841, "learning_rate": 1.623233318192634e-05, "loss": 0.6367, "step": 732 }, { "epoch": 0.6018062397372742, "grad_norm": 0.019481049850583076, "learning_rate": 1.6222031525939125e-05, "loss": 0.6473, "step": 733 }, { "epoch": 0.6026272577996716, "grad_norm": 0.02426612377166748, "learning_rate": 1.6211719084925663e-05, "loss": 0.6812, "step": 734 }, { "epoch": 0.603448275862069, "grad_norm": 0.019465740770101547, "learning_rate": 1.6201395876764017e-05, "loss": 0.6535, "step": 735 }, { "epoch": 0.6042692939244664, "grad_norm": 0.01721380278468132, "learning_rate": 1.6191061919350904e-05, "loss": 0.6606, "step": 736 }, { "epoch": 0.6050903119868637, "grad_norm": 0.019002821296453476, "learning_rate": 1.6180717230601686e-05, "loss": 0.67, "step": 737 }, { "epoch": 0.6059113300492611, "grad_norm": 0.018227092921733856, "learning_rate": 1.6170361828450322e-05, "loss": 0.6773, "step": 738 }, { "epoch": 0.6067323481116584, "grad_norm": 0.017974140122532845, "learning_rate": 1.615999573084935e-05, "loss": 0.6532, "step": 739 }, { "epoch": 0.6075533661740559, "grad_norm": 0.016617247834801674, "learning_rate": 1.614961895576983e-05, "loss": 0.6541, "step": 740 }, { "epoch": 0.6083743842364532, "grad_norm": 0.01667429320514202, "learning_rate": 1.6139231521201373e-05, "loss": 0.6386, "step": 741 }, { "epoch": 0.6091954022988506, "grad_norm": 0.017655611038208008, "learning_rate": 1.6128833445152034e-05, "loss": 0.6546, "step": 742 }, { "epoch": 0.6100164203612479, "grad_norm": 0.01637181080877781, "learning_rate": 1.6118424745648332e-05, "loss": 0.6482, "step": 743 }, { "epoch": 0.6108374384236454, "grad_norm": 0.020269248634576797, "learning_rate": 1.6108005440735196e-05, "loss": 0.6487, "step": 744 }, { "epoch": 0.6116584564860427, "grad_norm": 0.016422634944319725, "learning_rate": 1.609757554847595e-05, "loss": 0.6237, "step": 745 }, { "epoch": 0.6124794745484401, "grad_norm": 0.016804195940494537, "learning_rate": 1.6087135086952265e-05, "loss": 0.6603, "step": 746 }, { "epoch": 0.6133004926108374, "grad_norm": 0.01696902886033058, "learning_rate": 1.6076684074264136e-05, "loss": 0.6515, "step": 747 }, { "epoch": 0.6141215106732348, "grad_norm": 0.017994213849306107, "learning_rate": 1.6066222528529856e-05, "loss": 0.6373, "step": 748 }, { "epoch": 0.6149425287356322, "grad_norm": 0.0313229002058506, "learning_rate": 1.6055750467885973e-05, "loss": 0.647, "step": 749 }, { "epoch": 0.6157635467980296, "grad_norm": 0.01704541966319084, "learning_rate": 1.6045267910487265e-05, "loss": 0.6294, "step": 750 }, { "epoch": 0.6165845648604269, "grad_norm": 0.016514983028173447, "learning_rate": 1.6034774874506705e-05, "loss": 0.6575, "step": 751 }, { "epoch": 0.6174055829228243, "grad_norm": 0.0168316587805748, "learning_rate": 1.6024271378135442e-05, "loss": 0.6277, "step": 752 }, { "epoch": 0.6182266009852216, "grad_norm": 0.016462627798318863, "learning_rate": 1.6013757439582745e-05, "loss": 0.6646, "step": 753 }, { "epoch": 0.6190476190476191, "grad_norm": 0.016826754435896873, "learning_rate": 1.6003233077076003e-05, "loss": 0.6386, "step": 754 }, { "epoch": 0.6198686371100164, "grad_norm": 0.01635928265750408, "learning_rate": 1.5992698308860655e-05, "loss": 0.6342, "step": 755 }, { "epoch": 0.6206896551724138, "grad_norm": 0.016330720856785774, "learning_rate": 1.5982153153200202e-05, "loss": 0.637, "step": 756 }, { "epoch": 0.6215106732348111, "grad_norm": 0.01696993224322796, "learning_rate": 1.5971597628376135e-05, "loss": 0.63, "step": 757 }, { "epoch": 0.6223316912972086, "grad_norm": 0.01697859913110733, "learning_rate": 1.5961031752687937e-05, "loss": 0.6502, "step": 758 }, { "epoch": 0.6231527093596059, "grad_norm": 0.015890169888734818, "learning_rate": 1.5950455544453022e-05, "loss": 0.6367, "step": 759 }, { "epoch": 0.6239737274220033, "grad_norm": 0.017149457708001137, "learning_rate": 1.5939869022006724e-05, "loss": 0.6658, "step": 760 }, { "epoch": 0.6247947454844006, "grad_norm": 0.017428738996386528, "learning_rate": 1.592927220370226e-05, "loss": 0.636, "step": 761 }, { "epoch": 0.625615763546798, "grad_norm": 0.017064033076167107, "learning_rate": 1.5918665107910688e-05, "loss": 0.6276, "step": 762 }, { "epoch": 0.6264367816091954, "grad_norm": 0.017896663397550583, "learning_rate": 1.5908047753020895e-05, "loss": 0.6691, "step": 763 }, { "epoch": 0.6272577996715928, "grad_norm": 0.016910003498196602, "learning_rate": 1.589742015743954e-05, "loss": 0.6459, "step": 764 }, { "epoch": 0.6280788177339901, "grad_norm": 0.01705314964056015, "learning_rate": 1.5886782339591046e-05, "loss": 0.61, "step": 765 }, { "epoch": 0.6288998357963875, "grad_norm": 0.028002379462122917, "learning_rate": 1.5876134317917562e-05, "loss": 0.6416, "step": 766 }, { "epoch": 0.6297208538587848, "grad_norm": 0.03389204293489456, "learning_rate": 1.5865476110878907e-05, "loss": 0.6528, "step": 767 }, { "epoch": 0.6305418719211823, "grad_norm": 0.017483940348029137, "learning_rate": 1.5854807736952578e-05, "loss": 0.6431, "step": 768 }, { "epoch": 0.6313628899835796, "grad_norm": 0.016572022810578346, "learning_rate": 1.584412921463369e-05, "loss": 0.638, "step": 769 }, { "epoch": 0.632183908045977, "grad_norm": 0.018050888553261757, "learning_rate": 1.583344056243494e-05, "loss": 0.649, "step": 770 }, { "epoch": 0.6330049261083743, "grad_norm": 0.040694188326597214, "learning_rate": 1.582274179888662e-05, "loss": 0.6118, "step": 771 }, { "epoch": 0.6338259441707718, "grad_norm": 0.01901972107589245, "learning_rate": 1.581203294253651e-05, "loss": 0.6824, "step": 772 }, { "epoch": 0.6346469622331691, "grad_norm": 0.017053663730621338, "learning_rate": 1.5801314011949916e-05, "loss": 0.6645, "step": 773 }, { "epoch": 0.6354679802955665, "grad_norm": 0.021172018721699715, "learning_rate": 1.5790585025709594e-05, "loss": 0.6847, "step": 774 }, { "epoch": 0.6362889983579638, "grad_norm": 0.01936919242143631, "learning_rate": 1.5779846002415745e-05, "loss": 0.6514, "step": 775 }, { "epoch": 0.6371100164203612, "grad_norm": 0.019375218078494072, "learning_rate": 1.5769096960685965e-05, "loss": 0.6477, "step": 776 }, { "epoch": 0.6379310344827587, "grad_norm": 0.016533993184566498, "learning_rate": 1.575833791915521e-05, "loss": 0.6659, "step": 777 }, { "epoch": 0.638752052545156, "grad_norm": 0.019741157069802284, "learning_rate": 1.574756889647579e-05, "loss": 0.6546, "step": 778 }, { "epoch": 0.6395730706075534, "grad_norm": 0.02154269441962242, "learning_rate": 1.5736789911317298e-05, "loss": 0.6046, "step": 779 }, { "epoch": 0.6403940886699507, "grad_norm": 0.03421539440751076, "learning_rate": 1.5726000982366615e-05, "loss": 0.6786, "step": 780 }, { "epoch": 0.6412151067323482, "grad_norm": 0.017170744016766548, "learning_rate": 1.5715202128327857e-05, "loss": 0.6412, "step": 781 }, { "epoch": 0.6420361247947455, "grad_norm": 0.01603604294359684, "learning_rate": 1.570439336792234e-05, "loss": 0.6441, "step": 782 }, { "epoch": 0.6428571428571429, "grad_norm": 0.01808689720928669, "learning_rate": 1.5693574719888556e-05, "loss": 0.6375, "step": 783 }, { "epoch": 0.6436781609195402, "grad_norm": 0.01811482571065426, "learning_rate": 1.568274620298215e-05, "loss": 0.6313, "step": 784 }, { "epoch": 0.6444991789819376, "grad_norm": 0.015886131674051285, "learning_rate": 1.5671907835975858e-05, "loss": 0.6398, "step": 785 }, { "epoch": 0.645320197044335, "grad_norm": 0.017862917855381966, "learning_rate": 1.5661059637659504e-05, "loss": 0.6198, "step": 786 }, { "epoch": 0.6461412151067324, "grad_norm": 0.0158719252794981, "learning_rate": 1.5650201626839957e-05, "loss": 0.6381, "step": 787 }, { "epoch": 0.6469622331691297, "grad_norm": 0.01917172409594059, "learning_rate": 1.563933382234109e-05, "loss": 0.661, "step": 788 }, { "epoch": 0.6477832512315271, "grad_norm": 0.01746811904013157, "learning_rate": 1.5628456243003762e-05, "loss": 0.644, "step": 789 }, { "epoch": 0.6486042692939245, "grad_norm": 0.01733383722603321, "learning_rate": 1.5617568907685775e-05, "loss": 0.6534, "step": 790 }, { "epoch": 0.6494252873563219, "grad_norm": 0.017757508903741837, "learning_rate": 1.560667183526184e-05, "loss": 0.6525, "step": 791 }, { "epoch": 0.6502463054187192, "grad_norm": 0.016880834475159645, "learning_rate": 1.5595765044623554e-05, "loss": 0.6472, "step": 792 }, { "epoch": 0.6510673234811166, "grad_norm": 0.01704881154000759, "learning_rate": 1.558484855467936e-05, "loss": 0.6513, "step": 793 }, { "epoch": 0.6518883415435139, "grad_norm": 0.016723327338695526, "learning_rate": 1.5573922384354522e-05, "loss": 0.6512, "step": 794 }, { "epoch": 0.6527093596059114, "grad_norm": 0.018262261524796486, "learning_rate": 1.5562986552591076e-05, "loss": 0.6479, "step": 795 }, { "epoch": 0.6535303776683087, "grad_norm": 0.016721932217478752, "learning_rate": 1.5552041078347812e-05, "loss": 0.651, "step": 796 }, { "epoch": 0.6543513957307061, "grad_norm": 0.02023465931415558, "learning_rate": 1.5541085980600236e-05, "loss": 0.6338, "step": 797 }, { "epoch": 0.6551724137931034, "grad_norm": 0.016276651993393898, "learning_rate": 1.5530121278340545e-05, "loss": 0.6503, "step": 798 }, { "epoch": 0.6559934318555009, "grad_norm": 0.017510132864117622, "learning_rate": 1.5519146990577572e-05, "loss": 0.6291, "step": 799 }, { "epoch": 0.6568144499178982, "grad_norm": 0.016233589500188828, "learning_rate": 1.5508163136336784e-05, "loss": 0.6546, "step": 800 }, { "epoch": 0.6576354679802956, "grad_norm": 0.017920156940817833, "learning_rate": 1.5497169734660217e-05, "loss": 0.6383, "step": 801 }, { "epoch": 0.6584564860426929, "grad_norm": 0.01819434016942978, "learning_rate": 1.5486166804606474e-05, "loss": 0.6341, "step": 802 }, { "epoch": 0.6592775041050903, "grad_norm": 0.018081059679389, "learning_rate": 1.5475154365250668e-05, "loss": 0.6388, "step": 803 }, { "epoch": 0.6600985221674877, "grad_norm": 0.016950352117419243, "learning_rate": 1.5464132435684397e-05, "loss": 0.6446, "step": 804 }, { "epoch": 0.6609195402298851, "grad_norm": 0.017352592200040817, "learning_rate": 1.545310103501571e-05, "loss": 0.623, "step": 805 }, { "epoch": 0.6617405582922824, "grad_norm": 0.016468903049826622, "learning_rate": 1.5442060182369093e-05, "loss": 0.6651, "step": 806 }, { "epoch": 0.6625615763546798, "grad_norm": 0.01620594412088394, "learning_rate": 1.543100989688539e-05, "loss": 0.6082, "step": 807 }, { "epoch": 0.6633825944170771, "grad_norm": 0.01641901768743992, "learning_rate": 1.541995019772182e-05, "loss": 0.6388, "step": 808 }, { "epoch": 0.6642036124794746, "grad_norm": 0.016606735065579414, "learning_rate": 1.5408881104051915e-05, "loss": 0.6262, "step": 809 }, { "epoch": 0.6650246305418719, "grad_norm": 0.01608430966734886, "learning_rate": 1.5397802635065492e-05, "loss": 0.609, "step": 810 }, { "epoch": 0.6658456486042693, "grad_norm": 0.0166269950568676, "learning_rate": 1.5386714809968626e-05, "loss": 0.632, "step": 811 }, { "epoch": 0.6666666666666666, "grad_norm": 0.0584440752863884, "learning_rate": 1.5375617647983602e-05, "loss": 0.6487, "step": 812 }, { "epoch": 0.6674876847290641, "grad_norm": 0.016656601801514626, "learning_rate": 1.5364511168348905e-05, "loss": 0.6362, "step": 813 }, { "epoch": 0.6683087027914614, "grad_norm": 0.020423348993062973, "learning_rate": 1.5353395390319163e-05, "loss": 0.6724, "step": 814 }, { "epoch": 0.6691297208538588, "grad_norm": 0.016124868765473366, "learning_rate": 1.5342270333165134e-05, "loss": 0.6269, "step": 815 }, { "epoch": 0.6699507389162561, "grad_norm": 0.016632523387670517, "learning_rate": 1.5331136016173652e-05, "loss": 0.6447, "step": 816 }, { "epoch": 0.6707717569786535, "grad_norm": 0.016391877084970474, "learning_rate": 1.531999245864761e-05, "loss": 0.6171, "step": 817 }, { "epoch": 0.6715927750410509, "grad_norm": 0.017254671081900597, "learning_rate": 1.530883967990592e-05, "loss": 0.6773, "step": 818 }, { "epoch": 0.6724137931034483, "grad_norm": 0.015901075676083565, "learning_rate": 1.529767769928348e-05, "loss": 0.632, "step": 819 }, { "epoch": 0.6732348111658456, "grad_norm": 0.018830275163054466, "learning_rate": 1.5286506536131144e-05, "loss": 0.6401, "step": 820 }, { "epoch": 0.674055829228243, "grad_norm": 0.01770174875855446, "learning_rate": 1.5275326209815682e-05, "loss": 0.6558, "step": 821 }, { "epoch": 0.6748768472906403, "grad_norm": 0.03130536898970604, "learning_rate": 1.5264136739719745e-05, "loss": 0.6564, "step": 822 }, { "epoch": 0.6756978653530378, "grad_norm": 0.017460044473409653, "learning_rate": 1.5252938145241847e-05, "loss": 0.6377, "step": 823 }, { "epoch": 0.6765188834154351, "grad_norm": 0.0170414000749588, "learning_rate": 1.5241730445796307e-05, "loss": 0.6481, "step": 824 }, { "epoch": 0.6773399014778325, "grad_norm": 0.016636524349451065, "learning_rate": 1.523051366081324e-05, "loss": 0.6124, "step": 825 }, { "epoch": 0.6781609195402298, "grad_norm": 0.01738659478724003, "learning_rate": 1.5219287809738511e-05, "loss": 0.6553, "step": 826 }, { "epoch": 0.6789819376026273, "grad_norm": 0.02054508589208126, "learning_rate": 1.5208052912033695e-05, "loss": 0.6167, "step": 827 }, { "epoch": 0.6798029556650246, "grad_norm": 0.016683876514434814, "learning_rate": 1.5196808987176055e-05, "loss": 0.6241, "step": 828 }, { "epoch": 0.680623973727422, "grad_norm": 0.01573900505900383, "learning_rate": 1.5185556054658503e-05, "loss": 0.6347, "step": 829 }, { "epoch": 0.6814449917898193, "grad_norm": 0.016305560246109962, "learning_rate": 1.5174294133989574e-05, "loss": 0.6279, "step": 830 }, { "epoch": 0.6822660098522167, "grad_norm": 0.01631125807762146, "learning_rate": 1.5163023244693366e-05, "loss": 0.6285, "step": 831 }, { "epoch": 0.6830870279146142, "grad_norm": 0.01709560863673687, "learning_rate": 1.5151743406309555e-05, "loss": 0.6312, "step": 832 }, { "epoch": 0.6839080459770115, "grad_norm": 0.015759270638227463, "learning_rate": 1.5140454638393307e-05, "loss": 0.656, "step": 833 }, { "epoch": 0.6847290640394089, "grad_norm": 0.018081584945321083, "learning_rate": 1.5129156960515274e-05, "loss": 0.6301, "step": 834 }, { "epoch": 0.6855500821018062, "grad_norm": 0.016283875331282616, "learning_rate": 1.5117850392261555e-05, "loss": 0.6575, "step": 835 }, { "epoch": 0.6863711001642037, "grad_norm": 0.018057288601994514, "learning_rate": 1.5106534953233674e-05, "loss": 0.638, "step": 836 }, { "epoch": 0.687192118226601, "grad_norm": 0.016846930608153343, "learning_rate": 1.5095210663048516e-05, "loss": 0.6163, "step": 837 }, { "epoch": 0.6880131362889984, "grad_norm": 0.018576467409729958, "learning_rate": 1.5083877541338321e-05, "loss": 0.6376, "step": 838 }, { "epoch": 0.6888341543513957, "grad_norm": 0.016794437542557716, "learning_rate": 1.507253560775063e-05, "loss": 0.6031, "step": 839 }, { "epoch": 0.6896551724137931, "grad_norm": 0.018154611811041832, "learning_rate": 1.506118488194828e-05, "loss": 0.6554, "step": 840 }, { "epoch": 0.6904761904761905, "grad_norm": 0.01581154204905033, "learning_rate": 1.5049825383609324e-05, "loss": 0.6051, "step": 841 }, { "epoch": 0.6912972085385879, "grad_norm": 0.017936043441295624, "learning_rate": 1.503845713242705e-05, "loss": 0.6253, "step": 842 }, { "epoch": 0.6921182266009852, "grad_norm": 0.01620936021208763, "learning_rate": 1.5027080148109903e-05, "loss": 0.626, "step": 843 }, { "epoch": 0.6929392446633826, "grad_norm": 0.017497023567557335, "learning_rate": 1.501569445038147e-05, "loss": 0.6001, "step": 844 }, { "epoch": 0.69376026272578, "grad_norm": 0.016865573823451996, "learning_rate": 1.5004300058980452e-05, "loss": 0.6233, "step": 845 }, { "epoch": 0.6945812807881774, "grad_norm": 0.018129603937268257, "learning_rate": 1.4992896993660612e-05, "loss": 0.6483, "step": 846 }, { "epoch": 0.6954022988505747, "grad_norm": 0.018073352053761482, "learning_rate": 1.4981485274190755e-05, "loss": 0.652, "step": 847 }, { "epoch": 0.6962233169129721, "grad_norm": 0.016122665256261826, "learning_rate": 1.4970064920354695e-05, "loss": 0.6141, "step": 848 }, { "epoch": 0.6970443349753694, "grad_norm": 0.017163028940558434, "learning_rate": 1.4958635951951202e-05, "loss": 0.6243, "step": 849 }, { "epoch": 0.6978653530377669, "grad_norm": 0.0178022813051939, "learning_rate": 1.494719838879399e-05, "loss": 0.6323, "step": 850 }, { "epoch": 0.6986863711001642, "grad_norm": 0.01701164059340954, "learning_rate": 1.4935752250711673e-05, "loss": 0.6236, "step": 851 }, { "epoch": 0.6995073891625616, "grad_norm": 0.017102673649787903, "learning_rate": 1.4924297557547725e-05, "loss": 0.6374, "step": 852 }, { "epoch": 0.7003284072249589, "grad_norm": 0.017293235287070274, "learning_rate": 1.4912834329160454e-05, "loss": 0.6427, "step": 853 }, { "epoch": 0.7011494252873564, "grad_norm": 0.017602037638425827, "learning_rate": 1.4901362585422976e-05, "loss": 0.5935, "step": 854 }, { "epoch": 0.7019704433497537, "grad_norm": 0.016376588493585587, "learning_rate": 1.4889882346223145e-05, "loss": 0.6288, "step": 855 }, { "epoch": 0.7027914614121511, "grad_norm": 0.01779436506330967, "learning_rate": 1.4878393631463571e-05, "loss": 0.6244, "step": 856 }, { "epoch": 0.7036124794745484, "grad_norm": 0.017282649874687195, "learning_rate": 1.4866896461061532e-05, "loss": 0.6233, "step": 857 }, { "epoch": 0.7044334975369458, "grad_norm": 0.01643366552889347, "learning_rate": 1.4855390854948988e-05, "loss": 0.6209, "step": 858 }, { "epoch": 0.7052545155993432, "grad_norm": 0.018126333132386208, "learning_rate": 1.4843876833072506e-05, "loss": 0.6674, "step": 859 }, { "epoch": 0.7060755336617406, "grad_norm": 0.01577799953520298, "learning_rate": 1.4832354415393251e-05, "loss": 0.6134, "step": 860 }, { "epoch": 0.7068965517241379, "grad_norm": 0.017869843170046806, "learning_rate": 1.4820823621886941e-05, "loss": 0.6145, "step": 861 }, { "epoch": 0.7077175697865353, "grad_norm": 0.016958175227046013, "learning_rate": 1.4809284472543822e-05, "loss": 0.6089, "step": 862 }, { "epoch": 0.7085385878489326, "grad_norm": 0.016220971941947937, "learning_rate": 1.4797736987368611e-05, "loss": 0.5971, "step": 863 }, { "epoch": 0.7093596059113301, "grad_norm": 0.01639181189239025, "learning_rate": 1.4786181186380488e-05, "loss": 0.613, "step": 864 }, { "epoch": 0.7101806239737274, "grad_norm": 0.01860785484313965, "learning_rate": 1.4774617089613047e-05, "loss": 0.6179, "step": 865 }, { "epoch": 0.7110016420361248, "grad_norm": 0.017657579854130745, "learning_rate": 1.4763044717114266e-05, "loss": 0.6362, "step": 866 }, { "epoch": 0.7118226600985221, "grad_norm": 0.018048148602247238, "learning_rate": 1.4751464088946463e-05, "loss": 0.6184, "step": 867 }, { "epoch": 0.7126436781609196, "grad_norm": 0.016867440193891525, "learning_rate": 1.4739875225186275e-05, "loss": 0.6243, "step": 868 }, { "epoch": 0.7134646962233169, "grad_norm": 0.016798043623566628, "learning_rate": 1.4728278145924614e-05, "loss": 0.6181, "step": 869 }, { "epoch": 0.7142857142857143, "grad_norm": 0.016896909102797508, "learning_rate": 1.4716672871266639e-05, "loss": 0.6328, "step": 870 }, { "epoch": 0.7151067323481116, "grad_norm": 0.016999246552586555, "learning_rate": 1.4705059421331707e-05, "loss": 0.62, "step": 871 }, { "epoch": 0.715927750410509, "grad_norm": 0.01710176095366478, "learning_rate": 1.4693437816253356e-05, "loss": 0.6366, "step": 872 }, { "epoch": 0.7167487684729064, "grad_norm": 0.01640375889837742, "learning_rate": 1.468180807617926e-05, "loss": 0.6411, "step": 873 }, { "epoch": 0.7175697865353038, "grad_norm": 0.017630917951464653, "learning_rate": 1.46701702212712e-05, "loss": 0.6496, "step": 874 }, { "epoch": 0.7183908045977011, "grad_norm": 0.016706744208931923, "learning_rate": 1.4658524271705013e-05, "loss": 0.6121, "step": 875 }, { "epoch": 0.7192118226600985, "grad_norm": 0.018220271915197372, "learning_rate": 1.464687024767059e-05, "loss": 0.6425, "step": 876 }, { "epoch": 0.7200328407224958, "grad_norm": 0.016324257478117943, "learning_rate": 1.4635208169371795e-05, "loss": 0.6011, "step": 877 }, { "epoch": 0.7208538587848933, "grad_norm": 0.017675945535302162, "learning_rate": 1.4623538057026478e-05, "loss": 0.6252, "step": 878 }, { "epoch": 0.7216748768472906, "grad_norm": 0.01581117883324623, "learning_rate": 1.4611859930866401e-05, "loss": 0.6319, "step": 879 }, { "epoch": 0.722495894909688, "grad_norm": 0.01705995760858059, "learning_rate": 1.460017381113723e-05, "loss": 0.623, "step": 880 }, { "epoch": 0.7233169129720853, "grad_norm": 0.016270287334918976, "learning_rate": 1.4588479718098478e-05, "loss": 0.6125, "step": 881 }, { "epoch": 0.7241379310344828, "grad_norm": 0.019416503608226776, "learning_rate": 1.4576777672023494e-05, "loss": 0.639, "step": 882 }, { "epoch": 0.7249589490968801, "grad_norm": 0.016306547448039055, "learning_rate": 1.4565067693199401e-05, "loss": 0.6358, "step": 883 }, { "epoch": 0.7257799671592775, "grad_norm": 0.01899714022874832, "learning_rate": 1.455334980192709e-05, "loss": 0.601, "step": 884 }, { "epoch": 0.7266009852216748, "grad_norm": 0.016664952039718628, "learning_rate": 1.4541624018521152e-05, "loss": 0.6639, "step": 885 }, { "epoch": 0.7274220032840722, "grad_norm": 0.01648729108273983, "learning_rate": 1.4529890363309874e-05, "loss": 0.6089, "step": 886 }, { "epoch": 0.7282430213464697, "grad_norm": 0.017305802553892136, "learning_rate": 1.4518148856635181e-05, "loss": 0.6362, "step": 887 }, { "epoch": 0.729064039408867, "grad_norm": 0.016178695484995842, "learning_rate": 1.4506399518852618e-05, "loss": 0.6466, "step": 888 }, { "epoch": 0.7298850574712644, "grad_norm": 0.016331100836396217, "learning_rate": 1.4494642370331297e-05, "loss": 0.5978, "step": 889 }, { "epoch": 0.7307060755336617, "grad_norm": 0.016934026032686234, "learning_rate": 1.4482877431453879e-05, "loss": 0.6533, "step": 890 }, { "epoch": 0.7315270935960592, "grad_norm": 0.01635129749774933, "learning_rate": 1.4471104722616525e-05, "loss": 0.6387, "step": 891 }, { "epoch": 0.7323481116584565, "grad_norm": 0.016492590308189392, "learning_rate": 1.445932426422887e-05, "loss": 0.593, "step": 892 }, { "epoch": 0.7331691297208539, "grad_norm": 0.016522688791155815, "learning_rate": 1.4447536076713977e-05, "loss": 0.6323, "step": 893 }, { "epoch": 0.7339901477832512, "grad_norm": 0.017636509612202644, "learning_rate": 1.4435740180508322e-05, "loss": 0.6136, "step": 894 }, { "epoch": 0.7348111658456487, "grad_norm": 0.017724452540278435, "learning_rate": 1.4423936596061736e-05, "loss": 0.6216, "step": 895 }, { "epoch": 0.735632183908046, "grad_norm": 0.01759430766105652, "learning_rate": 1.4412125343837376e-05, "loss": 0.6203, "step": 896 }, { "epoch": 0.7364532019704434, "grad_norm": 0.05851398780941963, "learning_rate": 1.4400306444311696e-05, "loss": 0.633, "step": 897 }, { "epoch": 0.7372742200328407, "grad_norm": 0.016317086294293404, "learning_rate": 1.438847991797441e-05, "loss": 0.6094, "step": 898 }, { "epoch": 0.7380952380952381, "grad_norm": 0.01803850196301937, "learning_rate": 1.437664578532845e-05, "loss": 0.6549, "step": 899 }, { "epoch": 0.7389162561576355, "grad_norm": 0.01744643785059452, "learning_rate": 1.4364804066889939e-05, "loss": 0.6149, "step": 900 }, { "epoch": 0.7397372742200329, "grad_norm": 0.01748671568930149, "learning_rate": 1.4352954783188144e-05, "loss": 0.6234, "step": 901 }, { "epoch": 0.7405582922824302, "grad_norm": 0.015901271253824234, "learning_rate": 1.4341097954765452e-05, "loss": 0.6212, "step": 902 }, { "epoch": 0.7413793103448276, "grad_norm": 0.018575701862573624, "learning_rate": 1.4329233602177339e-05, "loss": 0.6346, "step": 903 }, { "epoch": 0.7422003284072249, "grad_norm": 0.016744717955589294, "learning_rate": 1.4317361745992299e-05, "loss": 0.6175, "step": 904 }, { "epoch": 0.7430213464696224, "grad_norm": 0.01740286312997341, "learning_rate": 1.4305482406791869e-05, "loss": 0.6188, "step": 905 }, { "epoch": 0.7438423645320197, "grad_norm": 0.01676209270954132, "learning_rate": 1.4293595605170529e-05, "loss": 0.5824, "step": 906 }, { "epoch": 0.7446633825944171, "grad_norm": 0.017688285559415817, "learning_rate": 1.4281701361735713e-05, "loss": 0.6045, "step": 907 }, { "epoch": 0.7454844006568144, "grad_norm": 0.01647118665277958, "learning_rate": 1.4269799697107747e-05, "loss": 0.642, "step": 908 }, { "epoch": 0.7463054187192119, "grad_norm": 0.01801016367971897, "learning_rate": 1.4257890631919837e-05, "loss": 0.627, "step": 909 }, { "epoch": 0.7471264367816092, "grad_norm": 0.01761813275516033, "learning_rate": 1.4245974186818002e-05, "loss": 0.6678, "step": 910 }, { "epoch": 0.7479474548440066, "grad_norm": 0.017913799732923508, "learning_rate": 1.4234050382461064e-05, "loss": 0.6457, "step": 911 }, { "epoch": 0.7487684729064039, "grad_norm": 0.01804761402308941, "learning_rate": 1.42221192395206e-05, "loss": 0.6157, "step": 912 }, { "epoch": 0.7495894909688013, "grad_norm": 0.01641744375228882, "learning_rate": 1.4210180778680916e-05, "loss": 0.637, "step": 913 }, { "epoch": 0.7504105090311987, "grad_norm": 0.01633932627737522, "learning_rate": 1.4198235020638993e-05, "loss": 0.6119, "step": 914 }, { "epoch": 0.7512315270935961, "grad_norm": 0.016509121283888817, "learning_rate": 1.4186281986104476e-05, "loss": 0.6304, "step": 915 }, { "epoch": 0.7520525451559934, "grad_norm": 0.01679670810699463, "learning_rate": 1.4174321695799614e-05, "loss": 0.6301, "step": 916 }, { "epoch": 0.7528735632183908, "grad_norm": 0.016446808353066444, "learning_rate": 1.4162354170459242e-05, "loss": 0.621, "step": 917 }, { "epoch": 0.7536945812807881, "grad_norm": 0.017387673258781433, "learning_rate": 1.4150379430830732e-05, "loss": 0.6097, "step": 918 }, { "epoch": 0.7545155993431856, "grad_norm": 0.017584918066859245, "learning_rate": 1.413839749767397e-05, "loss": 0.621, "step": 919 }, { "epoch": 0.7553366174055829, "grad_norm": 0.017944592982530594, "learning_rate": 1.4126408391761306e-05, "loss": 0.6253, "step": 920 }, { "epoch": 0.7561576354679803, "grad_norm": 0.0180913507938385, "learning_rate": 1.4114412133877531e-05, "loss": 0.6252, "step": 921 }, { "epoch": 0.7569786535303776, "grad_norm": 0.016817530617117882, "learning_rate": 1.4102408744819829e-05, "loss": 0.6192, "step": 922 }, { "epoch": 0.7577996715927751, "grad_norm": 0.016722172498703003, "learning_rate": 1.409039824539775e-05, "loss": 0.6237, "step": 923 }, { "epoch": 0.7586206896551724, "grad_norm": 0.02107631042599678, "learning_rate": 1.4078380656433173e-05, "loss": 0.6422, "step": 924 }, { "epoch": 0.7594417077175698, "grad_norm": 0.019069626927375793, "learning_rate": 1.4066355998760267e-05, "loss": 0.6089, "step": 925 }, { "epoch": 0.7602627257799671, "grad_norm": 0.01889723725616932, "learning_rate": 1.4054324293225445e-05, "loss": 0.6089, "step": 926 }, { "epoch": 0.7610837438423645, "grad_norm": 0.018626421689987183, "learning_rate": 1.4042285560687359e-05, "loss": 0.6098, "step": 927 }, { "epoch": 0.7619047619047619, "grad_norm": 0.022432615980505943, "learning_rate": 1.4030239822016822e-05, "loss": 0.6169, "step": 928 }, { "epoch": 0.7627257799671593, "grad_norm": 0.01817944087088108, "learning_rate": 1.4018187098096814e-05, "loss": 0.6321, "step": 929 }, { "epoch": 0.7635467980295566, "grad_norm": 0.021306542679667473, "learning_rate": 1.4006127409822399e-05, "loss": 0.6419, "step": 930 }, { "epoch": 0.764367816091954, "grad_norm": 0.016988292336463928, "learning_rate": 1.3994060778100741e-05, "loss": 0.6344, "step": 931 }, { "epoch": 0.7651888341543513, "grad_norm": 0.019709765911102295, "learning_rate": 1.3981987223851028e-05, "loss": 0.6485, "step": 932 }, { "epoch": 0.7660098522167488, "grad_norm": 0.016837650910019875, "learning_rate": 1.3969906768004451e-05, "loss": 0.6114, "step": 933 }, { "epoch": 0.7668308702791461, "grad_norm": 0.01814124919474125, "learning_rate": 1.3957819431504158e-05, "loss": 0.6371, "step": 934 }, { "epoch": 0.7676518883415435, "grad_norm": 0.017289316281676292, "learning_rate": 1.3945725235305247e-05, "loss": 0.5972, "step": 935 }, { "epoch": 0.7684729064039408, "grad_norm": 0.01648302935063839, "learning_rate": 1.3933624200374685e-05, "loss": 0.5897, "step": 936 }, { "epoch": 0.7692939244663383, "grad_norm": 0.04432398080825806, "learning_rate": 1.3921516347691307e-05, "loss": 0.6091, "step": 937 }, { "epoch": 0.7701149425287356, "grad_norm": 0.017205022275447845, "learning_rate": 1.3909401698245764e-05, "loss": 0.6232, "step": 938 }, { "epoch": 0.770935960591133, "grad_norm": 0.017611496150493622, "learning_rate": 1.3897280273040496e-05, "loss": 0.6145, "step": 939 }, { "epoch": 0.7717569786535303, "grad_norm": 0.016692543402314186, "learning_rate": 1.388515209308968e-05, "loss": 0.6254, "step": 940 }, { "epoch": 0.7725779967159278, "grad_norm": 0.016858136281371117, "learning_rate": 1.387301717941921e-05, "loss": 0.6275, "step": 941 }, { "epoch": 0.7733990147783252, "grad_norm": 0.016933023929595947, "learning_rate": 1.3860875553066652e-05, "loss": 0.6369, "step": 942 }, { "epoch": 0.7742200328407225, "grad_norm": 0.016707437112927437, "learning_rate": 1.3848727235081212e-05, "loss": 0.6327, "step": 943 }, { "epoch": 0.7750410509031199, "grad_norm": 0.03393053263425827, "learning_rate": 1.3836572246523692e-05, "loss": 0.6431, "step": 944 }, { "epoch": 0.7758620689655172, "grad_norm": 0.016521049663424492, "learning_rate": 1.3824410608466458e-05, "loss": 0.6239, "step": 945 }, { "epoch": 0.7766830870279147, "grad_norm": 0.017098890617489815, "learning_rate": 1.3812242341993411e-05, "loss": 0.6107, "step": 946 }, { "epoch": 0.777504105090312, "grad_norm": 0.017305495217442513, "learning_rate": 1.3800067468199938e-05, "loss": 0.6228, "step": 947 }, { "epoch": 0.7783251231527094, "grad_norm": 0.017134590074419975, "learning_rate": 1.3787886008192881e-05, "loss": 0.5876, "step": 948 }, { "epoch": 0.7791461412151067, "grad_norm": 0.01760588027536869, "learning_rate": 1.3775697983090504e-05, "loss": 0.6101, "step": 949 }, { "epoch": 0.7799671592775042, "grad_norm": 0.016805877909064293, "learning_rate": 1.3763503414022443e-05, "loss": 0.6268, "step": 950 }, { "epoch": 0.7807881773399015, "grad_norm": 0.018829908221960068, "learning_rate": 1.375130232212969e-05, "loss": 0.6228, "step": 951 }, { "epoch": 0.7816091954022989, "grad_norm": 0.017044035717844963, "learning_rate": 1.3739094728564536e-05, "loss": 0.5963, "step": 952 }, { "epoch": 0.7824302134646962, "grad_norm": 0.019606253132224083, "learning_rate": 1.3726880654490553e-05, "loss": 0.6084, "step": 953 }, { "epoch": 0.7832512315270936, "grad_norm": 0.016177969053387642, "learning_rate": 1.3714660121082541e-05, "loss": 0.6232, "step": 954 }, { "epoch": 0.784072249589491, "grad_norm": 0.019419146701693535, "learning_rate": 1.3702433149526499e-05, "loss": 0.6173, "step": 955 }, { "epoch": 0.7848932676518884, "grad_norm": 0.01680677942931652, "learning_rate": 1.369019976101959e-05, "loss": 0.6062, "step": 956 }, { "epoch": 0.7857142857142857, "grad_norm": 0.01873822696506977, "learning_rate": 1.3677959976770097e-05, "loss": 0.6003, "step": 957 }, { "epoch": 0.7865353037766831, "grad_norm": 0.017157303169369698, "learning_rate": 1.3665713817997397e-05, "loss": 0.6048, "step": 958 }, { "epoch": 0.7873563218390804, "grad_norm": 0.01878327503800392, "learning_rate": 1.365346130593191e-05, "loss": 0.618, "step": 959 }, { "epoch": 0.7881773399014779, "grad_norm": 0.01705792546272278, "learning_rate": 1.3641202461815085e-05, "loss": 0.6217, "step": 960 }, { "epoch": 0.7889983579638752, "grad_norm": 0.01645771786570549, "learning_rate": 1.3628937306899328e-05, "loss": 0.6131, "step": 961 }, { "epoch": 0.7898193760262726, "grad_norm": 0.01813456229865551, "learning_rate": 1.3616665862448007e-05, "loss": 0.6306, "step": 962 }, { "epoch": 0.7906403940886699, "grad_norm": 0.016792239621281624, "learning_rate": 1.3604388149735371e-05, "loss": 0.6272, "step": 963 }, { "epoch": 0.7914614121510674, "grad_norm": 0.017783569172024727, "learning_rate": 1.3592104190046562e-05, "loss": 0.6139, "step": 964 }, { "epoch": 0.7922824302134647, "grad_norm": 0.016002994030714035, "learning_rate": 1.3579814004677528e-05, "loss": 0.5964, "step": 965 }, { "epoch": 0.7931034482758621, "grad_norm": 0.017107916995882988, "learning_rate": 1.3567517614935027e-05, "loss": 0.6103, "step": 966 }, { "epoch": 0.7939244663382594, "grad_norm": 0.017043696716427803, "learning_rate": 1.3555215042136557e-05, "loss": 0.6076, "step": 967 }, { "epoch": 0.7947454844006568, "grad_norm": 0.017891036346554756, "learning_rate": 1.3542906307610356e-05, "loss": 0.6251, "step": 968 }, { "epoch": 0.7955665024630542, "grad_norm": 0.016679411754012108, "learning_rate": 1.3530591432695325e-05, "loss": 0.6026, "step": 969 }, { "epoch": 0.7963875205254516, "grad_norm": 0.016589375212788582, "learning_rate": 1.3518270438741024e-05, "loss": 0.6226, "step": 970 }, { "epoch": 0.7972085385878489, "grad_norm": 0.01671524904668331, "learning_rate": 1.3505943347107608e-05, "loss": 0.6056, "step": 971 }, { "epoch": 0.7980295566502463, "grad_norm": 0.01591823808848858, "learning_rate": 1.3493610179165822e-05, "loss": 0.6082, "step": 972 }, { "epoch": 0.7988505747126436, "grad_norm": 0.017678476870059967, "learning_rate": 1.348127095629692e-05, "loss": 0.6201, "step": 973 }, { "epoch": 0.7996715927750411, "grad_norm": 0.01651569828391075, "learning_rate": 1.3468925699892673e-05, "loss": 0.6046, "step": 974 }, { "epoch": 0.8004926108374384, "grad_norm": 0.017295341938734055, "learning_rate": 1.3456574431355308e-05, "loss": 0.6348, "step": 975 }, { "epoch": 0.8013136288998358, "grad_norm": 0.015982462093234062, "learning_rate": 1.344421717209747e-05, "loss": 0.6212, "step": 976 }, { "epoch": 0.8021346469622331, "grad_norm": 0.01634635031223297, "learning_rate": 1.3431853943542193e-05, "loss": 0.6494, "step": 977 }, { "epoch": 0.8029556650246306, "grad_norm": 0.01806807518005371, "learning_rate": 1.3419484767122857e-05, "loss": 0.5817, "step": 978 }, { "epoch": 0.8037766830870279, "grad_norm": 0.016531284898519516, "learning_rate": 1.3407109664283153e-05, "loss": 0.5885, "step": 979 }, { "epoch": 0.8045977011494253, "grad_norm": 0.018238496035337448, "learning_rate": 1.3394728656477059e-05, "loss": 0.6144, "step": 980 }, { "epoch": 0.8054187192118226, "grad_norm": 0.01688590832054615, "learning_rate": 1.3382341765168767e-05, "loss": 0.6175, "step": 981 }, { "epoch": 0.80623973727422, "grad_norm": 0.01779519021511078, "learning_rate": 1.3369949011832694e-05, "loss": 0.6073, "step": 982 }, { "epoch": 0.8070607553366174, "grad_norm": 0.016382494941353798, "learning_rate": 1.3357550417953397e-05, "loss": 0.6039, "step": 983 }, { "epoch": 0.8078817733990148, "grad_norm": 0.017475098371505737, "learning_rate": 1.3345146005025575e-05, "loss": 0.6175, "step": 984 }, { "epoch": 0.8087027914614121, "grad_norm": 0.016528092324733734, "learning_rate": 1.3332735794554002e-05, "loss": 0.6117, "step": 985 }, { "epoch": 0.8095238095238095, "grad_norm": 0.017134059220552444, "learning_rate": 1.3320319808053516e-05, "loss": 0.6028, "step": 986 }, { "epoch": 0.8103448275862069, "grad_norm": 0.016690775752067566, "learning_rate": 1.3307898067048961e-05, "loss": 0.5999, "step": 987 }, { "epoch": 0.8111658456486043, "grad_norm": 0.018070189282298088, "learning_rate": 1.3295470593075158e-05, "loss": 0.608, "step": 988 }, { "epoch": 0.8119868637110016, "grad_norm": 0.018059300258755684, "learning_rate": 1.328303740767686e-05, "loss": 0.6245, "step": 989 }, { "epoch": 0.812807881773399, "grad_norm": 0.016609426587820053, "learning_rate": 1.3270598532408743e-05, "loss": 0.6185, "step": 990 }, { "epoch": 0.8136288998357963, "grad_norm": 0.016524409875273705, "learning_rate": 1.325815398883532e-05, "loss": 0.5933, "step": 991 }, { "epoch": 0.8144499178981938, "grad_norm": 0.01702118292450905, "learning_rate": 1.324570379853095e-05, "loss": 0.6381, "step": 992 }, { "epoch": 0.8152709359605911, "grad_norm": 0.01645691692829132, "learning_rate": 1.3233247983079779e-05, "loss": 0.6304, "step": 993 }, { "epoch": 0.8160919540229885, "grad_norm": 0.016796709969639778, "learning_rate": 1.3220786564075691e-05, "loss": 0.6038, "step": 994 }, { "epoch": 0.8169129720853858, "grad_norm": 0.016751611605286598, "learning_rate": 1.3208319563122305e-05, "loss": 0.6284, "step": 995 }, { "epoch": 0.8177339901477833, "grad_norm": 0.017269333824515343, "learning_rate": 1.31958470018329e-05, "loss": 0.6229, "step": 996 }, { "epoch": 0.8185550082101807, "grad_norm": 0.020196860656142235, "learning_rate": 1.3183368901830403e-05, "loss": 0.5998, "step": 997 }, { "epoch": 0.819376026272578, "grad_norm": 0.01640750840306282, "learning_rate": 1.317088528474734e-05, "loss": 0.6144, "step": 998 }, { "epoch": 0.8201970443349754, "grad_norm": 0.01798519678413868, "learning_rate": 1.3158396172225812e-05, "loss": 0.6183, "step": 999 }, { "epoch": 0.8210180623973727, "grad_norm": 0.016412656754255295, "learning_rate": 1.3145901585917425e-05, "loss": 0.6344, "step": 1000 }, { "epoch": 0.8218390804597702, "grad_norm": 0.018009468913078308, "learning_rate": 1.3133401547483298e-05, "loss": 0.5974, "step": 1001 }, { "epoch": 0.8226600985221675, "grad_norm": 0.015752628445625305, "learning_rate": 1.3120896078593987e-05, "loss": 0.5662, "step": 1002 }, { "epoch": 0.8234811165845649, "grad_norm": 0.01654975302517414, "learning_rate": 1.3108385200929468e-05, "loss": 0.5931, "step": 1003 }, { "epoch": 0.8243021346469622, "grad_norm": 0.01852544955909252, "learning_rate": 1.3095868936179098e-05, "loss": 0.5915, "step": 1004 }, { "epoch": 0.8251231527093597, "grad_norm": 0.016900241374969482, "learning_rate": 1.3083347306041558e-05, "loss": 0.6208, "step": 1005 }, { "epoch": 0.825944170771757, "grad_norm": 0.01703634299337864, "learning_rate": 1.3070820332224852e-05, "loss": 0.6041, "step": 1006 }, { "epoch": 0.8267651888341544, "grad_norm": 0.0157708078622818, "learning_rate": 1.3058288036446232e-05, "loss": 0.5969, "step": 1007 }, { "epoch": 0.8275862068965517, "grad_norm": 0.01635306142270565, "learning_rate": 1.3045750440432184e-05, "loss": 0.6214, "step": 1008 }, { "epoch": 0.8284072249589491, "grad_norm": 0.01632867008447647, "learning_rate": 1.3033207565918375e-05, "loss": 0.619, "step": 1009 }, { "epoch": 0.8292282430213465, "grad_norm": 0.016301007941365242, "learning_rate": 1.3020659434649632e-05, "loss": 0.622, "step": 1010 }, { "epoch": 0.8300492610837439, "grad_norm": 0.017222441732883453, "learning_rate": 1.3008106068379894e-05, "loss": 0.6152, "step": 1011 }, { "epoch": 0.8308702791461412, "grad_norm": 0.017338763922452927, "learning_rate": 1.2995547488872168e-05, "loss": 0.6063, "step": 1012 }, { "epoch": 0.8316912972085386, "grad_norm": 0.016133887693285942, "learning_rate": 1.298298371789851e-05, "loss": 0.6093, "step": 1013 }, { "epoch": 0.8325123152709359, "grad_norm": 0.01702342927455902, "learning_rate": 1.2970414777239962e-05, "loss": 0.6007, "step": 1014 }, { "epoch": 0.8333333333333334, "grad_norm": 0.016478441655635834, "learning_rate": 1.2957840688686542e-05, "loss": 0.6187, "step": 1015 }, { "epoch": 0.8341543513957307, "grad_norm": 0.01781783625483513, "learning_rate": 1.2945261474037186e-05, "loss": 0.5987, "step": 1016 }, { "epoch": 0.8349753694581281, "grad_norm": 0.016546953469514847, "learning_rate": 1.2932677155099721e-05, "loss": 0.6155, "step": 1017 }, { "epoch": 0.8357963875205254, "grad_norm": 0.017237583175301552, "learning_rate": 1.2920087753690816e-05, "loss": 0.6168, "step": 1018 }, { "epoch": 0.8366174055829229, "grad_norm": 0.016434988006949425, "learning_rate": 1.2907493291635953e-05, "loss": 0.6173, "step": 1019 }, { "epoch": 0.8374384236453202, "grad_norm": 0.017002368345856667, "learning_rate": 1.2894893790769395e-05, "loss": 0.6118, "step": 1020 }, { "epoch": 0.8382594417077176, "grad_norm": 0.01626773178577423, "learning_rate": 1.2882289272934131e-05, "loss": 0.6019, "step": 1021 }, { "epoch": 0.8390804597701149, "grad_norm": 0.01619119569659233, "learning_rate": 1.2869679759981854e-05, "loss": 0.6211, "step": 1022 }, { "epoch": 0.8399014778325123, "grad_norm": 0.016273194923996925, "learning_rate": 1.2857065273772916e-05, "loss": 0.631, "step": 1023 }, { "epoch": 0.8407224958949097, "grad_norm": 0.01661028154194355, "learning_rate": 1.2844445836176282e-05, "loss": 0.5888, "step": 1024 }, { "epoch": 0.8415435139573071, "grad_norm": 0.03758163005113602, "learning_rate": 1.2831821469069515e-05, "loss": 0.6046, "step": 1025 }, { "epoch": 0.8423645320197044, "grad_norm": 0.01648324728012085, "learning_rate": 1.2819192194338712e-05, "loss": 0.6085, "step": 1026 }, { "epoch": 0.8431855500821018, "grad_norm": 0.01601887308061123, "learning_rate": 1.2806558033878488e-05, "loss": 0.6084, "step": 1027 }, { "epoch": 0.8440065681444991, "grad_norm": 0.016316618770360947, "learning_rate": 1.2793919009591915e-05, "loss": 0.6243, "step": 1028 }, { "epoch": 0.8448275862068966, "grad_norm": 0.016127334907650948, "learning_rate": 1.2781275143390517e-05, "loss": 0.6166, "step": 1029 }, { "epoch": 0.8456486042692939, "grad_norm": 0.01705724000930786, "learning_rate": 1.2768626457194191e-05, "loss": 0.5945, "step": 1030 }, { "epoch": 0.8464696223316913, "grad_norm": 0.017162678763270378, "learning_rate": 1.2755972972931206e-05, "loss": 0.6145, "step": 1031 }, { "epoch": 0.8472906403940886, "grad_norm": 0.017686357721686363, "learning_rate": 1.2743314712538132e-05, "loss": 0.6072, "step": 1032 }, { "epoch": 0.8481116584564861, "grad_norm": 0.018112773075699806, "learning_rate": 1.2730651697959844e-05, "loss": 0.6046, "step": 1033 }, { "epoch": 0.8489326765188834, "grad_norm": 0.01700584590435028, "learning_rate": 1.2717983951149435e-05, "loss": 0.6088, "step": 1034 }, { "epoch": 0.8497536945812808, "grad_norm": 0.017205242067575455, "learning_rate": 1.2705311494068218e-05, "loss": 0.5879, "step": 1035 }, { "epoch": 0.8505747126436781, "grad_norm": 0.017123989760875702, "learning_rate": 1.2692634348685657e-05, "loss": 0.5965, "step": 1036 }, { "epoch": 0.8513957307060755, "grad_norm": 0.01642029546201229, "learning_rate": 1.267995253697936e-05, "loss": 0.5941, "step": 1037 }, { "epoch": 0.8522167487684729, "grad_norm": 0.01908668503165245, "learning_rate": 1.2667266080935015e-05, "loss": 0.6025, "step": 1038 }, { "epoch": 0.8530377668308703, "grad_norm": 0.015867702662944794, "learning_rate": 1.265457500254637e-05, "loss": 0.6337, "step": 1039 }, { "epoch": 0.8538587848932676, "grad_norm": 0.017576858401298523, "learning_rate": 1.2641879323815166e-05, "loss": 0.6339, "step": 1040 }, { "epoch": 0.854679802955665, "grad_norm": 0.02694839797914028, "learning_rate": 1.2629179066751145e-05, "loss": 0.6025, "step": 1041 }, { "epoch": 0.8555008210180624, "grad_norm": 0.01769500970840454, "learning_rate": 1.2616474253371969e-05, "loss": 0.6277, "step": 1042 }, { "epoch": 0.8563218390804598, "grad_norm": 0.01764015480875969, "learning_rate": 1.260376490570321e-05, "loss": 0.6049, "step": 1043 }, { "epoch": 0.8571428571428571, "grad_norm": 0.01800648495554924, "learning_rate": 1.2591051045778293e-05, "loss": 0.6237, "step": 1044 }, { "epoch": 0.8579638752052545, "grad_norm": 0.01659088395535946, "learning_rate": 1.2578332695638471e-05, "loss": 0.6061, "step": 1045 }, { "epoch": 0.8587848932676518, "grad_norm": 0.01720474660396576, "learning_rate": 1.2565609877332774e-05, "loss": 0.6092, "step": 1046 }, { "epoch": 0.8596059113300493, "grad_norm": 0.017576538026332855, "learning_rate": 1.2552882612917984e-05, "loss": 0.594, "step": 1047 }, { "epoch": 0.8604269293924466, "grad_norm": 0.017050081863999367, "learning_rate": 1.2540150924458591e-05, "loss": 0.5862, "step": 1048 }, { "epoch": 0.861247947454844, "grad_norm": 0.016937939450144768, "learning_rate": 1.2527414834026756e-05, "loss": 0.6146, "step": 1049 }, { "epoch": 0.8620689655172413, "grad_norm": 0.016709407791495323, "learning_rate": 1.2514674363702266e-05, "loss": 0.6068, "step": 1050 }, { "epoch": 0.8628899835796388, "grad_norm": 0.02509489096701145, "learning_rate": 1.2501929535572504e-05, "loss": 0.595, "step": 1051 }, { "epoch": 0.8637110016420362, "grad_norm": 0.015810824930667877, "learning_rate": 1.2489180371732406e-05, "loss": 0.5966, "step": 1052 }, { "epoch": 0.8645320197044335, "grad_norm": 0.016731521114706993, "learning_rate": 1.2476426894284435e-05, "loss": 0.6069, "step": 1053 }, { "epoch": 0.8653530377668309, "grad_norm": 0.016808783635497093, "learning_rate": 1.2463669125338512e-05, "loss": 0.5829, "step": 1054 }, { "epoch": 0.8661740558292282, "grad_norm": 0.01691477932035923, "learning_rate": 1.2450907087012019e-05, "loss": 0.6032, "step": 1055 }, { "epoch": 0.8669950738916257, "grad_norm": 0.01657267101109028, "learning_rate": 1.2438140801429725e-05, "loss": 0.6071, "step": 1056 }, { "epoch": 0.867816091954023, "grad_norm": 0.017749957740306854, "learning_rate": 1.2425370290723772e-05, "loss": 0.6184, "step": 1057 }, { "epoch": 0.8686371100164204, "grad_norm": 0.016059761866927147, "learning_rate": 1.2412595577033614e-05, "loss": 0.5821, "step": 1058 }, { "epoch": 0.8694581280788177, "grad_norm": 0.018719235435128212, "learning_rate": 1.2399816682506015e-05, "loss": 0.6131, "step": 1059 }, { "epoch": 0.8702791461412152, "grad_norm": 0.016032414510846138, "learning_rate": 1.2387033629294953e-05, "loss": 0.5928, "step": 1060 }, { "epoch": 0.8711001642036125, "grad_norm": 0.016728779301047325, "learning_rate": 1.2374246439561653e-05, "loss": 0.5952, "step": 1061 }, { "epoch": 0.8719211822660099, "grad_norm": 0.016605909913778305, "learning_rate": 1.2361455135474475e-05, "loss": 0.5948, "step": 1062 }, { "epoch": 0.8727422003284072, "grad_norm": 0.016966061666607857, "learning_rate": 1.2348659739208943e-05, "loss": 0.6173, "step": 1063 }, { "epoch": 0.8735632183908046, "grad_norm": 0.017854535952210426, "learning_rate": 1.2335860272947655e-05, "loss": 0.6001, "step": 1064 }, { "epoch": 0.874384236453202, "grad_norm": 0.016883203759789467, "learning_rate": 1.2323056758880277e-05, "loss": 0.6072, "step": 1065 }, { "epoch": 0.8752052545155994, "grad_norm": 0.017957771196961403, "learning_rate": 1.231024921920348e-05, "loss": 0.5891, "step": 1066 }, { "epoch": 0.8760262725779967, "grad_norm": 0.017160510644316673, "learning_rate": 1.229743767612093e-05, "loss": 0.5928, "step": 1067 }, { "epoch": 0.8768472906403941, "grad_norm": 0.017925189808011055, "learning_rate": 1.2284622151843217e-05, "loss": 0.6056, "step": 1068 }, { "epoch": 0.8776683087027914, "grad_norm": 0.01872408390045166, "learning_rate": 1.2271802668587842e-05, "loss": 0.5921, "step": 1069 }, { "epoch": 0.8784893267651889, "grad_norm": 0.016280528157949448, "learning_rate": 1.2258979248579172e-05, "loss": 0.5777, "step": 1070 }, { "epoch": 0.8793103448275862, "grad_norm": 0.018613051623106003, "learning_rate": 1.2246151914048394e-05, "loss": 0.6027, "step": 1071 }, { "epoch": 0.8801313628899836, "grad_norm": 0.01829327642917633, "learning_rate": 1.2233320687233482e-05, "loss": 0.5919, "step": 1072 }, { "epoch": 0.8809523809523809, "grad_norm": 0.018061555922031403, "learning_rate": 1.2220485590379154e-05, "loss": 0.5993, "step": 1073 }, { "epoch": 0.8817733990147784, "grad_norm": 0.017241371795535088, "learning_rate": 1.2207646645736844e-05, "loss": 0.579, "step": 1074 }, { "epoch": 0.8825944170771757, "grad_norm": 0.017253262922167778, "learning_rate": 1.2194803875564656e-05, "loss": 0.6038, "step": 1075 }, { "epoch": 0.8834154351395731, "grad_norm": 0.01626117341220379, "learning_rate": 1.2181957302127323e-05, "loss": 0.6105, "step": 1076 }, { "epoch": 0.8842364532019704, "grad_norm": 0.017428966239094734, "learning_rate": 1.2169106947696177e-05, "loss": 0.619, "step": 1077 }, { "epoch": 0.8850574712643678, "grad_norm": 0.01687300018966198, "learning_rate": 1.2156252834549092e-05, "loss": 0.5948, "step": 1078 }, { "epoch": 0.8858784893267652, "grad_norm": 0.017413344234228134, "learning_rate": 1.2143394984970472e-05, "loss": 0.5795, "step": 1079 }, { "epoch": 0.8866995073891626, "grad_norm": 0.016341840848326683, "learning_rate": 1.2130533421251191e-05, "loss": 0.5891, "step": 1080 }, { "epoch": 0.8875205254515599, "grad_norm": 0.018019378185272217, "learning_rate": 1.2117668165688567e-05, "loss": 0.6076, "step": 1081 }, { "epoch": 0.8883415435139573, "grad_norm": 0.017460975795984268, "learning_rate": 1.2104799240586316e-05, "loss": 0.5878, "step": 1082 }, { "epoch": 0.8891625615763546, "grad_norm": 0.017692498862743378, "learning_rate": 1.2091926668254514e-05, "loss": 0.6065, "step": 1083 }, { "epoch": 0.8899835796387521, "grad_norm": 0.018301308155059814, "learning_rate": 1.2079050471009561e-05, "loss": 0.609, "step": 1084 }, { "epoch": 0.8908045977011494, "grad_norm": 0.01789926365017891, "learning_rate": 1.206617067117414e-05, "loss": 0.5838, "step": 1085 }, { "epoch": 0.8916256157635468, "grad_norm": 0.01743137836456299, "learning_rate": 1.2053287291077187e-05, "loss": 0.6168, "step": 1086 }, { "epoch": 0.8924466338259441, "grad_norm": 0.01675173081457615, "learning_rate": 1.2040400353053831e-05, "loss": 0.6218, "step": 1087 }, { "epoch": 0.8932676518883416, "grad_norm": 0.016438787803053856, "learning_rate": 1.2027509879445387e-05, "loss": 0.6159, "step": 1088 }, { "epoch": 0.8940886699507389, "grad_norm": 0.016652993857860565, "learning_rate": 1.2014615892599284e-05, "loss": 0.5697, "step": 1089 }, { "epoch": 0.8949096880131363, "grad_norm": 0.01798146404325962, "learning_rate": 1.2001718414869048e-05, "loss": 0.599, "step": 1090 }, { "epoch": 0.8957307060755336, "grad_norm": 0.016836615279316902, "learning_rate": 1.1988817468614252e-05, "loss": 0.5886, "step": 1091 }, { "epoch": 0.896551724137931, "grad_norm": 0.01657363958656788, "learning_rate": 1.1975913076200496e-05, "loss": 0.6145, "step": 1092 }, { "epoch": 0.8973727422003284, "grad_norm": 0.01692560315132141, "learning_rate": 1.196300525999934e-05, "loss": 0.6145, "step": 1093 }, { "epoch": 0.8981937602627258, "grad_norm": 0.017015445977449417, "learning_rate": 1.1950094042388284e-05, "loss": 0.5921, "step": 1094 }, { "epoch": 0.8990147783251231, "grad_norm": 0.015866441652178764, "learning_rate": 1.1937179445750723e-05, "loss": 0.5868, "step": 1095 }, { "epoch": 0.8998357963875205, "grad_norm": 0.017055731266736984, "learning_rate": 1.1924261492475922e-05, "loss": 0.5602, "step": 1096 }, { "epoch": 0.9006568144499179, "grad_norm": 0.01664452813565731, "learning_rate": 1.191134020495894e-05, "loss": 0.6065, "step": 1097 }, { "epoch": 0.9014778325123153, "grad_norm": 0.017227137461304665, "learning_rate": 1.1898415605600648e-05, "loss": 0.6037, "step": 1098 }, { "epoch": 0.9022988505747126, "grad_norm": 0.01624290645122528, "learning_rate": 1.1885487716807631e-05, "loss": 0.6056, "step": 1099 }, { "epoch": 0.90311986863711, "grad_norm": 0.016836725175380707, "learning_rate": 1.18725565609922e-05, "loss": 0.5848, "step": 1100 }, { "epoch": 0.9039408866995073, "grad_norm": 0.016381600871682167, "learning_rate": 1.1859622160572301e-05, "loss": 0.5732, "step": 1101 }, { "epoch": 0.9047619047619048, "grad_norm": 0.01685541681945324, "learning_rate": 1.1846684537971537e-05, "loss": 0.5918, "step": 1102 }, { "epoch": 0.9055829228243021, "grad_norm": 0.01748223602771759, "learning_rate": 1.1833743715619075e-05, "loss": 0.614, "step": 1103 }, { "epoch": 0.9064039408866995, "grad_norm": 0.017142564058303833, "learning_rate": 1.1820799715949643e-05, "loss": 0.6073, "step": 1104 }, { "epoch": 0.9072249589490968, "grad_norm": 0.017242927104234695, "learning_rate": 1.180785256140346e-05, "loss": 0.5949, "step": 1105 }, { "epoch": 0.9080459770114943, "grad_norm": 0.01614236645400524, "learning_rate": 1.1794902274426232e-05, "loss": 0.5777, "step": 1106 }, { "epoch": 0.9088669950738916, "grad_norm": 0.016959980130195618, "learning_rate": 1.1781948877469089e-05, "loss": 0.5892, "step": 1107 }, { "epoch": 0.909688013136289, "grad_norm": 0.017237814143300056, "learning_rate": 1.1768992392988552e-05, "loss": 0.585, "step": 1108 }, { "epoch": 0.9105090311986864, "grad_norm": 0.016399379819631577, "learning_rate": 1.175603284344649e-05, "loss": 0.6077, "step": 1109 }, { "epoch": 0.9113300492610837, "grad_norm": 0.016955768689513206, "learning_rate": 1.1743070251310095e-05, "loss": 0.5964, "step": 1110 }, { "epoch": 0.9121510673234812, "grad_norm": 0.01607462763786316, "learning_rate": 1.173010463905183e-05, "loss": 0.5673, "step": 1111 }, { "epoch": 0.9129720853858785, "grad_norm": 0.01600598730146885, "learning_rate": 1.1717136029149392e-05, "loss": 0.5713, "step": 1112 }, { "epoch": 0.9137931034482759, "grad_norm": 0.015928048640489578, "learning_rate": 1.1704164444085671e-05, "loss": 0.5923, "step": 1113 }, { "epoch": 0.9146141215106732, "grad_norm": 0.016088126227259636, "learning_rate": 1.1691189906348726e-05, "loss": 0.5905, "step": 1114 }, { "epoch": 0.9154351395730707, "grad_norm": 0.01642848178744316, "learning_rate": 1.1678212438431727e-05, "loss": 0.5794, "step": 1115 }, { "epoch": 0.916256157635468, "grad_norm": 0.016627388074994087, "learning_rate": 1.1665232062832925e-05, "loss": 0.5828, "step": 1116 }, { "epoch": 0.9170771756978654, "grad_norm": 0.01737840473651886, "learning_rate": 1.1652248802055611e-05, "loss": 0.5915, "step": 1117 }, { "epoch": 0.9178981937602627, "grad_norm": 0.01603935845196247, "learning_rate": 1.1639262678608086e-05, "loss": 0.6031, "step": 1118 }, { "epoch": 0.9187192118226601, "grad_norm": 0.016205986961722374, "learning_rate": 1.1626273715003598e-05, "loss": 0.5744, "step": 1119 }, { "epoch": 0.9195402298850575, "grad_norm": 0.016208617016673088, "learning_rate": 1.1613281933760331e-05, "loss": 0.5975, "step": 1120 }, { "epoch": 0.9203612479474549, "grad_norm": 0.016135746613144875, "learning_rate": 1.1600287357401353e-05, "loss": 0.6181, "step": 1121 }, { "epoch": 0.9211822660098522, "grad_norm": 0.01807931810617447, "learning_rate": 1.1587290008454577e-05, "loss": 0.6055, "step": 1122 }, { "epoch": 0.9220032840722496, "grad_norm": 0.016921069473028183, "learning_rate": 1.1574289909452715e-05, "loss": 0.5917, "step": 1123 }, { "epoch": 0.922824302134647, "grad_norm": 0.016570108011364937, "learning_rate": 1.1561287082933255e-05, "loss": 0.5733, "step": 1124 }, { "epoch": 0.9236453201970444, "grad_norm": 0.0165507011115551, "learning_rate": 1.1548281551438414e-05, "loss": 0.6119, "step": 1125 }, { "epoch": 0.9244663382594417, "grad_norm": 0.017894558608531952, "learning_rate": 1.1535273337515093e-05, "loss": 0.6005, "step": 1126 }, { "epoch": 0.9252873563218391, "grad_norm": 0.015404334291815758, "learning_rate": 1.1522262463714846e-05, "loss": 0.5964, "step": 1127 }, { "epoch": 0.9261083743842364, "grad_norm": 0.016996245831251144, "learning_rate": 1.1509248952593837e-05, "loss": 0.5765, "step": 1128 }, { "epoch": 0.9269293924466339, "grad_norm": 0.01596219837665558, "learning_rate": 1.1496232826712807e-05, "loss": 0.6009, "step": 1129 }, { "epoch": 0.9277504105090312, "grad_norm": 0.01610490493476391, "learning_rate": 1.148321410863703e-05, "loss": 0.5858, "step": 1130 }, { "epoch": 0.9285714285714286, "grad_norm": 0.0159321092069149, "learning_rate": 1.1470192820936257e-05, "loss": 0.5689, "step": 1131 }, { "epoch": 0.9293924466338259, "grad_norm": 0.016922827810049057, "learning_rate": 1.145716898618472e-05, "loss": 0.617, "step": 1132 }, { "epoch": 0.9302134646962233, "grad_norm": 0.016236169263720512, "learning_rate": 1.1444142626961047e-05, "loss": 0.6262, "step": 1133 }, { "epoch": 0.9310344827586207, "grad_norm": 0.016764728352427483, "learning_rate": 1.1431113765848256e-05, "loss": 0.5925, "step": 1134 }, { "epoch": 0.9318555008210181, "grad_norm": 0.016531789675354958, "learning_rate": 1.1418082425433686e-05, "loss": 0.5892, "step": 1135 }, { "epoch": 0.9326765188834154, "grad_norm": 0.018079929053783417, "learning_rate": 1.1405048628308994e-05, "loss": 0.5855, "step": 1136 }, { "epoch": 0.9334975369458128, "grad_norm": 0.016748664900660515, "learning_rate": 1.139201239707008e-05, "loss": 0.5921, "step": 1137 }, { "epoch": 0.9343185550082101, "grad_norm": 0.017835237085819244, "learning_rate": 1.1378973754317073e-05, "loss": 0.5818, "step": 1138 }, { "epoch": 0.9351395730706076, "grad_norm": 0.016934873536229134, "learning_rate": 1.1365932722654275e-05, "loss": 0.609, "step": 1139 }, { "epoch": 0.9359605911330049, "grad_norm": 0.017647646367549896, "learning_rate": 1.1352889324690144e-05, "loss": 0.5807, "step": 1140 }, { "epoch": 0.9367816091954023, "grad_norm": 0.016528800129890442, "learning_rate": 1.133984358303722e-05, "loss": 0.5805, "step": 1141 }, { "epoch": 0.9376026272577996, "grad_norm": 0.016718421131372452, "learning_rate": 1.132679552031212e-05, "loss": 0.6116, "step": 1142 }, { "epoch": 0.9384236453201971, "grad_norm": 0.015725215896964073, "learning_rate": 1.1313745159135481e-05, "loss": 0.5848, "step": 1143 }, { "epoch": 0.9392446633825944, "grad_norm": 0.01737361028790474, "learning_rate": 1.1300692522131927e-05, "loss": 0.5981, "step": 1144 }, { "epoch": 0.9400656814449918, "grad_norm": 0.016818955540657043, "learning_rate": 1.1287637631930028e-05, "loss": 0.587, "step": 1145 }, { "epoch": 0.9408866995073891, "grad_norm": 0.01865926943719387, "learning_rate": 1.1274580511162251e-05, "loss": 0.5425, "step": 1146 }, { "epoch": 0.9417077175697866, "grad_norm": 0.016214754432439804, "learning_rate": 1.1261521182464944e-05, "loss": 0.5705, "step": 1147 }, { "epoch": 0.9425287356321839, "grad_norm": 0.01937129907310009, "learning_rate": 1.1248459668478268e-05, "loss": 0.5686, "step": 1148 }, { "epoch": 0.9433497536945813, "grad_norm": 0.015967804938554764, "learning_rate": 1.1235395991846188e-05, "loss": 0.5661, "step": 1149 }, { "epoch": 0.9441707717569786, "grad_norm": 0.018178613856434822, "learning_rate": 1.1222330175216403e-05, "loss": 0.5956, "step": 1150 }, { "epoch": 0.944991789819376, "grad_norm": 0.01585310511291027, "learning_rate": 1.1209262241240337e-05, "loss": 0.5953, "step": 1151 }, { "epoch": 0.9458128078817734, "grad_norm": 0.018491728231310844, "learning_rate": 1.1196192212573072e-05, "loss": 0.5991, "step": 1152 }, { "epoch": 0.9466338259441708, "grad_norm": 0.01616772823035717, "learning_rate": 1.1183120111873324e-05, "loss": 0.5813, "step": 1153 }, { "epoch": 0.9474548440065681, "grad_norm": 0.016376454383134842, "learning_rate": 1.117004596180341e-05, "loss": 0.5697, "step": 1154 }, { "epoch": 0.9482758620689655, "grad_norm": 0.01699935458600521, "learning_rate": 1.1156969785029192e-05, "loss": 0.6095, "step": 1155 }, { "epoch": 0.9490968801313628, "grad_norm": 0.0159684419631958, "learning_rate": 1.1143891604220045e-05, "loss": 0.5823, "step": 1156 }, { "epoch": 0.9499178981937603, "grad_norm": 0.016202116385102272, "learning_rate": 1.1130811442048821e-05, "loss": 0.5948, "step": 1157 }, { "epoch": 0.9507389162561576, "grad_norm": 0.01689833216369152, "learning_rate": 1.1117729321191807e-05, "loss": 0.5989, "step": 1158 }, { "epoch": 0.951559934318555, "grad_norm": 0.01651107333600521, "learning_rate": 1.1104645264328686e-05, "loss": 0.5722, "step": 1159 }, { "epoch": 0.9523809523809523, "grad_norm": 0.016270749270915985, "learning_rate": 1.109155929414249e-05, "loss": 0.574, "step": 1160 }, { "epoch": 0.9532019704433498, "grad_norm": 0.015756996348500252, "learning_rate": 1.1078471433319584e-05, "loss": 0.5896, "step": 1161 }, { "epoch": 0.9540229885057471, "grad_norm": 0.0170670747756958, "learning_rate": 1.1065381704549594e-05, "loss": 0.5626, "step": 1162 }, { "epoch": 0.9548440065681445, "grad_norm": 0.01619679480791092, "learning_rate": 1.1052290130525396e-05, "loss": 0.5681, "step": 1163 }, { "epoch": 0.9556650246305419, "grad_norm": 0.017136523500084877, "learning_rate": 1.1039196733943054e-05, "loss": 0.6126, "step": 1164 }, { "epoch": 0.9564860426929392, "grad_norm": 0.016686104238033295, "learning_rate": 1.1026101537501806e-05, "loss": 0.589, "step": 1165 }, { "epoch": 0.9573070607553367, "grad_norm": 0.017088595777750015, "learning_rate": 1.1013004563903998e-05, "loss": 0.596, "step": 1166 }, { "epoch": 0.958128078817734, "grad_norm": 0.016939761117100716, "learning_rate": 1.0999905835855063e-05, "loss": 0.5969, "step": 1167 }, { "epoch": 0.9589490968801314, "grad_norm": 0.020650051534175873, "learning_rate": 1.0986805376063467e-05, "loss": 0.6024, "step": 1168 }, { "epoch": 0.9597701149425287, "grad_norm": 0.01641935482621193, "learning_rate": 1.0973703207240696e-05, "loss": 0.5685, "step": 1169 }, { "epoch": 0.9605911330049262, "grad_norm": 0.01749393902719021, "learning_rate": 1.0960599352101181e-05, "loss": 0.6189, "step": 1170 }, { "epoch": 0.9614121510673235, "grad_norm": 0.016399389132857323, "learning_rate": 1.0947493833362287e-05, "loss": 0.5925, "step": 1171 }, { "epoch": 0.9622331691297209, "grad_norm": 0.017036376520991325, "learning_rate": 1.0934386673744255e-05, "loss": 0.589, "step": 1172 }, { "epoch": 0.9630541871921182, "grad_norm": 0.01671152561903, "learning_rate": 1.0921277895970183e-05, "loss": 0.5875, "step": 1173 }, { "epoch": 0.9638752052545156, "grad_norm": 0.016384320333600044, "learning_rate": 1.0908167522765956e-05, "loss": 0.5832, "step": 1174 }, { "epoch": 0.964696223316913, "grad_norm": 0.017266463488340378, "learning_rate": 1.0895055576860242e-05, "loss": 0.5647, "step": 1175 }, { "epoch": 0.9655172413793104, "grad_norm": 0.01660304144024849, "learning_rate": 1.0881942080984427e-05, "loss": 0.5982, "step": 1176 }, { "epoch": 0.9663382594417077, "grad_norm": 0.01756059192121029, "learning_rate": 1.0868827057872592e-05, "loss": 0.6186, "step": 1177 }, { "epoch": 0.9671592775041051, "grad_norm": 0.01674589328467846, "learning_rate": 1.0855710530261447e-05, "loss": 0.6165, "step": 1178 }, { "epoch": 0.9679802955665024, "grad_norm": 0.016808710992336273, "learning_rate": 1.0842592520890334e-05, "loss": 0.5617, "step": 1179 }, { "epoch": 0.9688013136288999, "grad_norm": 0.01754768006503582, "learning_rate": 1.082947305250115e-05, "loss": 0.5967, "step": 1180 }, { "epoch": 0.9696223316912972, "grad_norm": 0.016541175544261932, "learning_rate": 1.0816352147838321e-05, "loss": 0.5922, "step": 1181 }, { "epoch": 0.9704433497536946, "grad_norm": 0.017556369304656982, "learning_rate": 1.0803229829648764e-05, "loss": 0.5979, "step": 1182 }, { "epoch": 0.9712643678160919, "grad_norm": 0.015982428565621376, "learning_rate": 1.079010612068186e-05, "loss": 0.5611, "step": 1183 }, { "epoch": 0.9720853858784894, "grad_norm": 0.015981189906597137, "learning_rate": 1.0776981043689378e-05, "loss": 0.5638, "step": 1184 }, { "epoch": 0.9729064039408867, "grad_norm": 0.016471238806843758, "learning_rate": 1.0763854621425482e-05, "loss": 0.5609, "step": 1185 }, { "epoch": 0.9737274220032841, "grad_norm": 0.01605178974568844, "learning_rate": 1.0750726876646642e-05, "loss": 0.5754, "step": 1186 }, { "epoch": 0.9745484400656814, "grad_norm": 0.017658716067671776, "learning_rate": 1.0737597832111648e-05, "loss": 0.6028, "step": 1187 }, { "epoch": 0.9753694581280788, "grad_norm": 0.016069870442152023, "learning_rate": 1.0724467510581525e-05, "loss": 0.5701, "step": 1188 }, { "epoch": 0.9761904761904762, "grad_norm": 0.015769382938742638, "learning_rate": 1.0711335934819521e-05, "loss": 0.5854, "step": 1189 }, { "epoch": 0.9770114942528736, "grad_norm": 0.01676945574581623, "learning_rate": 1.0698203127591046e-05, "loss": 0.6096, "step": 1190 }, { "epoch": 0.9778325123152709, "grad_norm": 0.01608770713210106, "learning_rate": 1.0685069111663668e-05, "loss": 0.5888, "step": 1191 }, { "epoch": 0.9786535303776683, "grad_norm": 0.017610331997275352, "learning_rate": 1.0671933909807024e-05, "loss": 0.5544, "step": 1192 }, { "epoch": 0.9794745484400657, "grad_norm": 0.01692347228527069, "learning_rate": 1.0658797544792825e-05, "loss": 0.5771, "step": 1193 }, { "epoch": 0.9802955665024631, "grad_norm": 0.01930500753223896, "learning_rate": 1.0645660039394792e-05, "loss": 0.5897, "step": 1194 }, { "epoch": 0.9811165845648604, "grad_norm": 0.016026152297854424, "learning_rate": 1.0632521416388628e-05, "loss": 0.5677, "step": 1195 }, { "epoch": 0.9819376026272578, "grad_norm": 0.019363176077604294, "learning_rate": 1.0619381698551961e-05, "loss": 0.5835, "step": 1196 }, { "epoch": 0.9827586206896551, "grad_norm": 0.016007188707590103, "learning_rate": 1.060624090866433e-05, "loss": 0.5775, "step": 1197 }, { "epoch": 0.9835796387520526, "grad_norm": 0.016363436356186867, "learning_rate": 1.0593099069507126e-05, "loss": 0.5636, "step": 1198 }, { "epoch": 0.9844006568144499, "grad_norm": 0.016260500997304916, "learning_rate": 1.0579956203863566e-05, "loss": 0.5823, "step": 1199 }, { "epoch": 0.9852216748768473, "grad_norm": 0.016434431076049805, "learning_rate": 1.0566812334518637e-05, "loss": 0.5903, "step": 1200 }, { "epoch": 0.9860426929392446, "grad_norm": 0.01619420200586319, "learning_rate": 1.0553667484259071e-05, "loss": 0.5659, "step": 1201 }, { "epoch": 0.986863711001642, "grad_norm": 0.017017312347888947, "learning_rate": 1.05405216758733e-05, "loss": 0.6098, "step": 1202 }, { "epoch": 0.9876847290640394, "grad_norm": 0.016330571845173836, "learning_rate": 1.0527374932151421e-05, "loss": 0.5865, "step": 1203 }, { "epoch": 0.9885057471264368, "grad_norm": 0.017104946076869965, "learning_rate": 1.0514227275885145e-05, "loss": 0.5748, "step": 1204 }, { "epoch": 0.9893267651888341, "grad_norm": 0.015867285430431366, "learning_rate": 1.050107872986777e-05, "loss": 0.5801, "step": 1205 }, { "epoch": 0.9901477832512315, "grad_norm": 0.01650787703692913, "learning_rate": 1.0487929316894137e-05, "loss": 0.5611, "step": 1206 }, { "epoch": 0.9909688013136289, "grad_norm": 0.016580892726778984, "learning_rate": 1.0474779059760581e-05, "loss": 0.5734, "step": 1207 }, { "epoch": 0.9917898193760263, "grad_norm": 0.016696719452738762, "learning_rate": 1.0461627981264917e-05, "loss": 0.6021, "step": 1208 }, { "epoch": 0.9926108374384236, "grad_norm": 0.017108280211687088, "learning_rate": 1.0448476104206368e-05, "loss": 0.6034, "step": 1209 }, { "epoch": 0.993431855500821, "grad_norm": 0.016127055510878563, "learning_rate": 1.0435323451385554e-05, "loss": 0.5834, "step": 1210 }, { "epoch": 0.9942528735632183, "grad_norm": 0.017457881942391396, "learning_rate": 1.0422170045604431e-05, "loss": 0.5751, "step": 1211 }, { "epoch": 0.9950738916256158, "grad_norm": 0.01638905704021454, "learning_rate": 1.0409015909666262e-05, "loss": 0.5614, "step": 1212 }, { "epoch": 0.9958949096880131, "grad_norm": 0.016045620664954185, "learning_rate": 1.0395861066375577e-05, "loss": 0.58, "step": 1213 }, { "epoch": 0.9967159277504105, "grad_norm": 0.016097500920295715, "learning_rate": 1.0382705538538135e-05, "loss": 0.5695, "step": 1214 }, { "epoch": 0.9975369458128078, "grad_norm": 0.01798843964934349, "learning_rate": 1.0369549348960876e-05, "loss": 0.5937, "step": 1215 }, { "epoch": 0.9983579638752053, "grad_norm": 0.01695561595261097, "learning_rate": 1.03563925204519e-05, "loss": 0.5817, "step": 1216 }, { "epoch": 0.9991789819376026, "grad_norm": 0.01571866311132908, "learning_rate": 1.0343235075820391e-05, "loss": 0.602, "step": 1217 }, { "epoch": 1.0, "grad_norm": 0.01669403724372387, "learning_rate": 1.0330077037876625e-05, "loss": 0.5781, "step": 1218 }, { "epoch": 1.0008210180623973, "grad_norm": 0.019914407283067703, "learning_rate": 1.031691842943189e-05, "loss": 0.5289, "step": 1219 }, { "epoch": 1.0016420361247949, "grad_norm": 0.019529756158590317, "learning_rate": 1.0303759273298478e-05, "loss": 0.5369, "step": 1220 }, { "epoch": 1.0024630541871922, "grad_norm": 0.021516941487789154, "learning_rate": 1.0290599592289616e-05, "loss": 0.5441, "step": 1221 }, { "epoch": 1.0032840722495895, "grad_norm": 0.017795894294977188, "learning_rate": 1.0277439409219455e-05, "loss": 0.541, "step": 1222 }, { "epoch": 1.0041050903119868, "grad_norm": 0.019721651449799538, "learning_rate": 1.0264278746902993e-05, "loss": 0.5146, "step": 1223 }, { "epoch": 1.0049261083743843, "grad_norm": 0.016439486294984818, "learning_rate": 1.0251117628156094e-05, "loss": 0.5055, "step": 1224 }, { "epoch": 1.0057471264367817, "grad_norm": 0.018522407859563828, "learning_rate": 1.0237956075795378e-05, "loss": 0.5357, "step": 1225 }, { "epoch": 1.006568144499179, "grad_norm": 0.01696150004863739, "learning_rate": 1.0224794112638242e-05, "loss": 0.5251, "step": 1226 }, { "epoch": 1.0073891625615763, "grad_norm": 0.01692981831729412, "learning_rate": 1.0211631761502788e-05, "loss": 0.5542, "step": 1227 }, { "epoch": 1.0082101806239738, "grad_norm": 0.01729629561305046, "learning_rate": 1.0198469045207779e-05, "loss": 0.5439, "step": 1228 }, { "epoch": 1.0090311986863711, "grad_norm": 0.01669936254620552, "learning_rate": 1.0185305986572624e-05, "loss": 0.5226, "step": 1229 }, { "epoch": 1.0098522167487685, "grad_norm": 0.016856878995895386, "learning_rate": 1.0172142608417325e-05, "loss": 0.5392, "step": 1230 }, { "epoch": 1.0106732348111658, "grad_norm": 0.01798987202346325, "learning_rate": 1.0158978933562433e-05, "loss": 0.5412, "step": 1231 }, { "epoch": 1.0114942528735633, "grad_norm": 0.01633612811565399, "learning_rate": 1.0145814984829016e-05, "loss": 0.5371, "step": 1232 }, { "epoch": 1.0123152709359606, "grad_norm": 0.01756327971816063, "learning_rate": 1.0132650785038613e-05, "loss": 0.5233, "step": 1233 }, { "epoch": 1.013136288998358, "grad_norm": 0.03032851219177246, "learning_rate": 1.0119486357013203e-05, "loss": 0.5439, "step": 1234 }, { "epoch": 1.0139573070607553, "grad_norm": 0.016092654317617416, "learning_rate": 1.010632172357516e-05, "loss": 0.539, "step": 1235 }, { "epoch": 1.0147783251231528, "grad_norm": 0.016442958265542984, "learning_rate": 1.0093156907547215e-05, "loss": 0.5082, "step": 1236 }, { "epoch": 1.0155993431855501, "grad_norm": 0.01820369064807892, "learning_rate": 1.0079991931752407e-05, "loss": 0.5349, "step": 1237 }, { "epoch": 1.0164203612479474, "grad_norm": 0.016815466806292534, "learning_rate": 1.0066826819014066e-05, "loss": 0.5436, "step": 1238 }, { "epoch": 1.0172413793103448, "grad_norm": 0.01730716973543167, "learning_rate": 1.0053661592155746e-05, "loss": 0.5244, "step": 1239 }, { "epoch": 1.0180623973727423, "grad_norm": 0.01757870241999626, "learning_rate": 1.004049627400121e-05, "loss": 0.5479, "step": 1240 }, { "epoch": 1.0188834154351396, "grad_norm": 0.01598929800093174, "learning_rate": 1.0027330887374369e-05, "loss": 0.5541, "step": 1241 }, { "epoch": 1.019704433497537, "grad_norm": 0.017755158245563507, "learning_rate": 1.0014165455099263e-05, "loss": 0.5193, "step": 1242 }, { "epoch": 1.0205254515599342, "grad_norm": 0.016775034368038177, "learning_rate": 1.0001000000000001e-05, "loss": 0.5166, "step": 1243 }, { "epoch": 1.0213464696223318, "grad_norm": 0.016258632764220238, "learning_rate": 9.987834544900743e-06, "loss": 0.5368, "step": 1244 }, { "epoch": 1.022167487684729, "grad_norm": 0.016539812088012695, "learning_rate": 9.974669112625632e-06, "loss": 0.5272, "step": 1245 }, { "epoch": 1.0229885057471264, "grad_norm": 0.016961336135864258, "learning_rate": 9.961503725998794e-06, "loss": 0.5151, "step": 1246 }, { "epoch": 1.0238095238095237, "grad_norm": 0.01664678379893303, "learning_rate": 9.948338407844257e-06, "loss": 0.5188, "step": 1247 }, { "epoch": 1.0246305418719213, "grad_norm": 0.017200790345668793, "learning_rate": 9.935173180985937e-06, "loss": 0.522, "step": 1248 }, { "epoch": 1.0254515599343186, "grad_norm": 0.017180820927023888, "learning_rate": 9.922008068247594e-06, "loss": 0.5157, "step": 1249 }, { "epoch": 1.026272577996716, "grad_norm": 0.015882382169365883, "learning_rate": 9.908843092452789e-06, "loss": 0.506, "step": 1250 }, { "epoch": 1.0270935960591132, "grad_norm": 0.017107807099819183, "learning_rate": 9.895678276424842e-06, "loss": 0.5241, "step": 1251 }, { "epoch": 1.0279146141215108, "grad_norm": 0.016119495034217834, "learning_rate": 9.8825136429868e-06, "loss": 0.5313, "step": 1252 }, { "epoch": 1.028735632183908, "grad_norm": 0.018750004470348358, "learning_rate": 9.86934921496139e-06, "loss": 0.5178, "step": 1253 }, { "epoch": 1.0295566502463054, "grad_norm": 0.015819862484931946, "learning_rate": 9.856185015170988e-06, "loss": 0.5233, "step": 1254 }, { "epoch": 1.0303776683087027, "grad_norm": 0.01587473601102829, "learning_rate": 9.843021066437571e-06, "loss": 0.5588, "step": 1255 }, { "epoch": 1.0311986863711002, "grad_norm": 0.01898243837058544, "learning_rate": 9.829857391582677e-06, "loss": 0.5209, "step": 1256 }, { "epoch": 1.0320197044334976, "grad_norm": 0.016045967116951942, "learning_rate": 9.816694013427377e-06, "loss": 0.5348, "step": 1257 }, { "epoch": 1.0328407224958949, "grad_norm": 0.01862511783838272, "learning_rate": 9.803530954792225e-06, "loss": 0.4998, "step": 1258 }, { "epoch": 1.0336617405582922, "grad_norm": 0.016133230179548264, "learning_rate": 9.790368238497215e-06, "loss": 0.5345, "step": 1259 }, { "epoch": 1.0344827586206897, "grad_norm": 0.017468906939029694, "learning_rate": 9.777205887361758e-06, "loss": 0.5387, "step": 1260 }, { "epoch": 1.035303776683087, "grad_norm": 0.016904247924685478, "learning_rate": 9.764043924204623e-06, "loss": 0.5198, "step": 1261 }, { "epoch": 1.0361247947454844, "grad_norm": 0.017530355602502823, "learning_rate": 9.750882371843912e-06, "loss": 0.5344, "step": 1262 }, { "epoch": 1.0369458128078817, "grad_norm": 0.017848003655672073, "learning_rate": 9.737721253097006e-06, "loss": 0.5183, "step": 1263 }, { "epoch": 1.0377668308702792, "grad_norm": 0.016326379030942917, "learning_rate": 9.724560590780553e-06, "loss": 0.5281, "step": 1264 }, { "epoch": 1.0385878489326765, "grad_norm": 0.01659223437309265, "learning_rate": 9.711400407710388e-06, "loss": 0.563, "step": 1265 }, { "epoch": 1.0394088669950738, "grad_norm": 0.0183429978787899, "learning_rate": 9.698240726701525e-06, "loss": 0.5269, "step": 1266 }, { "epoch": 1.0402298850574712, "grad_norm": 0.01677277311682701, "learning_rate": 9.685081570568111e-06, "loss": 0.5462, "step": 1267 }, { "epoch": 1.0410509031198687, "grad_norm": 0.01725882664322853, "learning_rate": 9.67192296212338e-06, "loss": 0.5332, "step": 1268 }, { "epoch": 1.041871921182266, "grad_norm": 0.01754515990614891, "learning_rate": 9.658764924179613e-06, "loss": 0.5309, "step": 1269 }, { "epoch": 1.0426929392446633, "grad_norm": 0.0168323777616024, "learning_rate": 9.645607479548105e-06, "loss": 0.5404, "step": 1270 }, { "epoch": 1.0435139573070606, "grad_norm": 0.018949126824736595, "learning_rate": 9.632450651039123e-06, "loss": 0.5057, "step": 1271 }, { "epoch": 1.0443349753694582, "grad_norm": 0.016508212313055992, "learning_rate": 9.619294461461867e-06, "loss": 0.5237, "step": 1272 }, { "epoch": 1.0451559934318555, "grad_norm": 0.017521405592560768, "learning_rate": 9.606138933624428e-06, "loss": 0.5549, "step": 1273 }, { "epoch": 1.0459770114942528, "grad_norm": 0.0172991082072258, "learning_rate": 9.592984090333742e-06, "loss": 0.5255, "step": 1274 }, { "epoch": 1.0467980295566504, "grad_norm": 0.016572317108511925, "learning_rate": 9.579829954395573e-06, "loss": 0.5157, "step": 1275 }, { "epoch": 1.0476190476190477, "grad_norm": 0.015742501243948936, "learning_rate": 9.566676548614448e-06, "loss": 0.5497, "step": 1276 }, { "epoch": 1.048440065681445, "grad_norm": 0.01613895408809185, "learning_rate": 9.553523895793631e-06, "loss": 0.5192, "step": 1277 }, { "epoch": 1.0492610837438423, "grad_norm": 0.016447672620415688, "learning_rate": 9.540372018735084e-06, "loss": 0.5156, "step": 1278 }, { "epoch": 1.0500821018062398, "grad_norm": 0.017141524702310562, "learning_rate": 9.527220940239422e-06, "loss": 0.5437, "step": 1279 }, { "epoch": 1.0509031198686372, "grad_norm": 0.01706545241177082, "learning_rate": 9.514070683105867e-06, "loss": 0.5086, "step": 1280 }, { "epoch": 1.0517241379310345, "grad_norm": 0.017160585150122643, "learning_rate": 9.500921270132232e-06, "loss": 0.5321, "step": 1281 }, { "epoch": 1.0525451559934318, "grad_norm": 0.016249317675828934, "learning_rate": 9.487772724114856e-06, "loss": 0.5417, "step": 1282 }, { "epoch": 1.0533661740558293, "grad_norm": 0.016462936997413635, "learning_rate": 9.474625067848582e-06, "loss": 0.54, "step": 1283 }, { "epoch": 1.0541871921182266, "grad_norm": 0.01623479649424553, "learning_rate": 9.4614783241267e-06, "loss": 0.5231, "step": 1284 }, { "epoch": 1.055008210180624, "grad_norm": 0.016937267035245895, "learning_rate": 9.44833251574093e-06, "loss": 0.5352, "step": 1285 }, { "epoch": 1.0558292282430213, "grad_norm": 0.01694582775235176, "learning_rate": 9.435187665481364e-06, "loss": 0.5265, "step": 1286 }, { "epoch": 1.0566502463054188, "grad_norm": 0.016040334478020668, "learning_rate": 9.422043796136438e-06, "loss": 0.5246, "step": 1287 }, { "epoch": 1.0574712643678161, "grad_norm": 0.01696915738284588, "learning_rate": 9.408900930492875e-06, "loss": 0.522, "step": 1288 }, { "epoch": 1.0582922824302134, "grad_norm": 0.016466708853840828, "learning_rate": 9.395759091335673e-06, "loss": 0.507, "step": 1289 }, { "epoch": 1.0591133004926108, "grad_norm": 0.015920262783765793, "learning_rate": 9.382618301448043e-06, "loss": 0.5326, "step": 1290 }, { "epoch": 1.0599343185550083, "grad_norm": 0.017963653430342674, "learning_rate": 9.36947858361138e-06, "loss": 0.5367, "step": 1291 }, { "epoch": 1.0607553366174056, "grad_norm": 0.016437392681837082, "learning_rate": 9.356339960605207e-06, "loss": 0.5484, "step": 1292 }, { "epoch": 1.061576354679803, "grad_norm": 0.016690773889422417, "learning_rate": 9.343202455207177e-06, "loss": 0.5121, "step": 1293 }, { "epoch": 1.0623973727422003, "grad_norm": 0.018437013030052185, "learning_rate": 9.330066090192979e-06, "loss": 0.5395, "step": 1294 }, { "epoch": 1.0632183908045978, "grad_norm": 0.016820982098579407, "learning_rate": 9.316930888336338e-06, "loss": 0.5345, "step": 1295 }, { "epoch": 1.064039408866995, "grad_norm": 0.01658771000802517, "learning_rate": 9.303796872408955e-06, "loss": 0.4993, "step": 1296 }, { "epoch": 1.0648604269293924, "grad_norm": 0.016661234200000763, "learning_rate": 9.290664065180485e-06, "loss": 0.5269, "step": 1297 }, { "epoch": 1.0656814449917897, "grad_norm": 0.01590563915669918, "learning_rate": 9.277532489418478e-06, "loss": 0.5256, "step": 1298 }, { "epoch": 1.0665024630541873, "grad_norm": 0.016190888360142708, "learning_rate": 9.264402167888353e-06, "loss": 0.516, "step": 1299 }, { "epoch": 1.0673234811165846, "grad_norm": 0.016272777691483498, "learning_rate": 9.251273123353357e-06, "loss": 0.5369, "step": 1300 }, { "epoch": 1.068144499178982, "grad_norm": 0.01627669855952263, "learning_rate": 9.238145378574524e-06, "loss": 0.5257, "step": 1301 }, { "epoch": 1.0689655172413792, "grad_norm": 0.016052350401878357, "learning_rate": 9.225018956310623e-06, "loss": 0.5012, "step": 1302 }, { "epoch": 1.0697865353037768, "grad_norm": 0.016666488721966743, "learning_rate": 9.211893879318142e-06, "loss": 0.5375, "step": 1303 }, { "epoch": 1.070607553366174, "grad_norm": 0.016034163534641266, "learning_rate": 9.198770170351235e-06, "loss": 0.5142, "step": 1304 }, { "epoch": 1.0714285714285714, "grad_norm": 0.0167247261852026, "learning_rate": 9.185647852161683e-06, "loss": 0.5326, "step": 1305 }, { "epoch": 1.0722495894909687, "grad_norm": 0.01617669314146042, "learning_rate": 9.172526947498856e-06, "loss": 0.5407, "step": 1306 }, { "epoch": 1.0730706075533663, "grad_norm": 0.016686413437128067, "learning_rate": 9.159407479109668e-06, "loss": 0.5309, "step": 1307 }, { "epoch": 1.0738916256157636, "grad_norm": 0.01717156171798706, "learning_rate": 9.146289469738556e-06, "loss": 0.5468, "step": 1308 }, { "epoch": 1.0747126436781609, "grad_norm": 0.01578836515545845, "learning_rate": 9.133172942127414e-06, "loss": 0.5235, "step": 1309 }, { "epoch": 1.0755336617405582, "grad_norm": 0.01699533872306347, "learning_rate": 9.120057919015572e-06, "loss": 0.5491, "step": 1310 }, { "epoch": 1.0763546798029557, "grad_norm": 0.01674419641494751, "learning_rate": 9.10694442313976e-06, "loss": 0.5232, "step": 1311 }, { "epoch": 1.077175697865353, "grad_norm": 0.01658719591796398, "learning_rate": 9.093832477234046e-06, "loss": 0.5247, "step": 1312 }, { "epoch": 1.0779967159277504, "grad_norm": 0.017214160412549973, "learning_rate": 9.080722104029823e-06, "loss": 0.5126, "step": 1313 }, { "epoch": 1.0788177339901477, "grad_norm": 0.017038801684975624, "learning_rate": 9.067613326255744e-06, "loss": 0.5091, "step": 1314 }, { "epoch": 1.0796387520525452, "grad_norm": 0.017452780157327652, "learning_rate": 9.054506166637716e-06, "loss": 0.5321, "step": 1315 }, { "epoch": 1.0804597701149425, "grad_norm": 0.01623319648206234, "learning_rate": 9.04140064789882e-06, "loss": 0.5119, "step": 1316 }, { "epoch": 1.0812807881773399, "grad_norm": 0.016727503389120102, "learning_rate": 9.028296792759306e-06, "loss": 0.5352, "step": 1317 }, { "epoch": 1.0821018062397372, "grad_norm": 0.017070552334189415, "learning_rate": 9.015194623936532e-06, "loss": 0.5144, "step": 1318 }, { "epoch": 1.0829228243021347, "grad_norm": 0.0167935062199831, "learning_rate": 9.002094164144943e-06, "loss": 0.5166, "step": 1319 }, { "epoch": 1.083743842364532, "grad_norm": 0.015757089480757713, "learning_rate": 8.988995436096003e-06, "loss": 0.5203, "step": 1320 }, { "epoch": 1.0845648604269293, "grad_norm": 0.016263891011476517, "learning_rate": 8.975898462498195e-06, "loss": 0.5164, "step": 1321 }, { "epoch": 1.0853858784893267, "grad_norm": 0.01639363169670105, "learning_rate": 8.962803266056947e-06, "loss": 0.5246, "step": 1322 }, { "epoch": 1.0862068965517242, "grad_norm": 0.017945965752005577, "learning_rate": 8.949709869474609e-06, "loss": 0.5243, "step": 1323 }, { "epoch": 1.0870279146141215, "grad_norm": 0.016737738624215126, "learning_rate": 8.936618295450409e-06, "loss": 0.5177, "step": 1324 }, { "epoch": 1.0878489326765188, "grad_norm": 0.016722463071346283, "learning_rate": 8.923528566680419e-06, "loss": 0.5175, "step": 1325 }, { "epoch": 1.0886699507389164, "grad_norm": 0.01670520193874836, "learning_rate": 8.910440705857512e-06, "loss": 0.5226, "step": 1326 }, { "epoch": 1.0894909688013137, "grad_norm": 0.01651105470955372, "learning_rate": 8.897354735671321e-06, "loss": 0.4913, "step": 1327 }, { "epoch": 1.090311986863711, "grad_norm": 0.01661216840147972, "learning_rate": 8.884270678808196e-06, "loss": 0.5123, "step": 1328 }, { "epoch": 1.0911330049261083, "grad_norm": 0.01767611689865589, "learning_rate": 8.871188557951183e-06, "loss": 0.5404, "step": 1329 }, { "epoch": 1.0919540229885056, "grad_norm": 0.01748446188867092, "learning_rate": 8.858108395779957e-06, "loss": 0.5244, "step": 1330 }, { "epoch": 1.0927750410509032, "grad_norm": 0.01922527886927128, "learning_rate": 8.84503021497081e-06, "loss": 0.5173, "step": 1331 }, { "epoch": 1.0935960591133005, "grad_norm": 0.01692078448832035, "learning_rate": 8.83195403819659e-06, "loss": 0.5246, "step": 1332 }, { "epoch": 1.0944170771756978, "grad_norm": 0.01827174797654152, "learning_rate": 8.818879888126678e-06, "loss": 0.5396, "step": 1333 }, { "epoch": 1.0952380952380953, "grad_norm": 0.0161137655377388, "learning_rate": 8.805807787426933e-06, "loss": 0.5023, "step": 1334 }, { "epoch": 1.0960591133004927, "grad_norm": 0.01763700321316719, "learning_rate": 8.792737758759665e-06, "loss": 0.5226, "step": 1335 }, { "epoch": 1.09688013136289, "grad_norm": 0.017385903745889664, "learning_rate": 8.779669824783596e-06, "loss": 0.5407, "step": 1336 }, { "epoch": 1.0977011494252873, "grad_norm": 0.016956990584731102, "learning_rate": 8.766604008153815e-06, "loss": 0.5349, "step": 1337 }, { "epoch": 1.0985221674876848, "grad_norm": 0.01684456877410412, "learning_rate": 8.753540331521734e-06, "loss": 0.5075, "step": 1338 }, { "epoch": 1.0993431855500821, "grad_norm": 0.016326608136296272, "learning_rate": 8.74047881753506e-06, "loss": 0.5003, "step": 1339 }, { "epoch": 1.1001642036124795, "grad_norm": 0.016543744131922722, "learning_rate": 8.727419488837751e-06, "loss": 0.5258, "step": 1340 }, { "epoch": 1.1009852216748768, "grad_norm": 0.01683054491877556, "learning_rate": 8.714362368069973e-06, "loss": 0.497, "step": 1341 }, { "epoch": 1.1018062397372743, "grad_norm": 0.016806093975901604, "learning_rate": 8.701307477868076e-06, "loss": 0.5303, "step": 1342 }, { "epoch": 1.1026272577996716, "grad_norm": 0.01674148254096508, "learning_rate": 8.688254840864518e-06, "loss": 0.5384, "step": 1343 }, { "epoch": 1.103448275862069, "grad_norm": 0.01635219156742096, "learning_rate": 8.675204479687884e-06, "loss": 0.5177, "step": 1344 }, { "epoch": 1.1042692939244663, "grad_norm": 0.016309024766087532, "learning_rate": 8.662156416962784e-06, "loss": 0.5257, "step": 1345 }, { "epoch": 1.1050903119868638, "grad_norm": 0.016065005213022232, "learning_rate": 8.649110675309862e-06, "loss": 0.5117, "step": 1346 }, { "epoch": 1.1059113300492611, "grad_norm": 0.016376636922359467, "learning_rate": 8.636067277345722e-06, "loss": 0.5211, "step": 1347 }, { "epoch": 1.1067323481116584, "grad_norm": 0.016352273523807526, "learning_rate": 8.623026245682931e-06, "loss": 0.4987, "step": 1348 }, { "epoch": 1.1075533661740558, "grad_norm": 0.01655934378504753, "learning_rate": 8.609987602929922e-06, "loss": 0.534, "step": 1349 }, { "epoch": 1.1083743842364533, "grad_norm": 0.01611988991498947, "learning_rate": 8.59695137169101e-06, "loss": 0.5025, "step": 1350 }, { "epoch": 1.1091954022988506, "grad_norm": 0.017162352800369263, "learning_rate": 8.583917574566313e-06, "loss": 0.5193, "step": 1351 }, { "epoch": 1.110016420361248, "grad_norm": 0.015951579436659813, "learning_rate": 8.570886234151748e-06, "loss": 0.5064, "step": 1352 }, { "epoch": 1.1108374384236452, "grad_norm": 0.01584707573056221, "learning_rate": 8.557857373038955e-06, "loss": 0.5106, "step": 1353 }, { "epoch": 1.1116584564860428, "grad_norm": 0.016321823000907898, "learning_rate": 8.544831013815285e-06, "loss": 0.5148, "step": 1354 }, { "epoch": 1.11247947454844, "grad_norm": 0.01569022797048092, "learning_rate": 8.531807179063745e-06, "loss": 0.5364, "step": 1355 }, { "epoch": 1.1133004926108374, "grad_norm": 0.016389675438404083, "learning_rate": 8.518785891362975e-06, "loss": 0.5212, "step": 1356 }, { "epoch": 1.1141215106732347, "grad_norm": 0.0158466175198555, "learning_rate": 8.505767173287195e-06, "loss": 0.4964, "step": 1357 }, { "epoch": 1.1149425287356323, "grad_norm": 0.01722538098692894, "learning_rate": 8.492751047406164e-06, "loss": 0.5288, "step": 1358 }, { "epoch": 1.1157635467980296, "grad_norm": 0.016668258234858513, "learning_rate": 8.47973753628516e-06, "loss": 0.5194, "step": 1359 }, { "epoch": 1.116584564860427, "grad_norm": 0.016549566760659218, "learning_rate": 8.466726662484911e-06, "loss": 0.5276, "step": 1360 }, { "epoch": 1.1174055829228242, "grad_norm": 0.016378164291381836, "learning_rate": 8.453718448561587e-06, "loss": 0.5483, "step": 1361 }, { "epoch": 1.1182266009852218, "grad_norm": 0.01640506647527218, "learning_rate": 8.44071291706675e-06, "loss": 0.5175, "step": 1362 }, { "epoch": 1.119047619047619, "grad_norm": 0.01651320420205593, "learning_rate": 8.427710090547288e-06, "loss": 0.5279, "step": 1363 }, { "epoch": 1.1198686371100164, "grad_norm": 0.01632227562367916, "learning_rate": 8.414709991545425e-06, "loss": 0.5344, "step": 1364 }, { "epoch": 1.1206896551724137, "grad_norm": 0.016370650380849838, "learning_rate": 8.40171264259865e-06, "loss": 0.516, "step": 1365 }, { "epoch": 1.1215106732348112, "grad_norm": 0.016123030334711075, "learning_rate": 8.388718066239671e-06, "loss": 0.5061, "step": 1366 }, { "epoch": 1.1223316912972086, "grad_norm": 0.016381455585360527, "learning_rate": 8.375726284996403e-06, "loss": 0.555, "step": 1367 }, { "epoch": 1.1231527093596059, "grad_norm": 0.016304366290569305, "learning_rate": 8.362737321391919e-06, "loss": 0.5299, "step": 1368 }, { "epoch": 1.1239737274220032, "grad_norm": 0.015455090440809727, "learning_rate": 8.349751197944388e-06, "loss": 0.5174, "step": 1369 }, { "epoch": 1.1247947454844007, "grad_norm": 0.016206521540880203, "learning_rate": 8.336767937167081e-06, "loss": 0.529, "step": 1370 }, { "epoch": 1.125615763546798, "grad_norm": 0.015927035361528397, "learning_rate": 8.323787561568274e-06, "loss": 0.5039, "step": 1371 }, { "epoch": 1.1264367816091954, "grad_norm": 0.015932176262140274, "learning_rate": 8.310810093651275e-06, "loss": 0.5333, "step": 1372 }, { "epoch": 1.1272577996715927, "grad_norm": 0.016603751108050346, "learning_rate": 8.297835555914333e-06, "loss": 0.5366, "step": 1373 }, { "epoch": 1.1280788177339902, "grad_norm": 0.017465781420469284, "learning_rate": 8.284863970850614e-06, "loss": 0.5334, "step": 1374 }, { "epoch": 1.1288998357963875, "grad_norm": 0.01684074103832245, "learning_rate": 8.271895360948172e-06, "loss": 0.5208, "step": 1375 }, { "epoch": 1.1297208538587848, "grad_norm": 0.0160768274217844, "learning_rate": 8.258929748689909e-06, "loss": 0.5157, "step": 1376 }, { "epoch": 1.1305418719211824, "grad_norm": 0.01851855218410492, "learning_rate": 8.245967156553513e-06, "loss": 0.5059, "step": 1377 }, { "epoch": 1.1313628899835797, "grad_norm": 0.01661522127687931, "learning_rate": 8.233007607011452e-06, "loss": 0.5154, "step": 1378 }, { "epoch": 1.132183908045977, "grad_norm": 0.017074022442102432, "learning_rate": 8.220051122530912e-06, "loss": 0.5359, "step": 1379 }, { "epoch": 1.1330049261083743, "grad_norm": 0.01702861115336418, "learning_rate": 8.207097725573769e-06, "loss": 0.52, "step": 1380 }, { "epoch": 1.1338259441707716, "grad_norm": 0.017408553510904312, "learning_rate": 8.194147438596545e-06, "loss": 0.5155, "step": 1381 }, { "epoch": 1.1346469622331692, "grad_norm": 0.01775670237839222, "learning_rate": 8.181200284050363e-06, "loss": 0.53, "step": 1382 }, { "epoch": 1.1354679802955665, "grad_norm": 0.017499694600701332, "learning_rate": 8.168256284380925e-06, "loss": 0.53, "step": 1383 }, { "epoch": 1.1362889983579638, "grad_norm": 0.017159895971417427, "learning_rate": 8.155315462028469e-06, "loss": 0.5237, "step": 1384 }, { "epoch": 1.1371100164203614, "grad_norm": 0.016443481668829918, "learning_rate": 8.1423778394277e-06, "loss": 0.542, "step": 1385 }, { "epoch": 1.1379310344827587, "grad_norm": 0.01786183752119541, "learning_rate": 8.129443439007806e-06, "loss": 0.5092, "step": 1386 }, { "epoch": 1.138752052545156, "grad_norm": 0.016319354996085167, "learning_rate": 8.11651228319237e-06, "loss": 0.5099, "step": 1387 }, { "epoch": 1.1395730706075533, "grad_norm": 0.026890931650996208, "learning_rate": 8.103584394399356e-06, "loss": 0.5682, "step": 1388 }, { "epoch": 1.1403940886699506, "grad_norm": 0.017409855499863625, "learning_rate": 8.090659795041058e-06, "loss": 0.5422, "step": 1389 }, { "epoch": 1.1412151067323482, "grad_norm": 0.016212008893489838, "learning_rate": 8.077738507524086e-06, "loss": 0.5083, "step": 1390 }, { "epoch": 1.1420361247947455, "grad_norm": 0.01762961409986019, "learning_rate": 8.064820554249278e-06, "loss": 0.53, "step": 1391 }, { "epoch": 1.1428571428571428, "grad_norm": 0.017848225310444832, "learning_rate": 8.05190595761172e-06, "loss": 0.5365, "step": 1392 }, { "epoch": 1.1436781609195403, "grad_norm": 0.015751121565699577, "learning_rate": 8.038994740000664e-06, "loss": 0.5183, "step": 1393 }, { "epoch": 1.1444991789819376, "grad_norm": 0.01844872161746025, "learning_rate": 8.026086923799505e-06, "loss": 0.5327, "step": 1394 }, { "epoch": 1.145320197044335, "grad_norm": 0.018720924854278564, "learning_rate": 8.013182531385749e-06, "loss": 0.5098, "step": 1395 }, { "epoch": 1.1461412151067323, "grad_norm": 0.01789386197924614, "learning_rate": 8.000281585130956e-06, "loss": 0.5038, "step": 1396 }, { "epoch": 1.1469622331691296, "grad_norm": 0.019849328324198723, "learning_rate": 7.987384107400717e-06, "loss": 0.5416, "step": 1397 }, { "epoch": 1.1477832512315271, "grad_norm": 0.015943145379424095, "learning_rate": 7.974490120554614e-06, "loss": 0.5272, "step": 1398 }, { "epoch": 1.1486042692939245, "grad_norm": 0.016612065955996513, "learning_rate": 7.961599646946168e-06, "loss": 0.497, "step": 1399 }, { "epoch": 1.1494252873563218, "grad_norm": 0.018516408279538155, "learning_rate": 7.948712708922814e-06, "loss": 0.5136, "step": 1400 }, { "epoch": 1.1502463054187193, "grad_norm": 0.0165900606662035, "learning_rate": 7.935829328825863e-06, "loss": 0.5373, "step": 1401 }, { "epoch": 1.1510673234811166, "grad_norm": 0.01742474175989628, "learning_rate": 7.922949528990442e-06, "loss": 0.5114, "step": 1402 }, { "epoch": 1.151888341543514, "grad_norm": 0.0173343475908041, "learning_rate": 7.910073331745492e-06, "loss": 0.4975, "step": 1403 }, { "epoch": 1.1527093596059113, "grad_norm": 0.021662328392267227, "learning_rate": 7.897200759413685e-06, "loss": 0.5088, "step": 1404 }, { "epoch": 1.1535303776683088, "grad_norm": 0.017717912793159485, "learning_rate": 7.884331834311434e-06, "loss": 0.5181, "step": 1405 }, { "epoch": 1.154351395730706, "grad_norm": 0.01699252426624298, "learning_rate": 7.871466578748811e-06, "loss": 0.5296, "step": 1406 }, { "epoch": 1.1551724137931034, "grad_norm": 0.016406061127781868, "learning_rate": 7.85860501502953e-06, "loss": 0.5114, "step": 1407 }, { "epoch": 1.1559934318555007, "grad_norm": 0.02024688571691513, "learning_rate": 7.84574716545091e-06, "loss": 0.5026, "step": 1408 }, { "epoch": 1.1568144499178983, "grad_norm": 0.016704188659787178, "learning_rate": 7.832893052303829e-06, "loss": 0.5489, "step": 1409 }, { "epoch": 1.1576354679802956, "grad_norm": 0.017524993047118187, "learning_rate": 7.820042697872676e-06, "loss": 0.5274, "step": 1410 }, { "epoch": 1.158456486042693, "grad_norm": 0.016422366723418236, "learning_rate": 7.807196124435343e-06, "loss": 0.532, "step": 1411 }, { "epoch": 1.1592775041050902, "grad_norm": 0.01628311723470688, "learning_rate": 7.794353354263157e-06, "loss": 0.49, "step": 1412 }, { "epoch": 1.1600985221674878, "grad_norm": 0.016732865944504738, "learning_rate": 7.781514409620849e-06, "loss": 0.5255, "step": 1413 }, { "epoch": 1.160919540229885, "grad_norm": 0.016535811126232147, "learning_rate": 7.768679312766524e-06, "loss": 0.4937, "step": 1414 }, { "epoch": 1.1617405582922824, "grad_norm": 0.016377368941903114, "learning_rate": 7.755848085951609e-06, "loss": 0.512, "step": 1415 }, { "epoch": 1.1625615763546797, "grad_norm": 0.016959168016910553, "learning_rate": 7.743020751420827e-06, "loss": 0.5417, "step": 1416 }, { "epoch": 1.1633825944170773, "grad_norm": 0.017272569239139557, "learning_rate": 7.730197331412162e-06, "loss": 0.5157, "step": 1417 }, { "epoch": 1.1642036124794746, "grad_norm": 0.015872566029429436, "learning_rate": 7.717377848156788e-06, "loss": 0.5375, "step": 1418 }, { "epoch": 1.1650246305418719, "grad_norm": 0.016047483310103416, "learning_rate": 7.704562323879075e-06, "loss": 0.5042, "step": 1419 }, { "epoch": 1.1658456486042692, "grad_norm": 0.016882307827472687, "learning_rate": 7.691750780796522e-06, "loss": 0.4915, "step": 1420 }, { "epoch": 1.1666666666666667, "grad_norm": 0.015864210203289986, "learning_rate": 7.678943241119727e-06, "loss": 0.5014, "step": 1421 }, { "epoch": 1.167487684729064, "grad_norm": 0.016586720943450928, "learning_rate": 7.666139727052344e-06, "loss": 0.527, "step": 1422 }, { "epoch": 1.1683087027914614, "grad_norm": 0.016303809359669685, "learning_rate": 7.653340260791063e-06, "loss": 0.5157, "step": 1423 }, { "epoch": 1.1691297208538587, "grad_norm": 0.01613624021410942, "learning_rate": 7.640544864525526e-06, "loss": 0.5392, "step": 1424 }, { "epoch": 1.1699507389162562, "grad_norm": 0.01656419038772583, "learning_rate": 7.627753560438356e-06, "loss": 0.5485, "step": 1425 }, { "epoch": 1.1707717569786535, "grad_norm": 0.016115980222821236, "learning_rate": 7.614966370705047e-06, "loss": 0.5246, "step": 1426 }, { "epoch": 1.1715927750410509, "grad_norm": 0.015690047293901443, "learning_rate": 7.60218331749399e-06, "loss": 0.505, "step": 1427 }, { "epoch": 1.1724137931034484, "grad_norm": 0.015392713248729706, "learning_rate": 7.5894044229663865e-06, "loss": 0.4866, "step": 1428 }, { "epoch": 1.1732348111658457, "grad_norm": 0.016604576259851456, "learning_rate": 7.576629709276232e-06, "loss": 0.5131, "step": 1429 }, { "epoch": 1.174055829228243, "grad_norm": 0.01614469103515148, "learning_rate": 7.563859198570276e-06, "loss": 0.5024, "step": 1430 }, { "epoch": 1.1748768472906403, "grad_norm": 0.016882969066500664, "learning_rate": 7.551092912987986e-06, "loss": 0.4971, "step": 1431 }, { "epoch": 1.1756978653530377, "grad_norm": 0.01611751690506935, "learning_rate": 7.538330874661489e-06, "loss": 0.493, "step": 1432 }, { "epoch": 1.1765188834154352, "grad_norm": 0.01701856032013893, "learning_rate": 7.525573105715569e-06, "loss": 0.4982, "step": 1433 }, { "epoch": 1.1773399014778325, "grad_norm": 0.016138305887579918, "learning_rate": 7.5128196282675954e-06, "loss": 0.4979, "step": 1434 }, { "epoch": 1.1781609195402298, "grad_norm": 0.01598946936428547, "learning_rate": 7.500070464427499e-06, "loss": 0.5286, "step": 1435 }, { "epoch": 1.1789819376026274, "grad_norm": 0.015963122248649597, "learning_rate": 7.4873256362977385e-06, "loss": 0.5367, "step": 1436 }, { "epoch": 1.1798029556650247, "grad_norm": 0.015564845874905586, "learning_rate": 7.4745851659732475e-06, "loss": 0.4813, "step": 1437 }, { "epoch": 1.180623973727422, "grad_norm": 0.0165276899933815, "learning_rate": 7.461849075541409e-06, "loss": 0.5036, "step": 1438 }, { "epoch": 1.1814449917898193, "grad_norm": 0.01595321111381054, "learning_rate": 7.449117387082021e-06, "loss": 0.5216, "step": 1439 }, { "epoch": 1.1822660098522166, "grad_norm": 0.016018273308873177, "learning_rate": 7.436390122667228e-06, "loss": 0.5354, "step": 1440 }, { "epoch": 1.1830870279146142, "grad_norm": 0.015979217365384102, "learning_rate": 7.423667304361531e-06, "loss": 0.507, "step": 1441 }, { "epoch": 1.1839080459770115, "grad_norm": 0.016732502728700638, "learning_rate": 7.410948954221708e-06, "loss": 0.5587, "step": 1442 }, { "epoch": 1.1847290640394088, "grad_norm": 0.016822757199406624, "learning_rate": 7.398235094296792e-06, "loss": 0.4963, "step": 1443 }, { "epoch": 1.1855500821018063, "grad_norm": 0.01609109342098236, "learning_rate": 7.385525746628029e-06, "loss": 0.5069, "step": 1444 }, { "epoch": 1.1863711001642037, "grad_norm": 0.016064146533608437, "learning_rate": 7.372820933248861e-06, "loss": 0.5093, "step": 1445 }, { "epoch": 1.187192118226601, "grad_norm": 0.01715187355875969, "learning_rate": 7.360120676184836e-06, "loss": 0.5271, "step": 1446 }, { "epoch": 1.1880131362889983, "grad_norm": 0.015517405234277248, "learning_rate": 7.347424997453636e-06, "loss": 0.5039, "step": 1447 }, { "epoch": 1.1888341543513956, "grad_norm": 0.016599643975496292, "learning_rate": 7.334733919064985e-06, "loss": 0.5009, "step": 1448 }, { "epoch": 1.1896551724137931, "grad_norm": 0.016821494325995445, "learning_rate": 7.322047463020641e-06, "loss": 0.5428, "step": 1449 }, { "epoch": 1.1904761904761905, "grad_norm": 0.017970936372876167, "learning_rate": 7.309365651314347e-06, "loss": 0.5433, "step": 1450 }, { "epoch": 1.1912972085385878, "grad_norm": 0.016280682757496834, "learning_rate": 7.296688505931787e-06, "loss": 0.5191, "step": 1451 }, { "epoch": 1.1921182266009853, "grad_norm": 0.01773475483059883, "learning_rate": 7.284016048850565e-06, "loss": 0.4886, "step": 1452 }, { "epoch": 1.1929392446633826, "grad_norm": 0.01603797823190689, "learning_rate": 7.271348302040159e-06, "loss": 0.5034, "step": 1453 }, { "epoch": 1.19376026272578, "grad_norm": 0.016318537294864655, "learning_rate": 7.258685287461869e-06, "loss": 0.5034, "step": 1454 }, { "epoch": 1.1945812807881773, "grad_norm": 0.016924038529396057, "learning_rate": 7.246027027068798e-06, "loss": 0.5077, "step": 1455 }, { "epoch": 1.1954022988505748, "grad_norm": 0.017280930653214455, "learning_rate": 7.233373542805813e-06, "loss": 0.5255, "step": 1456 }, { "epoch": 1.1962233169129721, "grad_norm": 0.017569996416568756, "learning_rate": 7.2207248566094865e-06, "loss": 0.5071, "step": 1457 }, { "epoch": 1.1970443349753694, "grad_norm": 0.016519127413630486, "learning_rate": 7.208080990408087e-06, "loss": 0.5226, "step": 1458 }, { "epoch": 1.1978653530377668, "grad_norm": 0.017674347385764122, "learning_rate": 7.195441966121516e-06, "loss": 0.4943, "step": 1459 }, { "epoch": 1.1986863711001643, "grad_norm": 0.01680927909910679, "learning_rate": 7.182807805661291e-06, "loss": 0.5075, "step": 1460 }, { "epoch": 1.1995073891625616, "grad_norm": 0.016115231439471245, "learning_rate": 7.17017853093049e-06, "loss": 0.4942, "step": 1461 }, { "epoch": 1.200328407224959, "grad_norm": 0.016495777294039726, "learning_rate": 7.1575541638237215e-06, "loss": 0.473, "step": 1462 }, { "epoch": 1.2011494252873562, "grad_norm": 0.01661892607808113, "learning_rate": 7.144934726227089e-06, "loss": 0.5083, "step": 1463 }, { "epoch": 1.2019704433497538, "grad_norm": 0.01646577939391136, "learning_rate": 7.132320240018149e-06, "loss": 0.5076, "step": 1464 }, { "epoch": 1.202791461412151, "grad_norm": 0.01637924090027809, "learning_rate": 7.11971072706587e-06, "loss": 0.514, "step": 1465 }, { "epoch": 1.2036124794745484, "grad_norm": 0.01622910425066948, "learning_rate": 7.107106209230606e-06, "loss": 0.4992, "step": 1466 }, { "epoch": 1.2044334975369457, "grad_norm": 0.017483368515968323, "learning_rate": 7.094506708364049e-06, "loss": 0.5043, "step": 1467 }, { "epoch": 1.2052545155993433, "grad_norm": 0.016326293349266052, "learning_rate": 7.081912246309187e-06, "loss": 0.4975, "step": 1468 }, { "epoch": 1.2060755336617406, "grad_norm": 0.016601387411355972, "learning_rate": 7.069322844900281e-06, "loss": 0.5, "step": 1469 }, { "epoch": 1.206896551724138, "grad_norm": 0.017025040462613106, "learning_rate": 7.056738525962815e-06, "loss": 0.4902, "step": 1470 }, { "epoch": 1.2077175697865352, "grad_norm": 0.01618053950369358, "learning_rate": 7.044159311313459e-06, "loss": 0.504, "step": 1471 }, { "epoch": 1.2085385878489328, "grad_norm": 0.01677946001291275, "learning_rate": 7.031585222760043e-06, "loss": 0.5012, "step": 1472 }, { "epoch": 1.20935960591133, "grad_norm": 0.01756678707897663, "learning_rate": 7.019016282101495e-06, "loss": 0.5087, "step": 1473 }, { "epoch": 1.2101806239737274, "grad_norm": 0.016086481511592865, "learning_rate": 7.0064525111278325e-06, "loss": 0.5086, "step": 1474 }, { "epoch": 1.2110016420361247, "grad_norm": 0.017537886276841164, "learning_rate": 6.99389393162011e-06, "loss": 0.4939, "step": 1475 }, { "epoch": 1.2118226600985222, "grad_norm": 0.015980638563632965, "learning_rate": 6.981340565350369e-06, "loss": 0.5186, "step": 1476 }, { "epoch": 1.2126436781609196, "grad_norm": 0.017119966447353363, "learning_rate": 6.968792434081627e-06, "loss": 0.5157, "step": 1477 }, { "epoch": 1.2134646962233169, "grad_norm": 0.016454966738820076, "learning_rate": 6.9562495595678215e-06, "loss": 0.5322, "step": 1478 }, { "epoch": 1.2142857142857142, "grad_norm": 0.016483623534440994, "learning_rate": 6.943711963553769e-06, "loss": 0.5093, "step": 1479 }, { "epoch": 1.2151067323481117, "grad_norm": 0.016667401418089867, "learning_rate": 6.9311796677751485e-06, "loss": 0.5169, "step": 1480 }, { "epoch": 1.215927750410509, "grad_norm": 0.016067208722233772, "learning_rate": 6.918652693958443e-06, "loss": 0.5083, "step": 1481 }, { "epoch": 1.2167487684729064, "grad_norm": 0.016918374225497246, "learning_rate": 6.906131063820907e-06, "loss": 0.5392, "step": 1482 }, { "epoch": 1.2175697865353037, "grad_norm": 0.016905425116419792, "learning_rate": 6.8936147990705345e-06, "loss": 0.489, "step": 1483 }, { "epoch": 1.2183908045977012, "grad_norm": 0.015728887170553207, "learning_rate": 6.881103921406017e-06, "loss": 0.4836, "step": 1484 }, { "epoch": 1.2192118226600985, "grad_norm": 0.016742896288633347, "learning_rate": 6.868598452516704e-06, "loss": 0.49, "step": 1485 }, { "epoch": 1.2200328407224958, "grad_norm": 0.017370697110891342, "learning_rate": 6.85609841408258e-06, "loss": 0.5211, "step": 1486 }, { "epoch": 1.2208538587848934, "grad_norm": 0.015931809321045876, "learning_rate": 6.843603827774194e-06, "loss": 0.4917, "step": 1487 }, { "epoch": 1.2216748768472907, "grad_norm": 0.016627846285700798, "learning_rate": 6.8311147152526604e-06, "loss": 0.5224, "step": 1488 }, { "epoch": 1.222495894909688, "grad_norm": 0.016920849680900574, "learning_rate": 6.818631098169602e-06, "loss": 0.5041, "step": 1489 }, { "epoch": 1.2233169129720853, "grad_norm": 0.01656419038772583, "learning_rate": 6.8061529981671045e-06, "loss": 0.4921, "step": 1490 }, { "epoch": 1.2241379310344827, "grad_norm": 0.018271589651703835, "learning_rate": 6.793680436877697e-06, "loss": 0.5352, "step": 1491 }, { "epoch": 1.2249589490968802, "grad_norm": 0.01682882569730282, "learning_rate": 6.781213435924311e-06, "loss": 0.5093, "step": 1492 }, { "epoch": 1.2257799671592775, "grad_norm": 0.01607169210910797, "learning_rate": 6.768752016920223e-06, "loss": 0.5004, "step": 1493 }, { "epoch": 1.2266009852216748, "grad_norm": 0.016821319237351418, "learning_rate": 6.756296201469052e-06, "loss": 0.4994, "step": 1494 }, { "epoch": 1.2274220032840724, "grad_norm": 0.016252487897872925, "learning_rate": 6.743846011164682e-06, "loss": 0.4784, "step": 1495 }, { "epoch": 1.2282430213464697, "grad_norm": 0.015882575884461403, "learning_rate": 6.7314014675912595e-06, "loss": 0.5046, "step": 1496 }, { "epoch": 1.229064039408867, "grad_norm": 0.016344523057341576, "learning_rate": 6.718962592323142e-06, "loss": 0.5252, "step": 1497 }, { "epoch": 1.2298850574712643, "grad_norm": 0.016861334443092346, "learning_rate": 6.706529406924848e-06, "loss": 0.5016, "step": 1498 }, { "epoch": 1.2307060755336616, "grad_norm": 0.016886170953512192, "learning_rate": 6.694101932951041e-06, "loss": 0.4948, "step": 1499 }, { "epoch": 1.2315270935960592, "grad_norm": 0.016344819217920303, "learning_rate": 6.681680191946487e-06, "loss": 0.497, "step": 1500 }, { "epoch": 1.2323481116584565, "grad_norm": 0.01840398460626602, "learning_rate": 6.669264205445998e-06, "loss": 0.4838, "step": 1501 }, { "epoch": 1.2331691297208538, "grad_norm": 0.016673199832439423, "learning_rate": 6.656853994974428e-06, "loss": 0.5179, "step": 1502 }, { "epoch": 1.2339901477832513, "grad_norm": 0.01643490605056286, "learning_rate": 6.6444495820466045e-06, "loss": 0.5195, "step": 1503 }, { "epoch": 1.2348111658456487, "grad_norm": 0.018148941919207573, "learning_rate": 6.6320509881673086e-06, "loss": 0.5061, "step": 1504 }, { "epoch": 1.235632183908046, "grad_norm": 0.01568223536014557, "learning_rate": 6.6196582348312335e-06, "loss": 0.5017, "step": 1505 }, { "epoch": 1.2364532019704433, "grad_norm": 0.016562843695282936, "learning_rate": 6.607271343522944e-06, "loss": 0.5042, "step": 1506 }, { "epoch": 1.2372742200328406, "grad_norm": 0.018032953143119812, "learning_rate": 6.594890335716846e-06, "loss": 0.5051, "step": 1507 }, { "epoch": 1.2380952380952381, "grad_norm": 0.016838086768984795, "learning_rate": 6.58251523287715e-06, "loss": 0.5187, "step": 1508 }, { "epoch": 1.2389162561576355, "grad_norm": 0.01805775798857212, "learning_rate": 6.570146056457811e-06, "loss": 0.5156, "step": 1509 }, { "epoch": 1.2397372742200328, "grad_norm": 0.018496623262763023, "learning_rate": 6.557782827902532e-06, "loss": 0.5052, "step": 1510 }, { "epoch": 1.2405582922824303, "grad_norm": 0.015971243381500244, "learning_rate": 6.545425568644696e-06, "loss": 0.4966, "step": 1511 }, { "epoch": 1.2413793103448276, "grad_norm": 0.017790451645851135, "learning_rate": 6.533074300107328e-06, "loss": 0.5104, "step": 1512 }, { "epoch": 1.242200328407225, "grad_norm": 0.016816338524222374, "learning_rate": 6.520729043703083e-06, "loss": 0.5111, "step": 1513 }, { "epoch": 1.2430213464696223, "grad_norm": 0.017138054594397545, "learning_rate": 6.508389820834185e-06, "loss": 0.4946, "step": 1514 }, { "epoch": 1.2438423645320198, "grad_norm": 0.017374210059642792, "learning_rate": 6.496056652892391e-06, "loss": 0.5174, "step": 1515 }, { "epoch": 1.2446633825944171, "grad_norm": 0.017023487016558647, "learning_rate": 6.48372956125898e-06, "loss": 0.5306, "step": 1516 }, { "epoch": 1.2454844006568144, "grad_norm": 0.016883185133337975, "learning_rate": 6.471408567304677e-06, "loss": 0.5094, "step": 1517 }, { "epoch": 1.2463054187192117, "grad_norm": 0.016680976375937462, "learning_rate": 6.459093692389646e-06, "loss": 0.4811, "step": 1518 }, { "epoch": 1.2471264367816093, "grad_norm": 0.01691296324133873, "learning_rate": 6.446784957863445e-06, "loss": 0.5211, "step": 1519 }, { "epoch": 1.2479474548440066, "grad_norm": 0.01626395620405674, "learning_rate": 6.434482385064979e-06, "loss": 0.5086, "step": 1520 }, { "epoch": 1.248768472906404, "grad_norm": 0.017594145610928535, "learning_rate": 6.422185995322473e-06, "loss": 0.5008, "step": 1521 }, { "epoch": 1.2495894909688012, "grad_norm": 0.016415350139141083, "learning_rate": 6.409895809953441e-06, "loss": 0.5016, "step": 1522 }, { "epoch": 1.2504105090311988, "grad_norm": 0.016344871371984482, "learning_rate": 6.3976118502646274e-06, "loss": 0.5121, "step": 1523 }, { "epoch": 1.251231527093596, "grad_norm": 0.016999276354908943, "learning_rate": 6.385334137551996e-06, "loss": 0.5304, "step": 1524 }, { "epoch": 1.2520525451559934, "grad_norm": 0.016115089878439903, "learning_rate": 6.373062693100673e-06, "loss": 0.4957, "step": 1525 }, { "epoch": 1.2528735632183907, "grad_norm": 0.016718868166208267, "learning_rate": 6.360797538184919e-06, "loss": 0.5014, "step": 1526 }, { "epoch": 1.2536945812807883, "grad_norm": 0.01737746223807335, "learning_rate": 6.3485386940680915e-06, "loss": 0.5117, "step": 1527 }, { "epoch": 1.2545155993431856, "grad_norm": 0.01626908965408802, "learning_rate": 6.336286182002605e-06, "loss": 0.5218, "step": 1528 }, { "epoch": 1.2553366174055829, "grad_norm": 0.017156345769762993, "learning_rate": 6.324040023229903e-06, "loss": 0.4991, "step": 1529 }, { "epoch": 1.2561576354679804, "grad_norm": 0.016035031527280807, "learning_rate": 6.311800238980413e-06, "loss": 0.5133, "step": 1530 }, { "epoch": 1.2569786535303777, "grad_norm": 0.01571383886039257, "learning_rate": 6.299566850473502e-06, "loss": 0.5138, "step": 1531 }, { "epoch": 1.257799671592775, "grad_norm": 0.017963556572794914, "learning_rate": 6.28733987891746e-06, "loss": 0.4895, "step": 1532 }, { "epoch": 1.2586206896551724, "grad_norm": 0.015598037280142307, "learning_rate": 6.2751193455094484e-06, "loss": 0.4978, "step": 1533 }, { "epoch": 1.2594417077175697, "grad_norm": 0.01583089493215084, "learning_rate": 6.262905271435462e-06, "loss": 0.5197, "step": 1534 }, { "epoch": 1.2602627257799672, "grad_norm": 0.015680553391575813, "learning_rate": 6.250697677870311e-06, "loss": 0.5082, "step": 1535 }, { "epoch": 1.2610837438423645, "grad_norm": 0.01589246466755867, "learning_rate": 6.238496585977559e-06, "loss": 0.4985, "step": 1536 }, { "epoch": 1.2619047619047619, "grad_norm": 0.016225799918174744, "learning_rate": 6.226302016909499e-06, "loss": 0.5294, "step": 1537 }, { "epoch": 1.2627257799671594, "grad_norm": 0.01647462137043476, "learning_rate": 6.2141139918071216e-06, "loss": 0.5151, "step": 1538 }, { "epoch": 1.2635467980295567, "grad_norm": 0.01597808487713337, "learning_rate": 6.201932531800064e-06, "loss": 0.4941, "step": 1539 }, { "epoch": 1.264367816091954, "grad_norm": 0.01601661555469036, "learning_rate": 6.18975765800659e-06, "loss": 0.5005, "step": 1540 }, { "epoch": 1.2651888341543513, "grad_norm": 0.016257472336292267, "learning_rate": 6.177589391533547e-06, "loss": 0.5064, "step": 1541 }, { "epoch": 1.2660098522167487, "grad_norm": 0.015633413568139076, "learning_rate": 6.165427753476312e-06, "loss": 0.5014, "step": 1542 }, { "epoch": 1.2668308702791462, "grad_norm": 0.016674188897013664, "learning_rate": 6.153272764918792e-06, "loss": 0.5006, "step": 1543 }, { "epoch": 1.2676518883415435, "grad_norm": 0.016763975843787193, "learning_rate": 6.141124446933352e-06, "loss": 0.516, "step": 1544 }, { "epoch": 1.2684729064039408, "grad_norm": 0.016861554235219955, "learning_rate": 6.128982820580794e-06, "loss": 0.5138, "step": 1545 }, { "epoch": 1.2692939244663384, "grad_norm": 0.016086319461464882, "learning_rate": 6.116847906910321e-06, "loss": 0.5197, "step": 1546 }, { "epoch": 1.2701149425287357, "grad_norm": 0.016256313771009445, "learning_rate": 6.1047197269595096e-06, "loss": 0.5163, "step": 1547 }, { "epoch": 1.270935960591133, "grad_norm": 0.017210407182574272, "learning_rate": 6.092598301754237e-06, "loss": 0.5123, "step": 1548 }, { "epoch": 1.2717569786535303, "grad_norm": 0.01670677959918976, "learning_rate": 6.0804836523086995e-06, "loss": 0.5081, "step": 1549 }, { "epoch": 1.2725779967159276, "grad_norm": 0.01633666455745697, "learning_rate": 6.068375799625319e-06, "loss": 0.5328, "step": 1550 }, { "epoch": 1.2733990147783252, "grad_norm": 0.016534943133592606, "learning_rate": 6.056274764694756e-06, "loss": 0.4981, "step": 1551 }, { "epoch": 1.2742200328407225, "grad_norm": 0.01696372590959072, "learning_rate": 6.044180568495843e-06, "loss": 0.5178, "step": 1552 }, { "epoch": 1.2750410509031198, "grad_norm": 0.016210688278079033, "learning_rate": 6.0320932319955525e-06, "loss": 0.5038, "step": 1553 }, { "epoch": 1.2758620689655173, "grad_norm": 0.016511831432580948, "learning_rate": 6.020012776148972e-06, "loss": 0.4935, "step": 1554 }, { "epoch": 1.2766830870279147, "grad_norm": 0.0165561530739069, "learning_rate": 6.0079392218992606e-06, "loss": 0.5025, "step": 1555 }, { "epoch": 1.277504105090312, "grad_norm": 0.015603486448526382, "learning_rate": 5.995872590177602e-06, "loss": 0.5154, "step": 1556 }, { "epoch": 1.2783251231527093, "grad_norm": 0.016517499461770058, "learning_rate": 5.983812901903191e-06, "loss": 0.5342, "step": 1557 }, { "epoch": 1.2791461412151066, "grad_norm": 0.023761102929711342, "learning_rate": 5.971760177983179e-06, "loss": 0.5425, "step": 1558 }, { "epoch": 1.2799671592775042, "grad_norm": 0.017194963991642, "learning_rate": 5.9597144393126435e-06, "loss": 0.5149, "step": 1559 }, { "epoch": 1.2807881773399015, "grad_norm": 0.016303617507219315, "learning_rate": 5.947675706774559e-06, "loss": 0.4875, "step": 1560 }, { "epoch": 1.2816091954022988, "grad_norm": 0.016939660534262657, "learning_rate": 5.935644001239738e-06, "loss": 0.5021, "step": 1561 }, { "epoch": 1.2824302134646963, "grad_norm": 0.01659572310745716, "learning_rate": 5.923619343566827e-06, "loss": 0.5118, "step": 1562 }, { "epoch": 1.2832512315270936, "grad_norm": 0.016531217843294144, "learning_rate": 5.911601754602253e-06, "loss": 0.5132, "step": 1563 }, { "epoch": 1.284072249589491, "grad_norm": 0.016553040593862534, "learning_rate": 5.899591255180174e-06, "loss": 0.4985, "step": 1564 }, { "epoch": 1.2848932676518883, "grad_norm": 0.016095608472824097, "learning_rate": 5.887587866122471e-06, "loss": 0.5258, "step": 1565 }, { "epoch": 1.2857142857142856, "grad_norm": 0.01720551960170269, "learning_rate": 5.875591608238696e-06, "loss": 0.5132, "step": 1566 }, { "epoch": 1.2865353037766831, "grad_norm": 0.016453083604574203, "learning_rate": 5.863602502326032e-06, "loss": 0.5178, "step": 1567 }, { "epoch": 1.2873563218390804, "grad_norm": 0.015897171571850777, "learning_rate": 5.851620569169266e-06, "loss": 0.4696, "step": 1568 }, { "epoch": 1.2881773399014778, "grad_norm": 0.017231963574886322, "learning_rate": 5.839645829540762e-06, "loss": 0.5177, "step": 1569 }, { "epoch": 1.2889983579638753, "grad_norm": 0.016255341470241547, "learning_rate": 5.827678304200387e-06, "loss": 0.5019, "step": 1570 }, { "epoch": 1.2898193760262726, "grad_norm": 0.015828507021069527, "learning_rate": 5.815718013895527e-06, "loss": 0.4748, "step": 1571 }, { "epoch": 1.29064039408867, "grad_norm": 0.016325993463397026, "learning_rate": 5.803764979361011e-06, "loss": 0.5078, "step": 1572 }, { "epoch": 1.2914614121510672, "grad_norm": 0.01684304140508175, "learning_rate": 5.791819221319089e-06, "loss": 0.5094, "step": 1573 }, { "epoch": 1.2922824302134646, "grad_norm": 0.015952205285429955, "learning_rate": 5.779880760479403e-06, "loss": 0.4965, "step": 1574 }, { "epoch": 1.293103448275862, "grad_norm": 0.017252076417207718, "learning_rate": 5.7679496175389384e-06, "loss": 0.5119, "step": 1575 }, { "epoch": 1.2939244663382594, "grad_norm": 0.016201423481106758, "learning_rate": 5.756025813182001e-06, "loss": 0.5036, "step": 1576 }, { "epoch": 1.2947454844006567, "grad_norm": 0.01624220982193947, "learning_rate": 5.7441093680801665e-06, "loss": 0.5196, "step": 1577 }, { "epoch": 1.2955665024630543, "grad_norm": 0.017581792548298836, "learning_rate": 5.732200302892252e-06, "loss": 0.5003, "step": 1578 }, { "epoch": 1.2963875205254516, "grad_norm": 0.016009293496608734, "learning_rate": 5.7202986382642874e-06, "loss": 0.4868, "step": 1579 }, { "epoch": 1.297208538587849, "grad_norm": 0.016555778682231903, "learning_rate": 5.708404394829476e-06, "loss": 0.5026, "step": 1580 }, { "epoch": 1.2980295566502464, "grad_norm": 0.0167497880756855, "learning_rate": 5.696517593208134e-06, "loss": 0.4911, "step": 1581 }, { "epoch": 1.2988505747126438, "grad_norm": 0.016563214361667633, "learning_rate": 5.684638254007702e-06, "loss": 0.5186, "step": 1582 }, { "epoch": 1.299671592775041, "grad_norm": 0.016196226701140404, "learning_rate": 5.672766397822665e-06, "loss": 0.4856, "step": 1583 }, { "epoch": 1.3004926108374384, "grad_norm": 0.0176117941737175, "learning_rate": 5.660902045234548e-06, "loss": 0.4831, "step": 1584 }, { "epoch": 1.3013136288998357, "grad_norm": 0.01588163711130619, "learning_rate": 5.649045216811862e-06, "loss": 0.4958, "step": 1585 }, { "epoch": 1.3021346469622332, "grad_norm": 0.01738322339951992, "learning_rate": 5.637195933110063e-06, "loss": 0.4886, "step": 1586 }, { "epoch": 1.3029556650246306, "grad_norm": 0.0190008245408535, "learning_rate": 5.6253542146715486e-06, "loss": 0.5036, "step": 1587 }, { "epoch": 1.3037766830870279, "grad_norm": 0.016569014638662338, "learning_rate": 5.613520082025592e-06, "loss": 0.499, "step": 1588 }, { "epoch": 1.3045977011494254, "grad_norm": 0.01771748811006546, "learning_rate": 5.6016935556883054e-06, "loss": 0.4942, "step": 1589 }, { "epoch": 1.3054187192118227, "grad_norm": 0.016706738620996475, "learning_rate": 5.589874656162625e-06, "loss": 0.4964, "step": 1590 }, { "epoch": 1.30623973727422, "grad_norm": 0.0162707157433033, "learning_rate": 5.578063403938266e-06, "loss": 0.5077, "step": 1591 }, { "epoch": 1.3070607553366174, "grad_norm": 0.017909372225403786, "learning_rate": 5.566259819491679e-06, "loss": 0.5191, "step": 1592 }, { "epoch": 1.3078817733990147, "grad_norm": 0.01742081716656685, "learning_rate": 5.554463923286026e-06, "loss": 0.5278, "step": 1593 }, { "epoch": 1.3087027914614122, "grad_norm": 0.016257621347904205, "learning_rate": 5.5426757357711355e-06, "loss": 0.517, "step": 1594 }, { "epoch": 1.3095238095238095, "grad_norm": 0.016794035211205482, "learning_rate": 5.530895277383476e-06, "loss": 0.5039, "step": 1595 }, { "epoch": 1.3103448275862069, "grad_norm": 0.017451316118240356, "learning_rate": 5.519122568546124e-06, "loss": 0.5007, "step": 1596 }, { "epoch": 1.3111658456486044, "grad_norm": 0.015949085354804993, "learning_rate": 5.507357629668705e-06, "loss": 0.4948, "step": 1597 }, { "epoch": 1.3119868637110017, "grad_norm": 0.016737988218665123, "learning_rate": 5.495600481147384e-06, "loss": 0.4788, "step": 1598 }, { "epoch": 1.312807881773399, "grad_norm": 0.01740618795156479, "learning_rate": 5.48385114336482e-06, "loss": 0.4987, "step": 1599 }, { "epoch": 1.3136288998357963, "grad_norm": 0.016057351604104042, "learning_rate": 5.472109636690127e-06, "loss": 0.5186, "step": 1600 }, { "epoch": 1.3144499178981937, "grad_norm": 0.016722042113542557, "learning_rate": 5.460375981478849e-06, "loss": 0.5045, "step": 1601 }, { "epoch": 1.3152709359605912, "grad_norm": 0.01601603627204895, "learning_rate": 5.448650198072914e-06, "loss": 0.4865, "step": 1602 }, { "epoch": 1.3160919540229885, "grad_norm": 0.016221648082137108, "learning_rate": 5.436932306800597e-06, "loss": 0.4943, "step": 1603 }, { "epoch": 1.3169129720853858, "grad_norm": 0.016737889498472214, "learning_rate": 5.425222327976509e-06, "loss": 0.5061, "step": 1604 }, { "epoch": 1.3177339901477834, "grad_norm": 0.015825318172574043, "learning_rate": 5.413520281901525e-06, "loss": 0.5061, "step": 1605 }, { "epoch": 1.3185550082101807, "grad_norm": 0.016169486567378044, "learning_rate": 5.401826188862774e-06, "loss": 0.5204, "step": 1606 }, { "epoch": 1.319376026272578, "grad_norm": 0.01563972979784012, "learning_rate": 5.390140069133602e-06, "loss": 0.494, "step": 1607 }, { "epoch": 1.3201970443349753, "grad_norm": 0.016117781400680542, "learning_rate": 5.378461942973523e-06, "loss": 0.5189, "step": 1608 }, { "epoch": 1.3210180623973726, "grad_norm": 0.016807863488793373, "learning_rate": 5.3667918306282065e-06, "loss": 0.5241, "step": 1609 }, { "epoch": 1.3218390804597702, "grad_norm": 0.0390302836894989, "learning_rate": 5.3551297523294155e-06, "loss": 0.505, "step": 1610 }, { "epoch": 1.3226600985221675, "grad_norm": 0.016778867691755295, "learning_rate": 5.343475728294987e-06, "loss": 0.5064, "step": 1611 }, { "epoch": 1.3234811165845648, "grad_norm": 0.01639803871512413, "learning_rate": 5.3318297787288e-06, "loss": 0.4986, "step": 1612 }, { "epoch": 1.3243021346469623, "grad_norm": 0.01598125323653221, "learning_rate": 5.320191923820744e-06, "loss": 0.4973, "step": 1613 }, { "epoch": 1.3251231527093597, "grad_norm": 0.015462282113730907, "learning_rate": 5.308562183746647e-06, "loss": 0.4974, "step": 1614 }, { "epoch": 1.325944170771757, "grad_norm": 0.01637648046016693, "learning_rate": 5.2969405786682985e-06, "loss": 0.5009, "step": 1615 }, { "epoch": 1.3267651888341543, "grad_norm": 0.01748686470091343, "learning_rate": 5.285327128733364e-06, "loss": 0.4965, "step": 1616 }, { "epoch": 1.3275862068965516, "grad_norm": 0.017259908840060234, "learning_rate": 5.2737218540753875e-06, "loss": 0.4992, "step": 1617 }, { "epoch": 1.3284072249589491, "grad_norm": 0.016680024564266205, "learning_rate": 5.262124774813728e-06, "loss": 0.4804, "step": 1618 }, { "epoch": 1.3292282430213465, "grad_norm": 0.016793936491012573, "learning_rate": 5.250535911053538e-06, "loss": 0.505, "step": 1619 }, { "epoch": 1.3300492610837438, "grad_norm": 0.016460614278912544, "learning_rate": 5.238955282885733e-06, "loss": 0.4807, "step": 1620 }, { "epoch": 1.3308702791461413, "grad_norm": 0.017218593508005142, "learning_rate": 5.227382910386957e-06, "loss": 0.5196, "step": 1621 }, { "epoch": 1.3316912972085386, "grad_norm": 0.018216732889413834, "learning_rate": 5.2158188136195144e-06, "loss": 0.4919, "step": 1622 }, { "epoch": 1.332512315270936, "grad_norm": 0.016625942662358284, "learning_rate": 5.204263012631391e-06, "loss": 0.5389, "step": 1623 }, { "epoch": 1.3333333333333333, "grad_norm": 0.016644326969981194, "learning_rate": 5.19271552745618e-06, "loss": 0.4902, "step": 1624 }, { "epoch": 1.3341543513957306, "grad_norm": 0.017717484384775162, "learning_rate": 5.181176378113061e-06, "loss": 0.4971, "step": 1625 }, { "epoch": 1.3349753694581281, "grad_norm": 0.016053512692451477, "learning_rate": 5.169645584606754e-06, "loss": 0.5083, "step": 1626 }, { "epoch": 1.3357963875205254, "grad_norm": 0.016186363995075226, "learning_rate": 5.158123166927498e-06, "loss": 0.4908, "step": 1627 }, { "epoch": 1.3366174055829227, "grad_norm": 0.017137352377176285, "learning_rate": 5.146609145051014e-06, "loss": 0.5042, "step": 1628 }, { "epoch": 1.3374384236453203, "grad_norm": 0.01566295512020588, "learning_rate": 5.135103538938472e-06, "loss": 0.4798, "step": 1629 }, { "epoch": 1.3382594417077176, "grad_norm": 0.016164684668183327, "learning_rate": 5.123606368536433e-06, "loss": 0.4794, "step": 1630 }, { "epoch": 1.339080459770115, "grad_norm": 0.015289668925106525, "learning_rate": 5.112117653776855e-06, "loss": 0.4919, "step": 1631 }, { "epoch": 1.3399014778325122, "grad_norm": 0.016462525352835655, "learning_rate": 5.100637414577028e-06, "loss": 0.4941, "step": 1632 }, { "epoch": 1.3407224958949095, "grad_norm": 0.016100455075502396, "learning_rate": 5.0891656708395466e-06, "loss": 0.492, "step": 1633 }, { "epoch": 1.341543513957307, "grad_norm": 0.01617618463933468, "learning_rate": 5.077702442452276e-06, "loss": 0.4921, "step": 1634 }, { "epoch": 1.3423645320197044, "grad_norm": 0.016862493008375168, "learning_rate": 5.06624774928833e-06, "loss": 0.518, "step": 1635 }, { "epoch": 1.3431855500821017, "grad_norm": 0.016352687031030655, "learning_rate": 5.05480161120601e-06, "loss": 0.5293, "step": 1636 }, { "epoch": 1.3440065681444993, "grad_norm": 0.016841424629092216, "learning_rate": 5.043364048048804e-06, "loss": 0.5026, "step": 1637 }, { "epoch": 1.3448275862068966, "grad_norm": 0.07805012911558151, "learning_rate": 5.0319350796453094e-06, "loss": 0.5122, "step": 1638 }, { "epoch": 1.345648604269294, "grad_norm": 0.016116900369524956, "learning_rate": 5.020514725809244e-06, "loss": 0.4973, "step": 1639 }, { "epoch": 1.3464696223316914, "grad_norm": 0.01628248393535614, "learning_rate": 5.009103006339391e-06, "loss": 0.5268, "step": 1640 }, { "epoch": 1.3472906403940887, "grad_norm": 0.017263662070035934, "learning_rate": 4.997699941019552e-06, "loss": 0.525, "step": 1641 }, { "epoch": 1.348111658456486, "grad_norm": 0.01790282130241394, "learning_rate": 4.98630554961853e-06, "loss": 0.4883, "step": 1642 }, { "epoch": 1.3489326765188834, "grad_norm": 0.016042744740843773, "learning_rate": 4.9749198518901e-06, "loss": 0.5104, "step": 1643 }, { "epoch": 1.3497536945812807, "grad_norm": 0.015843095257878304, "learning_rate": 4.96354286757295e-06, "loss": 0.4951, "step": 1644 }, { "epoch": 1.3505747126436782, "grad_norm": 0.01602538675069809, "learning_rate": 4.952174616390677e-06, "loss": 0.5265, "step": 1645 }, { "epoch": 1.3513957307060755, "grad_norm": 0.016159841790795326, "learning_rate": 4.940815118051726e-06, "loss": 0.5173, "step": 1646 }, { "epoch": 1.3522167487684729, "grad_norm": 0.01707025244832039, "learning_rate": 4.92946439224937e-06, "loss": 0.5226, "step": 1647 }, { "epoch": 1.3530377668308704, "grad_norm": 0.016719335690140724, "learning_rate": 4.9181224586616855e-06, "loss": 0.4873, "step": 1648 }, { "epoch": 1.3538587848932677, "grad_norm": 0.017132321372628212, "learning_rate": 4.90678933695149e-06, "loss": 0.531, "step": 1649 }, { "epoch": 1.354679802955665, "grad_norm": 0.016176994889974594, "learning_rate": 4.895465046766329e-06, "loss": 0.5003, "step": 1650 }, { "epoch": 1.3555008210180624, "grad_norm": 0.01658516377210617, "learning_rate": 4.884149607738447e-06, "loss": 0.5066, "step": 1651 }, { "epoch": 1.3563218390804597, "grad_norm": 0.016328293830156326, "learning_rate": 4.872843039484728e-06, "loss": 0.4752, "step": 1652 }, { "epoch": 1.3571428571428572, "grad_norm": 0.016002044081687927, "learning_rate": 4.861545361606697e-06, "loss": 0.4691, "step": 1653 }, { "epoch": 1.3579638752052545, "grad_norm": 0.01710786297917366, "learning_rate": 4.850256593690447e-06, "loss": 0.5001, "step": 1654 }, { "epoch": 1.3587848932676518, "grad_norm": 0.01591937057673931, "learning_rate": 4.838976755306632e-06, "loss": 0.4827, "step": 1655 }, { "epoch": 1.3596059113300494, "grad_norm": 0.01609291136264801, "learning_rate": 4.827705866010427e-06, "loss": 0.4907, "step": 1656 }, { "epoch": 1.3604269293924467, "grad_norm": 0.01656312867999077, "learning_rate": 4.816443945341497e-06, "loss": 0.4962, "step": 1657 }, { "epoch": 1.361247947454844, "grad_norm": 0.015894297510385513, "learning_rate": 4.805191012823949e-06, "loss": 0.4785, "step": 1658 }, { "epoch": 1.3620689655172413, "grad_norm": 0.015271642245352268, "learning_rate": 4.7939470879663115e-06, "loss": 0.4826, "step": 1659 }, { "epoch": 1.3628899835796386, "grad_norm": 0.01603817194700241, "learning_rate": 4.782712190261491e-06, "loss": 0.4845, "step": 1660 }, { "epoch": 1.3637110016420362, "grad_norm": 0.016729192808270454, "learning_rate": 4.771486339186762e-06, "loss": 0.5253, "step": 1661 }, { "epoch": 1.3645320197044335, "grad_norm": 0.01559630036354065, "learning_rate": 4.760269554203698e-06, "loss": 0.4834, "step": 1662 }, { "epoch": 1.3653530377668308, "grad_norm": 0.016533207148313522, "learning_rate": 4.749061854758157e-06, "loss": 0.5053, "step": 1663 }, { "epoch": 1.3661740558292284, "grad_norm": 0.01645340770483017, "learning_rate": 4.7378632602802545e-06, "loss": 0.4946, "step": 1664 }, { "epoch": 1.3669950738916257, "grad_norm": 0.016393568366765976, "learning_rate": 4.72667379018432e-06, "loss": 0.5052, "step": 1665 }, { "epoch": 1.367816091954023, "grad_norm": 0.01649537682533264, "learning_rate": 4.715493463868857e-06, "loss": 0.5078, "step": 1666 }, { "epoch": 1.3686371100164203, "grad_norm": 0.01636436954140663, "learning_rate": 4.704322300716519e-06, "loss": 0.5072, "step": 1667 }, { "epoch": 1.3694581280788176, "grad_norm": 0.01686609536409378, "learning_rate": 4.693160320094082e-06, "loss": 0.5117, "step": 1668 }, { "epoch": 1.3702791461412152, "grad_norm": 0.01635984145104885, "learning_rate": 4.6820075413523915e-06, "loss": 0.4932, "step": 1669 }, { "epoch": 1.3711001642036125, "grad_norm": 0.01641320437192917, "learning_rate": 4.670863983826355e-06, "loss": 0.4966, "step": 1670 }, { "epoch": 1.3719211822660098, "grad_norm": 0.017103901132941246, "learning_rate": 4.65972966683487e-06, "loss": 0.491, "step": 1671 }, { "epoch": 1.3727422003284073, "grad_norm": 0.016268998384475708, "learning_rate": 4.648604609680838e-06, "loss": 0.5006, "step": 1672 }, { "epoch": 1.3735632183908046, "grad_norm": 0.015512177720665932, "learning_rate": 4.637488831651099e-06, "loss": 0.4889, "step": 1673 }, { "epoch": 1.374384236453202, "grad_norm": 0.016519444063305855, "learning_rate": 4.626382352016402e-06, "loss": 0.4987, "step": 1674 }, { "epoch": 1.3752052545155993, "grad_norm": 0.017181985080242157, "learning_rate": 4.615285190031378e-06, "loss": 0.478, "step": 1675 }, { "epoch": 1.3760262725779966, "grad_norm": 0.016253387555480003, "learning_rate": 4.60419736493451e-06, "loss": 0.4841, "step": 1676 }, { "epoch": 1.3768472906403941, "grad_norm": 0.01749732717871666, "learning_rate": 4.5931188959480846e-06, "loss": 0.5259, "step": 1677 }, { "epoch": 1.3776683087027914, "grad_norm": 0.016517337411642075, "learning_rate": 4.58204980227818e-06, "loss": 0.4814, "step": 1678 }, { "epoch": 1.3784893267651888, "grad_norm": 0.016344062983989716, "learning_rate": 4.570990103114613e-06, "loss": 0.5113, "step": 1679 }, { "epoch": 1.3793103448275863, "grad_norm": 0.01611160859465599, "learning_rate": 4.559939817630909e-06, "loss": 0.4956, "step": 1680 }, { "epoch": 1.3801313628899836, "grad_norm": 0.016290003433823586, "learning_rate": 4.54889896498429e-06, "loss": 0.5184, "step": 1681 }, { "epoch": 1.380952380952381, "grad_norm": 0.016223903745412827, "learning_rate": 4.5378675643156086e-06, "loss": 0.4935, "step": 1682 }, { "epoch": 1.3817733990147782, "grad_norm": 0.01697399653494358, "learning_rate": 4.526845634749334e-06, "loss": 0.4788, "step": 1683 }, { "epoch": 1.3825944170771756, "grad_norm": 0.016582271084189415, "learning_rate": 4.515833195393528e-06, "loss": 0.4687, "step": 1684 }, { "epoch": 1.383415435139573, "grad_norm": 0.01689249277114868, "learning_rate": 4.504830265339783e-06, "loss": 0.5055, "step": 1685 }, { "epoch": 1.3842364532019704, "grad_norm": 0.016160577535629272, "learning_rate": 4.493836863663219e-06, "loss": 0.4856, "step": 1686 }, { "epoch": 1.3850574712643677, "grad_norm": 0.015963392332196236, "learning_rate": 4.482853009422431e-06, "loss": 0.4967, "step": 1687 }, { "epoch": 1.3858784893267653, "grad_norm": 0.016536112874746323, "learning_rate": 4.471878721659457e-06, "loss": 0.5057, "step": 1688 }, { "epoch": 1.3866995073891626, "grad_norm": 0.01600906252861023, "learning_rate": 4.4609140193997615e-06, "loss": 0.4775, "step": 1689 }, { "epoch": 1.38752052545156, "grad_norm": 0.015677744522690773, "learning_rate": 4.4499589216521935e-06, "loss": 0.4774, "step": 1690 }, { "epoch": 1.3883415435139574, "grad_norm": 0.016767427325248718, "learning_rate": 4.439013447408927e-06, "loss": 0.5005, "step": 1691 }, { "epoch": 1.3891625615763548, "grad_norm": 0.01600239798426628, "learning_rate": 4.42807761564548e-06, "loss": 0.4933, "step": 1692 }, { "epoch": 1.389983579638752, "grad_norm": 0.01675335504114628, "learning_rate": 4.417151445320638e-06, "loss": 0.4764, "step": 1693 }, { "epoch": 1.3908045977011494, "grad_norm": 0.016013359650969505, "learning_rate": 4.406234955376447e-06, "loss": 0.4856, "step": 1694 }, { "epoch": 1.3916256157635467, "grad_norm": 0.016268085688352585, "learning_rate": 4.395328164738163e-06, "loss": 0.5238, "step": 1695 }, { "epoch": 1.3924466338259442, "grad_norm": 0.01726417802274227, "learning_rate": 4.384431092314228e-06, "loss": 0.5012, "step": 1696 }, { "epoch": 1.3932676518883416, "grad_norm": 0.016475891694426537, "learning_rate": 4.373543756996235e-06, "loss": 0.4933, "step": 1697 }, { "epoch": 1.3940886699507389, "grad_norm": 0.016244657337665558, "learning_rate": 4.362666177658913e-06, "loss": 0.4984, "step": 1698 }, { "epoch": 1.3949096880131364, "grad_norm": 0.01574135012924671, "learning_rate": 4.3517983731600455e-06, "loss": 0.5148, "step": 1699 }, { "epoch": 1.3957307060755337, "grad_norm": 0.016135117039084435, "learning_rate": 4.340940362340496e-06, "loss": 0.4955, "step": 1700 }, { "epoch": 1.396551724137931, "grad_norm": 0.015914611518383026, "learning_rate": 4.330092164024146e-06, "loss": 0.4785, "step": 1701 }, { "epoch": 1.3973727422003284, "grad_norm": 0.016355788335204124, "learning_rate": 4.319253797017856e-06, "loss": 0.525, "step": 1702 }, { "epoch": 1.3981937602627257, "grad_norm": 0.015914790332317352, "learning_rate": 4.308425280111448e-06, "loss": 0.4709, "step": 1703 }, { "epoch": 1.3990147783251232, "grad_norm": 0.01742439717054367, "learning_rate": 4.297606632077665e-06, "loss": 0.498, "step": 1704 }, { "epoch": 1.3998357963875205, "grad_norm": 0.01618076115846634, "learning_rate": 4.286797871672144e-06, "loss": 0.5101, "step": 1705 }, { "epoch": 1.4006568144499179, "grad_norm": 0.015803616493940353, "learning_rate": 4.2759990176333845e-06, "loss": 0.476, "step": 1706 }, { "epoch": 1.4014778325123154, "grad_norm": 0.015950782224535942, "learning_rate": 4.265210088682704e-06, "loss": 0.4953, "step": 1707 }, { "epoch": 1.4022988505747127, "grad_norm": 0.015836119651794434, "learning_rate": 4.254431103524211e-06, "loss": 0.494, "step": 1708 }, { "epoch": 1.40311986863711, "grad_norm": 0.01610424928367138, "learning_rate": 4.24366208084479e-06, "loss": 0.4901, "step": 1709 }, { "epoch": 1.4039408866995073, "grad_norm": 0.01587124913930893, "learning_rate": 4.232903039314038e-06, "loss": 0.4833, "step": 1710 }, { "epoch": 1.4047619047619047, "grad_norm": 0.017478501424193382, "learning_rate": 4.222153997584254e-06, "loss": 0.4904, "step": 1711 }, { "epoch": 1.4055829228243022, "grad_norm": 0.01637875661253929, "learning_rate": 4.211414974290407e-06, "loss": 0.5341, "step": 1712 }, { "epoch": 1.4064039408866995, "grad_norm": 0.016793059185147285, "learning_rate": 4.200685988050086e-06, "loss": 0.4957, "step": 1713 }, { "epoch": 1.4072249589490968, "grad_norm": 0.016692323610186577, "learning_rate": 4.189967057463492e-06, "loss": 0.4938, "step": 1714 }, { "epoch": 1.4080459770114944, "grad_norm": 0.01645640842616558, "learning_rate": 4.179258201113385e-06, "loss": 0.5114, "step": 1715 }, { "epoch": 1.4088669950738917, "grad_norm": 0.016591472551226616, "learning_rate": 4.168559437565058e-06, "loss": 0.511, "step": 1716 }, { "epoch": 1.409688013136289, "grad_norm": 0.01690101996064186, "learning_rate": 4.157870785366315e-06, "loss": 0.5021, "step": 1717 }, { "epoch": 1.4105090311986863, "grad_norm": 0.015816153958439827, "learning_rate": 4.147192263047423e-06, "loss": 0.5036, "step": 1718 }, { "epoch": 1.4113300492610836, "grad_norm": 0.015660235658288002, "learning_rate": 4.136523889121094e-06, "loss": 0.49, "step": 1719 }, { "epoch": 1.4121510673234812, "grad_norm": 0.016461890190839767, "learning_rate": 4.125865682082443e-06, "loss": 0.5098, "step": 1720 }, { "epoch": 1.4129720853858785, "grad_norm": 0.016515564173460007, "learning_rate": 4.115217660408953e-06, "loss": 0.5033, "step": 1721 }, { "epoch": 1.4137931034482758, "grad_norm": 0.016689851880073547, "learning_rate": 4.104579842560462e-06, "loss": 0.4998, "step": 1722 }, { "epoch": 1.4146141215106733, "grad_norm": 0.016054656356573105, "learning_rate": 4.093952246979111e-06, "loss": 0.4996, "step": 1723 }, { "epoch": 1.4154351395730707, "grad_norm": 0.016394807025790215, "learning_rate": 4.083334892089313e-06, "loss": 0.4842, "step": 1724 }, { "epoch": 1.416256157635468, "grad_norm": 0.016269566491246223, "learning_rate": 4.072727796297741e-06, "loss": 0.4867, "step": 1725 }, { "epoch": 1.4170771756978653, "grad_norm": 0.016568703576922417, "learning_rate": 4.062130977993276e-06, "loss": 0.5087, "step": 1726 }, { "epoch": 1.4178981937602626, "grad_norm": 0.016157176345586777, "learning_rate": 4.051544455546981e-06, "loss": 0.4828, "step": 1727 }, { "epoch": 1.4187192118226601, "grad_norm": 0.015998512506484985, "learning_rate": 4.040968247312068e-06, "loss": 0.5108, "step": 1728 }, { "epoch": 1.4195402298850575, "grad_norm": 0.015671899542212486, "learning_rate": 4.030402371623866e-06, "loss": 0.4914, "step": 1729 }, { "epoch": 1.4203612479474548, "grad_norm": 0.01632794551551342, "learning_rate": 4.019846846799802e-06, "loss": 0.4815, "step": 1730 }, { "epoch": 1.4211822660098523, "grad_norm": 0.016358893364667892, "learning_rate": 4.009301691139349e-06, "loss": 0.4819, "step": 1731 }, { "epoch": 1.4220032840722496, "grad_norm": 0.01576661318540573, "learning_rate": 3.998766922924002e-06, "loss": 0.4787, "step": 1732 }, { "epoch": 1.422824302134647, "grad_norm": 0.015424797311425209, "learning_rate": 3.988242560417254e-06, "loss": 0.4996, "step": 1733 }, { "epoch": 1.4236453201970443, "grad_norm": 0.016595851629972458, "learning_rate": 3.97772862186456e-06, "loss": 0.4895, "step": 1734 }, { "epoch": 1.4244663382594416, "grad_norm": 0.016807885840535164, "learning_rate": 3.967225125493297e-06, "loss": 0.5292, "step": 1735 }, { "epoch": 1.4252873563218391, "grad_norm": 0.016221603378653526, "learning_rate": 3.956732089512737e-06, "loss": 0.4872, "step": 1736 }, { "epoch": 1.4261083743842364, "grad_norm": 0.015987025573849678, "learning_rate": 3.946249532114029e-06, "loss": 0.5042, "step": 1737 }, { "epoch": 1.4269293924466337, "grad_norm": 0.015703381970524788, "learning_rate": 3.935777471470144e-06, "loss": 0.4945, "step": 1738 }, { "epoch": 1.4277504105090313, "grad_norm": 0.016541115939617157, "learning_rate": 3.925315925735868e-06, "loss": 0.5285, "step": 1739 }, { "epoch": 1.4285714285714286, "grad_norm": 0.01626407355070114, "learning_rate": 3.914864913047739e-06, "loss": 0.5053, "step": 1740 }, { "epoch": 1.429392446633826, "grad_norm": 0.01582110859453678, "learning_rate": 3.904424451524051e-06, "loss": 0.4845, "step": 1741 }, { "epoch": 1.4302134646962232, "grad_norm": 0.01587524265050888, "learning_rate": 3.893994559264805e-06, "loss": 0.4836, "step": 1742 }, { "epoch": 1.4310344827586206, "grad_norm": 0.015770789235830307, "learning_rate": 3.883575254351672e-06, "loss": 0.4914, "step": 1743 }, { "epoch": 1.431855500821018, "grad_norm": 0.015490228310227394, "learning_rate": 3.873166554847966e-06, "loss": 0.5003, "step": 1744 }, { "epoch": 1.4326765188834154, "grad_norm": 0.01637321338057518, "learning_rate": 3.862768478798628e-06, "loss": 0.5066, "step": 1745 }, { "epoch": 1.4334975369458127, "grad_norm": 0.016110209748148918, "learning_rate": 3.852381044230168e-06, "loss": 0.4895, "step": 1746 }, { "epoch": 1.4343185550082103, "grad_norm": 0.01552515011280775, "learning_rate": 3.8420042691506575e-06, "loss": 0.5038, "step": 1747 }, { "epoch": 1.4351395730706076, "grad_norm": 0.01634242758154869, "learning_rate": 3.831638171549681e-06, "loss": 0.4887, "step": 1748 }, { "epoch": 1.435960591133005, "grad_norm": 0.01628069020807743, "learning_rate": 3.8212827693983145e-06, "loss": 0.4861, "step": 1749 }, { "epoch": 1.4367816091954024, "grad_norm": 0.015853237360715866, "learning_rate": 3.810938080649097e-06, "loss": 0.4931, "step": 1750 }, { "epoch": 1.4376026272577997, "grad_norm": 0.01576998084783554, "learning_rate": 3.8006041232359862e-06, "loss": 0.5133, "step": 1751 }, { "epoch": 1.438423645320197, "grad_norm": 0.015522085130214691, "learning_rate": 3.7902809150743384e-06, "loss": 0.4769, "step": 1752 }, { "epoch": 1.4392446633825944, "grad_norm": 0.01570752263069153, "learning_rate": 3.77996847406088e-06, "loss": 0.4718, "step": 1753 }, { "epoch": 1.4400656814449917, "grad_norm": 0.015881160274147987, "learning_rate": 3.769666818073661e-06, "loss": 0.4996, "step": 1754 }, { "epoch": 1.4408866995073892, "grad_norm": 0.01594473421573639, "learning_rate": 3.7593759649720457e-06, "loss": 0.4775, "step": 1755 }, { "epoch": 1.4417077175697866, "grad_norm": 0.0167029220610857, "learning_rate": 3.749095932596661e-06, "loss": 0.4836, "step": 1756 }, { "epoch": 1.4425287356321839, "grad_norm": 0.01599602960050106, "learning_rate": 3.738826738769374e-06, "loss": 0.4749, "step": 1757 }, { "epoch": 1.4433497536945814, "grad_norm": 0.01651759073138237, "learning_rate": 3.728568401293267e-06, "loss": 0.4891, "step": 1758 }, { "epoch": 1.4441707717569787, "grad_norm": 0.016692008823156357, "learning_rate": 3.7183209379526085e-06, "loss": 0.5032, "step": 1759 }, { "epoch": 1.444991789819376, "grad_norm": 0.015688546001911163, "learning_rate": 3.7080843665127924e-06, "loss": 0.4919, "step": 1760 }, { "epoch": 1.4458128078817734, "grad_norm": 0.016200270503759384, "learning_rate": 3.6978587047203527e-06, "loss": 0.4899, "step": 1761 }, { "epoch": 1.4466338259441707, "grad_norm": 0.01649155095219612, "learning_rate": 3.6876439703028953e-06, "loss": 0.4964, "step": 1762 }, { "epoch": 1.4474548440065682, "grad_norm": 0.01650436967611313, "learning_rate": 3.6774401809690936e-06, "loss": 0.5139, "step": 1763 }, { "epoch": 1.4482758620689655, "grad_norm": 0.01594744436442852, "learning_rate": 3.667247354408637e-06, "loss": 0.4967, "step": 1764 }, { "epoch": 1.4490968801313628, "grad_norm": 0.01667179726064205, "learning_rate": 3.65706550829221e-06, "loss": 0.5046, "step": 1765 }, { "epoch": 1.4499178981937604, "grad_norm": 0.015912609174847603, "learning_rate": 3.6468946602714663e-06, "loss": 0.5011, "step": 1766 }, { "epoch": 1.4507389162561577, "grad_norm": 0.016103865578770638, "learning_rate": 3.6367348279789965e-06, "loss": 0.4924, "step": 1767 }, { "epoch": 1.451559934318555, "grad_norm": 0.016113216057419777, "learning_rate": 3.6265860290282754e-06, "loss": 0.4678, "step": 1768 }, { "epoch": 1.4523809523809523, "grad_norm": 0.01774243265390396, "learning_rate": 3.6164482810136693e-06, "loss": 0.4965, "step": 1769 }, { "epoch": 1.4532019704433496, "grad_norm": 0.015887869521975517, "learning_rate": 3.6063216015103793e-06, "loss": 0.5041, "step": 1770 }, { "epoch": 1.4540229885057472, "grad_norm": 0.01597549580037594, "learning_rate": 3.5962060080744184e-06, "loss": 0.4911, "step": 1771 }, { "epoch": 1.4548440065681445, "grad_norm": 0.015385903418064117, "learning_rate": 3.5861015182425755e-06, "loss": 0.4569, "step": 1772 }, { "epoch": 1.4556650246305418, "grad_norm": 0.01629272662103176, "learning_rate": 3.5760081495323928e-06, "loss": 0.4982, "step": 1773 }, { "epoch": 1.4564860426929394, "grad_norm": 0.01573648676276207, "learning_rate": 3.565925919442135e-06, "loss": 0.4903, "step": 1774 }, { "epoch": 1.4573070607553367, "grad_norm": 0.01727268286049366, "learning_rate": 3.55585484545076e-06, "loss": 0.5046, "step": 1775 }, { "epoch": 1.458128078817734, "grad_norm": 0.015656432136893272, "learning_rate": 3.5457949450178747e-06, "loss": 0.5045, "step": 1776 }, { "epoch": 1.4589490968801313, "grad_norm": 0.0159356240183115, "learning_rate": 3.5357462355837183e-06, "loss": 0.4901, "step": 1777 }, { "epoch": 1.4597701149425286, "grad_norm": 0.01603950560092926, "learning_rate": 3.5257087345691376e-06, "loss": 0.5004, "step": 1778 }, { "epoch": 1.4605911330049262, "grad_norm": 0.015506523661315441, "learning_rate": 3.515682459375538e-06, "loss": 0.4912, "step": 1779 }, { "epoch": 1.4614121510673235, "grad_norm": 0.015681063756346703, "learning_rate": 3.505667427384861e-06, "loss": 0.484, "step": 1780 }, { "epoch": 1.4622331691297208, "grad_norm": 0.021944429725408554, "learning_rate": 3.495663655959572e-06, "loss": 0.4718, "step": 1781 }, { "epoch": 1.4630541871921183, "grad_norm": 0.015665240585803986, "learning_rate": 3.4856711624425948e-06, "loss": 0.4718, "step": 1782 }, { "epoch": 1.4638752052545156, "grad_norm": 0.015869904309511185, "learning_rate": 3.475689964157321e-06, "loss": 0.4975, "step": 1783 }, { "epoch": 1.464696223316913, "grad_norm": 0.015804462134838104, "learning_rate": 3.4657200784075456e-06, "loss": 0.4927, "step": 1784 }, { "epoch": 1.4655172413793103, "grad_norm": 0.015700949355959892, "learning_rate": 3.455761522477454e-06, "loss": 0.4891, "step": 1785 }, { "epoch": 1.4663382594417076, "grad_norm": 0.016060061752796173, "learning_rate": 3.445814313631598e-06, "loss": 0.4837, "step": 1786 }, { "epoch": 1.4671592775041051, "grad_norm": 0.016152681782841682, "learning_rate": 3.435878469114847e-06, "loss": 0.4935, "step": 1787 }, { "epoch": 1.4679802955665024, "grad_norm": 0.01628580503165722, "learning_rate": 3.42595400615238e-06, "loss": 0.5027, "step": 1788 }, { "epoch": 1.4688013136288998, "grad_norm": 0.016248255968093872, "learning_rate": 3.4160409419496355e-06, "loss": 0.4926, "step": 1789 }, { "epoch": 1.4696223316912973, "grad_norm": 0.016340939328074455, "learning_rate": 3.4061392936922898e-06, "loss": 0.5056, "step": 1790 }, { "epoch": 1.4704433497536946, "grad_norm": 0.015700742602348328, "learning_rate": 3.3962490785462386e-06, "loss": 0.4661, "step": 1791 }, { "epoch": 1.471264367816092, "grad_norm": 0.017287708818912506, "learning_rate": 3.3863703136575454e-06, "loss": 0.5054, "step": 1792 }, { "epoch": 1.4720853858784892, "grad_norm": 0.016118528321385384, "learning_rate": 3.376503016152427e-06, "loss": 0.4922, "step": 1793 }, { "epoch": 1.4729064039408866, "grad_norm": 0.031863413751125336, "learning_rate": 3.3666472031372247e-06, "loss": 0.5076, "step": 1794 }, { "epoch": 1.473727422003284, "grad_norm": 0.017524005845189095, "learning_rate": 3.356802891698362e-06, "loss": 0.5128, "step": 1795 }, { "epoch": 1.4745484400656814, "grad_norm": 0.015434886328876019, "learning_rate": 3.346970098902329e-06, "loss": 0.4652, "step": 1796 }, { "epoch": 1.4753694581280787, "grad_norm": 0.016214722767472267, "learning_rate": 3.3371488417956445e-06, "loss": 0.4949, "step": 1797 }, { "epoch": 1.4761904761904763, "grad_norm": 0.017616940662264824, "learning_rate": 3.327339137404822e-06, "loss": 0.4793, "step": 1798 }, { "epoch": 1.4770114942528736, "grad_norm": 0.01642577536404133, "learning_rate": 3.3175410027363574e-06, "loss": 0.493, "step": 1799 }, { "epoch": 1.477832512315271, "grad_norm": 0.016006316989660263, "learning_rate": 3.3077544547766916e-06, "loss": 0.5026, "step": 1800 }, { "epoch": 1.4786535303776684, "grad_norm": 0.016360443085432053, "learning_rate": 3.297979510492157e-06, "loss": 0.4897, "step": 1801 }, { "epoch": 1.4794745484400658, "grad_norm": 0.016926873475313187, "learning_rate": 3.2882161868289913e-06, "loss": 0.4936, "step": 1802 }, { "epoch": 1.480295566502463, "grad_norm": 0.016755202785134315, "learning_rate": 3.278464500713281e-06, "loss": 0.4856, "step": 1803 }, { "epoch": 1.4811165845648604, "grad_norm": 0.016006631776690483, "learning_rate": 3.2687244690509307e-06, "loss": 0.4978, "step": 1804 }, { "epoch": 1.4819376026272577, "grad_norm": 0.018267512321472168, "learning_rate": 3.2589961087276457e-06, "loss": 0.51, "step": 1805 }, { "epoch": 1.4827586206896552, "grad_norm": 0.018182098865509033, "learning_rate": 3.2492794366088917e-06, "loss": 0.4726, "step": 1806 }, { "epoch": 1.4835796387520526, "grad_norm": 0.016504261642694473, "learning_rate": 3.239574469539879e-06, "loss": 0.4873, "step": 1807 }, { "epoch": 1.4844006568144499, "grad_norm": 0.016546620056033134, "learning_rate": 3.2298812243455313e-06, "loss": 0.4952, "step": 1808 }, { "epoch": 1.4852216748768474, "grad_norm": 0.017086289823055267, "learning_rate": 3.220199717830425e-06, "loss": 0.4709, "step": 1809 }, { "epoch": 1.4860426929392447, "grad_norm": 0.01761605590581894, "learning_rate": 3.2105299667788113e-06, "loss": 0.4932, "step": 1810 }, { "epoch": 1.486863711001642, "grad_norm": 0.015749230980873108, "learning_rate": 3.200871987954557e-06, "loss": 0.4942, "step": 1811 }, { "epoch": 1.4876847290640394, "grad_norm": 0.01746760495007038, "learning_rate": 3.1912257981011114e-06, "loss": 0.4919, "step": 1812 }, { "epoch": 1.4885057471264367, "grad_norm": 0.01610916666686535, "learning_rate": 3.181591413941487e-06, "loss": 0.486, "step": 1813 }, { "epoch": 1.4893267651888342, "grad_norm": 0.016016118228435516, "learning_rate": 3.1719688521782403e-06, "loss": 0.5081, "step": 1814 }, { "epoch": 1.4901477832512315, "grad_norm": 0.017170250415802002, "learning_rate": 3.162358129493419e-06, "loss": 0.5306, "step": 1815 }, { "epoch": 1.4909688013136289, "grad_norm": 0.016354048624634743, "learning_rate": 3.152759262548561e-06, "loss": 0.5063, "step": 1816 }, { "epoch": 1.4917898193760264, "grad_norm": 0.017313700169324875, "learning_rate": 3.1431722679846276e-06, "loss": 0.5038, "step": 1817 }, { "epoch": 1.4926108374384237, "grad_norm": 0.015850329771637917, "learning_rate": 3.1335971624220202e-06, "loss": 0.5035, "step": 1818 }, { "epoch": 1.493431855500821, "grad_norm": 0.016181841492652893, "learning_rate": 3.124033962460522e-06, "loss": 0.5261, "step": 1819 }, { "epoch": 1.4942528735632183, "grad_norm": 0.01592717319726944, "learning_rate": 3.114482684679273e-06, "loss": 0.4992, "step": 1820 }, { "epoch": 1.4950738916256157, "grad_norm": 0.016473127529025078, "learning_rate": 3.104943345636741e-06, "loss": 0.4973, "step": 1821 }, { "epoch": 1.4958949096880132, "grad_norm": 0.017254067584872246, "learning_rate": 3.0954159618707103e-06, "loss": 0.4893, "step": 1822 }, { "epoch": 1.4967159277504105, "grad_norm": 0.01633891835808754, "learning_rate": 3.085900549898221e-06, "loss": 0.4905, "step": 1823 }, { "epoch": 1.4975369458128078, "grad_norm": 0.016628887504339218, "learning_rate": 3.0763971262155744e-06, "loss": 0.5103, "step": 1824 }, { "epoch": 1.4983579638752054, "grad_norm": 0.016512641683220863, "learning_rate": 3.0669057072982796e-06, "loss": 0.5069, "step": 1825 }, { "epoch": 1.4991789819376027, "grad_norm": 0.015940312296152115, "learning_rate": 3.057426309601032e-06, "loss": 0.5138, "step": 1826 }, { "epoch": 1.5, "grad_norm": 0.015734445303678513, "learning_rate": 3.047958949557696e-06, "loss": 0.479, "step": 1827 }, { "epoch": 1.5008210180623975, "grad_norm": 0.015417561866343021, "learning_rate": 3.0385036435812594e-06, "loss": 0.4714, "step": 1828 }, { "epoch": 1.5016420361247946, "grad_norm": 0.01666441187262535, "learning_rate": 3.0290604080638115e-06, "loss": 0.5133, "step": 1829 }, { "epoch": 1.5024630541871922, "grad_norm": 0.015651021152734756, "learning_rate": 3.0196292593765225e-06, "loss": 0.4616, "step": 1830 }, { "epoch": 1.5032840722495895, "grad_norm": 0.016008486971259117, "learning_rate": 3.0102102138696014e-06, "loss": 0.5051, "step": 1831 }, { "epoch": 1.5041050903119868, "grad_norm": 0.015711242333054543, "learning_rate": 3.0008032878722833e-06, "loss": 0.4872, "step": 1832 }, { "epoch": 1.5049261083743843, "grad_norm": 0.015744850039482117, "learning_rate": 2.991408497692783e-06, "loss": 0.4839, "step": 1833 }, { "epoch": 1.5057471264367817, "grad_norm": 0.016017595306038857, "learning_rate": 2.9820258596182776e-06, "loss": 0.4945, "step": 1834 }, { "epoch": 1.506568144499179, "grad_norm": 0.015782902017235756, "learning_rate": 2.972655389914883e-06, "loss": 0.476, "step": 1835 }, { "epoch": 1.5073891625615765, "grad_norm": 0.015705162659287453, "learning_rate": 2.963297104827622e-06, "loss": 0.4727, "step": 1836 }, { "epoch": 1.5082101806239736, "grad_norm": 0.016282562166452408, "learning_rate": 2.9539510205803758e-06, "loss": 0.4806, "step": 1837 }, { "epoch": 1.5090311986863711, "grad_norm": 0.015788551419973373, "learning_rate": 2.9446171533758933e-06, "loss": 0.4603, "step": 1838 }, { "epoch": 1.5098522167487685, "grad_norm": 0.015066917985677719, "learning_rate": 2.9352955193957313e-06, "loss": 0.4436, "step": 1839 }, { "epoch": 1.5106732348111658, "grad_norm": 0.01571269892156124, "learning_rate": 2.9259861348002483e-06, "loss": 0.4843, "step": 1840 }, { "epoch": 1.5114942528735633, "grad_norm": 0.016046926379203796, "learning_rate": 2.916689015728559e-06, "loss": 0.4921, "step": 1841 }, { "epoch": 1.5123152709359606, "grad_norm": 0.016173269599676132, "learning_rate": 2.9074041782985122e-06, "loss": 0.4919, "step": 1842 }, { "epoch": 1.513136288998358, "grad_norm": 0.01609210856258869, "learning_rate": 2.8981316386066743e-06, "loss": 0.4903, "step": 1843 }, { "epoch": 1.5139573070607555, "grad_norm": 0.016541680321097374, "learning_rate": 2.888871412728289e-06, "loss": 0.4879, "step": 1844 }, { "epoch": 1.5147783251231526, "grad_norm": 0.01608441397547722, "learning_rate": 2.8796235167172464e-06, "loss": 0.4746, "step": 1845 }, { "epoch": 1.5155993431855501, "grad_norm": 0.015723232179880142, "learning_rate": 2.870387966606062e-06, "loss": 0.4823, "step": 1846 }, { "epoch": 1.5164203612479474, "grad_norm": 0.016490289941430092, "learning_rate": 2.8611647784058573e-06, "loss": 0.5127, "step": 1847 }, { "epoch": 1.5172413793103448, "grad_norm": 0.015327083878219128, "learning_rate": 2.8519539681063135e-06, "loss": 0.4723, "step": 1848 }, { "epoch": 1.5180623973727423, "grad_norm": 0.016062747687101364, "learning_rate": 2.8427555516756523e-06, "loss": 0.4999, "step": 1849 }, { "epoch": 1.5188834154351396, "grad_norm": 0.01587042212486267, "learning_rate": 2.8335695450606126e-06, "loss": 0.4622, "step": 1850 }, { "epoch": 1.519704433497537, "grad_norm": 0.015951644629240036, "learning_rate": 2.824395964186421e-06, "loss": 0.5122, "step": 1851 }, { "epoch": 1.5205254515599345, "grad_norm": 0.01601056195795536, "learning_rate": 2.8152348249567625e-06, "loss": 0.5121, "step": 1852 }, { "epoch": 1.5213464696223316, "grad_norm": 0.014851457439363003, "learning_rate": 2.8060861432537477e-06, "loss": 0.4562, "step": 1853 }, { "epoch": 1.522167487684729, "grad_norm": 0.015827039256691933, "learning_rate": 2.7969499349378894e-06, "loss": 0.4824, "step": 1854 }, { "epoch": 1.5229885057471264, "grad_norm": 0.01535165123641491, "learning_rate": 2.7878262158480855e-06, "loss": 0.4618, "step": 1855 }, { "epoch": 1.5238095238095237, "grad_norm": 0.015763266012072563, "learning_rate": 2.778715001801571e-06, "loss": 0.4647, "step": 1856 }, { "epoch": 1.5246305418719213, "grad_norm": 0.015846991911530495, "learning_rate": 2.7696163085939115e-06, "loss": 0.4693, "step": 1857 }, { "epoch": 1.5254515599343186, "grad_norm": 0.015390144661068916, "learning_rate": 2.7605301519989596e-06, "loss": 0.4808, "step": 1858 }, { "epoch": 1.526272577996716, "grad_norm": 0.016332169994711876, "learning_rate": 2.751456547768833e-06, "loss": 0.4987, "step": 1859 }, { "epoch": 1.5270935960591134, "grad_norm": 0.01603347435593605, "learning_rate": 2.742395511633895e-06, "loss": 0.512, "step": 1860 }, { "epoch": 1.5279146141215105, "grad_norm": 0.01630253717303276, "learning_rate": 2.733347059302715e-06, "loss": 0.5184, "step": 1861 }, { "epoch": 1.528735632183908, "grad_norm": 0.015461920760571957, "learning_rate": 2.7243112064620435e-06, "loss": 0.4791, "step": 1862 }, { "epoch": 1.5295566502463054, "grad_norm": 0.01554048340767622, "learning_rate": 2.7152879687767996e-06, "loss": 0.4739, "step": 1863 }, { "epoch": 1.5303776683087027, "grad_norm": 0.015708548948168755, "learning_rate": 2.7062773618900167e-06, "loss": 0.4754, "step": 1864 }, { "epoch": 1.5311986863711002, "grad_norm": 0.015938108786940575, "learning_rate": 2.6972794014228483e-06, "loss": 0.4961, "step": 1865 }, { "epoch": 1.5320197044334976, "grad_norm": 0.015506520867347717, "learning_rate": 2.6882941029745073e-06, "loss": 0.4674, "step": 1866 }, { "epoch": 1.5328407224958949, "grad_norm": 0.015804840251803398, "learning_rate": 2.679321482122263e-06, "loss": 0.4616, "step": 1867 }, { "epoch": 1.5336617405582924, "grad_norm": 0.01575363799929619, "learning_rate": 2.6703615544214086e-06, "loss": 0.4992, "step": 1868 }, { "epoch": 1.5344827586206895, "grad_norm": 0.016107680276036263, "learning_rate": 2.661414335405232e-06, "loss": 0.4978, "step": 1869 }, { "epoch": 1.535303776683087, "grad_norm": 0.01596527360379696, "learning_rate": 2.652479840584977e-06, "loss": 0.4928, "step": 1870 }, { "epoch": 1.5361247947454844, "grad_norm": 0.015927845612168312, "learning_rate": 2.643558085449845e-06, "loss": 0.5075, "step": 1871 }, { "epoch": 1.5369458128078817, "grad_norm": 0.015915559604763985, "learning_rate": 2.6346490854669365e-06, "loss": 0.4792, "step": 1872 }, { "epoch": 1.5377668308702792, "grad_norm": 0.016632167622447014, "learning_rate": 2.6257528560812533e-06, "loss": 0.4969, "step": 1873 }, { "epoch": 1.5385878489326765, "grad_norm": 0.015808328986167908, "learning_rate": 2.616869412715647e-06, "loss": 0.5, "step": 1874 }, { "epoch": 1.5394088669950738, "grad_norm": 0.0162336602807045, "learning_rate": 2.6079987707708045e-06, "loss": 0.4827, "step": 1875 }, { "epoch": 1.5402298850574714, "grad_norm": 0.015916477888822556, "learning_rate": 2.599140945625222e-06, "loss": 0.497, "step": 1876 }, { "epoch": 1.5410509031198685, "grad_norm": 0.015614441595971584, "learning_rate": 2.5902959526351834e-06, "loss": 0.4715, "step": 1877 }, { "epoch": 1.541871921182266, "grad_norm": 0.016244906932115555, "learning_rate": 2.581463807134706e-06, "loss": 0.4671, "step": 1878 }, { "epoch": 1.5426929392446633, "grad_norm": 0.016475308686494827, "learning_rate": 2.5726445244355523e-06, "loss": 0.4809, "step": 1879 }, { "epoch": 1.5435139573070606, "grad_norm": 0.015760285779833794, "learning_rate": 2.5638381198271813e-06, "loss": 0.4902, "step": 1880 }, { "epoch": 1.5443349753694582, "grad_norm": 0.016566110774874687, "learning_rate": 2.5550446085767233e-06, "loss": 0.5032, "step": 1881 }, { "epoch": 1.5451559934318555, "grad_norm": 0.016543706879019737, "learning_rate": 2.546264005928956e-06, "loss": 0.4919, "step": 1882 }, { "epoch": 1.5459770114942528, "grad_norm": 0.01645726151764393, "learning_rate": 2.5374963271062766e-06, "loss": 0.4904, "step": 1883 }, { "epoch": 1.5467980295566504, "grad_norm": 0.016130411997437477, "learning_rate": 2.5287415873086823e-06, "loss": 0.5082, "step": 1884 }, { "epoch": 1.5476190476190477, "grad_norm": 0.01631920598447323, "learning_rate": 2.5199998017137435e-06, "loss": 0.4786, "step": 1885 }, { "epoch": 1.548440065681445, "grad_norm": 0.017591379582881927, "learning_rate": 2.511270985476551e-06, "loss": 0.4876, "step": 1886 }, { "epoch": 1.5492610837438425, "grad_norm": 0.015623513609170914, "learning_rate": 2.5025551537297344e-06, "loss": 0.4817, "step": 1887 }, { "epoch": 1.5500821018062396, "grad_norm": 0.01776067167520523, "learning_rate": 2.4938523215834055e-06, "loss": 0.4763, "step": 1888 }, { "epoch": 1.5509031198686372, "grad_norm": 0.01697317697107792, "learning_rate": 2.4851625041251363e-06, "loss": 0.483, "step": 1889 }, { "epoch": 1.5517241379310345, "grad_norm": 0.015433291904628277, "learning_rate": 2.476485716419934e-06, "loss": 0.4567, "step": 1890 }, { "epoch": 1.5525451559934318, "grad_norm": 0.015932945534586906, "learning_rate": 2.4678219735102284e-06, "loss": 0.5191, "step": 1891 }, { "epoch": 1.5533661740558293, "grad_norm": 0.015454375185072422, "learning_rate": 2.4591712904158186e-06, "loss": 0.4622, "step": 1892 }, { "epoch": 1.5541871921182266, "grad_norm": 0.01569574512541294, "learning_rate": 2.4505336821338804e-06, "loss": 0.4766, "step": 1893 }, { "epoch": 1.555008210180624, "grad_norm": 0.016376560553908348, "learning_rate": 2.4419091636389023e-06, "loss": 0.4763, "step": 1894 }, { "epoch": 1.5558292282430215, "grad_norm": 0.016491059213876724, "learning_rate": 2.433297749882697e-06, "loss": 0.4873, "step": 1895 }, { "epoch": 1.5566502463054186, "grad_norm": 0.015268388204276562, "learning_rate": 2.424699455794353e-06, "loss": 0.4677, "step": 1896 }, { "epoch": 1.5574712643678161, "grad_norm": 0.015677891671657562, "learning_rate": 2.4161142962802097e-06, "loss": 0.4781, "step": 1897 }, { "epoch": 1.5582922824302134, "grad_norm": 0.015629008412361145, "learning_rate": 2.407542286223839e-06, "loss": 0.4862, "step": 1898 }, { "epoch": 1.5591133004926108, "grad_norm": 0.016713371500372887, "learning_rate": 2.398983440486019e-06, "loss": 0.4724, "step": 1899 }, { "epoch": 1.5599343185550083, "grad_norm": 0.014949311502277851, "learning_rate": 2.3904377739047e-06, "loss": 0.4654, "step": 1900 }, { "epoch": 1.5607553366174056, "grad_norm": 0.01566718891263008, "learning_rate": 2.3819053012949916e-06, "loss": 0.4777, "step": 1901 }, { "epoch": 1.561576354679803, "grad_norm": 0.016346188262104988, "learning_rate": 2.373386037449124e-06, "loss": 0.4853, "step": 1902 }, { "epoch": 1.5623973727422005, "grad_norm": 0.015849128365516663, "learning_rate": 2.364879997136426e-06, "loss": 0.4877, "step": 1903 }, { "epoch": 1.5632183908045976, "grad_norm": 0.017064644023776054, "learning_rate": 2.356387195103311e-06, "loss": 0.4868, "step": 1904 }, { "epoch": 1.564039408866995, "grad_norm": 0.01621677353978157, "learning_rate": 2.347907646073235e-06, "loss": 0.4958, "step": 1905 }, { "epoch": 1.5648604269293924, "grad_norm": 0.0162211824208498, "learning_rate": 2.339441364746674e-06, "loss": 0.4895, "step": 1906 }, { "epoch": 1.5656814449917897, "grad_norm": 0.016178473830223083, "learning_rate": 2.3309883658011172e-06, "loss": 0.5264, "step": 1907 }, { "epoch": 1.5665024630541873, "grad_norm": 0.016229422762989998, "learning_rate": 2.32254866389101e-06, "loss": 0.4777, "step": 1908 }, { "epoch": 1.5673234811165846, "grad_norm": 0.015940597280859947, "learning_rate": 2.3141222736477585e-06, "loss": 0.506, "step": 1909 }, { "epoch": 1.568144499178982, "grad_norm": 0.01580042764544487, "learning_rate": 2.3057092096796847e-06, "loss": 0.4771, "step": 1910 }, { "epoch": 1.5689655172413794, "grad_norm": 0.015447542071342468, "learning_rate": 2.297309486572006e-06, "loss": 0.4814, "step": 1911 }, { "epoch": 1.5697865353037765, "grad_norm": 0.015568530187010765, "learning_rate": 2.2889231188868164e-06, "loss": 0.4649, "step": 1912 }, { "epoch": 1.570607553366174, "grad_norm": 0.015696143731474876, "learning_rate": 2.280550121163058e-06, "loss": 0.474, "step": 1913 }, { "epoch": 1.5714285714285714, "grad_norm": 0.015379379503428936, "learning_rate": 2.2721905079164884e-06, "loss": 0.4802, "step": 1914 }, { "epoch": 1.5722495894909687, "grad_norm": 0.015729527920484543, "learning_rate": 2.2638442936396614e-06, "loss": 0.4723, "step": 1915 }, { "epoch": 1.5730706075533663, "grad_norm": 0.0155901238322258, "learning_rate": 2.255511492801904e-06, "loss": 0.4842, "step": 1916 }, { "epoch": 1.5738916256157636, "grad_norm": 0.01583964377641678, "learning_rate": 2.2471921198492947e-06, "loss": 0.4874, "step": 1917 }, { "epoch": 1.5747126436781609, "grad_norm": 0.015598983503878117, "learning_rate": 2.2388861892046225e-06, "loss": 0.4662, "step": 1918 }, { "epoch": 1.5755336617405584, "grad_norm": 0.016408590599894524, "learning_rate": 2.2305937152673753e-06, "loss": 0.4903, "step": 1919 }, { "epoch": 1.5763546798029555, "grad_norm": 0.017679080367088318, "learning_rate": 2.2223147124137175e-06, "loss": 0.4879, "step": 1920 }, { "epoch": 1.577175697865353, "grad_norm": 0.015338685363531113, "learning_rate": 2.214049194996458e-06, "loss": 0.4888, "step": 1921 }, { "epoch": 1.5779967159277504, "grad_norm": 0.015879876911640167, "learning_rate": 2.2057971773450206e-06, "loss": 0.4959, "step": 1922 }, { "epoch": 1.5788177339901477, "grad_norm": 0.01575079932808876, "learning_rate": 2.1975586737654288e-06, "loss": 0.4832, "step": 1923 }, { "epoch": 1.5796387520525452, "grad_norm": 0.01565314084291458, "learning_rate": 2.1893336985402834e-06, "loss": 0.4934, "step": 1924 }, { "epoch": 1.5804597701149425, "grad_norm": 0.015439679846167564, "learning_rate": 2.1811222659287195e-06, "loss": 0.4788, "step": 1925 }, { "epoch": 1.5812807881773399, "grad_norm": 0.016545888036489487, "learning_rate": 2.1729243901664108e-06, "loss": 0.5017, "step": 1926 }, { "epoch": 1.5821018062397374, "grad_norm": 0.016459206119179726, "learning_rate": 2.164740085465508e-06, "loss": 0.4986, "step": 1927 }, { "epoch": 1.5829228243021345, "grad_norm": 0.015283244661986828, "learning_rate": 2.1565693660146506e-06, "loss": 0.464, "step": 1928 }, { "epoch": 1.583743842364532, "grad_norm": 0.015454614534974098, "learning_rate": 2.148412245978923e-06, "loss": 0.447, "step": 1929 }, { "epoch": 1.5845648604269293, "grad_norm": 0.01652132347226143, "learning_rate": 2.140268739499829e-06, "loss": 0.5035, "step": 1930 }, { "epoch": 1.5853858784893267, "grad_norm": 0.015475360676646233, "learning_rate": 2.1321388606952707e-06, "loss": 0.4819, "step": 1931 }, { "epoch": 1.5862068965517242, "grad_norm": 0.0158186387270689, "learning_rate": 2.1240226236595337e-06, "loss": 0.4927, "step": 1932 }, { "epoch": 1.5870279146141215, "grad_norm": 0.015229251235723495, "learning_rate": 2.11592004246324e-06, "loss": 0.4895, "step": 1933 }, { "epoch": 1.5878489326765188, "grad_norm": 0.015820791944861412, "learning_rate": 2.107831131153352e-06, "loss": 0.4859, "step": 1934 }, { "epoch": 1.5886699507389164, "grad_norm": 0.015486907213926315, "learning_rate": 2.0997559037531224e-06, "loss": 0.4809, "step": 1935 }, { "epoch": 1.5894909688013135, "grad_norm": 0.015888821333646774, "learning_rate": 2.0916943742620825e-06, "loss": 0.4937, "step": 1936 }, { "epoch": 1.590311986863711, "grad_norm": 0.016018465161323547, "learning_rate": 2.0836465566560242e-06, "loss": 0.48, "step": 1937 }, { "epoch": 1.5911330049261085, "grad_norm": 0.01598304696381092, "learning_rate": 2.0756124648869595e-06, "loss": 0.4945, "step": 1938 }, { "epoch": 1.5919540229885056, "grad_norm": 0.015892794355750084, "learning_rate": 2.0675921128831035e-06, "loss": 0.4725, "step": 1939 }, { "epoch": 1.5927750410509032, "grad_norm": 0.015462912619113922, "learning_rate": 2.059585514548864e-06, "loss": 0.5079, "step": 1940 }, { "epoch": 1.5935960591133005, "grad_norm": 0.01626044511795044, "learning_rate": 2.051592683764788e-06, "loss": 0.5102, "step": 1941 }, { "epoch": 1.5944170771756978, "grad_norm": 0.01545952819287777, "learning_rate": 2.04361363438757e-06, "loss": 0.4818, "step": 1942 }, { "epoch": 1.5952380952380953, "grad_norm": 0.01623893901705742, "learning_rate": 2.035648380250003e-06, "loss": 0.5058, "step": 1943 }, { "epoch": 1.5960591133004927, "grad_norm": 0.015128539875149727, "learning_rate": 2.0276969351609626e-06, "loss": 0.4735, "step": 1944 }, { "epoch": 1.59688013136289, "grad_norm": 0.016033314168453217, "learning_rate": 2.0197593129053917e-06, "loss": 0.4806, "step": 1945 }, { "epoch": 1.5977011494252875, "grad_norm": 0.015621437691152096, "learning_rate": 2.0118355272442718e-06, "loss": 0.4814, "step": 1946 }, { "epoch": 1.5985221674876846, "grad_norm": 0.015524248592555523, "learning_rate": 2.00392559191458e-06, "loss": 0.4919, "step": 1947 }, { "epoch": 1.5993431855500821, "grad_norm": 0.01565590687096119, "learning_rate": 1.996029520629302e-06, "loss": 0.4849, "step": 1948 }, { "epoch": 1.6001642036124795, "grad_norm": 0.016836225986480713, "learning_rate": 1.9881473270773717e-06, "loss": 0.4925, "step": 1949 }, { "epoch": 1.6009852216748768, "grad_norm": 0.016094274818897247, "learning_rate": 1.9802790249236785e-06, "loss": 0.4756, "step": 1950 }, { "epoch": 1.6018062397372743, "grad_norm": 0.015539838932454586, "learning_rate": 1.9724246278090194e-06, "loss": 0.4794, "step": 1951 }, { "epoch": 1.6026272577996716, "grad_norm": 0.016037996858358383, "learning_rate": 1.964584149350084e-06, "loss": 0.5027, "step": 1952 }, { "epoch": 1.603448275862069, "grad_norm": 0.01562398299574852, "learning_rate": 1.95675760313944e-06, "loss": 0.4705, "step": 1953 }, { "epoch": 1.6042692939244665, "grad_norm": 0.015694711357355118, "learning_rate": 1.9489450027455017e-06, "loss": 0.4753, "step": 1954 }, { "epoch": 1.6050903119868636, "grad_norm": 0.015679875388741493, "learning_rate": 1.941146361712492e-06, "loss": 0.4951, "step": 1955 }, { "epoch": 1.6059113300492611, "grad_norm": 0.015887128189206123, "learning_rate": 1.9333616935604485e-06, "loss": 0.4899, "step": 1956 }, { "epoch": 1.6067323481116584, "grad_norm": 0.015506242401897907, "learning_rate": 1.925591011785182e-06, "loss": 0.4651, "step": 1957 }, { "epoch": 1.6075533661740558, "grad_norm": 0.01599552296102047, "learning_rate": 1.917834329858251e-06, "loss": 0.4801, "step": 1958 }, { "epoch": 1.6083743842364533, "grad_norm": 0.01593782566487789, "learning_rate": 1.910091661226946e-06, "loss": 0.4914, "step": 1959 }, { "epoch": 1.6091954022988506, "grad_norm": 0.01553129032254219, "learning_rate": 1.9023630193142617e-06, "loss": 0.4668, "step": 1960 }, { "epoch": 1.610016420361248, "grad_norm": 0.0158458910882473, "learning_rate": 1.8946484175188797e-06, "loss": 0.4867, "step": 1961 }, { "epoch": 1.6108374384236455, "grad_norm": 0.015690842643380165, "learning_rate": 1.88694786921514e-06, "loss": 0.4904, "step": 1962 }, { "epoch": 1.6116584564860426, "grad_norm": 0.016395214945077896, "learning_rate": 1.879261387753017e-06, "loss": 0.4826, "step": 1963 }, { "epoch": 1.61247947454844, "grad_norm": 0.017209570854902267, "learning_rate": 1.8715889864580958e-06, "loss": 0.482, "step": 1964 }, { "epoch": 1.6133004926108374, "grad_norm": 0.015885401517152786, "learning_rate": 1.863930678631558e-06, "loss": 0.5148, "step": 1965 }, { "epoch": 1.6141215106732347, "grad_norm": 0.015720441937446594, "learning_rate": 1.8562864775501482e-06, "loss": 0.4767, "step": 1966 }, { "epoch": 1.6149425287356323, "grad_norm": 0.015652090311050415, "learning_rate": 1.8486563964661536e-06, "loss": 0.4648, "step": 1967 }, { "epoch": 1.6157635467980296, "grad_norm": 0.015637246891856194, "learning_rate": 1.8410404486073868e-06, "loss": 0.5051, "step": 1968 }, { "epoch": 1.616584564860427, "grad_norm": 0.01548423059284687, "learning_rate": 1.8334386471771548e-06, "loss": 0.4817, "step": 1969 }, { "epoch": 1.6174055829228244, "grad_norm": 0.015508806332945824, "learning_rate": 1.825851005354242e-06, "loss": 0.4818, "step": 1970 }, { "epoch": 1.6182266009852215, "grad_norm": 0.016123181208968163, "learning_rate": 1.8182775362928856e-06, "loss": 0.4982, "step": 1971 }, { "epoch": 1.619047619047619, "grad_norm": 0.015823356807231903, "learning_rate": 1.8107182531227468e-06, "loss": 0.4959, "step": 1972 }, { "epoch": 1.6198686371100164, "grad_norm": 0.015498347580432892, "learning_rate": 1.8031731689489029e-06, "loss": 0.4771, "step": 1973 }, { "epoch": 1.6206896551724137, "grad_norm": 0.01600535213947296, "learning_rate": 1.7956422968518076e-06, "loss": 0.4926, "step": 1974 }, { "epoch": 1.6215106732348112, "grad_norm": 0.015418567694723606, "learning_rate": 1.788125649887276e-06, "loss": 0.497, "step": 1975 }, { "epoch": 1.6223316912972086, "grad_norm": 0.01551859825849533, "learning_rate": 1.78062324108647e-06, "loss": 0.4645, "step": 1976 }, { "epoch": 1.6231527093596059, "grad_norm": 0.015696529299020767, "learning_rate": 1.7731350834558567e-06, "loss": 0.4698, "step": 1977 }, { "epoch": 1.6239737274220034, "grad_norm": 0.015740003436803818, "learning_rate": 1.765661189977207e-06, "loss": 0.4941, "step": 1978 }, { "epoch": 1.6247947454844005, "grad_norm": 0.016558723524212837, "learning_rate": 1.7582015736075565e-06, "loss": 0.4855, "step": 1979 }, { "epoch": 1.625615763546798, "grad_norm": 0.015828408300876617, "learning_rate": 1.750756247279188e-06, "loss": 0.5024, "step": 1980 }, { "epoch": 1.6264367816091954, "grad_norm": 0.015253191813826561, "learning_rate": 1.7433252238996166e-06, "loss": 0.4867, "step": 1981 }, { "epoch": 1.6272577996715927, "grad_norm": 0.016014771535992622, "learning_rate": 1.7359085163515601e-06, "loss": 0.4778, "step": 1982 }, { "epoch": 1.6280788177339902, "grad_norm": 0.016061678528785706, "learning_rate": 1.7285061374929134e-06, "loss": 0.4755, "step": 1983 }, { "epoch": 1.6288998357963875, "grad_norm": 0.01628752239048481, "learning_rate": 1.7211181001567334e-06, "loss": 0.4691, "step": 1984 }, { "epoch": 1.6297208538587848, "grad_norm": 0.015998616814613342, "learning_rate": 1.7137444171512124e-06, "loss": 0.4964, "step": 1985 }, { "epoch": 1.6305418719211824, "grad_norm": 0.01486665103584528, "learning_rate": 1.7063851012596625e-06, "loss": 0.4798, "step": 1986 }, { "epoch": 1.6313628899835795, "grad_norm": 0.015691667795181274, "learning_rate": 1.6990401652404825e-06, "loss": 0.4902, "step": 1987 }, { "epoch": 1.632183908045977, "grad_norm": 0.015749597921967506, "learning_rate": 1.6917096218271418e-06, "loss": 0.4954, "step": 1988 }, { "epoch": 1.6330049261083743, "grad_norm": 0.015742674469947815, "learning_rate": 1.684393483728163e-06, "loss": 0.4613, "step": 1989 }, { "epoch": 1.6338259441707716, "grad_norm": 0.01611177623271942, "learning_rate": 1.677091763627094e-06, "loss": 0.5019, "step": 1990 }, { "epoch": 1.6346469622331692, "grad_norm": 0.015637114644050598, "learning_rate": 1.669804474182482e-06, "loss": 0.4689, "step": 1991 }, { "epoch": 1.6354679802955665, "grad_norm": 0.015532470308244228, "learning_rate": 1.6625316280278612e-06, "loss": 0.4991, "step": 1992 }, { "epoch": 1.6362889983579638, "grad_norm": 0.015088187530636787, "learning_rate": 1.6552732377717273e-06, "loss": 0.466, "step": 1993 }, { "epoch": 1.6371100164203614, "grad_norm": 0.01568402163684368, "learning_rate": 1.6480293159975082e-06, "loss": 0.4797, "step": 1994 }, { "epoch": 1.6379310344827587, "grad_norm": 0.015868451446294785, "learning_rate": 1.6407998752635608e-06, "loss": 0.4741, "step": 1995 }, { "epoch": 1.638752052545156, "grad_norm": 0.015653511509299278, "learning_rate": 1.6335849281031208e-06, "loss": 0.4646, "step": 1996 }, { "epoch": 1.6395730706075535, "grad_norm": 0.015679484233260155, "learning_rate": 1.6263844870243083e-06, "loss": 0.5019, "step": 1997 }, { "epoch": 1.6403940886699506, "grad_norm": 0.015676412731409073, "learning_rate": 1.6191985645100966e-06, "loss": 0.4757, "step": 1998 }, { "epoch": 1.6412151067323482, "grad_norm": 0.015784261748194695, "learning_rate": 1.6120271730182838e-06, "loss": 0.4685, "step": 1999 }, { "epoch": 1.6420361247947455, "grad_norm": 0.015728861093521118, "learning_rate": 1.6048703249814721e-06, "loss": 0.4866, "step": 2000 }, { "epoch": 1.6428571428571428, "grad_norm": 0.01626911573112011, "learning_rate": 1.597728032807064e-06, "loss": 0.5272, "step": 2001 }, { "epoch": 1.6436781609195403, "grad_norm": 0.01550866849720478, "learning_rate": 1.5906003088772146e-06, "loss": 0.5048, "step": 2002 }, { "epoch": 1.6444991789819376, "grad_norm": 0.01563875563442707, "learning_rate": 1.583487165548831e-06, "loss": 0.4814, "step": 2003 }, { "epoch": 1.645320197044335, "grad_norm": 0.015220959670841694, "learning_rate": 1.5763886151535382e-06, "loss": 0.4792, "step": 2004 }, { "epoch": 1.6461412151067325, "grad_norm": 0.016066614538431168, "learning_rate": 1.5693046699976617e-06, "loss": 0.5004, "step": 2005 }, { "epoch": 1.6469622331691296, "grad_norm": 0.015524661168456078, "learning_rate": 1.5622353423622137e-06, "loss": 0.4585, "step": 2006 }, { "epoch": 1.6477832512315271, "grad_norm": 0.015538550913333893, "learning_rate": 1.5551806445028585e-06, "loss": 0.4789, "step": 2007 }, { "epoch": 1.6486042692939245, "grad_norm": 0.015526456758379936, "learning_rate": 1.5481405886498946e-06, "loss": 0.4944, "step": 2008 }, { "epoch": 1.6494252873563218, "grad_norm": 0.015546659007668495, "learning_rate": 1.5411151870082483e-06, "loss": 0.4744, "step": 2009 }, { "epoch": 1.6502463054187193, "grad_norm": 0.01639384776353836, "learning_rate": 1.5341044517574283e-06, "loss": 0.5052, "step": 2010 }, { "epoch": 1.6510673234811166, "grad_norm": 0.016197599470615387, "learning_rate": 1.5271083950515257e-06, "loss": 0.4938, "step": 2011 }, { "epoch": 1.651888341543514, "grad_norm": 0.01581249199807644, "learning_rate": 1.5201270290191808e-06, "loss": 0.4858, "step": 2012 }, { "epoch": 1.6527093596059115, "grad_norm": 0.01576329581439495, "learning_rate": 1.5131603657635624e-06, "loss": 0.4928, "step": 2013 }, { "epoch": 1.6535303776683086, "grad_norm": 0.015252877958118916, "learning_rate": 1.5062084173623558e-06, "loss": 0.4661, "step": 2014 }, { "epoch": 1.654351395730706, "grad_norm": 0.015963822603225708, "learning_rate": 1.4992711958677372e-06, "loss": 0.496, "step": 2015 }, { "epoch": 1.6551724137931034, "grad_norm": 0.015794062986969948, "learning_rate": 1.4923487133063418e-06, "loss": 0.5056, "step": 2016 }, { "epoch": 1.6559934318555007, "grad_norm": 0.01539048831909895, "learning_rate": 1.4854409816792625e-06, "loss": 0.4539, "step": 2017 }, { "epoch": 1.6568144499178983, "grad_norm": 0.015466923825442791, "learning_rate": 1.478548012962015e-06, "loss": 0.483, "step": 2018 }, { "epoch": 1.6576354679802956, "grad_norm": 0.01584423892199993, "learning_rate": 1.4716698191045242e-06, "loss": 0.4886, "step": 2019 }, { "epoch": 1.658456486042693, "grad_norm": 0.015421071089804173, "learning_rate": 1.4648064120310968e-06, "loss": 0.4948, "step": 2020 }, { "epoch": 1.6592775041050905, "grad_norm": 0.0154110137373209, "learning_rate": 1.457957803640406e-06, "loss": 0.4912, "step": 2021 }, { "epoch": 1.6600985221674875, "grad_norm": 0.01591498963534832, "learning_rate": 1.451124005805471e-06, "loss": 0.476, "step": 2022 }, { "epoch": 1.660919540229885, "grad_norm": 0.01656123250722885, "learning_rate": 1.4443050303736397e-06, "loss": 0.4972, "step": 2023 }, { "epoch": 1.6617405582922824, "grad_norm": 0.017036881297826767, "learning_rate": 1.4375008891665474e-06, "loss": 0.502, "step": 2024 }, { "epoch": 1.6625615763546797, "grad_norm": 0.016035238280892372, "learning_rate": 1.4307115939801276e-06, "loss": 0.4909, "step": 2025 }, { "epoch": 1.6633825944170773, "grad_norm": 0.015836592763662338, "learning_rate": 1.4239371565845719e-06, "loss": 0.5, "step": 2026 }, { "epoch": 1.6642036124794746, "grad_norm": 0.015833750367164612, "learning_rate": 1.4171775887243122e-06, "loss": 0.4783, "step": 2027 }, { "epoch": 1.6650246305418719, "grad_norm": 0.015752112492918968, "learning_rate": 1.4104329021180027e-06, "loss": 0.4775, "step": 2028 }, { "epoch": 1.6658456486042694, "grad_norm": 0.015482007525861263, "learning_rate": 1.4037031084584948e-06, "loss": 0.4808, "step": 2029 }, { "epoch": 1.6666666666666665, "grad_norm": 0.015523916110396385, "learning_rate": 1.396988219412828e-06, "loss": 0.4769, "step": 2030 }, { "epoch": 1.667487684729064, "grad_norm": 0.015568318776786327, "learning_rate": 1.3902882466222029e-06, "loss": 0.4819, "step": 2031 }, { "epoch": 1.6683087027914614, "grad_norm": 0.015001797117292881, "learning_rate": 1.3836032017019532e-06, "loss": 0.4604, "step": 2032 }, { "epoch": 1.6691297208538587, "grad_norm": 0.015303738415241241, "learning_rate": 1.3769330962415356e-06, "loss": 0.478, "step": 2033 }, { "epoch": 1.6699507389162562, "grad_norm": 0.0158770140260458, "learning_rate": 1.370277941804513e-06, "loss": 0.4669, "step": 2034 }, { "epoch": 1.6707717569786535, "grad_norm": 0.01618236117064953, "learning_rate": 1.3636377499285228e-06, "loss": 0.5075, "step": 2035 }, { "epoch": 1.6715927750410509, "grad_norm": 0.0160455834120512, "learning_rate": 1.357012532125261e-06, "loss": 0.4937, "step": 2036 }, { "epoch": 1.6724137931034484, "grad_norm": 0.015862135216593742, "learning_rate": 1.350402299880472e-06, "loss": 0.4935, "step": 2037 }, { "epoch": 1.6732348111658455, "grad_norm": 0.016203027218580246, "learning_rate": 1.3438070646539109e-06, "loss": 0.5147, "step": 2038 }, { "epoch": 1.674055829228243, "grad_norm": 0.015451003797352314, "learning_rate": 1.3372268378793438e-06, "loss": 0.4679, "step": 2039 }, { "epoch": 1.6748768472906403, "grad_norm": 0.015685701742768288, "learning_rate": 1.3306616309645094e-06, "loss": 0.5019, "step": 2040 }, { "epoch": 1.6756978653530377, "grad_norm": 0.015762388706207275, "learning_rate": 1.3241114552911075e-06, "loss": 0.4854, "step": 2041 }, { "epoch": 1.6765188834154352, "grad_norm": 0.015071883797645569, "learning_rate": 1.3175763222147853e-06, "loss": 0.4699, "step": 2042 }, { "epoch": 1.6773399014778325, "grad_norm": 0.015742506831884384, "learning_rate": 1.3110562430651055e-06, "loss": 0.4726, "step": 2043 }, { "epoch": 1.6781609195402298, "grad_norm": 0.015501638874411583, "learning_rate": 1.3045512291455378e-06, "loss": 0.4821, "step": 2044 }, { "epoch": 1.6789819376026274, "grad_norm": 0.015396186150610447, "learning_rate": 1.2980612917334295e-06, "loss": 0.463, "step": 2045 }, { "epoch": 1.6798029556650245, "grad_norm": 0.015435964800417423, "learning_rate": 1.291586442079992e-06, "loss": 0.4692, "step": 2046 }, { "epoch": 1.680623973727422, "grad_norm": 0.016745395958423615, "learning_rate": 1.2851266914102828e-06, "loss": 0.5167, "step": 2047 }, { "epoch": 1.6814449917898193, "grad_norm": 0.015125054866075516, "learning_rate": 1.2786820509231806e-06, "loss": 0.4722, "step": 2048 }, { "epoch": 1.6822660098522166, "grad_norm": 0.01636912301182747, "learning_rate": 1.2722525317913665e-06, "loss": 0.4942, "step": 2049 }, { "epoch": 1.6830870279146142, "grad_norm": 0.015484650619328022, "learning_rate": 1.265838145161313e-06, "loss": 0.458, "step": 2050 }, { "epoch": 1.6839080459770115, "grad_norm": 0.0158610250800848, "learning_rate": 1.2594389021532508e-06, "loss": 0.4761, "step": 2051 }, { "epoch": 1.6847290640394088, "grad_norm": 0.01632123999297619, "learning_rate": 1.253054813861164e-06, "loss": 0.5171, "step": 2052 }, { "epoch": 1.6855500821018063, "grad_norm": 0.01526773814111948, "learning_rate": 1.2466858913527596e-06, "loss": 0.4995, "step": 2053 }, { "epoch": 1.6863711001642037, "grad_norm": 0.015935787931084633, "learning_rate": 1.24033214566945e-06, "loss": 0.4763, "step": 2054 }, { "epoch": 1.687192118226601, "grad_norm": 0.015317106619477272, "learning_rate": 1.2339935878263428e-06, "loss": 0.4737, "step": 2055 }, { "epoch": 1.6880131362889985, "grad_norm": 0.015093964524567127, "learning_rate": 1.2276702288122156e-06, "loss": 0.4735, "step": 2056 }, { "epoch": 1.6888341543513956, "grad_norm": 0.016071287915110588, "learning_rate": 1.221362079589488e-06, "loss": 0.478, "step": 2057 }, { "epoch": 1.6896551724137931, "grad_norm": 0.01538119837641716, "learning_rate": 1.2150691510942183e-06, "loss": 0.4732, "step": 2058 }, { "epoch": 1.6904761904761905, "grad_norm": 0.015424251556396484, "learning_rate": 1.208791454236079e-06, "loss": 0.4711, "step": 2059 }, { "epoch": 1.6912972085385878, "grad_norm": 0.016360469162464142, "learning_rate": 1.2025289998983316e-06, "loss": 0.5121, "step": 2060 }, { "epoch": 1.6921182266009853, "grad_norm": 0.015696777030825615, "learning_rate": 1.196281798937816e-06, "loss": 0.4755, "step": 2061 }, { "epoch": 1.6929392446633826, "grad_norm": 0.01575440540909767, "learning_rate": 1.1900498621849226e-06, "loss": 0.4821, "step": 2062 }, { "epoch": 1.69376026272578, "grad_norm": 0.015944097191095352, "learning_rate": 1.1838332004435858e-06, "loss": 0.4848, "step": 2063 }, { "epoch": 1.6945812807881775, "grad_norm": 0.01584324985742569, "learning_rate": 1.1776318244912622e-06, "loss": 0.4587, "step": 2064 }, { "epoch": 1.6954022988505746, "grad_norm": 0.016223037615418434, "learning_rate": 1.1714457450788915e-06, "loss": 0.4744, "step": 2065 }, { "epoch": 1.6962233169129721, "grad_norm": 0.015747593715786934, "learning_rate": 1.1652749729309126e-06, "loss": 0.4561, "step": 2066 }, { "epoch": 1.6970443349753694, "grad_norm": 0.015842510387301445, "learning_rate": 1.1591195187452187e-06, "loss": 0.4861, "step": 2067 }, { "epoch": 1.6978653530377668, "grad_norm": 0.016176441684365273, "learning_rate": 1.1529793931931497e-06, "loss": 0.4895, "step": 2068 }, { "epoch": 1.6986863711001643, "grad_norm": 0.016653161495923996, "learning_rate": 1.1468546069194666e-06, "loss": 0.4875, "step": 2069 }, { "epoch": 1.6995073891625616, "grad_norm": 0.015825165435671806, "learning_rate": 1.140745170542345e-06, "loss": 0.464, "step": 2070 }, { "epoch": 1.700328407224959, "grad_norm": 0.016034720465540886, "learning_rate": 1.134651094653341e-06, "loss": 0.4541, "step": 2071 }, { "epoch": 1.7011494252873565, "grad_norm": 0.015851562842726707, "learning_rate": 1.1285723898173925e-06, "loss": 0.5031, "step": 2072 }, { "epoch": 1.7019704433497536, "grad_norm": 0.015767419710755348, "learning_rate": 1.1225090665727744e-06, "loss": 0.4775, "step": 2073 }, { "epoch": 1.702791461412151, "grad_norm": 0.015090251341462135, "learning_rate": 1.116461135431106e-06, "loss": 0.4706, "step": 2074 }, { "epoch": 1.7036124794745484, "grad_norm": 0.01600392907857895, "learning_rate": 1.1104286068773245e-06, "loss": 0.4864, "step": 2075 }, { "epoch": 1.7044334975369457, "grad_norm": 0.01571963168680668, "learning_rate": 1.1044114913696573e-06, "loss": 0.5234, "step": 2076 }, { "epoch": 1.7052545155993433, "grad_norm": 0.01666342280805111, "learning_rate": 1.0984097993396102e-06, "loss": 0.4873, "step": 2077 }, { "epoch": 1.7060755336617406, "grad_norm": 0.01565195992588997, "learning_rate": 1.0924235411919583e-06, "loss": 0.4845, "step": 2078 }, { "epoch": 1.706896551724138, "grad_norm": 0.014956934377551079, "learning_rate": 1.086452727304713e-06, "loss": 0.481, "step": 2079 }, { "epoch": 1.7077175697865354, "grad_norm": 0.01591748557984829, "learning_rate": 1.080497368029116e-06, "loss": 0.4998, "step": 2080 }, { "epoch": 1.7085385878489325, "grad_norm": 0.015323350206017494, "learning_rate": 1.0745574736896117e-06, "loss": 0.4915, "step": 2081 }, { "epoch": 1.70935960591133, "grad_norm": 0.015289954841136932, "learning_rate": 1.0686330545838338e-06, "loss": 0.5004, "step": 2082 }, { "epoch": 1.7101806239737274, "grad_norm": 0.015453536063432693, "learning_rate": 1.0627241209825936e-06, "loss": 0.4898, "step": 2083 }, { "epoch": 1.7110016420361247, "grad_norm": 0.015360808931291103, "learning_rate": 1.0568306831298506e-06, "loss": 0.4587, "step": 2084 }, { "epoch": 1.7118226600985222, "grad_norm": 0.015217814594507217, "learning_rate": 1.050952751242699e-06, "loss": 0.4806, "step": 2085 }, { "epoch": 1.7126436781609196, "grad_norm": 0.015033245086669922, "learning_rate": 1.045090335511359e-06, "loss": 0.4684, "step": 2086 }, { "epoch": 1.7134646962233169, "grad_norm": 0.015733687207102776, "learning_rate": 1.0392434460991403e-06, "loss": 0.5054, "step": 2087 }, { "epoch": 1.7142857142857144, "grad_norm": 0.015921618789434433, "learning_rate": 1.0334120931424475e-06, "loss": 0.4842, "step": 2088 }, { "epoch": 1.7151067323481115, "grad_norm": 0.015839990228414536, "learning_rate": 1.027596286750741e-06, "loss": 0.4829, "step": 2089 }, { "epoch": 1.715927750410509, "grad_norm": 0.01586214266717434, "learning_rate": 1.0217960370065332e-06, "loss": 0.5062, "step": 2090 }, { "epoch": 1.7167487684729064, "grad_norm": 0.015566175803542137, "learning_rate": 1.016011353965366e-06, "loss": 0.4926, "step": 2091 }, { "epoch": 1.7175697865353037, "grad_norm": 0.01525910384953022, "learning_rate": 1.0102422476557997e-06, "loss": 0.481, "step": 2092 }, { "epoch": 1.7183908045977012, "grad_norm": 0.015301541425287724, "learning_rate": 1.004488728079377e-06, "loss": 0.4738, "step": 2093 }, { "epoch": 1.7192118226600985, "grad_norm": 0.015459192916750908, "learning_rate": 9.987508052106317e-07, "loss": 0.4911, "step": 2094 }, { "epoch": 1.7200328407224958, "grad_norm": 0.01622798480093479, "learning_rate": 9.930284889970523e-07, "loss": 0.507, "step": 2095 }, { "epoch": 1.7208538587848934, "grad_norm": 0.014863235875964165, "learning_rate": 9.873217893590748e-07, "loss": 0.456, "step": 2096 }, { "epoch": 1.7216748768472905, "grad_norm": 0.01534042414277792, "learning_rate": 9.816307161900584e-07, "loss": 0.4789, "step": 2097 }, { "epoch": 1.722495894909688, "grad_norm": 0.01636558771133423, "learning_rate": 9.759552793562688e-07, "loss": 0.5, "step": 2098 }, { "epoch": 1.7233169129720853, "grad_norm": 0.015529455617070198, "learning_rate": 9.702954886968707e-07, "loss": 0.4664, "step": 2099 }, { "epoch": 1.7241379310344827, "grad_norm": 0.01603534072637558, "learning_rate": 9.646513540239035e-07, "loss": 0.5031, "step": 2100 }, { "epoch": 1.7249589490968802, "grad_norm": 0.014952403493225574, "learning_rate": 9.590228851222592e-07, "loss": 0.5015, "step": 2101 }, { "epoch": 1.7257799671592775, "grad_norm": 0.015509990975260735, "learning_rate": 9.53410091749674e-07, "loss": 0.4846, "step": 2102 }, { "epoch": 1.7266009852216748, "grad_norm": 0.014958679676055908, "learning_rate": 9.478129836367099e-07, "loss": 0.463, "step": 2103 }, { "epoch": 1.7274220032840724, "grad_norm": 0.015841830521821976, "learning_rate": 9.422315704867341e-07, "loss": 0.4658, "step": 2104 }, { "epoch": 1.7282430213464697, "grad_norm": 0.015340582467615604, "learning_rate": 9.366658619759054e-07, "loss": 0.4746, "step": 2105 }, { "epoch": 1.729064039408867, "grad_norm": 0.015921082347631454, "learning_rate": 9.311158677531536e-07, "loss": 0.4988, "step": 2106 }, { "epoch": 1.7298850574712645, "grad_norm": 0.014810705557465553, "learning_rate": 9.255815974401708e-07, "loss": 0.4669, "step": 2107 }, { "epoch": 1.7307060755336616, "grad_norm": 0.01562414038926363, "learning_rate": 9.200630606313877e-07, "loss": 0.4843, "step": 2108 }, { "epoch": 1.7315270935960592, "grad_norm": 0.015357793308794498, "learning_rate": 9.145602668939573e-07, "loss": 0.5065, "step": 2109 }, { "epoch": 1.7323481116584565, "grad_norm": 0.016098149120807648, "learning_rate": 9.090732257677367e-07, "loss": 0.5079, "step": 2110 }, { "epoch": 1.7331691297208538, "grad_norm": 0.01582753099501133, "learning_rate": 9.036019467652833e-07, "loss": 0.4857, "step": 2111 }, { "epoch": 1.7339901477832513, "grad_norm": 0.016597777605056763, "learning_rate": 8.981464393718175e-07, "loss": 0.4943, "step": 2112 }, { "epoch": 1.7348111658456487, "grad_norm": 0.0160269383341074, "learning_rate": 8.927067130452268e-07, "loss": 0.4815, "step": 2113 }, { "epoch": 1.735632183908046, "grad_norm": 0.01501698512583971, "learning_rate": 8.872827772160338e-07, "loss": 0.4734, "step": 2114 }, { "epoch": 1.7364532019704435, "grad_norm": 0.015269134193658829, "learning_rate": 8.818746412873866e-07, "loss": 0.48, "step": 2115 }, { "epoch": 1.7372742200328406, "grad_norm": 0.015069263987243176, "learning_rate": 8.76482314635047e-07, "loss": 0.456, "step": 2116 }, { "epoch": 1.7380952380952381, "grad_norm": 0.016815979033708572, "learning_rate": 8.711058066073635e-07, "loss": 0.4994, "step": 2117 }, { "epoch": 1.7389162561576355, "grad_norm": 0.015854621306061745, "learning_rate": 8.657451265252631e-07, "loss": 0.5029, "step": 2118 }, { "epoch": 1.7397372742200328, "grad_norm": 0.015168856829404831, "learning_rate": 8.604002836822335e-07, "loss": 0.4618, "step": 2119 }, { "epoch": 1.7405582922824303, "grad_norm": 0.015261489897966385, "learning_rate": 8.55071287344303e-07, "loss": 0.4639, "step": 2120 }, { "epoch": 1.7413793103448276, "grad_norm": 0.015366725623607635, "learning_rate": 8.49758146750036e-07, "loss": 0.4561, "step": 2121 }, { "epoch": 1.742200328407225, "grad_norm": 0.015112039633095264, "learning_rate": 8.444608711104987e-07, "loss": 0.4577, "step": 2122 }, { "epoch": 1.7430213464696225, "grad_norm": 0.01497553288936615, "learning_rate": 8.391794696092563e-07, "loss": 0.4654, "step": 2123 }, { "epoch": 1.7438423645320196, "grad_norm": 0.015393216162919998, "learning_rate": 8.33913951402358e-07, "loss": 0.49, "step": 2124 }, { "epoch": 1.7446633825944171, "grad_norm": 0.015450459904968739, "learning_rate": 8.28664325618318e-07, "loss": 0.4805, "step": 2125 }, { "epoch": 1.7454844006568144, "grad_norm": 0.016087999567389488, "learning_rate": 8.234306013580867e-07, "loss": 0.4937, "step": 2126 }, { "epoch": 1.7463054187192117, "grad_norm": 0.015580431558191776, "learning_rate": 8.182127876950623e-07, "loss": 0.4937, "step": 2127 }, { "epoch": 1.7471264367816093, "grad_norm": 0.014850424602627754, "learning_rate": 8.13010893675048e-07, "loss": 0.4548, "step": 2128 }, { "epoch": 1.7479474548440066, "grad_norm": 0.015039444901049137, "learning_rate": 8.078249283162575e-07, "loss": 0.4566, "step": 2129 }, { "epoch": 1.748768472906404, "grad_norm": 0.015349899418652058, "learning_rate": 8.026549006092833e-07, "loss": 0.4764, "step": 2130 }, { "epoch": 1.7495894909688015, "grad_norm": 0.015569412149488926, "learning_rate": 7.975008195170878e-07, "loss": 0.4789, "step": 2131 }, { "epoch": 1.7504105090311985, "grad_norm": 0.015145953744649887, "learning_rate": 7.923626939749923e-07, "loss": 0.4772, "step": 2132 }, { "epoch": 1.751231527093596, "grad_norm": 0.016421262174844742, "learning_rate": 7.872405328906569e-07, "loss": 0.4809, "step": 2133 }, { "epoch": 1.7520525451559934, "grad_norm": 0.015398701652884483, "learning_rate": 7.821343451440558e-07, "loss": 0.5053, "step": 2134 }, { "epoch": 1.7528735632183907, "grad_norm": 0.015453897416591644, "learning_rate": 7.77044139587482e-07, "loss": 0.4741, "step": 2135 }, { "epoch": 1.7536945812807883, "grad_norm": 0.015196345746517181, "learning_rate": 7.71969925045519e-07, "loss": 0.4743, "step": 2136 }, { "epoch": 1.7545155993431856, "grad_norm": 0.015317702665925026, "learning_rate": 7.669117103150238e-07, "loss": 0.4898, "step": 2137 }, { "epoch": 1.7553366174055829, "grad_norm": 0.015206136740744114, "learning_rate": 7.618695041651166e-07, "loss": 0.4954, "step": 2138 }, { "epoch": 1.7561576354679804, "grad_norm": 0.015694070607423782, "learning_rate": 7.568433153371653e-07, "loss": 0.5144, "step": 2139 }, { "epoch": 1.7569786535303775, "grad_norm": 0.01578361913561821, "learning_rate": 7.518331525447691e-07, "loss": 0.5019, "step": 2140 }, { "epoch": 1.757799671592775, "grad_norm": 0.01525607705116272, "learning_rate": 7.468390244737504e-07, "loss": 0.4756, "step": 2141 }, { "epoch": 1.7586206896551724, "grad_norm": 0.014851718209683895, "learning_rate": 7.418609397821183e-07, "loss": 0.4649, "step": 2142 }, { "epoch": 1.7594417077175697, "grad_norm": 0.015416168607771397, "learning_rate": 7.368989071000809e-07, "loss": 0.4649, "step": 2143 }, { "epoch": 1.7602627257799672, "grad_norm": 0.019347622990608215, "learning_rate": 7.319529350300173e-07, "loss": 0.4912, "step": 2144 }, { "epoch": 1.7610837438423645, "grad_norm": 0.016116976737976074, "learning_rate": 7.270230321464567e-07, "loss": 0.4888, "step": 2145 }, { "epoch": 1.7619047619047619, "grad_norm": 0.015480166301131248, "learning_rate": 7.221092069960729e-07, "loss": 0.4619, "step": 2146 }, { "epoch": 1.7627257799671594, "grad_norm": 0.015195691958069801, "learning_rate": 7.17211468097671e-07, "loss": 0.4571, "step": 2147 }, { "epoch": 1.7635467980295565, "grad_norm": 0.015291008166968822, "learning_rate": 7.123298239421613e-07, "loss": 0.4754, "step": 2148 }, { "epoch": 1.764367816091954, "grad_norm": 0.01524385716766119, "learning_rate": 7.074642829925607e-07, "loss": 0.461, "step": 2149 }, { "epoch": 1.7651888341543513, "grad_norm": 0.015616542659699917, "learning_rate": 7.02614853683956e-07, "loss": 0.467, "step": 2150 }, { "epoch": 1.7660098522167487, "grad_norm": 0.01563451625406742, "learning_rate": 6.977815444235131e-07, "loss": 0.4744, "step": 2151 }, { "epoch": 1.7668308702791462, "grad_norm": 0.015570216812193394, "learning_rate": 6.929643635904496e-07, "loss": 0.486, "step": 2152 }, { "epoch": 1.7676518883415435, "grad_norm": 0.015299690887331963, "learning_rate": 6.881633195360192e-07, "loss": 0.4677, "step": 2153 }, { "epoch": 1.7684729064039408, "grad_norm": 0.015313737094402313, "learning_rate": 6.833784205834998e-07, "loss": 0.4698, "step": 2154 }, { "epoch": 1.7692939244663384, "grad_norm": 0.0155660230666399, "learning_rate": 6.786096750281845e-07, "loss": 0.4782, "step": 2155 }, { "epoch": 1.7701149425287355, "grad_norm": 0.015201478265225887, "learning_rate": 6.738570911373559e-07, "loss": 0.478, "step": 2156 }, { "epoch": 1.770935960591133, "grad_norm": 0.015885740518569946, "learning_rate": 6.691206771502867e-07, "loss": 0.4865, "step": 2157 }, { "epoch": 1.7717569786535303, "grad_norm": 0.015546221286058426, "learning_rate": 6.644004412782063e-07, "loss": 0.461, "step": 2158 }, { "epoch": 1.7725779967159276, "grad_norm": 0.01562671549618244, "learning_rate": 6.596963917043034e-07, "loss": 0.4906, "step": 2159 }, { "epoch": 1.7733990147783252, "grad_norm": 0.01557452604174614, "learning_rate": 6.550085365837073e-07, "loss": 0.4757, "step": 2160 }, { "epoch": 1.7742200328407225, "grad_norm": 0.015770206227898598, "learning_rate": 6.503368840434654e-07, "loss": 0.5115, "step": 2161 }, { "epoch": 1.7750410509031198, "grad_norm": 0.015603831969201565, "learning_rate": 6.4568144218254e-07, "loss": 0.49, "step": 2162 }, { "epoch": 1.7758620689655173, "grad_norm": 0.01510606613010168, "learning_rate": 6.410422190717916e-07, "loss": 0.4719, "step": 2163 }, { "epoch": 1.7766830870279147, "grad_norm": 0.015445725992321968, "learning_rate": 6.364192227539563e-07, "loss": 0.4764, "step": 2164 }, { "epoch": 1.777504105090312, "grad_norm": 0.015509750694036484, "learning_rate": 6.318124612436484e-07, "loss": 0.4726, "step": 2165 }, { "epoch": 1.7783251231527095, "grad_norm": 0.01569550670683384, "learning_rate": 6.272219425273292e-07, "loss": 0.4773, "step": 2166 }, { "epoch": 1.7791461412151066, "grad_norm": 0.015374436043202877, "learning_rate": 6.226476745633025e-07, "loss": 0.4761, "step": 2167 }, { "epoch": 1.7799671592775042, "grad_norm": 0.01618749089539051, "learning_rate": 6.180896652817004e-07, "loss": 0.5067, "step": 2168 }, { "epoch": 1.7807881773399015, "grad_norm": 0.015438567847013474, "learning_rate": 6.135479225844719e-07, "loss": 0.4608, "step": 2169 }, { "epoch": 1.7816091954022988, "grad_norm": 0.015340916812419891, "learning_rate": 6.090224543453574e-07, "loss": 0.4776, "step": 2170 }, { "epoch": 1.7824302134646963, "grad_norm": 0.015322774648666382, "learning_rate": 6.045132684098903e-07, "loss": 0.4808, "step": 2171 }, { "epoch": 1.7832512315270936, "grad_norm": 0.015410608612000942, "learning_rate": 6.000203725953704e-07, "loss": 0.4912, "step": 2172 }, { "epoch": 1.784072249589491, "grad_norm": 0.015733590349555016, "learning_rate": 5.95543774690862e-07, "loss": 0.4917, "step": 2173 }, { "epoch": 1.7848932676518885, "grad_norm": 0.015107972547411919, "learning_rate": 5.910834824571726e-07, "loss": 0.4632, "step": 2174 }, { "epoch": 1.7857142857142856, "grad_norm": 0.01514780055731535, "learning_rate": 5.866395036268384e-07, "loss": 0.4707, "step": 2175 }, { "epoch": 1.7865353037766831, "grad_norm": 0.015190456062555313, "learning_rate": 5.822118459041188e-07, "loss": 0.4604, "step": 2176 }, { "epoch": 1.7873563218390804, "grad_norm": 0.015435704030096531, "learning_rate": 5.778005169649777e-07, "loss": 0.4787, "step": 2177 }, { "epoch": 1.7881773399014778, "grad_norm": 0.014880494214594364, "learning_rate": 5.734055244570687e-07, "loss": 0.4717, "step": 2178 }, { "epoch": 1.7889983579638753, "grad_norm": 0.015714647248387337, "learning_rate": 5.69026875999721e-07, "loss": 0.5081, "step": 2179 }, { "epoch": 1.7898193760262726, "grad_norm": 0.015134638175368309, "learning_rate": 5.646645791839371e-07, "loss": 0.4654, "step": 2180 }, { "epoch": 1.79064039408867, "grad_norm": 0.015331555157899857, "learning_rate": 5.603186415723652e-07, "loss": 0.4622, "step": 2181 }, { "epoch": 1.7914614121510675, "grad_norm": 0.01487436331808567, "learning_rate": 5.559890706992974e-07, "loss": 0.4682, "step": 2182 }, { "epoch": 1.7922824302134646, "grad_norm": 0.0154042337089777, "learning_rate": 5.516758740706431e-07, "loss": 0.4476, "step": 2183 }, { "epoch": 1.793103448275862, "grad_norm": 0.01589024066925049, "learning_rate": 5.473790591639344e-07, "loss": 0.4921, "step": 2184 }, { "epoch": 1.7939244663382594, "grad_norm": 0.015481101348996162, "learning_rate": 5.430986334283006e-07, "loss": 0.4558, "step": 2185 }, { "epoch": 1.7947454844006567, "grad_norm": 0.015417640097439289, "learning_rate": 5.388346042844562e-07, "loss": 0.4876, "step": 2186 }, { "epoch": 1.7955665024630543, "grad_norm": 0.014990502037107944, "learning_rate": 5.34586979124688e-07, "loss": 0.481, "step": 2187 }, { "epoch": 1.7963875205254516, "grad_norm": 0.015080283395946026, "learning_rate": 5.303557653128499e-07, "loss": 0.4722, "step": 2188 }, { "epoch": 1.797208538587849, "grad_norm": 0.015395242720842361, "learning_rate": 5.261409701843397e-07, "loss": 0.4756, "step": 2189 }, { "epoch": 1.7980295566502464, "grad_norm": 0.024952076375484467, "learning_rate": 5.219426010460956e-07, "loss": 0.4885, "step": 2190 }, { "epoch": 1.7988505747126435, "grad_norm": 0.01530367136001587, "learning_rate": 5.177606651765737e-07, "loss": 0.4824, "step": 2191 }, { "epoch": 1.799671592775041, "grad_norm": 0.015464072115719318, "learning_rate": 5.135951698257434e-07, "loss": 0.4532, "step": 2192 }, { "epoch": 1.8004926108374384, "grad_norm": 0.016016820445656776, "learning_rate": 5.094461222150741e-07, "loss": 0.5077, "step": 2193 }, { "epoch": 1.8013136288998357, "grad_norm": 0.014925846830010414, "learning_rate": 5.053135295375178e-07, "loss": 0.4896, "step": 2194 }, { "epoch": 1.8021346469622332, "grad_norm": 0.016018545255064964, "learning_rate": 5.011973989574986e-07, "loss": 0.5098, "step": 2195 }, { "epoch": 1.8029556650246306, "grad_norm": 0.015256466343998909, "learning_rate": 4.970977376109063e-07, "loss": 0.4699, "step": 2196 }, { "epoch": 1.8037766830870279, "grad_norm": 0.01508869044482708, "learning_rate": 4.930145526050745e-07, "loss": 0.4774, "step": 2197 }, { "epoch": 1.8045977011494254, "grad_norm": 0.016155073419213295, "learning_rate": 4.889478510187743e-07, "loss": 0.4873, "step": 2198 }, { "epoch": 1.8054187192118225, "grad_norm": 0.016019541770219803, "learning_rate": 4.848976399022015e-07, "loss": 0.4986, "step": 2199 }, { "epoch": 1.80623973727422, "grad_norm": 0.015575544908642769, "learning_rate": 4.80863926276959e-07, "loss": 0.4641, "step": 2200 }, { "epoch": 1.8070607553366174, "grad_norm": 0.015047204680740833, "learning_rate": 4.768467171360552e-07, "loss": 0.466, "step": 2201 }, { "epoch": 1.8078817733990147, "grad_norm": 0.015078497119247913, "learning_rate": 4.728460194438842e-07, "loss": 0.4616, "step": 2202 }, { "epoch": 1.8087027914614122, "grad_norm": 0.0152956023812294, "learning_rate": 4.6886184013620955e-07, "loss": 0.4523, "step": 2203 }, { "epoch": 1.8095238095238095, "grad_norm": 0.015425956808030605, "learning_rate": 4.648941861201634e-07, "loss": 0.4856, "step": 2204 }, { "epoch": 1.8103448275862069, "grad_norm": 0.015189622528851032, "learning_rate": 4.6094306427422866e-07, "loss": 0.495, "step": 2205 }, { "epoch": 1.8111658456486044, "grad_norm": 0.014836854301393032, "learning_rate": 4.570084814482259e-07, "loss": 0.4625, "step": 2206 }, { "epoch": 1.8119868637110015, "grad_norm": 0.01580096036195755, "learning_rate": 4.530904444633049e-07, "loss": 0.4805, "step": 2207 }, { "epoch": 1.812807881773399, "grad_norm": 0.015270494855940342, "learning_rate": 4.491889601119257e-07, "loss": 0.4805, "step": 2208 }, { "epoch": 1.8136288998357963, "grad_norm": 0.015618212521076202, "learning_rate": 4.4530403515785875e-07, "loss": 0.4706, "step": 2209 }, { "epoch": 1.8144499178981937, "grad_norm": 0.015255572274327278, "learning_rate": 4.414356763361656e-07, "loss": 0.4576, "step": 2210 }, { "epoch": 1.8152709359605912, "grad_norm": 0.015442331321537495, "learning_rate": 4.3758389035318066e-07, "loss": 0.4925, "step": 2211 }, { "epoch": 1.8160919540229885, "grad_norm": 0.015752149745821953, "learning_rate": 4.337486838865151e-07, "loss": 0.4846, "step": 2212 }, { "epoch": 1.8169129720853858, "grad_norm": 0.015507672913372517, "learning_rate": 4.2993006358503597e-07, "loss": 0.4777, "step": 2213 }, { "epoch": 1.8177339901477834, "grad_norm": 0.014925277791917324, "learning_rate": 4.261280360688529e-07, "loss": 0.479, "step": 2214 }, { "epoch": 1.8185550082101807, "grad_norm": 0.015582118183374405, "learning_rate": 4.223426079293126e-07, "loss": 0.469, "step": 2215 }, { "epoch": 1.819376026272578, "grad_norm": 0.01486912276595831, "learning_rate": 4.1857378572897867e-07, "loss": 0.4706, "step": 2216 }, { "epoch": 1.8201970443349755, "grad_norm": 0.015484574250876904, "learning_rate": 4.148215760016351e-07, "loss": 0.4961, "step": 2217 }, { "epoch": 1.8210180623973726, "grad_norm": 0.015445464290678501, "learning_rate": 4.1108598525226066e-07, "loss": 0.4786, "step": 2218 }, { "epoch": 1.8218390804597702, "grad_norm": 0.015311065129935741, "learning_rate": 4.073670199570201e-07, "loss": 0.4772, "step": 2219 }, { "epoch": 1.8226600985221675, "grad_norm": 0.015310128219425678, "learning_rate": 4.0366468656325966e-07, "loss": 0.4807, "step": 2220 }, { "epoch": 1.8234811165845648, "grad_norm": 0.015288349241018295, "learning_rate": 3.9997899148949373e-07, "loss": 0.4895, "step": 2221 }, { "epoch": 1.8243021346469623, "grad_norm": 0.01535724475979805, "learning_rate": 3.963099411253861e-07, "loss": 0.4785, "step": 2222 }, { "epoch": 1.8251231527093597, "grad_norm": 0.015312973409891129, "learning_rate": 3.9265754183174744e-07, "loss": 0.5019, "step": 2223 }, { "epoch": 1.825944170771757, "grad_norm": 0.015401681885123253, "learning_rate": 3.8902179994052457e-07, "loss": 0.4707, "step": 2224 }, { "epoch": 1.8267651888341545, "grad_norm": 0.016326820477843285, "learning_rate": 3.8540272175477906e-07, "loss": 0.5084, "step": 2225 }, { "epoch": 1.8275862068965516, "grad_norm": 0.014230319298803806, "learning_rate": 3.8180031354869197e-07, "loss": 0.4513, "step": 2226 }, { "epoch": 1.8284072249589491, "grad_norm": 0.014893174171447754, "learning_rate": 3.782145815675391e-07, "loss": 0.469, "step": 2227 }, { "epoch": 1.8292282430213465, "grad_norm": 0.015149365179240704, "learning_rate": 3.746455320276857e-07, "loss": 0.4894, "step": 2228 }, { "epoch": 1.8300492610837438, "grad_norm": 0.015360807999968529, "learning_rate": 3.7109317111657983e-07, "loss": 0.4874, "step": 2229 }, { "epoch": 1.8308702791461413, "grad_norm": 0.015750547870993614, "learning_rate": 3.67557504992733e-07, "loss": 0.4881, "step": 2230 }, { "epoch": 1.8316912972085386, "grad_norm": 0.01548411138355732, "learning_rate": 3.6403853978571524e-07, "loss": 0.4624, "step": 2231 }, { "epoch": 1.832512315270936, "grad_norm": 0.01503883209079504, "learning_rate": 3.605362815961472e-07, "loss": 0.4718, "step": 2232 }, { "epoch": 1.8333333333333335, "grad_norm": 0.015267870388925076, "learning_rate": 3.5705073649567776e-07, "loss": 0.4964, "step": 2233 }, { "epoch": 1.8341543513957306, "grad_norm": 0.01521692331880331, "learning_rate": 3.535819105269893e-07, "loss": 0.484, "step": 2234 }, { "epoch": 1.8349753694581281, "grad_norm": 0.015637477859854698, "learning_rate": 3.5012980970377296e-07, "loss": 0.4792, "step": 2235 }, { "epoch": 1.8357963875205254, "grad_norm": 0.015153719112277031, "learning_rate": 3.46694440010729e-07, "loss": 0.4647, "step": 2236 }, { "epoch": 1.8366174055829227, "grad_norm": 0.015574854798614979, "learning_rate": 3.432758074035486e-07, "loss": 0.4694, "step": 2237 }, { "epoch": 1.8374384236453203, "grad_norm": 0.015485401265323162, "learning_rate": 3.398739178089109e-07, "loss": 0.482, "step": 2238 }, { "epoch": 1.8382594417077176, "grad_norm": 0.015001550316810608, "learning_rate": 3.364887771244646e-07, "loss": 0.4867, "step": 2239 }, { "epoch": 1.839080459770115, "grad_norm": 0.015446125529706478, "learning_rate": 3.331203912188235e-07, "loss": 0.4722, "step": 2240 }, { "epoch": 1.8399014778325125, "grad_norm": 0.0146271251142025, "learning_rate": 3.297687659315541e-07, "loss": 0.4461, "step": 2241 }, { "epoch": 1.8407224958949095, "grad_norm": 0.015643101185560226, "learning_rate": 3.264339070731671e-07, "loss": 0.4804, "step": 2242 }, { "epoch": 1.841543513957307, "grad_norm": 0.015141674317419529, "learning_rate": 3.2311582042510675e-07, "loss": 0.4704, "step": 2243 }, { "epoch": 1.8423645320197044, "grad_norm": 0.015366453677415848, "learning_rate": 3.1981451173973707e-07, "loss": 0.4873, "step": 2244 }, { "epoch": 1.8431855500821017, "grad_norm": 0.015893856063485146, "learning_rate": 3.165299867403382e-07, "loss": 0.4764, "step": 2245 }, { "epoch": 1.8440065681444993, "grad_norm": 0.015404456295073032, "learning_rate": 3.1326225112109416e-07, "loss": 0.474, "step": 2246 }, { "epoch": 1.8448275862068966, "grad_norm": 0.01591719314455986, "learning_rate": 3.100113105470785e-07, "loss": 0.4852, "step": 2247 }, { "epoch": 1.845648604269294, "grad_norm": 0.01561382319778204, "learning_rate": 3.067771706542512e-07, "loss": 0.5003, "step": 2248 }, { "epoch": 1.8464696223316914, "grad_norm": 0.015889927744865417, "learning_rate": 3.035598370494449e-07, "loss": 0.5055, "step": 2249 }, { "epoch": 1.8472906403940885, "grad_norm": 0.015345151536166668, "learning_rate": 3.0035931531035623e-07, "loss": 0.4861, "step": 2250 }, { "epoch": 1.848111658456486, "grad_norm": 0.014596505090594292, "learning_rate": 2.971756109855382e-07, "loss": 0.4631, "step": 2251 }, { "epoch": 1.8489326765188834, "grad_norm": 0.015132762491703033, "learning_rate": 2.9400872959438337e-07, "loss": 0.4542, "step": 2252 }, { "epoch": 1.8497536945812807, "grad_norm": 0.015626264736056328, "learning_rate": 2.908586766271236e-07, "loss": 0.4893, "step": 2253 }, { "epoch": 1.8505747126436782, "grad_norm": 0.01598695106804371, "learning_rate": 2.877254575448164e-07, "loss": 0.5021, "step": 2254 }, { "epoch": 1.8513957307060755, "grad_norm": 0.015104359947144985, "learning_rate": 2.84609077779334e-07, "loss": 0.4747, "step": 2255 }, { "epoch": 1.8522167487684729, "grad_norm": 0.015188918448984623, "learning_rate": 2.815095427333539e-07, "loss": 0.4663, "step": 2256 }, { "epoch": 1.8530377668308704, "grad_norm": 0.014330818317830563, "learning_rate": 2.784268577803545e-07, "loss": 0.4722, "step": 2257 }, { "epoch": 1.8538587848932675, "grad_norm": 0.015418116934597492, "learning_rate": 2.753610282645981e-07, "loss": 0.4718, "step": 2258 }, { "epoch": 1.854679802955665, "grad_norm": 0.015287382528185844, "learning_rate": 2.723120595011311e-07, "loss": 0.4609, "step": 2259 }, { "epoch": 1.8555008210180624, "grad_norm": 0.015386098995804787, "learning_rate": 2.692799567757651e-07, "loss": 0.4795, "step": 2260 }, { "epoch": 1.8563218390804597, "grad_norm": 0.014863336458802223, "learning_rate": 2.662647253450715e-07, "loss": 0.4548, "step": 2261 }, { "epoch": 1.8571428571428572, "grad_norm": 0.015416731126606464, "learning_rate": 2.6326637043637667e-07, "loss": 0.4706, "step": 2262 }, { "epoch": 1.8579638752052545, "grad_norm": 0.015211060643196106, "learning_rate": 2.6028489724774785e-07, "loss": 0.4805, "step": 2263 }, { "epoch": 1.8587848932676518, "grad_norm": 0.01500970870256424, "learning_rate": 2.57320310947982e-07, "loss": 0.4535, "step": 2264 }, { "epoch": 1.8596059113300494, "grad_norm": 0.014990514144301414, "learning_rate": 2.5437261667660577e-07, "loss": 0.4795, "step": 2265 }, { "epoch": 1.8604269293924465, "grad_norm": 0.015359378419816494, "learning_rate": 2.514418195438565e-07, "loss": 0.4861, "step": 2266 }, { "epoch": 1.861247947454844, "grad_norm": 0.015207471325993538, "learning_rate": 2.485279246306814e-07, "loss": 0.4834, "step": 2267 }, { "epoch": 1.8620689655172413, "grad_norm": 0.016145996749401093, "learning_rate": 2.4563093698872506e-07, "loss": 0.4736, "step": 2268 }, { "epoch": 1.8628899835796386, "grad_norm": 0.02187061868607998, "learning_rate": 2.427508616403161e-07, "loss": 0.4915, "step": 2269 }, { "epoch": 1.8637110016420362, "grad_norm": 0.015269560739398003, "learning_rate": 2.398877035784698e-07, "loss": 0.4682, "step": 2270 }, { "epoch": 1.8645320197044335, "grad_norm": 0.016414988785982132, "learning_rate": 2.370414677668721e-07, "loss": 0.4916, "step": 2271 }, { "epoch": 1.8653530377668308, "grad_norm": 0.015437723137438297, "learning_rate": 2.3421215913986536e-07, "loss": 0.4828, "step": 2272 }, { "epoch": 1.8661740558292284, "grad_norm": 0.015225778333842754, "learning_rate": 2.3139978260245395e-07, "loss": 0.4765, "step": 2273 }, { "epoch": 1.8669950738916257, "grad_norm": 0.01484403945505619, "learning_rate": 2.2860434303028198e-07, "loss": 0.4591, "step": 2274 }, { "epoch": 1.867816091954023, "grad_norm": 0.015388760715723038, "learning_rate": 2.2582584526963882e-07, "loss": 0.4865, "step": 2275 }, { "epoch": 1.8686371100164205, "grad_norm": 0.014903387986123562, "learning_rate": 2.2306429413743377e-07, "loss": 0.4653, "step": 2276 }, { "epoch": 1.8694581280788176, "grad_norm": 0.015131459571421146, "learning_rate": 2.2031969442120347e-07, "loss": 0.4514, "step": 2277 }, { "epoch": 1.8702791461412152, "grad_norm": 0.015423638746142387, "learning_rate": 2.1759205087909227e-07, "loss": 0.4467, "step": 2278 }, { "epoch": 1.8711001642036125, "grad_norm": 0.01541722659021616, "learning_rate": 2.1488136823985531e-07, "loss": 0.4682, "step": 2279 }, { "epoch": 1.8719211822660098, "grad_norm": 0.01609201170504093, "learning_rate": 2.1218765120283323e-07, "loss": 0.4981, "step": 2280 }, { "epoch": 1.8727422003284073, "grad_norm": 0.014873280189931393, "learning_rate": 2.0951090443796413e-07, "loss": 0.4545, "step": 2281 }, { "epoch": 1.8735632183908046, "grad_norm": 0.015327694825828075, "learning_rate": 2.068511325857626e-07, "loss": 0.4738, "step": 2282 }, { "epoch": 1.874384236453202, "grad_norm": 0.015122122131288052, "learning_rate": 2.04208340257312e-07, "loss": 0.4919, "step": 2283 }, { "epoch": 1.8752052545155995, "grad_norm": 0.015592072159051895, "learning_rate": 2.0158253203426323e-07, "loss": 0.4512, "step": 2284 }, { "epoch": 1.8760262725779966, "grad_norm": 0.015078229829668999, "learning_rate": 1.989737124688193e-07, "loss": 0.4629, "step": 2285 }, { "epoch": 1.8768472906403941, "grad_norm": 0.015502789057791233, "learning_rate": 1.9638188608373524e-07, "loss": 0.5005, "step": 2286 }, { "epoch": 1.8776683087027914, "grad_norm": 0.01520951185375452, "learning_rate": 1.938070573723026e-07, "loss": 0.48, "step": 2287 }, { "epoch": 1.8784893267651888, "grad_norm": 0.01580154150724411, "learning_rate": 1.9124923079834725e-07, "loss": 0.469, "step": 2288 }, { "epoch": 1.8793103448275863, "grad_norm": 0.015161740593612194, "learning_rate": 1.8870841079621714e-07, "loss": 0.4756, "step": 2289 }, { "epoch": 1.8801313628899836, "grad_norm": 0.016212163493037224, "learning_rate": 1.8618460177077892e-07, "loss": 0.4982, "step": 2290 }, { "epoch": 1.880952380952381, "grad_norm": 0.015119411051273346, "learning_rate": 1.8367780809740705e-07, "loss": 0.4805, "step": 2291 }, { "epoch": 1.8817733990147785, "grad_norm": 0.015376249328255653, "learning_rate": 1.8118803412197898e-07, "loss": 0.4768, "step": 2292 }, { "epoch": 1.8825944170771756, "grad_norm": 0.015516343526542187, "learning_rate": 1.7871528416086558e-07, "loss": 0.5055, "step": 2293 }, { "epoch": 1.883415435139573, "grad_norm": 0.015533638186752796, "learning_rate": 1.7625956250092305e-07, "loss": 0.487, "step": 2294 }, { "epoch": 1.8842364532019704, "grad_norm": 0.01518923882395029, "learning_rate": 1.7382087339948866e-07, "loss": 0.4643, "step": 2295 }, { "epoch": 1.8850574712643677, "grad_norm": 0.015647394582629204, "learning_rate": 1.7139922108436958e-07, "loss": 0.4924, "step": 2296 }, { "epoch": 1.8858784893267653, "grad_norm": 0.015119189396500587, "learning_rate": 1.6899460975383624e-07, "loss": 0.4774, "step": 2297 }, { "epoch": 1.8866995073891626, "grad_norm": 0.015249375253915787, "learning_rate": 1.6660704357662226e-07, "loss": 0.486, "step": 2298 }, { "epoch": 1.88752052545156, "grad_norm": 0.015351799316704273, "learning_rate": 1.642365266919024e-07, "loss": 0.4748, "step": 2299 }, { "epoch": 1.8883415435139574, "grad_norm": 0.014982398599386215, "learning_rate": 1.6188306320930127e-07, "loss": 0.4784, "step": 2300 }, { "epoch": 1.8891625615763545, "grad_norm": 0.014860417693853378, "learning_rate": 1.5954665720887459e-07, "loss": 0.4945, "step": 2301 }, { "epoch": 1.889983579638752, "grad_norm": 0.015161916613578796, "learning_rate": 1.572273127411091e-07, "loss": 0.4962, "step": 2302 }, { "epoch": 1.8908045977011494, "grad_norm": 0.015124011784791946, "learning_rate": 1.5492503382691153e-07, "loss": 0.4739, "step": 2303 }, { "epoch": 1.8916256157635467, "grad_norm": 0.015296337194740772, "learning_rate": 1.5263982445760517e-07, "loss": 0.4548, "step": 2304 }, { "epoch": 1.8924466338259442, "grad_norm": 0.015139610506594181, "learning_rate": 1.503716885949167e-07, "loss": 0.4647, "step": 2305 }, { "epoch": 1.8932676518883416, "grad_norm": 0.0153737748041749, "learning_rate": 1.4812063017097827e-07, "loss": 0.4784, "step": 2306 }, { "epoch": 1.8940886699507389, "grad_norm": 0.014753188006579876, "learning_rate": 1.4588665308831205e-07, "loss": 0.4567, "step": 2307 }, { "epoch": 1.8949096880131364, "grad_norm": 0.0153097128495574, "learning_rate": 1.4366976121983125e-07, "loss": 0.4583, "step": 2308 }, { "epoch": 1.8957307060755335, "grad_norm": 0.01570820063352585, "learning_rate": 1.414699584088258e-07, "loss": 0.4883, "step": 2309 }, { "epoch": 1.896551724137931, "grad_norm": 0.015372912399470806, "learning_rate": 1.3928724846896e-07, "loss": 0.4691, "step": 2310 }, { "epoch": 1.8973727422003284, "grad_norm": 0.014666399918496609, "learning_rate": 1.3712163518426935e-07, "loss": 0.4723, "step": 2311 }, { "epoch": 1.8981937602627257, "grad_norm": 0.015325483866035938, "learning_rate": 1.349731223091437e-07, "loss": 0.4791, "step": 2312 }, { "epoch": 1.8990147783251232, "grad_norm": 0.015232360921800137, "learning_rate": 1.3284171356832968e-07, "loss": 0.4701, "step": 2313 }, { "epoch": 1.8998357963875205, "grad_norm": 0.015033064410090446, "learning_rate": 1.307274126569228e-07, "loss": 0.4748, "step": 2314 }, { "epoch": 1.9006568144499179, "grad_norm": 0.014903790317475796, "learning_rate": 1.2863022324035968e-07, "loss": 0.4861, "step": 2315 }, { "epoch": 1.9014778325123154, "grad_norm": 0.01533140242099762, "learning_rate": 1.265501489544103e-07, "loss": 0.4815, "step": 2316 }, { "epoch": 1.9022988505747125, "grad_norm": 0.015469180420041084, "learning_rate": 1.2448719340517253e-07, "loss": 0.4485, "step": 2317 }, { "epoch": 1.90311986863711, "grad_norm": 0.015554878860712051, "learning_rate": 1.2244136016906864e-07, "loss": 0.4856, "step": 2318 }, { "epoch": 1.9039408866995073, "grad_norm": 0.01548820175230503, "learning_rate": 1.2041265279283543e-07, "loss": 0.4844, "step": 2319 }, { "epoch": 1.9047619047619047, "grad_norm": 0.014682622626423836, "learning_rate": 1.1840107479352199e-07, "loss": 0.474, "step": 2320 }, { "epoch": 1.9055829228243022, "grad_norm": 0.015887096524238586, "learning_rate": 1.1640662965847744e-07, "loss": 0.4869, "step": 2321 }, { "epoch": 1.9064039408866995, "grad_norm": 0.015378888696432114, "learning_rate": 1.144293208453499e-07, "loss": 0.482, "step": 2322 }, { "epoch": 1.9072249589490968, "grad_norm": 0.01514795608818531, "learning_rate": 1.1246915178208306e-07, "loss": 0.4743, "step": 2323 }, { "epoch": 1.9080459770114944, "grad_norm": 0.016094617545604706, "learning_rate": 1.1052612586690295e-07, "loss": 0.4946, "step": 2324 }, { "epoch": 1.9088669950738915, "grad_norm": 0.01541482750326395, "learning_rate": 1.0860024646831344e-07, "loss": 0.4936, "step": 2325 }, { "epoch": 1.909688013136289, "grad_norm": 0.015286142006516457, "learning_rate": 1.0669151692509849e-07, "loss": 0.4975, "step": 2326 }, { "epoch": 1.9105090311986865, "grad_norm": 0.015187742188572884, "learning_rate": 1.0479994054630437e-07, "loss": 0.4898, "step": 2327 }, { "epoch": 1.9113300492610836, "grad_norm": 0.0148614551872015, "learning_rate": 1.0292552061124524e-07, "loss": 0.4489, "step": 2328 }, { "epoch": 1.9121510673234812, "grad_norm": 0.014923889189958572, "learning_rate": 1.010682603694876e-07, "loss": 0.4751, "step": 2329 }, { "epoch": 1.9129720853858785, "grad_norm": 0.014661014080047607, "learning_rate": 9.922816304085134e-08, "loss": 0.4764, "step": 2330 }, { "epoch": 1.9137931034482758, "grad_norm": 0.01495309267193079, "learning_rate": 9.740523181540314e-08, "loss": 0.4716, "step": 2331 }, { "epoch": 1.9146141215106733, "grad_norm": 0.014790463261306286, "learning_rate": 9.559946985344873e-08, "loss": 0.4526, "step": 2332 }, { "epoch": 1.9154351395730707, "grad_norm": 0.01444925181567669, "learning_rate": 9.381088028552728e-08, "loss": 0.45, "step": 2333 }, { "epoch": 1.916256157635468, "grad_norm": 0.015388376079499722, "learning_rate": 9.203946621240915e-08, "loss": 0.4973, "step": 2334 }, { "epoch": 1.9170771756978655, "grad_norm": 0.015289773233234882, "learning_rate": 9.028523070508709e-08, "loss": 0.4817, "step": 2335 }, { "epoch": 1.9178981937602626, "grad_norm": 0.01580299623310566, "learning_rate": 8.854817680477403e-08, "loss": 0.4937, "step": 2336 }, { "epoch": 1.9187192118226601, "grad_norm": 0.015340659767389297, "learning_rate": 8.682830752289519e-08, "loss": 0.4782, "step": 2337 }, { "epoch": 1.9195402298850575, "grad_norm": 0.015471559017896652, "learning_rate": 8.512562584108261e-08, "loss": 0.4795, "step": 2338 }, { "epoch": 1.9203612479474548, "grad_norm": 0.015449115075170994, "learning_rate": 8.344013471117525e-08, "loss": 0.5033, "step": 2339 }, { "epoch": 1.9211822660098523, "grad_norm": 0.01570102386176586, "learning_rate": 8.177183705520429e-08, "loss": 0.4748, "step": 2340 }, { "epoch": 1.9220032840722496, "grad_norm": 0.014731856063008308, "learning_rate": 8.012073576539897e-08, "loss": 0.4778, "step": 2341 }, { "epoch": 1.922824302134647, "grad_norm": 0.015431157313287258, "learning_rate": 7.848683370417194e-08, "loss": 0.4662, "step": 2342 }, { "epoch": 1.9236453201970445, "grad_norm": 0.015194197185337543, "learning_rate": 7.687013370412057e-08, "loss": 0.4635, "step": 2343 }, { "epoch": 1.9244663382594416, "grad_norm": 0.015501155517995358, "learning_rate": 7.52706385680212e-08, "loss": 0.4792, "step": 2344 }, { "epoch": 1.9252873563218391, "grad_norm": 0.015052792616188526, "learning_rate": 7.368835106882149e-08, "loss": 0.4616, "step": 2345 }, { "epoch": 1.9261083743842364, "grad_norm": 0.014951469376683235, "learning_rate": 7.212327394963596e-08, "loss": 0.4687, "step": 2346 }, { "epoch": 1.9269293924466337, "grad_norm": 0.015144369564950466, "learning_rate": 7.057540992374595e-08, "loss": 0.4755, "step": 2347 }, { "epoch": 1.9277504105090313, "grad_norm": 0.014515471644699574, "learning_rate": 6.90447616745897e-08, "loss": 0.4695, "step": 2348 }, { "epoch": 1.9285714285714286, "grad_norm": 0.01541223470121622, "learning_rate": 6.753133185575896e-08, "loss": 0.4778, "step": 2349 }, { "epoch": 1.929392446633826, "grad_norm": 0.014813839457929134, "learning_rate": 6.603512309099456e-08, "loss": 0.4763, "step": 2350 }, { "epoch": 1.9302134646962235, "grad_norm": 0.015283233486115932, "learning_rate": 6.455613797418529e-08, "loss": 0.497, "step": 2351 }, { "epoch": 1.9310344827586206, "grad_norm": 0.014593685045838356, "learning_rate": 6.309437906935689e-08, "loss": 0.4576, "step": 2352 }, { "epoch": 1.931855500821018, "grad_norm": 0.01514139212667942, "learning_rate": 6.1649848910673e-08, "loss": 0.4835, "step": 2353 }, { "epoch": 1.9326765188834154, "grad_norm": 0.014766537584364414, "learning_rate": 6.022255000242752e-08, "loss": 0.4749, "step": 2354 }, { "epoch": 1.9334975369458127, "grad_norm": 0.015389851294457912, "learning_rate": 5.881248481904237e-08, "loss": 0.4769, "step": 2355 }, { "epoch": 1.9343185550082103, "grad_norm": 0.015501394867897034, "learning_rate": 5.741965580506296e-08, "loss": 0.4748, "step": 2356 }, { "epoch": 1.9351395730706076, "grad_norm": 0.015523744747042656, "learning_rate": 5.604406537515278e-08, "loss": 0.4633, "step": 2357 }, { "epoch": 1.935960591133005, "grad_norm": 0.015382915735244751, "learning_rate": 5.468571591408881e-08, "loss": 0.4719, "step": 2358 }, { "epoch": 1.9367816091954024, "grad_norm": 0.015075638890266418, "learning_rate": 5.3344609776760536e-08, "loss": 0.4842, "step": 2359 }, { "epoch": 1.9376026272577995, "grad_norm": 0.016605209559202194, "learning_rate": 5.202074928816101e-08, "loss": 0.4773, "step": 2360 }, { "epoch": 1.938423645320197, "grad_norm": 0.015294373966753483, "learning_rate": 5.0714136743389066e-08, "loss": 0.4708, "step": 2361 }, { "epoch": 1.9392446633825944, "grad_norm": 0.015327363274991512, "learning_rate": 4.942477440763824e-08, "loss": 0.4878, "step": 2362 }, { "epoch": 1.9400656814449917, "grad_norm": 0.015385082922875881, "learning_rate": 4.8152664516198964e-08, "loss": 0.4705, "step": 2363 }, { "epoch": 1.9408866995073892, "grad_norm": 0.015252226032316685, "learning_rate": 4.689780927445195e-08, "loss": 0.4559, "step": 2364 }, { "epoch": 1.9417077175697866, "grad_norm": 0.01549266092479229, "learning_rate": 4.5660210857861477e-08, "loss": 0.4918, "step": 2365 }, { "epoch": 1.9425287356321839, "grad_norm": 0.015748847275972366, "learning_rate": 4.4439871411979866e-08, "loss": 0.4916, "step": 2366 }, { "epoch": 1.9433497536945814, "grad_norm": 0.015205973759293556, "learning_rate": 4.3236793052434154e-08, "loss": 0.4706, "step": 2367 }, { "epoch": 1.9441707717569785, "grad_norm": 0.015322347171604633, "learning_rate": 4.20509778649283e-08, "loss": 0.4845, "step": 2368 }, { "epoch": 1.944991789819376, "grad_norm": 0.015182742848992348, "learning_rate": 4.0882427905239876e-08, "loss": 0.4921, "step": 2369 }, { "epoch": 1.9458128078817734, "grad_norm": 0.01528804562985897, "learning_rate": 3.973114519921226e-08, "loss": 0.481, "step": 2370 }, { "epoch": 1.9466338259441707, "grad_norm": 0.015447237528860569, "learning_rate": 3.8597131742754685e-08, "loss": 0.5016, "step": 2371 }, { "epoch": 1.9474548440065682, "grad_norm": 0.015526745468378067, "learning_rate": 3.7480389501837756e-08, "loss": 0.4696, "step": 2372 }, { "epoch": 1.9482758620689655, "grad_norm": 0.015222259797155857, "learning_rate": 3.638092041249015e-08, "loss": 0.4891, "step": 2373 }, { "epoch": 1.9490968801313628, "grad_norm": 0.015381956472992897, "learning_rate": 3.529872638079527e-08, "loss": 0.5164, "step": 2374 }, { "epoch": 1.9499178981937604, "grad_norm": 0.015073809772729874, "learning_rate": 3.423380928288679e-08, "loss": 0.4727, "step": 2375 }, { "epoch": 1.9507389162561575, "grad_norm": 0.015016633085906506, "learning_rate": 3.3186170964947595e-08, "loss": 0.4828, "step": 2376 }, { "epoch": 1.951559934318555, "grad_norm": 0.014783023856580257, "learning_rate": 3.2155813243205285e-08, "loss": 0.4786, "step": 2377 }, { "epoch": 1.9523809523809523, "grad_norm": 0.01484466902911663, "learning_rate": 3.1142737903927755e-08, "loss": 0.4505, "step": 2378 }, { "epoch": 1.9532019704433496, "grad_norm": 0.014960236847400665, "learning_rate": 3.0146946703423204e-08, "loss": 0.4611, "step": 2379 }, { "epoch": 1.9540229885057472, "grad_norm": 0.015473098494112492, "learning_rate": 2.9168441368033468e-08, "loss": 0.47, "step": 2380 }, { "epoch": 1.9548440065681445, "grad_norm": 0.014925132505595684, "learning_rate": 2.8207223594136243e-08, "loss": 0.482, "step": 2381 }, { "epoch": 1.9556650246305418, "grad_norm": 0.015020204707980156, "learning_rate": 2.7263295048135085e-08, "loss": 0.4713, "step": 2382 }, { "epoch": 1.9564860426929394, "grad_norm": 0.015034380368888378, "learning_rate": 2.6336657366462752e-08, "loss": 0.4738, "step": 2383 }, { "epoch": 1.9573070607553367, "grad_norm": 0.015546063892543316, "learning_rate": 2.542731215557342e-08, "loss": 0.4969, "step": 2384 }, { "epoch": 1.958128078817734, "grad_norm": 0.015370727516710758, "learning_rate": 2.453526099194491e-08, "loss": 0.4941, "step": 2385 }, { "epoch": 1.9589490968801315, "grad_norm": 0.01554510835558176, "learning_rate": 2.3660505422073153e-08, "loss": 0.4926, "step": 2386 }, { "epoch": 1.9597701149425286, "grad_norm": 0.015030594542622566, "learning_rate": 2.2803046962466608e-08, "loss": 0.4856, "step": 2387 }, { "epoch": 1.9605911330049262, "grad_norm": 0.015367834828794003, "learning_rate": 2.196288709965183e-08, "loss": 0.4896, "step": 2388 }, { "epoch": 1.9614121510673235, "grad_norm": 0.014817137271165848, "learning_rate": 2.114002729016238e-08, "loss": 0.4793, "step": 2389 }, { "epoch": 1.9622331691297208, "grad_norm": 0.015613330528140068, "learning_rate": 2.0334468960539914e-08, "loss": 0.4847, "step": 2390 }, { "epoch": 1.9630541871921183, "grad_norm": 0.015316348522901535, "learning_rate": 1.954621350733308e-08, "loss": 0.4755, "step": 2391 }, { "epoch": 1.9638752052545156, "grad_norm": 0.014886623248457909, "learning_rate": 1.8775262297091984e-08, "loss": 0.4628, "step": 2392 }, { "epoch": 1.964696223316913, "grad_norm": 0.014942088164389133, "learning_rate": 1.802161666636815e-08, "loss": 0.4842, "step": 2393 }, { "epoch": 1.9655172413793105, "grad_norm": 0.015175285749137402, "learning_rate": 1.7285277921712346e-08, "loss": 0.4634, "step": 2394 }, { "epoch": 1.9663382594417076, "grad_norm": 0.01460226159542799, "learning_rate": 1.656624733966901e-08, "loss": 0.4849, "step": 2395 }, { "epoch": 1.9671592775041051, "grad_norm": 0.015037557110190392, "learning_rate": 1.5864526166778466e-08, "loss": 0.475, "step": 2396 }, { "epoch": 1.9679802955665024, "grad_norm": 0.014700337313115597, "learning_rate": 1.51801156195736e-08, "loss": 0.4769, "step": 2397 }, { "epoch": 1.9688013136288998, "grad_norm": 0.015581055544316769, "learning_rate": 1.4513016884574312e-08, "loss": 0.4824, "step": 2398 }, { "epoch": 1.9696223316912973, "grad_norm": 0.014462685212492943, "learning_rate": 1.3863231118289747e-08, "loss": 0.4584, "step": 2399 }, { "epoch": 1.9704433497536946, "grad_norm": 0.014947470277547836, "learning_rate": 1.3230759447213819e-08, "loss": 0.4563, "step": 2400 }, { "epoch": 1.971264367816092, "grad_norm": 0.015496054664254189, "learning_rate": 1.2615602967825248e-08, "loss": 0.4685, "step": 2401 }, { "epoch": 1.9720853858784895, "grad_norm": 0.015509115532040596, "learning_rate": 1.2017762746581994e-08, "loss": 0.4759, "step": 2402 }, { "epoch": 1.9729064039408866, "grad_norm": 0.015031293965876102, "learning_rate": 1.1437239819925693e-08, "loss": 0.4835, "step": 2403 }, { "epoch": 1.973727422003284, "grad_norm": 0.014577899128198624, "learning_rate": 1.0874035194272788e-08, "loss": 0.4624, "step": 2404 }, { "epoch": 1.9745484400656814, "grad_norm": 0.015408586710691452, "learning_rate": 1.032814984601674e-08, "loss": 0.4974, "step": 2405 }, { "epoch": 1.9753694581280787, "grad_norm": 0.015507093630731106, "learning_rate": 9.799584721526922e-09, "loss": 0.4903, "step": 2406 }, { "epoch": 1.9761904761904763, "grad_norm": 0.015329570509493351, "learning_rate": 9.288340737143061e-09, "loss": 0.4611, "step": 2407 }, { "epoch": 1.9770114942528736, "grad_norm": 0.01562531851232052, "learning_rate": 8.794418779179694e-09, "loss": 0.5066, "step": 2408 }, { "epoch": 1.977832512315271, "grad_norm": 0.01528861466795206, "learning_rate": 8.317819703918384e-09, "loss": 0.475, "step": 2409 }, { "epoch": 1.9786535303776684, "grad_norm": 0.015193655155599117, "learning_rate": 7.85854433761105e-09, "loss": 0.4893, "step": 2410 }, { "epoch": 1.9794745484400655, "grad_norm": 0.015363416634500027, "learning_rate": 7.416593476474429e-09, "loss": 0.48, "step": 2411 }, { "epoch": 1.980295566502463, "grad_norm": 0.015456761233508587, "learning_rate": 6.99196788669339e-09, "loss": 0.487, "step": 2412 }, { "epoch": 1.9811165845648604, "grad_norm": 0.01536094956099987, "learning_rate": 6.584668304414288e-09, "loss": 0.478, "step": 2413 }, { "epoch": 1.9819376026272577, "grad_norm": 0.01530491840094328, "learning_rate": 6.194695435749392e-09, "loss": 0.4731, "step": 2414 }, { "epoch": 1.9827586206896552, "grad_norm": 0.015588871203362942, "learning_rate": 5.822049956769125e-09, "loss": 0.4782, "step": 2415 }, { "epoch": 1.9835796387520526, "grad_norm": 0.014674595557153225, "learning_rate": 5.466732513507605e-09, "loss": 0.4592, "step": 2416 }, { "epoch": 1.9844006568144499, "grad_norm": 0.014861774630844593, "learning_rate": 5.1287437219571025e-09, "loss": 0.4618, "step": 2417 }, { "epoch": 1.9852216748768474, "grad_norm": 0.015453887172043324, "learning_rate": 4.808084168069145e-09, "loss": 0.4846, "step": 2418 }, { "epoch": 1.9860426929392445, "grad_norm": 0.015321428887546062, "learning_rate": 4.504754407750081e-09, "loss": 0.4839, "step": 2419 }, { "epoch": 1.986863711001642, "grad_norm": 0.01495120208710432, "learning_rate": 4.218754966865515e-09, "loss": 0.4803, "step": 2420 }, { "epoch": 1.9876847290640394, "grad_norm": 0.014852608554065228, "learning_rate": 3.950086341234762e-09, "loss": 0.4652, "step": 2421 }, { "epoch": 1.9885057471264367, "grad_norm": 0.015172898769378662, "learning_rate": 3.6987489966330647e-09, "loss": 0.468, "step": 2422 }, { "epoch": 1.9893267651888342, "grad_norm": 0.01570308953523636, "learning_rate": 3.464743368788266e-09, "loss": 0.492, "step": 2423 }, { "epoch": 1.9901477832512315, "grad_norm": 0.015210078097879887, "learning_rate": 3.248069863381917e-09, "loss": 0.4663, "step": 2424 }, { "epoch": 1.9909688013136289, "grad_norm": 0.015290279872715473, "learning_rate": 3.0487288560481636e-09, "loss": 0.4865, "step": 2425 }, { "epoch": 1.9917898193760264, "grad_norm": 0.014876478351652622, "learning_rate": 2.866720692371536e-09, "loss": 0.4979, "step": 2426 }, { "epoch": 1.9926108374384235, "grad_norm": 0.014895363710820675, "learning_rate": 2.702045687889158e-09, "loss": 0.4912, "step": 2427 }, { "epoch": 1.993431855500821, "grad_norm": 0.014767020009458065, "learning_rate": 2.554704128088533e-09, "loss": 0.456, "step": 2428 }, { "epoch": 1.9942528735632183, "grad_norm": 0.015800345689058304, "learning_rate": 2.4246962684064335e-09, "loss": 0.5045, "step": 2429 }, { "epoch": 1.9950738916256157, "grad_norm": 0.015566859394311905, "learning_rate": 2.3120223342288995e-09, "loss": 0.497, "step": 2430 }, { "epoch": 1.9958949096880132, "grad_norm": 0.01547219231724739, "learning_rate": 2.216682520893458e-09, "loss": 0.4867, "step": 2431 }, { "epoch": 1.9967159277504105, "grad_norm": 0.015549108386039734, "learning_rate": 2.1386769936835754e-09, "loss": 0.4847, "step": 2432 }, { "epoch": 1.9975369458128078, "grad_norm": 0.015337791293859482, "learning_rate": 2.0780058878330966e-09, "loss": 0.4638, "step": 2433 }, { "epoch": 1.9983579638752054, "grad_norm": 0.015806861221790314, "learning_rate": 2.0346693085251332e-09, "loss": 0.473, "step": 2434 }, { "epoch": 1.9991789819376025, "grad_norm": 0.01592666655778885, "learning_rate": 2.008667330887625e-09, "loss": 0.4761, "step": 2435 }, { "epoch": 2.0, "grad_norm": 0.015056383796036243, "learning_rate": 2e-09, "loss": 0.4871, "step": 2436 } ], "logging_steps": 1.0, "max_steps": 2436, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8252110628176632e+20, "train_batch_size": 4, "trial_name": null, "trial_params": null }