{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9992566897918733, "eval_steps": 500, "global_step": 4035, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007433102081268583, "grad_norm": 5.905340671539307, "learning_rate": 2.4752475247524754e-08, "loss": 0.8843, "step": 1 }, { "epoch": 0.0014866204162537165, "grad_norm": 5.807815074920654, "learning_rate": 4.950495049504951e-08, "loss": 0.8665, "step": 2 }, { "epoch": 0.0022299306243805748, "grad_norm": 5.660592555999756, "learning_rate": 7.425742574257426e-08, "loss": 0.8595, "step": 3 }, { "epoch": 0.002973240832507433, "grad_norm": 5.751865386962891, "learning_rate": 9.900990099009901e-08, "loss": 0.8283, "step": 4 }, { "epoch": 0.0037165510406342913, "grad_norm": 5.583338737487793, "learning_rate": 1.2376237623762377e-07, "loss": 0.8795, "step": 5 }, { "epoch": 0.0044598612487611496, "grad_norm": 5.825685501098633, "learning_rate": 1.4851485148514852e-07, "loss": 0.8833, "step": 6 }, { "epoch": 0.005203171456888008, "grad_norm": 5.812551975250244, "learning_rate": 1.7326732673267329e-07, "loss": 0.8685, "step": 7 }, { "epoch": 0.005946481665014866, "grad_norm": 5.448821544647217, "learning_rate": 1.9801980198019803e-07, "loss": 0.8225, "step": 8 }, { "epoch": 0.006689791873141724, "grad_norm": 5.68565559387207, "learning_rate": 2.2277227722772277e-07, "loss": 0.8437, "step": 9 }, { "epoch": 0.007433102081268583, "grad_norm": 5.729457855224609, "learning_rate": 2.4752475247524754e-07, "loss": 0.8485, "step": 10 }, { "epoch": 0.00817641228939544, "grad_norm": 5.802018642425537, "learning_rate": 2.722772277227723e-07, "loss": 0.8501, "step": 11 }, { "epoch": 0.008919722497522299, "grad_norm": 5.636158466339111, "learning_rate": 2.9702970297029703e-07, "loss": 0.8553, "step": 12 }, { "epoch": 0.009663032705649157, "grad_norm": 5.485172748565674, "learning_rate": 3.217821782178218e-07, "loss": 0.8246, "step": 13 }, { "epoch": 0.010406342913776016, "grad_norm": 5.555150985717773, "learning_rate": 3.4653465346534657e-07, "loss": 0.8131, "step": 14 }, { "epoch": 0.011149653121902874, "grad_norm": 5.51214075088501, "learning_rate": 3.712871287128713e-07, "loss": 0.8394, "step": 15 }, { "epoch": 0.011892963330029732, "grad_norm": 5.548535346984863, "learning_rate": 3.9603960396039606e-07, "loss": 0.8574, "step": 16 }, { "epoch": 0.01263627353815659, "grad_norm": 5.266436576843262, "learning_rate": 4.2079207920792083e-07, "loss": 0.8468, "step": 17 }, { "epoch": 0.013379583746283449, "grad_norm": 5.129281520843506, "learning_rate": 4.4554455445544555e-07, "loss": 0.8136, "step": 18 }, { "epoch": 0.014122893954410307, "grad_norm": 5.342949390411377, "learning_rate": 4.702970297029703e-07, "loss": 0.841, "step": 19 }, { "epoch": 0.014866204162537165, "grad_norm": 4.552680492401123, "learning_rate": 4.950495049504951e-07, "loss": 0.822, "step": 20 }, { "epoch": 0.015609514370664023, "grad_norm": 4.465524196624756, "learning_rate": 5.198019801980199e-07, "loss": 0.8274, "step": 21 }, { "epoch": 0.01635282457879088, "grad_norm": 4.227555751800537, "learning_rate": 5.445544554455446e-07, "loss": 0.8077, "step": 22 }, { "epoch": 0.01709613478691774, "grad_norm": 4.310589790344238, "learning_rate": 5.693069306930694e-07, "loss": 0.8252, "step": 23 }, { "epoch": 0.017839444995044598, "grad_norm": 3.9970104694366455, "learning_rate": 5.940594059405941e-07, "loss": 0.7887, "step": 24 }, { "epoch": 0.018582755203171458, "grad_norm": 4.083194732666016, "learning_rate": 6.188118811881188e-07, "loss": 0.8144, "step": 25 }, { "epoch": 0.019326065411298315, "grad_norm": 4.095208644866943, "learning_rate": 6.435643564356436e-07, "loss": 0.777, "step": 26 }, { "epoch": 0.020069375619425175, "grad_norm": 2.3572006225585938, "learning_rate": 6.683168316831684e-07, "loss": 0.7574, "step": 27 }, { "epoch": 0.02081268582755203, "grad_norm": 2.2594964504241943, "learning_rate": 6.930693069306931e-07, "loss": 0.7421, "step": 28 }, { "epoch": 0.02155599603567889, "grad_norm": 2.391563892364502, "learning_rate": 7.178217821782178e-07, "loss": 0.7559, "step": 29 }, { "epoch": 0.022299306243805748, "grad_norm": 2.4013009071350098, "learning_rate": 7.425742574257426e-07, "loss": 0.7675, "step": 30 }, { "epoch": 0.023042616451932608, "grad_norm": 2.298351526260376, "learning_rate": 7.673267326732673e-07, "loss": 0.7719, "step": 31 }, { "epoch": 0.023785926660059464, "grad_norm": 2.113934278488159, "learning_rate": 7.920792079207921e-07, "loss": 0.7689, "step": 32 }, { "epoch": 0.024529236868186324, "grad_norm": 2.0293784141540527, "learning_rate": 8.168316831683169e-07, "loss": 0.7959, "step": 33 }, { "epoch": 0.02527254707631318, "grad_norm": 1.8610485792160034, "learning_rate": 8.415841584158417e-07, "loss": 0.7517, "step": 34 }, { "epoch": 0.02601585728444004, "grad_norm": 1.7773295640945435, "learning_rate": 8.663366336633663e-07, "loss": 0.7614, "step": 35 }, { "epoch": 0.026759167492566897, "grad_norm": 1.366025686264038, "learning_rate": 8.910891089108911e-07, "loss": 0.745, "step": 36 }, { "epoch": 0.027502477700693757, "grad_norm": 1.5288234949111938, "learning_rate": 9.158415841584159e-07, "loss": 0.6911, "step": 37 }, { "epoch": 0.028245787908820614, "grad_norm": 2.0033557415008545, "learning_rate": 9.405940594059406e-07, "loss": 0.7438, "step": 38 }, { "epoch": 0.028989098116947474, "grad_norm": 2.1585214138031006, "learning_rate": 9.653465346534655e-07, "loss": 0.7197, "step": 39 }, { "epoch": 0.02973240832507433, "grad_norm": 2.012362003326416, "learning_rate": 9.900990099009902e-07, "loss": 0.7224, "step": 40 }, { "epoch": 0.03047571853320119, "grad_norm": 2.19296932220459, "learning_rate": 1.014851485148515e-06, "loss": 0.7317, "step": 41 }, { "epoch": 0.031219028741328047, "grad_norm": 1.8439847230911255, "learning_rate": 1.0396039603960397e-06, "loss": 0.6863, "step": 42 }, { "epoch": 0.0319623389494549, "grad_norm": 1.7286558151245117, "learning_rate": 1.0643564356435644e-06, "loss": 0.6546, "step": 43 }, { "epoch": 0.03270564915758176, "grad_norm": 1.7375023365020752, "learning_rate": 1.0891089108910893e-06, "loss": 0.6892, "step": 44 }, { "epoch": 0.03344895936570862, "grad_norm": 1.4694201946258545, "learning_rate": 1.113861386138614e-06, "loss": 0.6956, "step": 45 }, { "epoch": 0.03419226957383548, "grad_norm": 1.2022805213928223, "learning_rate": 1.1386138613861388e-06, "loss": 0.6803, "step": 46 }, { "epoch": 0.034935579781962336, "grad_norm": 1.0188475847244263, "learning_rate": 1.1633663366336635e-06, "loss": 0.6848, "step": 47 }, { "epoch": 0.035678889990089196, "grad_norm": 0.9226506352424622, "learning_rate": 1.1881188118811881e-06, "loss": 0.6663, "step": 48 }, { "epoch": 0.036422200198216056, "grad_norm": 0.8798975348472595, "learning_rate": 1.212871287128713e-06, "loss": 0.6634, "step": 49 }, { "epoch": 0.037165510406342916, "grad_norm": 0.9661669135093689, "learning_rate": 1.2376237623762377e-06, "loss": 0.6404, "step": 50 }, { "epoch": 0.03790882061446977, "grad_norm": 1.0756473541259766, "learning_rate": 1.2623762376237625e-06, "loss": 0.6502, "step": 51 }, { "epoch": 0.03865213082259663, "grad_norm": 1.027353286743164, "learning_rate": 1.2871287128712872e-06, "loss": 0.6611, "step": 52 }, { "epoch": 0.03939544103072349, "grad_norm": 0.9659296274185181, "learning_rate": 1.311881188118812e-06, "loss": 0.6251, "step": 53 }, { "epoch": 0.04013875123885035, "grad_norm": 0.8395365476608276, "learning_rate": 1.3366336633663367e-06, "loss": 0.6596, "step": 54 }, { "epoch": 0.0408820614469772, "grad_norm": 0.8122340440750122, "learning_rate": 1.3613861386138616e-06, "loss": 0.6441, "step": 55 }, { "epoch": 0.04162537165510406, "grad_norm": 0.8509111404418945, "learning_rate": 1.3861386138613863e-06, "loss": 0.6641, "step": 56 }, { "epoch": 0.04236868186323092, "grad_norm": 0.7928901314735413, "learning_rate": 1.410891089108911e-06, "loss": 0.6555, "step": 57 }, { "epoch": 0.04311199207135778, "grad_norm": 0.7211742997169495, "learning_rate": 1.4356435643564356e-06, "loss": 0.6609, "step": 58 }, { "epoch": 0.043855302279484636, "grad_norm": 0.6866978406906128, "learning_rate": 1.4603960396039605e-06, "loss": 0.6609, "step": 59 }, { "epoch": 0.044598612487611496, "grad_norm": 0.7188010811805725, "learning_rate": 1.4851485148514852e-06, "loss": 0.6563, "step": 60 }, { "epoch": 0.045341922695738356, "grad_norm": 0.6553677916526794, "learning_rate": 1.50990099009901e-06, "loss": 0.6514, "step": 61 }, { "epoch": 0.046085232903865216, "grad_norm": 0.6269822120666504, "learning_rate": 1.5346534653465347e-06, "loss": 0.6441, "step": 62 }, { "epoch": 0.04682854311199207, "grad_norm": 0.6455215215682983, "learning_rate": 1.5594059405940596e-06, "loss": 0.584, "step": 63 }, { "epoch": 0.04757185332011893, "grad_norm": 0.6785221695899963, "learning_rate": 1.5841584158415842e-06, "loss": 0.6057, "step": 64 }, { "epoch": 0.04831516352824579, "grad_norm": 0.7301404476165771, "learning_rate": 1.6089108910891091e-06, "loss": 0.6239, "step": 65 }, { "epoch": 0.04905847373637265, "grad_norm": 0.5305691957473755, "learning_rate": 1.6336633663366338e-06, "loss": 0.6195, "step": 66 }, { "epoch": 0.0498017839444995, "grad_norm": 0.5761064887046814, "learning_rate": 1.6584158415841587e-06, "loss": 0.623, "step": 67 }, { "epoch": 0.05054509415262636, "grad_norm": 0.4838932454586029, "learning_rate": 1.6831683168316833e-06, "loss": 0.6202, "step": 68 }, { "epoch": 0.05128840436075322, "grad_norm": 0.4967688322067261, "learning_rate": 1.7079207920792082e-06, "loss": 0.599, "step": 69 }, { "epoch": 0.05203171456888008, "grad_norm": 0.5389752388000488, "learning_rate": 1.7326732673267326e-06, "loss": 0.5817, "step": 70 }, { "epoch": 0.052775024777006935, "grad_norm": 0.5388440489768982, "learning_rate": 1.7574257425742575e-06, "loss": 0.5713, "step": 71 }, { "epoch": 0.053518334985133795, "grad_norm": 0.5632448196411133, "learning_rate": 1.7821782178217822e-06, "loss": 0.6255, "step": 72 }, { "epoch": 0.054261645193260655, "grad_norm": 0.5152658224105835, "learning_rate": 1.806930693069307e-06, "loss": 0.6237, "step": 73 }, { "epoch": 0.055004955401387515, "grad_norm": 0.5288457870483398, "learning_rate": 1.8316831683168317e-06, "loss": 0.629, "step": 74 }, { "epoch": 0.05574826560951437, "grad_norm": 0.520173192024231, "learning_rate": 1.8564356435643566e-06, "loss": 0.6076, "step": 75 }, { "epoch": 0.05649157581764123, "grad_norm": 0.5277500152587891, "learning_rate": 1.8811881188118813e-06, "loss": 0.6227, "step": 76 }, { "epoch": 0.05723488602576809, "grad_norm": 0.4581058621406555, "learning_rate": 1.9059405940594061e-06, "loss": 0.6009, "step": 77 }, { "epoch": 0.05797819623389495, "grad_norm": 0.4728521406650543, "learning_rate": 1.930693069306931e-06, "loss": 0.5692, "step": 78 }, { "epoch": 0.0587215064420218, "grad_norm": 0.4677862524986267, "learning_rate": 1.9554455445544555e-06, "loss": 0.6089, "step": 79 }, { "epoch": 0.05946481665014866, "grad_norm": 0.5398072600364685, "learning_rate": 1.9801980198019803e-06, "loss": 0.5969, "step": 80 }, { "epoch": 0.06020812685827552, "grad_norm": 0.5086443424224854, "learning_rate": 2.0049504950495052e-06, "loss": 0.5702, "step": 81 }, { "epoch": 0.06095143706640238, "grad_norm": 0.4599297046661377, "learning_rate": 2.02970297029703e-06, "loss": 0.559, "step": 82 }, { "epoch": 0.061694747274529234, "grad_norm": 0.4818575084209442, "learning_rate": 2.0544554455445546e-06, "loss": 0.6081, "step": 83 }, { "epoch": 0.062438057482656094, "grad_norm": 0.4580310881137848, "learning_rate": 2.0792079207920794e-06, "loss": 0.604, "step": 84 }, { "epoch": 0.06318136769078295, "grad_norm": 0.4662623405456543, "learning_rate": 2.103960396039604e-06, "loss": 0.5857, "step": 85 }, { "epoch": 0.0639246778989098, "grad_norm": 0.4752725064754486, "learning_rate": 2.1287128712871288e-06, "loss": 0.5507, "step": 86 }, { "epoch": 0.06466798810703667, "grad_norm": 0.4286254346370697, "learning_rate": 2.1534653465346536e-06, "loss": 0.5688, "step": 87 }, { "epoch": 0.06541129831516353, "grad_norm": 0.43976491689682007, "learning_rate": 2.1782178217821785e-06, "loss": 0.5984, "step": 88 }, { "epoch": 0.06615460852329039, "grad_norm": 0.45546266436576843, "learning_rate": 2.202970297029703e-06, "loss": 0.5806, "step": 89 }, { "epoch": 0.06689791873141725, "grad_norm": 0.4776824116706848, "learning_rate": 2.227722772277228e-06, "loss": 0.5606, "step": 90 }, { "epoch": 0.0676412289395441, "grad_norm": 0.46758347749710083, "learning_rate": 2.2524752475247527e-06, "loss": 0.5744, "step": 91 }, { "epoch": 0.06838453914767097, "grad_norm": 0.40587735176086426, "learning_rate": 2.2772277227722776e-06, "loss": 0.5572, "step": 92 }, { "epoch": 0.06912784935579781, "grad_norm": 0.40530866384506226, "learning_rate": 2.3019801980198025e-06, "loss": 0.5749, "step": 93 }, { "epoch": 0.06987115956392467, "grad_norm": 0.4385751187801361, "learning_rate": 2.326732673267327e-06, "loss": 0.5677, "step": 94 }, { "epoch": 0.07061446977205153, "grad_norm": 0.5101872086524963, "learning_rate": 2.3514851485148514e-06, "loss": 0.5825, "step": 95 }, { "epoch": 0.07135777998017839, "grad_norm": 0.48427459597587585, "learning_rate": 2.3762376237623762e-06, "loss": 0.5784, "step": 96 }, { "epoch": 0.07210109018830525, "grad_norm": 0.4452694058418274, "learning_rate": 2.400990099009901e-06, "loss": 0.5532, "step": 97 }, { "epoch": 0.07284440039643211, "grad_norm": 0.4033268094062805, "learning_rate": 2.425742574257426e-06, "loss": 0.5693, "step": 98 }, { "epoch": 0.07358771060455897, "grad_norm": 0.4787071645259857, "learning_rate": 2.4504950495049505e-06, "loss": 0.5251, "step": 99 }, { "epoch": 0.07433102081268583, "grad_norm": 0.4482133984565735, "learning_rate": 2.4752475247524753e-06, "loss": 0.5925, "step": 100 }, { "epoch": 0.07507433102081268, "grad_norm": 0.4687333405017853, "learning_rate": 2.5e-06, "loss": 0.5716, "step": 101 }, { "epoch": 0.07581764122893954, "grad_norm": 0.46060818433761597, "learning_rate": 2.524752475247525e-06, "loss": 0.5822, "step": 102 }, { "epoch": 0.0765609514370664, "grad_norm": 0.49632108211517334, "learning_rate": 2.54950495049505e-06, "loss": 0.5906, "step": 103 }, { "epoch": 0.07730426164519326, "grad_norm": 0.4452047646045685, "learning_rate": 2.5742574257425744e-06, "loss": 0.5609, "step": 104 }, { "epoch": 0.07804757185332012, "grad_norm": 0.4802757799625397, "learning_rate": 2.5990099009900993e-06, "loss": 0.6013, "step": 105 }, { "epoch": 0.07879088206144698, "grad_norm": 0.4209423363208771, "learning_rate": 2.623762376237624e-06, "loss": 0.5459, "step": 106 }, { "epoch": 0.07953419226957384, "grad_norm": 0.3964388966560364, "learning_rate": 2.648514851485149e-06, "loss": 0.5646, "step": 107 }, { "epoch": 0.0802775024777007, "grad_norm": 0.40172046422958374, "learning_rate": 2.6732673267326735e-06, "loss": 0.554, "step": 108 }, { "epoch": 0.08102081268582755, "grad_norm": 0.4461441934108734, "learning_rate": 2.6980198019801984e-06, "loss": 0.5411, "step": 109 }, { "epoch": 0.0817641228939544, "grad_norm": 0.4360045790672302, "learning_rate": 2.7227722772277232e-06, "loss": 0.5492, "step": 110 }, { "epoch": 0.08250743310208127, "grad_norm": 0.47421205043792725, "learning_rate": 2.747524752475248e-06, "loss": 0.5471, "step": 111 }, { "epoch": 0.08325074331020813, "grad_norm": 0.4908078908920288, "learning_rate": 2.7722772277227726e-06, "loss": 0.5508, "step": 112 }, { "epoch": 0.08399405351833499, "grad_norm": 0.46661320328712463, "learning_rate": 2.7970297029702974e-06, "loss": 0.565, "step": 113 }, { "epoch": 0.08473736372646185, "grad_norm": 0.42289769649505615, "learning_rate": 2.821782178217822e-06, "loss": 0.5535, "step": 114 }, { "epoch": 0.0854806739345887, "grad_norm": 0.4719426929950714, "learning_rate": 2.8465346534653464e-06, "loss": 0.5628, "step": 115 }, { "epoch": 0.08622398414271557, "grad_norm": 0.4354778230190277, "learning_rate": 2.8712871287128712e-06, "loss": 0.551, "step": 116 }, { "epoch": 0.08696729435084243, "grad_norm": 0.45441314578056335, "learning_rate": 2.896039603960396e-06, "loss": 0.5605, "step": 117 }, { "epoch": 0.08771060455896927, "grad_norm": 0.47440457344055176, "learning_rate": 2.920792079207921e-06, "loss": 0.5403, "step": 118 }, { "epoch": 0.08845391476709613, "grad_norm": 0.47835442423820496, "learning_rate": 2.9455445544554454e-06, "loss": 0.5607, "step": 119 }, { "epoch": 0.08919722497522299, "grad_norm": 0.44359564781188965, "learning_rate": 2.9702970297029703e-06, "loss": 0.5455, "step": 120 }, { "epoch": 0.08994053518334985, "grad_norm": 0.48243454098701477, "learning_rate": 2.995049504950495e-06, "loss": 0.5541, "step": 121 }, { "epoch": 0.09068384539147671, "grad_norm": 0.42605650424957275, "learning_rate": 3.01980198019802e-06, "loss": 0.5434, "step": 122 }, { "epoch": 0.09142715559960357, "grad_norm": 0.4616245925426483, "learning_rate": 3.044554455445545e-06, "loss": 0.5501, "step": 123 }, { "epoch": 0.09217046580773043, "grad_norm": 0.4019681513309479, "learning_rate": 3.0693069306930694e-06, "loss": 0.5366, "step": 124 }, { "epoch": 0.09291377601585729, "grad_norm": 0.4853626787662506, "learning_rate": 3.0940594059405943e-06, "loss": 0.5359, "step": 125 }, { "epoch": 0.09365708622398414, "grad_norm": 0.4740798771381378, "learning_rate": 3.118811881188119e-06, "loss": 0.5608, "step": 126 }, { "epoch": 0.094400396432111, "grad_norm": 0.4716520309448242, "learning_rate": 3.143564356435644e-06, "loss": 0.5403, "step": 127 }, { "epoch": 0.09514370664023786, "grad_norm": 0.4860478341579437, "learning_rate": 3.1683168316831685e-06, "loss": 0.5735, "step": 128 }, { "epoch": 0.09588701684836472, "grad_norm": 0.4342196583747864, "learning_rate": 3.1930693069306933e-06, "loss": 0.5544, "step": 129 }, { "epoch": 0.09663032705649158, "grad_norm": 0.464524507522583, "learning_rate": 3.2178217821782182e-06, "loss": 0.5876, "step": 130 }, { "epoch": 0.09737363726461844, "grad_norm": 0.4703315496444702, "learning_rate": 3.242574257425743e-06, "loss": 0.513, "step": 131 }, { "epoch": 0.0981169474727453, "grad_norm": 0.4774945080280304, "learning_rate": 3.2673267326732676e-06, "loss": 0.5622, "step": 132 }, { "epoch": 0.09886025768087216, "grad_norm": 0.4305424690246582, "learning_rate": 3.2920792079207924e-06, "loss": 0.541, "step": 133 }, { "epoch": 0.099603567888999, "grad_norm": 0.451345294713974, "learning_rate": 3.3168316831683173e-06, "loss": 0.5317, "step": 134 }, { "epoch": 0.10034687809712586, "grad_norm": 0.46744972467422485, "learning_rate": 3.341584158415842e-06, "loss": 0.5486, "step": 135 }, { "epoch": 0.10109018830525272, "grad_norm": 0.4475635290145874, "learning_rate": 3.3663366336633666e-06, "loss": 0.5307, "step": 136 }, { "epoch": 0.10183349851337958, "grad_norm": 0.44471287727355957, "learning_rate": 3.3910891089108915e-06, "loss": 0.5363, "step": 137 }, { "epoch": 0.10257680872150644, "grad_norm": 0.4605703055858612, "learning_rate": 3.4158415841584164e-06, "loss": 0.533, "step": 138 }, { "epoch": 0.1033201189296333, "grad_norm": 0.4224656820297241, "learning_rate": 3.4405940594059404e-06, "loss": 0.5802, "step": 139 }, { "epoch": 0.10406342913776016, "grad_norm": 0.4675084352493286, "learning_rate": 3.4653465346534653e-06, "loss": 0.5063, "step": 140 }, { "epoch": 0.10480673934588702, "grad_norm": 0.4248298108577728, "learning_rate": 3.49009900990099e-06, "loss": 0.5487, "step": 141 }, { "epoch": 0.10555004955401387, "grad_norm": 0.45625585317611694, "learning_rate": 3.514851485148515e-06, "loss": 0.5455, "step": 142 }, { "epoch": 0.10629335976214073, "grad_norm": 0.4366876184940338, "learning_rate": 3.5396039603960395e-06, "loss": 0.5585, "step": 143 }, { "epoch": 0.10703666997026759, "grad_norm": 0.48566946387290955, "learning_rate": 3.5643564356435644e-06, "loss": 0.5504, "step": 144 }, { "epoch": 0.10777998017839445, "grad_norm": 0.4654909074306488, "learning_rate": 3.5891089108910892e-06, "loss": 0.5379, "step": 145 }, { "epoch": 0.10852329038652131, "grad_norm": 0.48410966992378235, "learning_rate": 3.613861386138614e-06, "loss": 0.5422, "step": 146 }, { "epoch": 0.10926660059464817, "grad_norm": 0.46971452236175537, "learning_rate": 3.638613861386139e-06, "loss": 0.5394, "step": 147 }, { "epoch": 0.11000991080277503, "grad_norm": 0.43932268023490906, "learning_rate": 3.6633663366336635e-06, "loss": 0.5513, "step": 148 }, { "epoch": 0.11075322101090189, "grad_norm": 0.47325316071510315, "learning_rate": 3.6881188118811883e-06, "loss": 0.5332, "step": 149 }, { "epoch": 0.11149653121902874, "grad_norm": 0.46883144974708557, "learning_rate": 3.712871287128713e-06, "loss": 0.5228, "step": 150 }, { "epoch": 0.1122398414271556, "grad_norm": 0.504351794719696, "learning_rate": 3.737623762376238e-06, "loss": 0.5353, "step": 151 }, { "epoch": 0.11298315163528246, "grad_norm": 0.4592736065387726, "learning_rate": 3.7623762376237625e-06, "loss": 0.5483, "step": 152 }, { "epoch": 0.11372646184340932, "grad_norm": 0.5053746104240417, "learning_rate": 3.7871287128712874e-06, "loss": 0.533, "step": 153 }, { "epoch": 0.11446977205153618, "grad_norm": 0.46648526191711426, "learning_rate": 3.8118811881188123e-06, "loss": 0.5226, "step": 154 }, { "epoch": 0.11521308225966304, "grad_norm": 0.44674375653266907, "learning_rate": 3.836633663366337e-06, "loss": 0.5264, "step": 155 }, { "epoch": 0.1159563924677899, "grad_norm": 0.492818683385849, "learning_rate": 3.861386138613862e-06, "loss": 0.5488, "step": 156 }, { "epoch": 0.11669970267591676, "grad_norm": 0.4794859290122986, "learning_rate": 3.886138613861387e-06, "loss": 0.5607, "step": 157 }, { "epoch": 0.1174430128840436, "grad_norm": 0.43738847970962524, "learning_rate": 3.910891089108911e-06, "loss": 0.5313, "step": 158 }, { "epoch": 0.11818632309217046, "grad_norm": 0.4613577127456665, "learning_rate": 3.935643564356436e-06, "loss": 0.5449, "step": 159 }, { "epoch": 0.11892963330029732, "grad_norm": 0.47308024764060974, "learning_rate": 3.960396039603961e-06, "loss": 0.5584, "step": 160 }, { "epoch": 0.11967294350842418, "grad_norm": 0.4822494685649872, "learning_rate": 3.9851485148514856e-06, "loss": 0.5136, "step": 161 }, { "epoch": 0.12041625371655104, "grad_norm": 0.434329092502594, "learning_rate": 4.0099009900990104e-06, "loss": 0.5435, "step": 162 }, { "epoch": 0.1211595639246779, "grad_norm": 0.4368073344230652, "learning_rate": 4.034653465346535e-06, "loss": 0.5309, "step": 163 }, { "epoch": 0.12190287413280476, "grad_norm": 0.4989222288131714, "learning_rate": 4.05940594059406e-06, "loss": 0.5547, "step": 164 }, { "epoch": 0.12264618434093162, "grad_norm": 0.4628196358680725, "learning_rate": 4.084158415841584e-06, "loss": 0.5399, "step": 165 }, { "epoch": 0.12338949454905847, "grad_norm": 0.44373273849487305, "learning_rate": 4.108910891089109e-06, "loss": 0.5358, "step": 166 }, { "epoch": 0.12413280475718533, "grad_norm": 0.4264870584011078, "learning_rate": 4.133663366336634e-06, "loss": 0.5226, "step": 167 }, { "epoch": 0.12487611496531219, "grad_norm": 0.4214656949043274, "learning_rate": 4.158415841584159e-06, "loss": 0.5351, "step": 168 }, { "epoch": 0.12561942517343905, "grad_norm": 0.42062050104141235, "learning_rate": 4.183168316831684e-06, "loss": 0.52, "step": 169 }, { "epoch": 0.1263627353815659, "grad_norm": 0.4452720284461975, "learning_rate": 4.207920792079208e-06, "loss": 0.542, "step": 170 }, { "epoch": 0.12710604558969277, "grad_norm": 0.48112401366233826, "learning_rate": 4.232673267326733e-06, "loss": 0.52, "step": 171 }, { "epoch": 0.1278493557978196, "grad_norm": 0.4752514362335205, "learning_rate": 4.2574257425742575e-06, "loss": 0.595, "step": 172 }, { "epoch": 0.1285926660059465, "grad_norm": 0.4438402056694031, "learning_rate": 4.282178217821782e-06, "loss": 0.4896, "step": 173 }, { "epoch": 0.12933597621407333, "grad_norm": 0.431855708360672, "learning_rate": 4.306930693069307e-06, "loss": 0.5293, "step": 174 }, { "epoch": 0.1300792864222002, "grad_norm": 0.4909333288669586, "learning_rate": 4.331683168316832e-06, "loss": 0.524, "step": 175 }, { "epoch": 0.13082259663032705, "grad_norm": 0.40157315135002136, "learning_rate": 4.356435643564357e-06, "loss": 0.5304, "step": 176 }, { "epoch": 0.13156590683845393, "grad_norm": 0.4373876452445984, "learning_rate": 4.381188118811882e-06, "loss": 0.52, "step": 177 }, { "epoch": 0.13230921704658077, "grad_norm": 0.44756701588630676, "learning_rate": 4.405940594059406e-06, "loss": 0.5346, "step": 178 }, { "epoch": 0.13305252725470762, "grad_norm": 0.47474202513694763, "learning_rate": 4.430693069306931e-06, "loss": 0.5332, "step": 179 }, { "epoch": 0.1337958374628345, "grad_norm": 0.45652738213539124, "learning_rate": 4.455445544554456e-06, "loss": 0.5619, "step": 180 }, { "epoch": 0.13453914767096134, "grad_norm": 0.4373515844345093, "learning_rate": 4.4801980198019806e-06, "loss": 0.5496, "step": 181 }, { "epoch": 0.1352824578790882, "grad_norm": 0.4370967447757721, "learning_rate": 4.5049504950495054e-06, "loss": 0.4986, "step": 182 }, { "epoch": 0.13602576808721506, "grad_norm": 0.44371795654296875, "learning_rate": 4.52970297029703e-06, "loss": 0.5396, "step": 183 }, { "epoch": 0.13676907829534193, "grad_norm": 0.45473968982696533, "learning_rate": 4.554455445544555e-06, "loss": 0.5188, "step": 184 }, { "epoch": 0.13751238850346878, "grad_norm": 0.44284147024154663, "learning_rate": 4.57920792079208e-06, "loss": 0.514, "step": 185 }, { "epoch": 0.13825569871159563, "grad_norm": 0.42653924226760864, "learning_rate": 4.603960396039605e-06, "loss": 0.5448, "step": 186 }, { "epoch": 0.1389990089197225, "grad_norm": 0.45377421379089355, "learning_rate": 4.628712871287129e-06, "loss": 0.5361, "step": 187 }, { "epoch": 0.13974231912784935, "grad_norm": 0.48410120606422424, "learning_rate": 4.653465346534654e-06, "loss": 0.5074, "step": 188 }, { "epoch": 0.14048562933597622, "grad_norm": 0.45418596267700195, "learning_rate": 4.678217821782179e-06, "loss": 0.5131, "step": 189 }, { "epoch": 0.14122893954410307, "grad_norm": 0.4985758662223816, "learning_rate": 4.702970297029703e-06, "loss": 0.5193, "step": 190 }, { "epoch": 0.14197224975222994, "grad_norm": 0.4467200040817261, "learning_rate": 4.727722772277228e-06, "loss": 0.5075, "step": 191 }, { "epoch": 0.14271555996035679, "grad_norm": 0.4127119183540344, "learning_rate": 4.7524752475247525e-06, "loss": 0.5031, "step": 192 }, { "epoch": 0.14345887016848366, "grad_norm": 0.4410676956176758, "learning_rate": 4.777227722772277e-06, "loss": 0.5105, "step": 193 }, { "epoch": 0.1442021803766105, "grad_norm": 0.5219435691833496, "learning_rate": 4.801980198019802e-06, "loss": 0.5238, "step": 194 }, { "epoch": 0.14494549058473735, "grad_norm": 0.427232027053833, "learning_rate": 4.826732673267327e-06, "loss": 0.5349, "step": 195 }, { "epoch": 0.14568880079286423, "grad_norm": 0.47921016812324524, "learning_rate": 4.851485148514852e-06, "loss": 0.5548, "step": 196 }, { "epoch": 0.14643211100099107, "grad_norm": 0.4536798894405365, "learning_rate": 4.876237623762377e-06, "loss": 0.4965, "step": 197 }, { "epoch": 0.14717542120911795, "grad_norm": 0.45389771461486816, "learning_rate": 4.900990099009901e-06, "loss": 0.5198, "step": 198 }, { "epoch": 0.1479187314172448, "grad_norm": 0.38764092326164246, "learning_rate": 4.925742574257426e-06, "loss": 0.488, "step": 199 }, { "epoch": 0.14866204162537167, "grad_norm": 0.4550466239452362, "learning_rate": 4.950495049504951e-06, "loss": 0.517, "step": 200 }, { "epoch": 0.1494053518334985, "grad_norm": 0.46836602687835693, "learning_rate": 4.9752475247524755e-06, "loss": 0.527, "step": 201 }, { "epoch": 0.15014866204162536, "grad_norm": 0.471284419298172, "learning_rate": 5e-06, "loss": 0.5121, "step": 202 }, { "epoch": 0.15089197224975223, "grad_norm": 0.47552749514579773, "learning_rate": 5.024752475247525e-06, "loss": 0.5158, "step": 203 }, { "epoch": 0.15163528245787908, "grad_norm": 0.45879486203193665, "learning_rate": 5.04950495049505e-06, "loss": 0.4999, "step": 204 }, { "epoch": 0.15237859266600595, "grad_norm": 0.4500840902328491, "learning_rate": 5.074257425742575e-06, "loss": 0.5253, "step": 205 }, { "epoch": 0.1531219028741328, "grad_norm": 0.4661167860031128, "learning_rate": 5.0990099009901e-06, "loss": 0.5129, "step": 206 }, { "epoch": 0.15386521308225967, "grad_norm": 0.4430326223373413, "learning_rate": 5.123762376237624e-06, "loss": 0.5156, "step": 207 }, { "epoch": 0.15460852329038652, "grad_norm": 0.44413721561431885, "learning_rate": 5.148514851485149e-06, "loss": 0.5231, "step": 208 }, { "epoch": 0.1553518334985134, "grad_norm": 0.48977646231651306, "learning_rate": 5.173267326732674e-06, "loss": 0.5578, "step": 209 }, { "epoch": 0.15609514370664024, "grad_norm": 0.4958733320236206, "learning_rate": 5.1980198019801986e-06, "loss": 0.525, "step": 210 }, { "epoch": 0.15683845391476708, "grad_norm": 0.4345434010028839, "learning_rate": 5.2227722772277234e-06, "loss": 0.5456, "step": 211 }, { "epoch": 0.15758176412289396, "grad_norm": 0.4231167733669281, "learning_rate": 5.247524752475248e-06, "loss": 0.4866, "step": 212 }, { "epoch": 0.1583250743310208, "grad_norm": 0.5175653696060181, "learning_rate": 5.272277227722773e-06, "loss": 0.556, "step": 213 }, { "epoch": 0.15906838453914768, "grad_norm": 0.49100637435913086, "learning_rate": 5.297029702970298e-06, "loss": 0.5485, "step": 214 }, { "epoch": 0.15981169474727452, "grad_norm": 0.4906803369522095, "learning_rate": 5.321782178217822e-06, "loss": 0.5081, "step": 215 }, { "epoch": 0.1605550049554014, "grad_norm": 0.5360707640647888, "learning_rate": 5.346534653465347e-06, "loss": 0.5131, "step": 216 }, { "epoch": 0.16129831516352824, "grad_norm": 0.4741499125957489, "learning_rate": 5.371287128712872e-06, "loss": 0.5487, "step": 217 }, { "epoch": 0.1620416253716551, "grad_norm": 0.5384502410888672, "learning_rate": 5.396039603960397e-06, "loss": 0.5026, "step": 218 }, { "epoch": 0.16278493557978196, "grad_norm": 0.48085111379623413, "learning_rate": 5.420792079207922e-06, "loss": 0.482, "step": 219 }, { "epoch": 0.1635282457879088, "grad_norm": 0.5121098160743713, "learning_rate": 5.4455445544554465e-06, "loss": 0.5292, "step": 220 }, { "epoch": 0.16427155599603568, "grad_norm": 0.4785178303718567, "learning_rate": 5.470297029702971e-06, "loss": 0.4944, "step": 221 }, { "epoch": 0.16501486620416253, "grad_norm": 0.4850371778011322, "learning_rate": 5.495049504950496e-06, "loss": 0.5344, "step": 222 }, { "epoch": 0.1657581764122894, "grad_norm": 0.5366983413696289, "learning_rate": 5.519801980198021e-06, "loss": 0.5059, "step": 223 }, { "epoch": 0.16650148662041625, "grad_norm": 0.425606906414032, "learning_rate": 5.544554455445545e-06, "loss": 0.5207, "step": 224 }, { "epoch": 0.16724479682854312, "grad_norm": 0.5467171669006348, "learning_rate": 5.56930693069307e-06, "loss": 0.5087, "step": 225 }, { "epoch": 0.16798810703666997, "grad_norm": 0.509044349193573, "learning_rate": 5.594059405940595e-06, "loss": 0.4891, "step": 226 }, { "epoch": 0.16873141724479682, "grad_norm": 0.6025151014328003, "learning_rate": 5.61881188118812e-06, "loss": 0.5281, "step": 227 }, { "epoch": 0.1694747274529237, "grad_norm": 0.5316563844680786, "learning_rate": 5.643564356435644e-06, "loss": 0.5041, "step": 228 }, { "epoch": 0.17021803766105054, "grad_norm": 0.44227367639541626, "learning_rate": 5.668316831683169e-06, "loss": 0.5178, "step": 229 }, { "epoch": 0.1709613478691774, "grad_norm": 0.6156481504440308, "learning_rate": 5.693069306930693e-06, "loss": 0.5179, "step": 230 }, { "epoch": 0.17170465807730426, "grad_norm": 0.5856399536132812, "learning_rate": 5.717821782178218e-06, "loss": 0.5109, "step": 231 }, { "epoch": 0.17244796828543113, "grad_norm": 0.5178152918815613, "learning_rate": 5.7425742574257425e-06, "loss": 0.502, "step": 232 }, { "epoch": 0.17319127849355798, "grad_norm": 0.5159707069396973, "learning_rate": 5.767326732673267e-06, "loss": 0.5138, "step": 233 }, { "epoch": 0.17393458870168485, "grad_norm": 0.5301708579063416, "learning_rate": 5.792079207920792e-06, "loss": 0.5268, "step": 234 }, { "epoch": 0.1746778989098117, "grad_norm": 0.4690225124359131, "learning_rate": 5.816831683168317e-06, "loss": 0.5008, "step": 235 }, { "epoch": 0.17542120911793854, "grad_norm": 0.4541289508342743, "learning_rate": 5.841584158415842e-06, "loss": 0.5227, "step": 236 }, { "epoch": 0.17616451932606542, "grad_norm": 0.4861178994178772, "learning_rate": 5.866336633663367e-06, "loss": 0.5308, "step": 237 }, { "epoch": 0.17690782953419226, "grad_norm": 0.5014662742614746, "learning_rate": 5.891089108910891e-06, "loss": 0.5045, "step": 238 }, { "epoch": 0.17765113974231914, "grad_norm": 0.5350708365440369, "learning_rate": 5.915841584158416e-06, "loss": 0.5242, "step": 239 }, { "epoch": 0.17839444995044598, "grad_norm": 0.509087860584259, "learning_rate": 5.940594059405941e-06, "loss": 0.5493, "step": 240 }, { "epoch": 0.17913776015857286, "grad_norm": 0.5118116736412048, "learning_rate": 5.9653465346534655e-06, "loss": 0.509, "step": 241 }, { "epoch": 0.1798810703666997, "grad_norm": 0.5453997850418091, "learning_rate": 5.99009900990099e-06, "loss": 0.5001, "step": 242 }, { "epoch": 0.18062438057482655, "grad_norm": 0.535991370677948, "learning_rate": 6.014851485148515e-06, "loss": 0.5119, "step": 243 }, { "epoch": 0.18136769078295342, "grad_norm": 0.5806535482406616, "learning_rate": 6.03960396039604e-06, "loss": 0.4795, "step": 244 }, { "epoch": 0.18211100099108027, "grad_norm": 0.5645806193351746, "learning_rate": 6.064356435643565e-06, "loss": 0.5121, "step": 245 }, { "epoch": 0.18285431119920714, "grad_norm": 0.5311123728752136, "learning_rate": 6.08910891089109e-06, "loss": 0.5021, "step": 246 }, { "epoch": 0.183597621407334, "grad_norm": 0.6022430658340454, "learning_rate": 6.113861386138614e-06, "loss": 0.5811, "step": 247 }, { "epoch": 0.18434093161546086, "grad_norm": 0.5211747288703918, "learning_rate": 6.138613861386139e-06, "loss": 0.5148, "step": 248 }, { "epoch": 0.1850842418235877, "grad_norm": 0.5665345191955566, "learning_rate": 6.163366336633664e-06, "loss": 0.5248, "step": 249 }, { "epoch": 0.18582755203171458, "grad_norm": 0.582675039768219, "learning_rate": 6.1881188118811885e-06, "loss": 0.533, "step": 250 }, { "epoch": 0.18657086223984143, "grad_norm": 0.4879077076911926, "learning_rate": 6.212871287128713e-06, "loss": 0.5107, "step": 251 }, { "epoch": 0.18731417244796827, "grad_norm": 0.552381694316864, "learning_rate": 6.237623762376238e-06, "loss": 0.513, "step": 252 }, { "epoch": 0.18805748265609515, "grad_norm": 0.48247870802879333, "learning_rate": 6.262376237623763e-06, "loss": 0.4847, "step": 253 }, { "epoch": 0.188800792864222, "grad_norm": 0.5372512936592102, "learning_rate": 6.287128712871288e-06, "loss": 0.4768, "step": 254 }, { "epoch": 0.18954410307234887, "grad_norm": 0.5095828175544739, "learning_rate": 6.311881188118812e-06, "loss": 0.4982, "step": 255 }, { "epoch": 0.19028741328047571, "grad_norm": 0.5282643437385559, "learning_rate": 6.336633663366337e-06, "loss": 0.4986, "step": 256 }, { "epoch": 0.1910307234886026, "grad_norm": 0.4941370487213135, "learning_rate": 6.361386138613862e-06, "loss": 0.5029, "step": 257 }, { "epoch": 0.19177403369672943, "grad_norm": 0.5199927687644958, "learning_rate": 6.386138613861387e-06, "loss": 0.5044, "step": 258 }, { "epoch": 0.19251734390485628, "grad_norm": 0.5414229035377502, "learning_rate": 6.4108910891089116e-06, "loss": 0.5362, "step": 259 }, { "epoch": 0.19326065411298315, "grad_norm": 0.5082730650901794, "learning_rate": 6.4356435643564364e-06, "loss": 0.5237, "step": 260 }, { "epoch": 0.19400396432111, "grad_norm": 0.6030643582344055, "learning_rate": 6.460396039603961e-06, "loss": 0.5328, "step": 261 }, { "epoch": 0.19474727452923687, "grad_norm": 0.555672287940979, "learning_rate": 6.485148514851486e-06, "loss": 0.5057, "step": 262 }, { "epoch": 0.19549058473736372, "grad_norm": 0.6111627221107483, "learning_rate": 6.509900990099011e-06, "loss": 0.5101, "step": 263 }, { "epoch": 0.1962338949454906, "grad_norm": 0.6092895269393921, "learning_rate": 6.534653465346535e-06, "loss": 0.528, "step": 264 }, { "epoch": 0.19697720515361744, "grad_norm": 0.5410378575325012, "learning_rate": 6.55940594059406e-06, "loss": 0.5285, "step": 265 }, { "epoch": 0.19772051536174431, "grad_norm": 0.45867103338241577, "learning_rate": 6.584158415841585e-06, "loss": 0.4987, "step": 266 }, { "epoch": 0.19846382556987116, "grad_norm": 0.5248444080352783, "learning_rate": 6.60891089108911e-06, "loss": 0.5381, "step": 267 }, { "epoch": 0.199207135777998, "grad_norm": 0.5353764295578003, "learning_rate": 6.633663366336635e-06, "loss": 0.5286, "step": 268 }, { "epoch": 0.19995044598612488, "grad_norm": 0.458858460187912, "learning_rate": 6.6584158415841595e-06, "loss": 0.5042, "step": 269 }, { "epoch": 0.20069375619425173, "grad_norm": 0.5823876857757568, "learning_rate": 6.683168316831684e-06, "loss": 0.4924, "step": 270 }, { "epoch": 0.2014370664023786, "grad_norm": 0.5166022181510925, "learning_rate": 6.707920792079209e-06, "loss": 0.5041, "step": 271 }, { "epoch": 0.20218037661050545, "grad_norm": 0.5376091003417969, "learning_rate": 6.732673267326733e-06, "loss": 0.5328, "step": 272 }, { "epoch": 0.20292368681863232, "grad_norm": 0.5261780023574829, "learning_rate": 6.757425742574258e-06, "loss": 0.5038, "step": 273 }, { "epoch": 0.20366699702675917, "grad_norm": 0.5278216600418091, "learning_rate": 6.782178217821783e-06, "loss": 0.507, "step": 274 }, { "epoch": 0.204410307234886, "grad_norm": 0.5488489866256714, "learning_rate": 6.806930693069308e-06, "loss": 0.4984, "step": 275 }, { "epoch": 0.2051536174430129, "grad_norm": 0.4797666072845459, "learning_rate": 6.831683168316833e-06, "loss": 0.5038, "step": 276 }, { "epoch": 0.20589692765113973, "grad_norm": 0.5875706076622009, "learning_rate": 6.856435643564358e-06, "loss": 0.5027, "step": 277 }, { "epoch": 0.2066402378592666, "grad_norm": 0.477387398481369, "learning_rate": 6.881188118811881e-06, "loss": 0.5107, "step": 278 }, { "epoch": 0.20738354806739345, "grad_norm": 0.5426738262176514, "learning_rate": 6.905940594059406e-06, "loss": 0.5072, "step": 279 }, { "epoch": 0.20812685827552033, "grad_norm": 0.48128244280815125, "learning_rate": 6.930693069306931e-06, "loss": 0.4997, "step": 280 }, { "epoch": 0.20887016848364717, "grad_norm": 0.5680053234100342, "learning_rate": 6.9554455445544555e-06, "loss": 0.5226, "step": 281 }, { "epoch": 0.20961347869177405, "grad_norm": 0.5856984257698059, "learning_rate": 6.98019801980198e-06, "loss": 0.5155, "step": 282 }, { "epoch": 0.2103567888999009, "grad_norm": 0.5017432570457458, "learning_rate": 7.004950495049505e-06, "loss": 0.4891, "step": 283 }, { "epoch": 0.21110009910802774, "grad_norm": 0.5291122198104858, "learning_rate": 7.02970297029703e-06, "loss": 0.5122, "step": 284 }, { "epoch": 0.2118434093161546, "grad_norm": 0.6198121905326843, "learning_rate": 7.054455445544555e-06, "loss": 0.4878, "step": 285 }, { "epoch": 0.21258671952428146, "grad_norm": 0.5196989178657532, "learning_rate": 7.079207920792079e-06, "loss": 0.5143, "step": 286 }, { "epoch": 0.21333002973240833, "grad_norm": 0.5688585638999939, "learning_rate": 7.103960396039604e-06, "loss": 0.4957, "step": 287 }, { "epoch": 0.21407333994053518, "grad_norm": 0.6560046672821045, "learning_rate": 7.128712871287129e-06, "loss": 0.5026, "step": 288 }, { "epoch": 0.21481665014866205, "grad_norm": 0.4811052680015564, "learning_rate": 7.153465346534654e-06, "loss": 0.4782, "step": 289 }, { "epoch": 0.2155599603567889, "grad_norm": 0.5147226452827454, "learning_rate": 7.1782178217821785e-06, "loss": 0.4971, "step": 290 }, { "epoch": 0.21630327056491575, "grad_norm": 0.5403855443000793, "learning_rate": 7.202970297029703e-06, "loss": 0.5251, "step": 291 }, { "epoch": 0.21704658077304262, "grad_norm": 0.5252436399459839, "learning_rate": 7.227722772277228e-06, "loss": 0.5254, "step": 292 }, { "epoch": 0.21778989098116946, "grad_norm": 0.4940452575683594, "learning_rate": 7.252475247524753e-06, "loss": 0.509, "step": 293 }, { "epoch": 0.21853320118929634, "grad_norm": 0.5129565000534058, "learning_rate": 7.277227722772278e-06, "loss": 0.4797, "step": 294 }, { "epoch": 0.21927651139742318, "grad_norm": 0.5257061123847961, "learning_rate": 7.301980198019802e-06, "loss": 0.5007, "step": 295 }, { "epoch": 0.22001982160555006, "grad_norm": 0.5801428556442261, "learning_rate": 7.326732673267327e-06, "loss": 0.5141, "step": 296 }, { "epoch": 0.2207631318136769, "grad_norm": 0.48855528235435486, "learning_rate": 7.351485148514852e-06, "loss": 0.509, "step": 297 }, { "epoch": 0.22150644202180378, "grad_norm": 0.5569970607757568, "learning_rate": 7.376237623762377e-06, "loss": 0.503, "step": 298 }, { "epoch": 0.22224975222993062, "grad_norm": 0.5113475918769836, "learning_rate": 7.4009900990099015e-06, "loss": 0.4805, "step": 299 }, { "epoch": 0.22299306243805747, "grad_norm": 0.5131458044052124, "learning_rate": 7.425742574257426e-06, "loss": 0.511, "step": 300 }, { "epoch": 0.22373637264618434, "grad_norm": 0.5449301600456238, "learning_rate": 7.450495049504951e-06, "loss": 0.515, "step": 301 }, { "epoch": 0.2244796828543112, "grad_norm": 0.4476087987422943, "learning_rate": 7.475247524752476e-06, "loss": 0.5009, "step": 302 }, { "epoch": 0.22522299306243806, "grad_norm": 0.45732805132865906, "learning_rate": 7.500000000000001e-06, "loss": 0.4918, "step": 303 }, { "epoch": 0.2259663032705649, "grad_norm": 0.4608948528766632, "learning_rate": 7.524752475247525e-06, "loss": 0.4959, "step": 304 }, { "epoch": 0.22670961347869178, "grad_norm": 0.4578394591808319, "learning_rate": 7.54950495049505e-06, "loss": 0.5215, "step": 305 }, { "epoch": 0.22745292368681863, "grad_norm": 0.44605159759521484, "learning_rate": 7.574257425742575e-06, "loss": 0.5357, "step": 306 }, { "epoch": 0.22819623389494548, "grad_norm": 0.4310837686061859, "learning_rate": 7.5990099009901e-06, "loss": 0.486, "step": 307 }, { "epoch": 0.22893954410307235, "grad_norm": 0.5104194283485413, "learning_rate": 7.6237623762376246e-06, "loss": 0.4939, "step": 308 }, { "epoch": 0.2296828543111992, "grad_norm": 0.47748953104019165, "learning_rate": 7.648514851485149e-06, "loss": 0.5009, "step": 309 }, { "epoch": 0.23042616451932607, "grad_norm": 0.44622260332107544, "learning_rate": 7.673267326732674e-06, "loss": 0.5054, "step": 310 }, { "epoch": 0.23116947472745292, "grad_norm": 0.5874997973442078, "learning_rate": 7.698019801980198e-06, "loss": 0.5274, "step": 311 }, { "epoch": 0.2319127849355798, "grad_norm": 0.4257439076900482, "learning_rate": 7.722772277227724e-06, "loss": 0.4888, "step": 312 }, { "epoch": 0.23265609514370664, "grad_norm": 0.43075627088546753, "learning_rate": 7.747524752475248e-06, "loss": 0.4701, "step": 313 }, { "epoch": 0.2333994053518335, "grad_norm": 0.4804212152957916, "learning_rate": 7.772277227722774e-06, "loss": 0.5025, "step": 314 }, { "epoch": 0.23414271555996036, "grad_norm": 0.43794122338294983, "learning_rate": 7.797029702970298e-06, "loss": 0.5053, "step": 315 }, { "epoch": 0.2348860257680872, "grad_norm": 0.5216785073280334, "learning_rate": 7.821782178217822e-06, "loss": 0.5234, "step": 316 }, { "epoch": 0.23562933597621408, "grad_norm": 0.4367227554321289, "learning_rate": 7.846534653465348e-06, "loss": 0.4901, "step": 317 }, { "epoch": 0.23637264618434092, "grad_norm": 0.5354276895523071, "learning_rate": 7.871287128712872e-06, "loss": 0.4869, "step": 318 }, { "epoch": 0.2371159563924678, "grad_norm": 0.4983923137187958, "learning_rate": 7.896039603960397e-06, "loss": 0.5059, "step": 319 }, { "epoch": 0.23785926660059464, "grad_norm": 0.4833095371723175, "learning_rate": 7.920792079207921e-06, "loss": 0.4917, "step": 320 }, { "epoch": 0.23860257680872152, "grad_norm": 0.49613890051841736, "learning_rate": 7.945544554455447e-06, "loss": 0.5285, "step": 321 }, { "epoch": 0.23934588701684836, "grad_norm": 0.53828364610672, "learning_rate": 7.970297029702971e-06, "loss": 0.5052, "step": 322 }, { "epoch": 0.2400891972249752, "grad_norm": 0.4568215608596802, "learning_rate": 7.995049504950497e-06, "loss": 0.4989, "step": 323 }, { "epoch": 0.24083250743310208, "grad_norm": 0.5643250942230225, "learning_rate": 8.019801980198021e-06, "loss": 0.5133, "step": 324 }, { "epoch": 0.24157581764122893, "grad_norm": 0.5668647885322571, "learning_rate": 8.044554455445545e-06, "loss": 0.5027, "step": 325 }, { "epoch": 0.2423191278493558, "grad_norm": 0.4972526729106903, "learning_rate": 8.06930693069307e-06, "loss": 0.4947, "step": 326 }, { "epoch": 0.24306243805748265, "grad_norm": 0.6086891293525696, "learning_rate": 8.094059405940595e-06, "loss": 0.517, "step": 327 }, { "epoch": 0.24380574826560952, "grad_norm": 0.532262921333313, "learning_rate": 8.11881188118812e-06, "loss": 0.4825, "step": 328 }, { "epoch": 0.24454905847373637, "grad_norm": 0.5551033616065979, "learning_rate": 8.143564356435644e-06, "loss": 0.5024, "step": 329 }, { "epoch": 0.24529236868186324, "grad_norm": 0.5989863276481628, "learning_rate": 8.168316831683168e-06, "loss": 0.5058, "step": 330 }, { "epoch": 0.2460356788899901, "grad_norm": 0.5406901240348816, "learning_rate": 8.193069306930692e-06, "loss": 0.4959, "step": 331 }, { "epoch": 0.24677898909811694, "grad_norm": 0.5194695591926575, "learning_rate": 8.217821782178218e-06, "loss": 0.4294, "step": 332 }, { "epoch": 0.2475222993062438, "grad_norm": 0.5767866373062134, "learning_rate": 8.242574257425742e-06, "loss": 0.5049, "step": 333 }, { "epoch": 0.24826560951437066, "grad_norm": 0.5343612432479858, "learning_rate": 8.267326732673268e-06, "loss": 0.5252, "step": 334 }, { "epoch": 0.24900891972249753, "grad_norm": 0.5689433217048645, "learning_rate": 8.292079207920792e-06, "loss": 0.4895, "step": 335 }, { "epoch": 0.24975222993062438, "grad_norm": 0.4871666729450226, "learning_rate": 8.316831683168318e-06, "loss": 0.4888, "step": 336 }, { "epoch": 0.25049554013875125, "grad_norm": 0.590591549873352, "learning_rate": 8.341584158415842e-06, "loss": 0.5001, "step": 337 }, { "epoch": 0.2512388503468781, "grad_norm": 0.549049973487854, "learning_rate": 8.366336633663367e-06, "loss": 0.5264, "step": 338 }, { "epoch": 0.25198216055500494, "grad_norm": 0.5549241304397583, "learning_rate": 8.391089108910891e-06, "loss": 0.5034, "step": 339 }, { "epoch": 0.2527254707631318, "grad_norm": 0.4865972697734833, "learning_rate": 8.415841584158416e-06, "loss": 0.4732, "step": 340 }, { "epoch": 0.2534687809712587, "grad_norm": 0.5994433164596558, "learning_rate": 8.440594059405941e-06, "loss": 0.5056, "step": 341 }, { "epoch": 0.25421209117938554, "grad_norm": 0.5332654118537903, "learning_rate": 8.465346534653465e-06, "loss": 0.5214, "step": 342 }, { "epoch": 0.2549554013875124, "grad_norm": 0.6018056869506836, "learning_rate": 8.490099009900991e-06, "loss": 0.5229, "step": 343 }, { "epoch": 0.2556987115956392, "grad_norm": 0.5076673030853271, "learning_rate": 8.514851485148515e-06, "loss": 0.4898, "step": 344 }, { "epoch": 0.25644202180376613, "grad_norm": 0.5899639129638672, "learning_rate": 8.53960396039604e-06, "loss": 0.5272, "step": 345 }, { "epoch": 0.257185332011893, "grad_norm": 0.5329291224479675, "learning_rate": 8.564356435643565e-06, "loss": 0.5418, "step": 346 }, { "epoch": 0.2579286422200198, "grad_norm": 0.5399122834205627, "learning_rate": 8.58910891089109e-06, "loss": 0.5084, "step": 347 }, { "epoch": 0.25867195242814667, "grad_norm": 0.5656750798225403, "learning_rate": 8.613861386138615e-06, "loss": 0.519, "step": 348 }, { "epoch": 0.2594152626362735, "grad_norm": 0.5160282254219055, "learning_rate": 8.638613861386139e-06, "loss": 0.4677, "step": 349 }, { "epoch": 0.2601585728444004, "grad_norm": 0.5869655013084412, "learning_rate": 8.663366336633664e-06, "loss": 0.4812, "step": 350 }, { "epoch": 0.26090188305252726, "grad_norm": 0.5388080477714539, "learning_rate": 8.688118811881188e-06, "loss": 0.5137, "step": 351 }, { "epoch": 0.2616451932606541, "grad_norm": 0.5506287217140198, "learning_rate": 8.712871287128714e-06, "loss": 0.493, "step": 352 }, { "epoch": 0.26238850346878095, "grad_norm": 0.5026602149009705, "learning_rate": 8.737623762376238e-06, "loss": 0.5015, "step": 353 }, { "epoch": 0.26313181367690786, "grad_norm": 0.5491843819618225, "learning_rate": 8.762376237623764e-06, "loss": 0.5097, "step": 354 }, { "epoch": 0.2638751238850347, "grad_norm": 0.5250900983810425, "learning_rate": 8.787128712871288e-06, "loss": 0.4881, "step": 355 }, { "epoch": 0.26461843409316155, "grad_norm": 0.518272340297699, "learning_rate": 8.811881188118812e-06, "loss": 0.4925, "step": 356 }, { "epoch": 0.2653617443012884, "grad_norm": 0.5581690669059753, "learning_rate": 8.836633663366338e-06, "loss": 0.5089, "step": 357 }, { "epoch": 0.26610505450941524, "grad_norm": 0.48361197113990784, "learning_rate": 8.861386138613862e-06, "loss": 0.4906, "step": 358 }, { "epoch": 0.26684836471754214, "grad_norm": 0.5112094283103943, "learning_rate": 8.886138613861387e-06, "loss": 0.4808, "step": 359 }, { "epoch": 0.267591674925669, "grad_norm": 0.5314705967903137, "learning_rate": 8.910891089108911e-06, "loss": 0.5022, "step": 360 }, { "epoch": 0.26833498513379583, "grad_norm": 0.5173097252845764, "learning_rate": 8.935643564356437e-06, "loss": 0.504, "step": 361 }, { "epoch": 0.2690782953419227, "grad_norm": 0.5030562281608582, "learning_rate": 8.960396039603961e-06, "loss": 0.4943, "step": 362 }, { "epoch": 0.2698216055500496, "grad_norm": 0.5062738656997681, "learning_rate": 8.985148514851487e-06, "loss": 0.5016, "step": 363 }, { "epoch": 0.2705649157581764, "grad_norm": 0.4763631224632263, "learning_rate": 9.009900990099011e-06, "loss": 0.5109, "step": 364 }, { "epoch": 0.2713082259663033, "grad_norm": 0.5753836035728455, "learning_rate": 9.034653465346535e-06, "loss": 0.499, "step": 365 }, { "epoch": 0.2720515361744301, "grad_norm": 0.479875385761261, "learning_rate": 9.05940594059406e-06, "loss": 0.5216, "step": 366 }, { "epoch": 0.27279484638255697, "grad_norm": 0.5494627356529236, "learning_rate": 9.084158415841585e-06, "loss": 0.4922, "step": 367 }, { "epoch": 0.27353815659068387, "grad_norm": 0.5853993892669678, "learning_rate": 9.10891089108911e-06, "loss": 0.467, "step": 368 }, { "epoch": 0.2742814667988107, "grad_norm": 0.49697166681289673, "learning_rate": 9.133663366336634e-06, "loss": 0.4848, "step": 369 }, { "epoch": 0.27502477700693756, "grad_norm": 0.5494098663330078, "learning_rate": 9.15841584158416e-06, "loss": 0.487, "step": 370 }, { "epoch": 0.2757680872150644, "grad_norm": 0.7070233821868896, "learning_rate": 9.183168316831684e-06, "loss": 0.5006, "step": 371 }, { "epoch": 0.27651139742319125, "grad_norm": 0.5181058645248413, "learning_rate": 9.20792079207921e-06, "loss": 0.4904, "step": 372 }, { "epoch": 0.27725470763131815, "grad_norm": 0.5348058938980103, "learning_rate": 9.232673267326734e-06, "loss": 0.5038, "step": 373 }, { "epoch": 0.277998017839445, "grad_norm": 0.5366873741149902, "learning_rate": 9.257425742574258e-06, "loss": 0.4952, "step": 374 }, { "epoch": 0.27874132804757185, "grad_norm": 0.48175108432769775, "learning_rate": 9.282178217821784e-06, "loss": 0.4891, "step": 375 }, { "epoch": 0.2794846382556987, "grad_norm": 0.5470430850982666, "learning_rate": 9.306930693069308e-06, "loss": 0.4907, "step": 376 }, { "epoch": 0.2802279484638256, "grad_norm": 0.5287208557128906, "learning_rate": 9.331683168316833e-06, "loss": 0.4708, "step": 377 }, { "epoch": 0.28097125867195244, "grad_norm": 0.6337811946868896, "learning_rate": 9.356435643564357e-06, "loss": 0.5265, "step": 378 }, { "epoch": 0.2817145688800793, "grad_norm": 0.49275144934654236, "learning_rate": 9.381188118811881e-06, "loss": 0.4844, "step": 379 }, { "epoch": 0.28245787908820613, "grad_norm": 0.5628886222839355, "learning_rate": 9.405940594059405e-06, "loss": 0.4585, "step": 380 }, { "epoch": 0.283201189296333, "grad_norm": 0.5878376364707947, "learning_rate": 9.430693069306931e-06, "loss": 0.4726, "step": 381 }, { "epoch": 0.2839444995044599, "grad_norm": 0.5588452219963074, "learning_rate": 9.455445544554455e-06, "loss": 0.5207, "step": 382 }, { "epoch": 0.2846878097125867, "grad_norm": 0.6335049867630005, "learning_rate": 9.480198019801981e-06, "loss": 0.5338, "step": 383 }, { "epoch": 0.28543111992071357, "grad_norm": 0.580923318862915, "learning_rate": 9.504950495049505e-06, "loss": 0.4884, "step": 384 }, { "epoch": 0.2861744301288404, "grad_norm": 0.4616159200668335, "learning_rate": 9.52970297029703e-06, "loss": 0.5122, "step": 385 }, { "epoch": 0.2869177403369673, "grad_norm": 0.6275190711021423, "learning_rate": 9.554455445544555e-06, "loss": 0.4981, "step": 386 }, { "epoch": 0.28766105054509417, "grad_norm": 0.595289409160614, "learning_rate": 9.579207920792079e-06, "loss": 0.5178, "step": 387 }, { "epoch": 0.288404360753221, "grad_norm": 0.592597246170044, "learning_rate": 9.603960396039604e-06, "loss": 0.5068, "step": 388 }, { "epoch": 0.28914767096134786, "grad_norm": 0.5859998464584351, "learning_rate": 9.628712871287129e-06, "loss": 0.4911, "step": 389 }, { "epoch": 0.2898909811694747, "grad_norm": 0.54558265209198, "learning_rate": 9.653465346534654e-06, "loss": 0.493, "step": 390 }, { "epoch": 0.2906342913776016, "grad_norm": 0.5608166456222534, "learning_rate": 9.678217821782178e-06, "loss": 0.5088, "step": 391 }, { "epoch": 0.29137760158572845, "grad_norm": 0.5343917608261108, "learning_rate": 9.702970297029704e-06, "loss": 0.4648, "step": 392 }, { "epoch": 0.2921209117938553, "grad_norm": 0.4750935733318329, "learning_rate": 9.727722772277228e-06, "loss": 0.4696, "step": 393 }, { "epoch": 0.29286422200198214, "grad_norm": 0.5714291334152222, "learning_rate": 9.752475247524754e-06, "loss": 0.4902, "step": 394 }, { "epoch": 0.29360753221010905, "grad_norm": 0.5159726142883301, "learning_rate": 9.777227722772278e-06, "loss": 0.4918, "step": 395 }, { "epoch": 0.2943508424182359, "grad_norm": 0.5843749642372131, "learning_rate": 9.801980198019802e-06, "loss": 0.5031, "step": 396 }, { "epoch": 0.29509415262636274, "grad_norm": 0.4830499589443207, "learning_rate": 9.826732673267328e-06, "loss": 0.4823, "step": 397 }, { "epoch": 0.2958374628344896, "grad_norm": 0.6220623850822449, "learning_rate": 9.851485148514852e-06, "loss": 0.4796, "step": 398 }, { "epoch": 0.29658077304261643, "grad_norm": 0.5517994165420532, "learning_rate": 9.876237623762377e-06, "loss": 0.4823, "step": 399 }, { "epoch": 0.29732408325074333, "grad_norm": 0.714491605758667, "learning_rate": 9.900990099009901e-06, "loss": 0.5158, "step": 400 }, { "epoch": 0.2980673934588702, "grad_norm": 0.5913609862327576, "learning_rate": 9.925742574257427e-06, "loss": 0.491, "step": 401 }, { "epoch": 0.298810703666997, "grad_norm": 0.5994058847427368, "learning_rate": 9.950495049504951e-06, "loss": 0.481, "step": 402 }, { "epoch": 0.29955401387512387, "grad_norm": 0.6006872057914734, "learning_rate": 9.975247524752477e-06, "loss": 0.4899, "step": 403 }, { "epoch": 0.3002973240832507, "grad_norm": 0.6204814910888672, "learning_rate": 1e-05, "loss": 0.5045, "step": 404 }, { "epoch": 0.3010406342913776, "grad_norm": 0.49750056862831116, "learning_rate": 9.999998128511214e-06, "loss": 0.4934, "step": 405 }, { "epoch": 0.30178394449950446, "grad_norm": 0.6335426568984985, "learning_rate": 9.999992514046258e-06, "loss": 0.4925, "step": 406 }, { "epoch": 0.3025272547076313, "grad_norm": 0.5564846396446228, "learning_rate": 9.999983156609334e-06, "loss": 0.49, "step": 407 }, { "epoch": 0.30327056491575816, "grad_norm": 0.5476748943328857, "learning_rate": 9.999970056207445e-06, "loss": 0.5282, "step": 408 }, { "epoch": 0.30401387512388506, "grad_norm": 0.5818845629692078, "learning_rate": 9.999953212850402e-06, "loss": 0.4879, "step": 409 }, { "epoch": 0.3047571853320119, "grad_norm": 0.5520055294036865, "learning_rate": 9.99993262655081e-06, "loss": 0.5273, "step": 410 }, { "epoch": 0.30550049554013875, "grad_norm": 0.5399459004402161, "learning_rate": 9.999908297324083e-06, "loss": 0.5031, "step": 411 }, { "epoch": 0.3062438057482656, "grad_norm": 0.5103101134300232, "learning_rate": 9.999880225188431e-06, "loss": 0.4589, "step": 412 }, { "epoch": 0.30698711595639244, "grad_norm": 0.6715165376663208, "learning_rate": 9.999848410164871e-06, "loss": 0.5076, "step": 413 }, { "epoch": 0.30773042616451934, "grad_norm": 0.617375910282135, "learning_rate": 9.99981285227722e-06, "loss": 0.5212, "step": 414 }, { "epoch": 0.3084737363726462, "grad_norm": 0.5255922079086304, "learning_rate": 9.999773551552093e-06, "loss": 0.5193, "step": 415 }, { "epoch": 0.30921704658077304, "grad_norm": 0.6237926483154297, "learning_rate": 9.999730508018914e-06, "loss": 0.5156, "step": 416 }, { "epoch": 0.3099603567888999, "grad_norm": 0.475346177816391, "learning_rate": 9.999683721709902e-06, "loss": 0.4827, "step": 417 }, { "epoch": 0.3107036669970268, "grad_norm": 0.6023895740509033, "learning_rate": 9.999633192660084e-06, "loss": 0.4862, "step": 418 }, { "epoch": 0.31144697720515363, "grad_norm": 0.5087665319442749, "learning_rate": 9.999578920907283e-06, "loss": 0.4774, "step": 419 }, { "epoch": 0.3121902874132805, "grad_norm": 0.5809246897697449, "learning_rate": 9.999520906492129e-06, "loss": 0.5069, "step": 420 }, { "epoch": 0.3129335976214073, "grad_norm": 0.5472290515899658, "learning_rate": 9.99945914945805e-06, "loss": 0.5319, "step": 421 }, { "epoch": 0.31367690782953417, "grad_norm": 0.49253055453300476, "learning_rate": 9.999393649851277e-06, "loss": 0.471, "step": 422 }, { "epoch": 0.31442021803766107, "grad_norm": 0.5199219584465027, "learning_rate": 9.999324407720842e-06, "loss": 0.5016, "step": 423 }, { "epoch": 0.3151635282457879, "grad_norm": 0.48456427454948425, "learning_rate": 9.999251423118584e-06, "loss": 0.4454, "step": 424 }, { "epoch": 0.31590683845391476, "grad_norm": 0.5524627566337585, "learning_rate": 9.999174696099133e-06, "loss": 0.4941, "step": 425 }, { "epoch": 0.3166501486620416, "grad_norm": 0.49961355328559875, "learning_rate": 9.99909422671993e-06, "loss": 0.4771, "step": 426 }, { "epoch": 0.3173934588701685, "grad_norm": 0.5012865662574768, "learning_rate": 9.999010015041212e-06, "loss": 0.4686, "step": 427 }, { "epoch": 0.31813676907829536, "grad_norm": 0.5203090310096741, "learning_rate": 9.998922061126021e-06, "loss": 0.5029, "step": 428 }, { "epoch": 0.3188800792864222, "grad_norm": 0.49878522753715515, "learning_rate": 9.998830365040199e-06, "loss": 0.5068, "step": 429 }, { "epoch": 0.31962338949454905, "grad_norm": 0.49596357345581055, "learning_rate": 9.998734926852387e-06, "loss": 0.508, "step": 430 }, { "epoch": 0.3203666997026759, "grad_norm": 0.4770609736442566, "learning_rate": 9.998635746634032e-06, "loss": 0.476, "step": 431 }, { "epoch": 0.3211100099108028, "grad_norm": 0.4898109436035156, "learning_rate": 9.998532824459379e-06, "loss": 0.461, "step": 432 }, { "epoch": 0.32185332011892964, "grad_norm": 0.5054411888122559, "learning_rate": 9.998426160405475e-06, "loss": 0.4907, "step": 433 }, { "epoch": 0.3225966303270565, "grad_norm": 0.5375227332115173, "learning_rate": 9.998315754552169e-06, "loss": 0.5018, "step": 434 }, { "epoch": 0.32333994053518333, "grad_norm": 0.5269981622695923, "learning_rate": 9.99820160698211e-06, "loss": 0.4995, "step": 435 }, { "epoch": 0.3240832507433102, "grad_norm": 0.5755949020385742, "learning_rate": 9.998083717780746e-06, "loss": 0.4902, "step": 436 }, { "epoch": 0.3248265609514371, "grad_norm": 0.48860323429107666, "learning_rate": 9.997962087036333e-06, "loss": 0.4505, "step": 437 }, { "epoch": 0.32556987115956393, "grad_norm": 0.5488467216491699, "learning_rate": 9.99783671483992e-06, "loss": 0.48, "step": 438 }, { "epoch": 0.3263131813676908, "grad_norm": 0.548007607460022, "learning_rate": 9.997707601285359e-06, "loss": 0.4904, "step": 439 }, { "epoch": 0.3270564915758176, "grad_norm": 0.5025060772895813, "learning_rate": 9.997574746469307e-06, "loss": 0.4783, "step": 440 }, { "epoch": 0.3277998017839445, "grad_norm": 0.6320803165435791, "learning_rate": 9.997438150491216e-06, "loss": 0.5061, "step": 441 }, { "epoch": 0.32854311199207137, "grad_norm": 0.5825727581977844, "learning_rate": 9.997297813453344e-06, "loss": 0.4794, "step": 442 }, { "epoch": 0.3292864222001982, "grad_norm": 0.6636599898338318, "learning_rate": 9.997153735460742e-06, "loss": 0.4921, "step": 443 }, { "epoch": 0.33002973240832506, "grad_norm": 0.5351666808128357, "learning_rate": 9.997005916621272e-06, "loss": 0.5061, "step": 444 }, { "epoch": 0.3307730426164519, "grad_norm": 0.625179648399353, "learning_rate": 9.996854357045587e-06, "loss": 0.4779, "step": 445 }, { "epoch": 0.3315163528245788, "grad_norm": 0.5778199434280396, "learning_rate": 9.996699056847143e-06, "loss": 0.4911, "step": 446 }, { "epoch": 0.33225966303270565, "grad_norm": 0.575410008430481, "learning_rate": 9.9965400161422e-06, "loss": 0.4504, "step": 447 }, { "epoch": 0.3330029732408325, "grad_norm": 0.5481746196746826, "learning_rate": 9.996377235049812e-06, "loss": 0.4756, "step": 448 }, { "epoch": 0.33374628344895935, "grad_norm": 0.5678203701972961, "learning_rate": 9.996210713691838e-06, "loss": 0.4909, "step": 449 }, { "epoch": 0.33448959365708625, "grad_norm": 0.5475006699562073, "learning_rate": 9.996040452192935e-06, "loss": 0.5069, "step": 450 }, { "epoch": 0.3352329038652131, "grad_norm": 0.5999965071678162, "learning_rate": 9.99586645068056e-06, "loss": 0.4891, "step": 451 }, { "epoch": 0.33597621407333994, "grad_norm": 0.557653546333313, "learning_rate": 9.995688709284968e-06, "loss": 0.5045, "step": 452 }, { "epoch": 0.3367195242814668, "grad_norm": 0.5363184809684753, "learning_rate": 9.995507228139219e-06, "loss": 0.4612, "step": 453 }, { "epoch": 0.33746283448959363, "grad_norm": 0.6086477637290955, "learning_rate": 9.995322007379167e-06, "loss": 0.4752, "step": 454 }, { "epoch": 0.33820614469772053, "grad_norm": 0.5822886228561401, "learning_rate": 9.995133047143466e-06, "loss": 0.5059, "step": 455 }, { "epoch": 0.3389494549058474, "grad_norm": 0.5608342289924622, "learning_rate": 9.994940347573574e-06, "loss": 0.4821, "step": 456 }, { "epoch": 0.3396927651139742, "grad_norm": 0.5600447058677673, "learning_rate": 9.994743908813742e-06, "loss": 0.4838, "step": 457 }, { "epoch": 0.3404360753221011, "grad_norm": 0.5481860637664795, "learning_rate": 9.994543731011024e-06, "loss": 0.5038, "step": 458 }, { "epoch": 0.341179385530228, "grad_norm": 0.6090419888496399, "learning_rate": 9.994339814315275e-06, "loss": 0.493, "step": 459 }, { "epoch": 0.3419226957383548, "grad_norm": 0.5067794919013977, "learning_rate": 9.994132158879141e-06, "loss": 0.4546, "step": 460 }, { "epoch": 0.34266600594648167, "grad_norm": 0.531303882598877, "learning_rate": 9.993920764858075e-06, "loss": 0.4845, "step": 461 }, { "epoch": 0.3434093161546085, "grad_norm": 0.49158045649528503, "learning_rate": 9.993705632410327e-06, "loss": 0.4799, "step": 462 }, { "epoch": 0.34415262636273536, "grad_norm": 0.6180090308189392, "learning_rate": 9.993486761696943e-06, "loss": 0.4943, "step": 463 }, { "epoch": 0.34489593657086226, "grad_norm": 0.4700208306312561, "learning_rate": 9.993264152881766e-06, "loss": 0.5006, "step": 464 }, { "epoch": 0.3456392467789891, "grad_norm": 0.5312856435775757, "learning_rate": 9.993037806131443e-06, "loss": 0.4925, "step": 465 }, { "epoch": 0.34638255698711595, "grad_norm": 0.5068499445915222, "learning_rate": 9.992807721615414e-06, "loss": 0.4607, "step": 466 }, { "epoch": 0.3471258671952428, "grad_norm": 0.5449957251548767, "learning_rate": 9.992573899505921e-06, "loss": 0.4968, "step": 467 }, { "epoch": 0.3478691774033697, "grad_norm": 0.5942329168319702, "learning_rate": 9.992336339978001e-06, "loss": 0.5011, "step": 468 }, { "epoch": 0.34861248761149655, "grad_norm": 0.5435182452201843, "learning_rate": 9.992095043209494e-06, "loss": 0.4861, "step": 469 }, { "epoch": 0.3493557978196234, "grad_norm": 0.4819882810115814, "learning_rate": 9.991850009381027e-06, "loss": 0.4756, "step": 470 }, { "epoch": 0.35009910802775024, "grad_norm": 0.532892107963562, "learning_rate": 9.991601238676036e-06, "loss": 0.4878, "step": 471 }, { "epoch": 0.3508424182358771, "grad_norm": 0.478719562292099, "learning_rate": 9.991348731280748e-06, "loss": 0.4686, "step": 472 }, { "epoch": 0.351585728444004, "grad_norm": 0.5004722476005554, "learning_rate": 9.991092487384189e-06, "loss": 0.473, "step": 473 }, { "epoch": 0.35232903865213083, "grad_norm": 0.619777500629425, "learning_rate": 9.99083250717818e-06, "loss": 0.5309, "step": 474 }, { "epoch": 0.3530723488602577, "grad_norm": 0.5509745478630066, "learning_rate": 9.990568790857347e-06, "loss": 0.498, "step": 475 }, { "epoch": 0.3538156590683845, "grad_norm": 0.5727150440216064, "learning_rate": 9.9903013386191e-06, "loss": 0.4987, "step": 476 }, { "epoch": 0.35455896927651137, "grad_norm": 0.6386494040489197, "learning_rate": 9.990030150663656e-06, "loss": 0.4695, "step": 477 }, { "epoch": 0.3553022794846383, "grad_norm": 0.5451343059539795, "learning_rate": 9.989755227194027e-06, "loss": 0.4729, "step": 478 }, { "epoch": 0.3560455896927651, "grad_norm": 0.5648337006568909, "learning_rate": 9.989476568416014e-06, "loss": 0.4817, "step": 479 }, { "epoch": 0.35678889990089196, "grad_norm": 0.5866321325302124, "learning_rate": 9.989194174538224e-06, "loss": 0.457, "step": 480 }, { "epoch": 0.3575322101090188, "grad_norm": 0.5510798692703247, "learning_rate": 9.988908045772055e-06, "loss": 0.4797, "step": 481 }, { "epoch": 0.3582755203171457, "grad_norm": 0.5691381096839905, "learning_rate": 9.988618182331701e-06, "loss": 0.4755, "step": 482 }, { "epoch": 0.35901883052527256, "grad_norm": 0.582089900970459, "learning_rate": 9.988324584434153e-06, "loss": 0.4747, "step": 483 }, { "epoch": 0.3597621407333994, "grad_norm": 0.624828577041626, "learning_rate": 9.988027252299198e-06, "loss": 0.4939, "step": 484 }, { "epoch": 0.36050545094152625, "grad_norm": 0.4551810026168823, "learning_rate": 9.987726186149414e-06, "loss": 0.4807, "step": 485 }, { "epoch": 0.3612487611496531, "grad_norm": 0.5872930884361267, "learning_rate": 9.98742138621018e-06, "loss": 0.4676, "step": 486 }, { "epoch": 0.36199207135778, "grad_norm": 0.594639778137207, "learning_rate": 9.98711285270967e-06, "loss": 0.4485, "step": 487 }, { "epoch": 0.36273538156590684, "grad_norm": 0.5599005818367004, "learning_rate": 9.986800585878848e-06, "loss": 0.5059, "step": 488 }, { "epoch": 0.3634786917740337, "grad_norm": 0.5168454051017761, "learning_rate": 9.986484585951477e-06, "loss": 0.4798, "step": 489 }, { "epoch": 0.36422200198216054, "grad_norm": 0.4914504587650299, "learning_rate": 9.986164853164111e-06, "loss": 0.4719, "step": 490 }, { "epoch": 0.36496531219028744, "grad_norm": 0.46753111481666565, "learning_rate": 9.985841387756102e-06, "loss": 0.4831, "step": 491 }, { "epoch": 0.3657086223984143, "grad_norm": 0.48763561248779297, "learning_rate": 9.985514189969596e-06, "loss": 0.4617, "step": 492 }, { "epoch": 0.36645193260654113, "grad_norm": 0.5012291073799133, "learning_rate": 9.985183260049529e-06, "loss": 0.4539, "step": 493 }, { "epoch": 0.367195242814668, "grad_norm": 0.5575759410858154, "learning_rate": 9.984848598243637e-06, "loss": 0.4798, "step": 494 }, { "epoch": 0.3679385530227948, "grad_norm": 0.49914416670799255, "learning_rate": 9.984510204802443e-06, "loss": 0.4843, "step": 495 }, { "epoch": 0.3686818632309217, "grad_norm": 0.5977864265441895, "learning_rate": 9.98416807997927e-06, "loss": 0.5119, "step": 496 }, { "epoch": 0.36942517343904857, "grad_norm": 0.5695549845695496, "learning_rate": 9.983822224030227e-06, "loss": 0.4807, "step": 497 }, { "epoch": 0.3701684836471754, "grad_norm": 0.5131955146789551, "learning_rate": 9.983472637214225e-06, "loss": 0.4725, "step": 498 }, { "epoch": 0.37091179385530226, "grad_norm": 0.6787784695625305, "learning_rate": 9.98311931979296e-06, "loss": 0.4668, "step": 499 }, { "epoch": 0.37165510406342916, "grad_norm": 0.586571991443634, "learning_rate": 9.982762272030924e-06, "loss": 0.4702, "step": 500 }, { "epoch": 0.372398414271556, "grad_norm": 0.47793737053871155, "learning_rate": 9.982401494195402e-06, "loss": 0.4763, "step": 501 }, { "epoch": 0.37314172447968286, "grad_norm": 0.6702011823654175, "learning_rate": 9.982036986556472e-06, "loss": 0.4799, "step": 502 }, { "epoch": 0.3738850346878097, "grad_norm": 0.6476152539253235, "learning_rate": 9.981668749387e-06, "loss": 0.516, "step": 503 }, { "epoch": 0.37462834489593655, "grad_norm": 0.5418809652328491, "learning_rate": 9.981296782962648e-06, "loss": 0.4991, "step": 504 }, { "epoch": 0.37537165510406345, "grad_norm": 0.7416400909423828, "learning_rate": 9.980921087561869e-06, "loss": 0.5022, "step": 505 }, { "epoch": 0.3761149653121903, "grad_norm": 0.540590763092041, "learning_rate": 9.980541663465906e-06, "loss": 0.4947, "step": 506 }, { "epoch": 0.37685827552031714, "grad_norm": 0.648009717464447, "learning_rate": 9.980158510958795e-06, "loss": 0.4879, "step": 507 }, { "epoch": 0.377601585728444, "grad_norm": 0.5023894906044006, "learning_rate": 9.979771630327362e-06, "loss": 0.4947, "step": 508 }, { "epoch": 0.37834489593657084, "grad_norm": 0.6940127015113831, "learning_rate": 9.979381021861223e-06, "loss": 0.4714, "step": 509 }, { "epoch": 0.37908820614469774, "grad_norm": 0.4911518394947052, "learning_rate": 9.978986685852788e-06, "loss": 0.467, "step": 510 }, { "epoch": 0.3798315163528246, "grad_norm": 0.6214549541473389, "learning_rate": 9.978588622597253e-06, "loss": 0.5076, "step": 511 }, { "epoch": 0.38057482656095143, "grad_norm": 0.577860414981842, "learning_rate": 9.978186832392608e-06, "loss": 0.4822, "step": 512 }, { "epoch": 0.3813181367690783, "grad_norm": 0.5750202536582947, "learning_rate": 9.97778131553963e-06, "loss": 0.4823, "step": 513 }, { "epoch": 0.3820614469772052, "grad_norm": 0.7063978314399719, "learning_rate": 9.97737207234189e-06, "loss": 0.5175, "step": 514 }, { "epoch": 0.382804757185332, "grad_norm": 0.5641224384307861, "learning_rate": 9.976959103105738e-06, "loss": 0.4966, "step": 515 }, { "epoch": 0.38354806739345887, "grad_norm": 0.6598021388053894, "learning_rate": 9.97654240814033e-06, "loss": 0.4874, "step": 516 }, { "epoch": 0.3842913776015857, "grad_norm": 0.5088967680931091, "learning_rate": 9.976121987757598e-06, "loss": 0.4687, "step": 517 }, { "epoch": 0.38503468780971256, "grad_norm": 0.616483211517334, "learning_rate": 9.975697842272265e-06, "loss": 0.4696, "step": 518 }, { "epoch": 0.38577799801783946, "grad_norm": 0.4908723831176758, "learning_rate": 9.975269972001848e-06, "loss": 0.4606, "step": 519 }, { "epoch": 0.3865213082259663, "grad_norm": 0.5487363934516907, "learning_rate": 9.974838377266647e-06, "loss": 0.4814, "step": 520 }, { "epoch": 0.38726461843409316, "grad_norm": 0.5914155840873718, "learning_rate": 9.974403058389753e-06, "loss": 0.4692, "step": 521 }, { "epoch": 0.38800792864222, "grad_norm": 0.5199363231658936, "learning_rate": 9.973964015697041e-06, "loss": 0.4542, "step": 522 }, { "epoch": 0.3887512388503469, "grad_norm": 0.5319889783859253, "learning_rate": 9.97352124951718e-06, "loss": 0.4668, "step": 523 }, { "epoch": 0.38949454905847375, "grad_norm": 0.4993482828140259, "learning_rate": 9.973074760181618e-06, "loss": 0.4814, "step": 524 }, { "epoch": 0.3902378592666006, "grad_norm": 0.6336639523506165, "learning_rate": 9.9726245480246e-06, "loss": 0.4876, "step": 525 }, { "epoch": 0.39098116947472744, "grad_norm": 0.5717144012451172, "learning_rate": 9.972170613383151e-06, "loss": 0.4638, "step": 526 }, { "epoch": 0.3917244796828543, "grad_norm": 0.5836862325668335, "learning_rate": 9.971712956597083e-06, "loss": 0.4903, "step": 527 }, { "epoch": 0.3924677898909812, "grad_norm": 0.47930726408958435, "learning_rate": 9.971251578008998e-06, "loss": 0.4627, "step": 528 }, { "epoch": 0.39321110009910804, "grad_norm": 0.5524035096168518, "learning_rate": 9.970786477964281e-06, "loss": 0.463, "step": 529 }, { "epoch": 0.3939544103072349, "grad_norm": 0.5703684091567993, "learning_rate": 9.9703176568111e-06, "loss": 0.4599, "step": 530 }, { "epoch": 0.3946977205153617, "grad_norm": 0.5938689708709717, "learning_rate": 9.969845114900421e-06, "loss": 0.4861, "step": 531 }, { "epoch": 0.39544103072348863, "grad_norm": 0.47876855731010437, "learning_rate": 9.96936885258598e-06, "loss": 0.459, "step": 532 }, { "epoch": 0.3961843409316155, "grad_norm": 0.606468141078949, "learning_rate": 9.968888870224307e-06, "loss": 0.5035, "step": 533 }, { "epoch": 0.3969276511397423, "grad_norm": 0.5284757018089294, "learning_rate": 9.968405168174714e-06, "loss": 0.4344, "step": 534 }, { "epoch": 0.39767096134786917, "grad_norm": 0.5825899839401245, "learning_rate": 9.967917746799298e-06, "loss": 0.4766, "step": 535 }, { "epoch": 0.398414271555996, "grad_norm": 0.5452151298522949, "learning_rate": 9.967426606462942e-06, "loss": 0.471, "step": 536 }, { "epoch": 0.3991575817641229, "grad_norm": 0.5799670815467834, "learning_rate": 9.96693174753331e-06, "loss": 0.4804, "step": 537 }, { "epoch": 0.39990089197224976, "grad_norm": 0.5756499767303467, "learning_rate": 9.966433170380852e-06, "loss": 0.4671, "step": 538 }, { "epoch": 0.4006442021803766, "grad_norm": 0.5751057863235474, "learning_rate": 9.965930875378797e-06, "loss": 0.4813, "step": 539 }, { "epoch": 0.40138751238850345, "grad_norm": 0.5941126346588135, "learning_rate": 9.965424862903168e-06, "loss": 0.4842, "step": 540 }, { "epoch": 0.4021308225966303, "grad_norm": 0.5317739844322205, "learning_rate": 9.964915133332758e-06, "loss": 0.4512, "step": 541 }, { "epoch": 0.4028741328047572, "grad_norm": 0.776828408241272, "learning_rate": 9.96440168704915e-06, "loss": 0.498, "step": 542 }, { "epoch": 0.40361744301288405, "grad_norm": 0.49703362584114075, "learning_rate": 9.96388452443671e-06, "loss": 0.4849, "step": 543 }, { "epoch": 0.4043607532210109, "grad_norm": 0.7468554377555847, "learning_rate": 9.963363645882577e-06, "loss": 0.4852, "step": 544 }, { "epoch": 0.40510406342913774, "grad_norm": 0.5531187653541565, "learning_rate": 9.962839051776685e-06, "loss": 0.4868, "step": 545 }, { "epoch": 0.40584737363726464, "grad_norm": 0.5636987090110779, "learning_rate": 9.96231074251174e-06, "loss": 0.5097, "step": 546 }, { "epoch": 0.4065906838453915, "grad_norm": 0.5738264322280884, "learning_rate": 9.96177871848323e-06, "loss": 0.456, "step": 547 }, { "epoch": 0.40733399405351833, "grad_norm": 0.5867389440536499, "learning_rate": 9.961242980089432e-06, "loss": 0.5123, "step": 548 }, { "epoch": 0.4080773042616452, "grad_norm": 0.52158123254776, "learning_rate": 9.96070352773139e-06, "loss": 0.4962, "step": 549 }, { "epoch": 0.408820614469772, "grad_norm": 0.597061276435852, "learning_rate": 9.960160361812941e-06, "loss": 0.5154, "step": 550 }, { "epoch": 0.4095639246778989, "grad_norm": 0.5463768839836121, "learning_rate": 9.959613482740693e-06, "loss": 0.473, "step": 551 }, { "epoch": 0.4103072348860258, "grad_norm": 0.5058673024177551, "learning_rate": 9.95906289092404e-06, "loss": 0.4603, "step": 552 }, { "epoch": 0.4110505450941526, "grad_norm": 0.6111001968383789, "learning_rate": 9.95850858677515e-06, "loss": 0.4941, "step": 553 }, { "epoch": 0.41179385530227947, "grad_norm": 0.5430198907852173, "learning_rate": 9.957950570708977e-06, "loss": 0.4637, "step": 554 }, { "epoch": 0.41253716551040637, "grad_norm": 0.6374390721321106, "learning_rate": 9.957388843143243e-06, "loss": 0.4639, "step": 555 }, { "epoch": 0.4132804757185332, "grad_norm": 0.6621878743171692, "learning_rate": 9.956823404498458e-06, "loss": 0.4558, "step": 556 }, { "epoch": 0.41402378592666006, "grad_norm": 0.49974730610847473, "learning_rate": 9.956254255197909e-06, "loss": 0.4655, "step": 557 }, { "epoch": 0.4147670961347869, "grad_norm": 0.6885067820549011, "learning_rate": 9.955681395667655e-06, "loss": 0.4978, "step": 558 }, { "epoch": 0.41551040634291375, "grad_norm": 0.5243480801582336, "learning_rate": 9.955104826336539e-06, "loss": 0.4841, "step": 559 }, { "epoch": 0.41625371655104065, "grad_norm": 0.652668297290802, "learning_rate": 9.954524547636175e-06, "loss": 0.5191, "step": 560 }, { "epoch": 0.4169970267591675, "grad_norm": 0.6432493925094604, "learning_rate": 9.953940560000962e-06, "loss": 0.4799, "step": 561 }, { "epoch": 0.41774033696729435, "grad_norm": 0.6089559197425842, "learning_rate": 9.953352863868063e-06, "loss": 0.4705, "step": 562 }, { "epoch": 0.4184836471754212, "grad_norm": 0.5594080686569214, "learning_rate": 9.95276145967743e-06, "loss": 0.4818, "step": 563 }, { "epoch": 0.4192269573835481, "grad_norm": 0.619881272315979, "learning_rate": 9.952166347871786e-06, "loss": 0.4866, "step": 564 }, { "epoch": 0.41997026759167494, "grad_norm": 0.5263139009475708, "learning_rate": 9.951567528896628e-06, "loss": 0.477, "step": 565 }, { "epoch": 0.4207135777998018, "grad_norm": 0.503973126411438, "learning_rate": 9.950965003200227e-06, "loss": 0.4751, "step": 566 }, { "epoch": 0.42145688800792863, "grad_norm": 0.48759156465530396, "learning_rate": 9.950358771233633e-06, "loss": 0.4646, "step": 567 }, { "epoch": 0.4222001982160555, "grad_norm": 0.4430334270000458, "learning_rate": 9.94974883345067e-06, "loss": 0.453, "step": 568 }, { "epoch": 0.4229435084241824, "grad_norm": 0.5764279365539551, "learning_rate": 9.94913519030793e-06, "loss": 0.48, "step": 569 }, { "epoch": 0.4236868186323092, "grad_norm": 0.5012962222099304, "learning_rate": 9.948517842264788e-06, "loss": 0.473, "step": 570 }, { "epoch": 0.42443012884043607, "grad_norm": 0.5135936737060547, "learning_rate": 9.947896789783387e-06, "loss": 0.4529, "step": 571 }, { "epoch": 0.4251734390485629, "grad_norm": 0.5244408249855042, "learning_rate": 9.947272033328645e-06, "loss": 0.5019, "step": 572 }, { "epoch": 0.4259167492566898, "grad_norm": 0.46449771523475647, "learning_rate": 9.946643573368249e-06, "loss": 0.5031, "step": 573 }, { "epoch": 0.42666005946481667, "grad_norm": 0.5138418078422546, "learning_rate": 9.946011410372663e-06, "loss": 0.4729, "step": 574 }, { "epoch": 0.4274033696729435, "grad_norm": 0.5857571363449097, "learning_rate": 9.945375544815118e-06, "loss": 0.4624, "step": 575 }, { "epoch": 0.42814667988107036, "grad_norm": 0.5026090145111084, "learning_rate": 9.944735977171627e-06, "loss": 0.5174, "step": 576 }, { "epoch": 0.4288899900891972, "grad_norm": 0.6173230409622192, "learning_rate": 9.944092707920962e-06, "loss": 0.4533, "step": 577 }, { "epoch": 0.4296333002973241, "grad_norm": 0.5695835947990417, "learning_rate": 9.943445737544674e-06, "loss": 0.4617, "step": 578 }, { "epoch": 0.43037661050545095, "grad_norm": 0.5167635679244995, "learning_rate": 9.942795066527083e-06, "loss": 0.4971, "step": 579 }, { "epoch": 0.4311199207135778, "grad_norm": 0.5369362831115723, "learning_rate": 9.942140695355273e-06, "loss": 0.4878, "step": 580 }, { "epoch": 0.43186323092170464, "grad_norm": 0.6126880645751953, "learning_rate": 9.94148262451911e-06, "loss": 0.4703, "step": 581 }, { "epoch": 0.4326065411298315, "grad_norm": 0.5431976914405823, "learning_rate": 9.94082085451122e-06, "loss": 0.4946, "step": 582 }, { "epoch": 0.4333498513379584, "grad_norm": 0.5065004825592041, "learning_rate": 9.940155385826998e-06, "loss": 0.4915, "step": 583 }, { "epoch": 0.43409316154608524, "grad_norm": 0.5377302765846252, "learning_rate": 9.939486218964616e-06, "loss": 0.4858, "step": 584 }, { "epoch": 0.4348364717542121, "grad_norm": 0.6147332191467285, "learning_rate": 9.938813354425007e-06, "loss": 0.4806, "step": 585 }, { "epoch": 0.43557978196233893, "grad_norm": 0.49938836693763733, "learning_rate": 9.938136792711875e-06, "loss": 0.4568, "step": 586 }, { "epoch": 0.43632309217046583, "grad_norm": 0.5218966603279114, "learning_rate": 9.937456534331689e-06, "loss": 0.4653, "step": 587 }, { "epoch": 0.4370664023785927, "grad_norm": 0.625738799571991, "learning_rate": 9.936772579793691e-06, "loss": 0.4599, "step": 588 }, { "epoch": 0.4378097125867195, "grad_norm": 0.586622416973114, "learning_rate": 9.936084929609884e-06, "loss": 0.4756, "step": 589 }, { "epoch": 0.43855302279484637, "grad_norm": 0.5094144344329834, "learning_rate": 9.935393584295039e-06, "loss": 0.4734, "step": 590 }, { "epoch": 0.4392963330029732, "grad_norm": 0.6149701476097107, "learning_rate": 9.934698544366694e-06, "loss": 0.4955, "step": 591 }, { "epoch": 0.4400396432111001, "grad_norm": 0.5335476994514465, "learning_rate": 9.933999810345156e-06, "loss": 0.485, "step": 592 }, { "epoch": 0.44078295341922696, "grad_norm": 0.5479117631912231, "learning_rate": 9.933297382753491e-06, "loss": 0.4903, "step": 593 }, { "epoch": 0.4415262636273538, "grad_norm": 0.5118146538734436, "learning_rate": 9.932591262117536e-06, "loss": 0.452, "step": 594 }, { "epoch": 0.44226957383548066, "grad_norm": 0.5134488940238953, "learning_rate": 9.931881448965885e-06, "loss": 0.4782, "step": 595 }, { "epoch": 0.44301288404360756, "grad_norm": 0.5181375741958618, "learning_rate": 9.931167943829906e-06, "loss": 0.4736, "step": 596 }, { "epoch": 0.4437561942517344, "grad_norm": 0.5755943655967712, "learning_rate": 9.930450747243724e-06, "loss": 0.4802, "step": 597 }, { "epoch": 0.44449950445986125, "grad_norm": 0.4711308777332306, "learning_rate": 9.929729859744229e-06, "loss": 0.4554, "step": 598 }, { "epoch": 0.4452428146679881, "grad_norm": 0.5302271842956543, "learning_rate": 9.929005281871072e-06, "loss": 0.4659, "step": 599 }, { "epoch": 0.44598612487611494, "grad_norm": 0.46014416217803955, "learning_rate": 9.92827701416667e-06, "loss": 0.4574, "step": 600 }, { "epoch": 0.44672943508424184, "grad_norm": 0.5306234955787659, "learning_rate": 9.927545057176204e-06, "loss": 0.4797, "step": 601 }, { "epoch": 0.4474727452923687, "grad_norm": 0.4577465355396271, "learning_rate": 9.92680941144761e-06, "loss": 0.4599, "step": 602 }, { "epoch": 0.44821605550049554, "grad_norm": 0.5206385850906372, "learning_rate": 9.926070077531591e-06, "loss": 0.4495, "step": 603 }, { "epoch": 0.4489593657086224, "grad_norm": 0.5479946732521057, "learning_rate": 9.925327055981609e-06, "loss": 0.4709, "step": 604 }, { "epoch": 0.4497026759167493, "grad_norm": 0.5019120573997498, "learning_rate": 9.924580347353885e-06, "loss": 0.5076, "step": 605 }, { "epoch": 0.45044598612487613, "grad_norm": 0.5605428218841553, "learning_rate": 9.923829952207403e-06, "loss": 0.449, "step": 606 }, { "epoch": 0.451189296333003, "grad_norm": 0.49342456459999084, "learning_rate": 9.923075871103906e-06, "loss": 0.4831, "step": 607 }, { "epoch": 0.4519326065411298, "grad_norm": 0.5123006701469421, "learning_rate": 9.922318104607894e-06, "loss": 0.4752, "step": 608 }, { "epoch": 0.45267591674925667, "grad_norm": 0.5444505214691162, "learning_rate": 9.921556653286629e-06, "loss": 0.4744, "step": 609 }, { "epoch": 0.45341922695738357, "grad_norm": 0.5340210795402527, "learning_rate": 9.920791517710129e-06, "loss": 0.4578, "step": 610 }, { "epoch": 0.4541625371655104, "grad_norm": 0.4989148676395416, "learning_rate": 9.920022698451173e-06, "loss": 0.4724, "step": 611 }, { "epoch": 0.45490584737363726, "grad_norm": 0.5524435639381409, "learning_rate": 9.919250196085293e-06, "loss": 0.4957, "step": 612 }, { "epoch": 0.4556491575817641, "grad_norm": 0.5104256272315979, "learning_rate": 9.918474011190781e-06, "loss": 0.463, "step": 613 }, { "epoch": 0.45639246778989095, "grad_norm": 0.5444559454917908, "learning_rate": 9.917694144348689e-06, "loss": 0.4611, "step": 614 }, { "epoch": 0.45713577799801786, "grad_norm": 0.47436603903770447, "learning_rate": 9.916910596142819e-06, "loss": 0.4874, "step": 615 }, { "epoch": 0.4578790882061447, "grad_norm": 0.5419439077377319, "learning_rate": 9.91612336715973e-06, "loss": 0.4652, "step": 616 }, { "epoch": 0.45862239841427155, "grad_norm": 0.5047597289085388, "learning_rate": 9.915332457988745e-06, "loss": 0.5152, "step": 617 }, { "epoch": 0.4593657086223984, "grad_norm": 0.5034196972846985, "learning_rate": 9.914537869221927e-06, "loss": 0.4771, "step": 618 }, { "epoch": 0.4601090188305253, "grad_norm": 0.5433529019355774, "learning_rate": 9.913739601454104e-06, "loss": 0.4837, "step": 619 }, { "epoch": 0.46085232903865214, "grad_norm": 0.5222334265708923, "learning_rate": 9.912937655282858e-06, "loss": 0.4267, "step": 620 }, { "epoch": 0.461595639246779, "grad_norm": 0.6051110029220581, "learning_rate": 9.912132031308522e-06, "loss": 0.4813, "step": 621 }, { "epoch": 0.46233894945490583, "grad_norm": 0.4668157398700714, "learning_rate": 9.911322730134179e-06, "loss": 0.4664, "step": 622 }, { "epoch": 0.4630822596630327, "grad_norm": 0.4921639561653137, "learning_rate": 9.910509752365671e-06, "loss": 0.4751, "step": 623 }, { "epoch": 0.4638255698711596, "grad_norm": 0.4957593083381653, "learning_rate": 9.90969309861159e-06, "loss": 0.4805, "step": 624 }, { "epoch": 0.46456888007928643, "grad_norm": 0.4643923044204712, "learning_rate": 9.908872769483279e-06, "loss": 0.4387, "step": 625 }, { "epoch": 0.4653121902874133, "grad_norm": 0.6181691288948059, "learning_rate": 9.908048765594831e-06, "loss": 0.5024, "step": 626 }, { "epoch": 0.4660555004955401, "grad_norm": 0.46761977672576904, "learning_rate": 9.907221087563093e-06, "loss": 0.473, "step": 627 }, { "epoch": 0.466798810703667, "grad_norm": 0.5462128520011902, "learning_rate": 9.90638973600766e-06, "loss": 0.4394, "step": 628 }, { "epoch": 0.46754212091179387, "grad_norm": 0.46988722681999207, "learning_rate": 9.905554711550879e-06, "loss": 0.4912, "step": 629 }, { "epoch": 0.4682854311199207, "grad_norm": 0.5360676646232605, "learning_rate": 9.904716014817848e-06, "loss": 0.4948, "step": 630 }, { "epoch": 0.46902874132804756, "grad_norm": 0.5678356289863586, "learning_rate": 9.903873646436405e-06, "loss": 0.448, "step": 631 }, { "epoch": 0.4697720515361744, "grad_norm": 0.5429752469062805, "learning_rate": 9.90302760703715e-06, "loss": 0.4856, "step": 632 }, { "epoch": 0.4705153617443013, "grad_norm": 0.6103689074516296, "learning_rate": 9.90217789725342e-06, "loss": 0.4547, "step": 633 }, { "epoch": 0.47125867195242815, "grad_norm": 0.5541225075721741, "learning_rate": 9.901324517721306e-06, "loss": 0.4633, "step": 634 }, { "epoch": 0.472001982160555, "grad_norm": 0.5446614623069763, "learning_rate": 9.900467469079646e-06, "loss": 0.4911, "step": 635 }, { "epoch": 0.47274529236868185, "grad_norm": 0.5170345306396484, "learning_rate": 9.899606751970017e-06, "loss": 0.4894, "step": 636 }, { "epoch": 0.47348860257680875, "grad_norm": 0.5190234780311584, "learning_rate": 9.898742367036753e-06, "loss": 0.5041, "step": 637 }, { "epoch": 0.4742319127849356, "grad_norm": 0.4774671494960785, "learning_rate": 9.897874314926925e-06, "loss": 0.4595, "step": 638 }, { "epoch": 0.47497522299306244, "grad_norm": 0.4908256530761719, "learning_rate": 9.897002596290357e-06, "loss": 0.4711, "step": 639 }, { "epoch": 0.4757185332011893, "grad_norm": 0.5735973119735718, "learning_rate": 9.896127211779611e-06, "loss": 0.472, "step": 640 }, { "epoch": 0.47646184340931613, "grad_norm": 0.46258601546287537, "learning_rate": 9.895248162049996e-06, "loss": 0.4852, "step": 641 }, { "epoch": 0.47720515361744303, "grad_norm": 0.6210533976554871, "learning_rate": 9.894365447759565e-06, "loss": 0.4615, "step": 642 }, { "epoch": 0.4779484638255699, "grad_norm": 0.502743124961853, "learning_rate": 9.893479069569113e-06, "loss": 0.4685, "step": 643 }, { "epoch": 0.4786917740336967, "grad_norm": 0.499509334564209, "learning_rate": 9.892589028142182e-06, "loss": 0.4691, "step": 644 }, { "epoch": 0.4794350842418236, "grad_norm": 0.472807377576828, "learning_rate": 9.891695324145052e-06, "loss": 0.472, "step": 645 }, { "epoch": 0.4801783944499504, "grad_norm": 0.5001767873764038, "learning_rate": 9.890797958246742e-06, "loss": 0.4414, "step": 646 }, { "epoch": 0.4809217046580773, "grad_norm": 0.4905308485031128, "learning_rate": 9.88989693111902e-06, "loss": 0.4825, "step": 647 }, { "epoch": 0.48166501486620417, "grad_norm": 0.5648345351219177, "learning_rate": 9.888992243436388e-06, "loss": 0.4907, "step": 648 }, { "epoch": 0.482408325074331, "grad_norm": 0.4872819185256958, "learning_rate": 9.888083895876095e-06, "loss": 0.4818, "step": 649 }, { "epoch": 0.48315163528245786, "grad_norm": 0.44704869389533997, "learning_rate": 9.887171889118121e-06, "loss": 0.452, "step": 650 }, { "epoch": 0.48389494549058476, "grad_norm": 0.5661419034004211, "learning_rate": 9.886256223845196e-06, "loss": 0.4845, "step": 651 }, { "epoch": 0.4846382556987116, "grad_norm": 0.5802968740463257, "learning_rate": 9.885336900742776e-06, "loss": 0.4726, "step": 652 }, { "epoch": 0.48538156590683845, "grad_norm": 0.5797024965286255, "learning_rate": 9.88441392049907e-06, "loss": 0.4663, "step": 653 }, { "epoch": 0.4861248761149653, "grad_norm": 0.5713447332382202, "learning_rate": 9.88348728380501e-06, "loss": 0.4575, "step": 654 }, { "epoch": 0.48686818632309214, "grad_norm": 0.4710460901260376, "learning_rate": 9.882556991354275e-06, "loss": 0.4859, "step": 655 }, { "epoch": 0.48761149653121905, "grad_norm": 0.5686562657356262, "learning_rate": 9.881623043843279e-06, "loss": 0.4807, "step": 656 }, { "epoch": 0.4883548067393459, "grad_norm": 0.4773658514022827, "learning_rate": 9.880685441971169e-06, "loss": 0.4955, "step": 657 }, { "epoch": 0.48909811694747274, "grad_norm": 0.5471131205558777, "learning_rate": 9.87974418643983e-06, "loss": 0.4674, "step": 658 }, { "epoch": 0.4898414271555996, "grad_norm": 0.4857066869735718, "learning_rate": 9.87879927795388e-06, "loss": 0.5082, "step": 659 }, { "epoch": 0.4905847373637265, "grad_norm": 0.561384379863739, "learning_rate": 9.877850717220676e-06, "loss": 0.4912, "step": 660 }, { "epoch": 0.49132804757185333, "grad_norm": 0.47961750626564026, "learning_rate": 9.876898504950307e-06, "loss": 0.4446, "step": 661 }, { "epoch": 0.4920713577799802, "grad_norm": 0.5543147325515747, "learning_rate": 9.875942641855592e-06, "loss": 0.4583, "step": 662 }, { "epoch": 0.492814667988107, "grad_norm": 0.5470408201217651, "learning_rate": 9.874983128652087e-06, "loss": 0.4774, "step": 663 }, { "epoch": 0.49355797819623387, "grad_norm": 0.5253694653511047, "learning_rate": 9.874019966058079e-06, "loss": 0.4642, "step": 664 }, { "epoch": 0.4943012884043608, "grad_norm": 0.7614284157752991, "learning_rate": 9.873053154794587e-06, "loss": 0.4297, "step": 665 }, { "epoch": 0.4950445986124876, "grad_norm": 0.46130627393722534, "learning_rate": 9.872082695585363e-06, "loss": 0.4487, "step": 666 }, { "epoch": 0.49578790882061446, "grad_norm": 0.5756059885025024, "learning_rate": 9.871108589156888e-06, "loss": 0.4693, "step": 667 }, { "epoch": 0.4965312190287413, "grad_norm": 0.5758311748504639, "learning_rate": 9.870130836238372e-06, "loss": 0.4566, "step": 668 }, { "epoch": 0.4972745292368682, "grad_norm": 0.5050128102302551, "learning_rate": 9.869149437561756e-06, "loss": 0.4754, "step": 669 }, { "epoch": 0.49801783944499506, "grad_norm": 0.5773419737815857, "learning_rate": 9.868164393861714e-06, "loss": 0.4608, "step": 670 }, { "epoch": 0.4987611496531219, "grad_norm": 0.5679943561553955, "learning_rate": 9.867175705875644e-06, "loss": 0.4928, "step": 671 }, { "epoch": 0.49950445986124875, "grad_norm": 0.4940268099308014, "learning_rate": 9.866183374343672e-06, "loss": 0.4577, "step": 672 }, { "epoch": 0.5002477700693756, "grad_norm": 0.531087338924408, "learning_rate": 9.865187400008656e-06, "loss": 0.4536, "step": 673 }, { "epoch": 0.5009910802775025, "grad_norm": 0.4906018078327179, "learning_rate": 9.864187783616172e-06, "loss": 0.4565, "step": 674 }, { "epoch": 0.5017343904856293, "grad_norm": 0.5216202735900879, "learning_rate": 9.863184525914536e-06, "loss": 0.4693, "step": 675 }, { "epoch": 0.5024777006937562, "grad_norm": 0.5577301383018494, "learning_rate": 9.862177627654776e-06, "loss": 0.4686, "step": 676 }, { "epoch": 0.5032210109018831, "grad_norm": 0.4859778583049774, "learning_rate": 9.861167089590652e-06, "loss": 0.4407, "step": 677 }, { "epoch": 0.5039643211100099, "grad_norm": 0.5716337561607361, "learning_rate": 9.860152912478653e-06, "loss": 0.4475, "step": 678 }, { "epoch": 0.5047076313181368, "grad_norm": 0.5375843644142151, "learning_rate": 9.859135097077985e-06, "loss": 0.4734, "step": 679 }, { "epoch": 0.5054509415262636, "grad_norm": 0.5094954967498779, "learning_rate": 9.858113644150578e-06, "loss": 0.4746, "step": 680 }, { "epoch": 0.5061942517343905, "grad_norm": 0.5385057926177979, "learning_rate": 9.857088554461089e-06, "loss": 0.4643, "step": 681 }, { "epoch": 0.5069375619425174, "grad_norm": 0.5110798478126526, "learning_rate": 9.856059828776897e-06, "loss": 0.4629, "step": 682 }, { "epoch": 0.5076808721506442, "grad_norm": 0.5102913975715637, "learning_rate": 9.855027467868096e-06, "loss": 0.4746, "step": 683 }, { "epoch": 0.5084241823587711, "grad_norm": 0.552940309047699, "learning_rate": 9.853991472507514e-06, "loss": 0.4766, "step": 684 }, { "epoch": 0.509167492566898, "grad_norm": 0.5385982394218445, "learning_rate": 9.852951843470685e-06, "loss": 0.4318, "step": 685 }, { "epoch": 0.5099108027750248, "grad_norm": 0.5665810108184814, "learning_rate": 9.851908581535879e-06, "loss": 0.475, "step": 686 }, { "epoch": 0.5106541129831517, "grad_norm": 0.5021418333053589, "learning_rate": 9.850861687484069e-06, "loss": 0.4782, "step": 687 }, { "epoch": 0.5113974231912785, "grad_norm": 0.6369080543518066, "learning_rate": 9.849811162098962e-06, "loss": 0.4659, "step": 688 }, { "epoch": 0.5121407333994054, "grad_norm": 0.5041050314903259, "learning_rate": 9.84875700616697e-06, "loss": 0.4421, "step": 689 }, { "epoch": 0.5128840436075323, "grad_norm": 0.6036661863327026, "learning_rate": 9.847699220477236e-06, "loss": 0.473, "step": 690 }, { "epoch": 0.513627353815659, "grad_norm": 0.5912425518035889, "learning_rate": 9.846637805821609e-06, "loss": 0.4597, "step": 691 }, { "epoch": 0.514370664023786, "grad_norm": 0.6546609997749329, "learning_rate": 9.84557276299466e-06, "loss": 0.4801, "step": 692 }, { "epoch": 0.5151139742319127, "grad_norm": 0.6018258929252625, "learning_rate": 9.844504092793677e-06, "loss": 0.4398, "step": 693 }, { "epoch": 0.5158572844400396, "grad_norm": 0.5320812463760376, "learning_rate": 9.843431796018661e-06, "loss": 0.4723, "step": 694 }, { "epoch": 0.5166005946481665, "grad_norm": 0.6172481775283813, "learning_rate": 9.84235587347233e-06, "loss": 0.4478, "step": 695 }, { "epoch": 0.5173439048562933, "grad_norm": 0.500764787197113, "learning_rate": 9.841276325960109e-06, "loss": 0.4626, "step": 696 }, { "epoch": 0.5180872150644202, "grad_norm": 0.6790436506271362, "learning_rate": 9.84019315429015e-06, "loss": 0.5151, "step": 697 }, { "epoch": 0.518830525272547, "grad_norm": 0.565923810005188, "learning_rate": 9.839106359273306e-06, "loss": 0.4785, "step": 698 }, { "epoch": 0.5195738354806739, "grad_norm": 0.6229163408279419, "learning_rate": 9.838015941723147e-06, "loss": 0.4871, "step": 699 }, { "epoch": 0.5203171456888008, "grad_norm": 0.5736317038536072, "learning_rate": 9.836921902455956e-06, "loss": 0.4917, "step": 700 }, { "epoch": 0.5210604558969276, "grad_norm": 0.5578978657722473, "learning_rate": 9.835824242290726e-06, "loss": 0.4842, "step": 701 }, { "epoch": 0.5218037661050545, "grad_norm": 0.5952611565589905, "learning_rate": 9.834722962049159e-06, "loss": 0.486, "step": 702 }, { "epoch": 0.5225470763131813, "grad_norm": 0.5428274869918823, "learning_rate": 9.83361806255567e-06, "loss": 0.4816, "step": 703 }, { "epoch": 0.5232903865213082, "grad_norm": 0.5729207396507263, "learning_rate": 9.832509544637381e-06, "loss": 0.4935, "step": 704 }, { "epoch": 0.5240336967294351, "grad_norm": 0.5044023990631104, "learning_rate": 9.831397409124124e-06, "loss": 0.4864, "step": 705 }, { "epoch": 0.5247770069375619, "grad_norm": 0.5960573554039001, "learning_rate": 9.830281656848437e-06, "loss": 0.4822, "step": 706 }, { "epoch": 0.5255203171456888, "grad_norm": 0.5227779746055603, "learning_rate": 9.82916228864557e-06, "loss": 0.4673, "step": 707 }, { "epoch": 0.5262636273538157, "grad_norm": 0.530681848526001, "learning_rate": 9.828039305353477e-06, "loss": 0.4657, "step": 708 }, { "epoch": 0.5270069375619425, "grad_norm": 0.5905095934867859, "learning_rate": 9.826912707812813e-06, "loss": 0.4864, "step": 709 }, { "epoch": 0.5277502477700694, "grad_norm": 0.593195378780365, "learning_rate": 9.825782496866951e-06, "loss": 0.4629, "step": 710 }, { "epoch": 0.5284935579781962, "grad_norm": 0.5341914892196655, "learning_rate": 9.824648673361956e-06, "loss": 0.4878, "step": 711 }, { "epoch": 0.5292368681863231, "grad_norm": 0.5942769050598145, "learning_rate": 9.823511238146608e-06, "loss": 0.4953, "step": 712 }, { "epoch": 0.52998017839445, "grad_norm": 0.5475106239318848, "learning_rate": 9.822370192072382e-06, "loss": 0.4946, "step": 713 }, { "epoch": 0.5307234886025768, "grad_norm": 0.5444550514221191, "learning_rate": 9.821225535993462e-06, "loss": 0.4524, "step": 714 }, { "epoch": 0.5314667988107037, "grad_norm": 0.4604387581348419, "learning_rate": 9.820077270766731e-06, "loss": 0.4508, "step": 715 }, { "epoch": 0.5322101090188305, "grad_norm": 0.450522780418396, "learning_rate": 9.818925397251778e-06, "loss": 0.4734, "step": 716 }, { "epoch": 0.5329534192269574, "grad_norm": 0.5370664000511169, "learning_rate": 9.817769916310887e-06, "loss": 0.5204, "step": 717 }, { "epoch": 0.5336967294350843, "grad_norm": 0.49352675676345825, "learning_rate": 9.816610828809048e-06, "loss": 0.4718, "step": 718 }, { "epoch": 0.5344400396432111, "grad_norm": 0.5361419320106506, "learning_rate": 9.815448135613948e-06, "loss": 0.4674, "step": 719 }, { "epoch": 0.535183349851338, "grad_norm": 0.45716872811317444, "learning_rate": 9.814281837595974e-06, "loss": 0.4634, "step": 720 }, { "epoch": 0.5359266600594648, "grad_norm": 0.4845161736011505, "learning_rate": 9.813111935628212e-06, "loss": 0.4812, "step": 721 }, { "epoch": 0.5366699702675917, "grad_norm": 0.5154899954795837, "learning_rate": 9.811938430586445e-06, "loss": 0.4589, "step": 722 }, { "epoch": 0.5374132804757186, "grad_norm": 0.4918622076511383, "learning_rate": 9.810761323349152e-06, "loss": 0.4616, "step": 723 }, { "epoch": 0.5381565906838454, "grad_norm": 0.47309625148773193, "learning_rate": 9.809580614797511e-06, "loss": 0.4551, "step": 724 }, { "epoch": 0.5388999008919723, "grad_norm": 0.44127514958381653, "learning_rate": 9.808396305815398e-06, "loss": 0.4452, "step": 725 }, { "epoch": 0.5396432111000992, "grad_norm": 0.47713083028793335, "learning_rate": 9.807208397289378e-06, "loss": 0.4842, "step": 726 }, { "epoch": 0.540386521308226, "grad_norm": 0.5486829876899719, "learning_rate": 9.806016890108716e-06, "loss": 0.4654, "step": 727 }, { "epoch": 0.5411298315163529, "grad_norm": 0.4422832429409027, "learning_rate": 9.804821785165367e-06, "loss": 0.4795, "step": 728 }, { "epoch": 0.5418731417244796, "grad_norm": 0.5345862507820129, "learning_rate": 9.803623083353983e-06, "loss": 0.4747, "step": 729 }, { "epoch": 0.5426164519326065, "grad_norm": 0.504055917263031, "learning_rate": 9.802420785571904e-06, "loss": 0.4774, "step": 730 }, { "epoch": 0.5433597621407334, "grad_norm": 0.4611656963825226, "learning_rate": 9.801214892719169e-06, "loss": 0.4479, "step": 731 }, { "epoch": 0.5441030723488602, "grad_norm": 0.6216002702713013, "learning_rate": 9.800005405698502e-06, "loss": 0.4797, "step": 732 }, { "epoch": 0.5448463825569871, "grad_norm": 0.523231565952301, "learning_rate": 9.798792325415318e-06, "loss": 0.493, "step": 733 }, { "epoch": 0.5455896927651139, "grad_norm": 0.6213244199752808, "learning_rate": 9.797575652777726e-06, "loss": 0.4569, "step": 734 }, { "epoch": 0.5463330029732408, "grad_norm": 0.5133166909217834, "learning_rate": 9.79635538869652e-06, "loss": 0.472, "step": 735 }, { "epoch": 0.5470763131813677, "grad_norm": 0.562609076499939, "learning_rate": 9.795131534085183e-06, "loss": 0.4423, "step": 736 }, { "epoch": 0.5478196233894945, "grad_norm": 0.6615050435066223, "learning_rate": 9.793904089859891e-06, "loss": 0.4646, "step": 737 }, { "epoch": 0.5485629335976214, "grad_norm": 0.6896381974220276, "learning_rate": 9.792673056939501e-06, "loss": 0.4836, "step": 738 }, { "epoch": 0.5493062438057482, "grad_norm": 0.4659193158149719, "learning_rate": 9.791438436245557e-06, "loss": 0.4567, "step": 739 }, { "epoch": 0.5500495540138751, "grad_norm": 0.7640861868858337, "learning_rate": 9.790200228702294e-06, "loss": 0.5106, "step": 740 }, { "epoch": 0.550792864222002, "grad_norm": 0.5317689180374146, "learning_rate": 9.788958435236624e-06, "loss": 0.4891, "step": 741 }, { "epoch": 0.5515361744301288, "grad_norm": 0.686019241809845, "learning_rate": 9.787713056778155e-06, "loss": 0.4634, "step": 742 }, { "epoch": 0.5522794846382557, "grad_norm": 0.5485683679580688, "learning_rate": 9.786464094259163e-06, "loss": 0.4603, "step": 743 }, { "epoch": 0.5530227948463825, "grad_norm": 0.5396677255630493, "learning_rate": 9.785211548614623e-06, "loss": 0.4337, "step": 744 }, { "epoch": 0.5537661050545094, "grad_norm": 0.6290425062179565, "learning_rate": 9.783955420782184e-06, "loss": 0.4596, "step": 745 }, { "epoch": 0.5545094152626363, "grad_norm": 0.603428065776825, "learning_rate": 9.782695711702173e-06, "loss": 0.4762, "step": 746 }, { "epoch": 0.5552527254707631, "grad_norm": 0.6844127178192139, "learning_rate": 9.781432422317607e-06, "loss": 0.4849, "step": 747 }, { "epoch": 0.55599603567889, "grad_norm": 0.5983946919441223, "learning_rate": 9.780165553574177e-06, "loss": 0.5038, "step": 748 }, { "epoch": 0.5567393458870169, "grad_norm": 0.6327676773071289, "learning_rate": 9.778895106420256e-06, "loss": 0.466, "step": 749 }, { "epoch": 0.5574826560951437, "grad_norm": 0.5791441798210144, "learning_rate": 9.777621081806895e-06, "loss": 0.4828, "step": 750 }, { "epoch": 0.5582259663032706, "grad_norm": 0.6937406659126282, "learning_rate": 9.77634348068782e-06, "loss": 0.4891, "step": 751 }, { "epoch": 0.5589692765113974, "grad_norm": 0.5707263350486755, "learning_rate": 9.775062304019442e-06, "loss": 0.4635, "step": 752 }, { "epoch": 0.5597125867195243, "grad_norm": 0.6267306208610535, "learning_rate": 9.773777552760843e-06, "loss": 0.4757, "step": 753 }, { "epoch": 0.5604558969276512, "grad_norm": 0.5999884009361267, "learning_rate": 9.772489227873782e-06, "loss": 0.451, "step": 754 }, { "epoch": 0.561199207135778, "grad_norm": 0.513155996799469, "learning_rate": 9.77119733032269e-06, "loss": 0.4712, "step": 755 }, { "epoch": 0.5619425173439049, "grad_norm": 0.5858469009399414, "learning_rate": 9.769901861074682e-06, "loss": 0.4546, "step": 756 }, { "epoch": 0.5626858275520317, "grad_norm": 0.5111133456230164, "learning_rate": 9.768602821099535e-06, "loss": 0.4471, "step": 757 }, { "epoch": 0.5634291377601586, "grad_norm": 0.4766891896724701, "learning_rate": 9.767300211369705e-06, "loss": 0.4463, "step": 758 }, { "epoch": 0.5641724479682855, "grad_norm": 0.5861994624137878, "learning_rate": 9.765994032860323e-06, "loss": 0.4558, "step": 759 }, { "epoch": 0.5649157581764123, "grad_norm": 0.5520639419555664, "learning_rate": 9.764684286549185e-06, "loss": 0.4784, "step": 760 }, { "epoch": 0.5656590683845392, "grad_norm": 0.6142148971557617, "learning_rate": 9.763370973416763e-06, "loss": 0.4756, "step": 761 }, { "epoch": 0.566402378592666, "grad_norm": 0.6096112728118896, "learning_rate": 9.762054094446198e-06, "loss": 0.4658, "step": 762 }, { "epoch": 0.5671456888007929, "grad_norm": 0.5839915871620178, "learning_rate": 9.760733650623298e-06, "loss": 0.5066, "step": 763 }, { "epoch": 0.5678889990089198, "grad_norm": 0.5892787575721741, "learning_rate": 9.759409642936542e-06, "loss": 0.4719, "step": 764 }, { "epoch": 0.5686323092170465, "grad_norm": 0.6337869763374329, "learning_rate": 9.758082072377076e-06, "loss": 0.4716, "step": 765 }, { "epoch": 0.5693756194251735, "grad_norm": 0.5412096977233887, "learning_rate": 9.756750939938715e-06, "loss": 0.4614, "step": 766 }, { "epoch": 0.5701189296333002, "grad_norm": 0.5808846354484558, "learning_rate": 9.755416246617934e-06, "loss": 0.4621, "step": 767 }, { "epoch": 0.5708622398414271, "grad_norm": 0.5488495826721191, "learning_rate": 9.754077993413887e-06, "loss": 0.4576, "step": 768 }, { "epoch": 0.571605550049554, "grad_norm": 0.5190985202789307, "learning_rate": 9.752736181328376e-06, "loss": 0.4727, "step": 769 }, { "epoch": 0.5723488602576808, "grad_norm": 0.5815384984016418, "learning_rate": 9.751390811365878e-06, "loss": 0.44, "step": 770 }, { "epoch": 0.5730921704658077, "grad_norm": 0.5479460954666138, "learning_rate": 9.750041884533532e-06, "loss": 0.4914, "step": 771 }, { "epoch": 0.5738354806739346, "grad_norm": 0.5546796321868896, "learning_rate": 9.748689401841138e-06, "loss": 0.4858, "step": 772 }, { "epoch": 0.5745787908820614, "grad_norm": 0.485755056142807, "learning_rate": 9.747333364301158e-06, "loss": 0.452, "step": 773 }, { "epoch": 0.5753221010901883, "grad_norm": 0.48897892236709595, "learning_rate": 9.745973772928719e-06, "loss": 0.5184, "step": 774 }, { "epoch": 0.5760654112983151, "grad_norm": 0.5158651471138, "learning_rate": 9.7446106287416e-06, "loss": 0.4885, "step": 775 }, { "epoch": 0.576808721506442, "grad_norm": 0.5078892111778259, "learning_rate": 9.743243932760248e-06, "loss": 0.45, "step": 776 }, { "epoch": 0.5775520317145689, "grad_norm": 0.4950157701969147, "learning_rate": 9.741873686007764e-06, "loss": 0.465, "step": 777 }, { "epoch": 0.5782953419226957, "grad_norm": 0.4934135973453522, "learning_rate": 9.74049988950991e-06, "loss": 0.4527, "step": 778 }, { "epoch": 0.5790386521308226, "grad_norm": 0.5547251105308533, "learning_rate": 9.739122544295101e-06, "loss": 0.4776, "step": 779 }, { "epoch": 0.5797819623389494, "grad_norm": 0.44168493151664734, "learning_rate": 9.737741651394414e-06, "loss": 0.4712, "step": 780 }, { "epoch": 0.5805252725470763, "grad_norm": 0.4891485571861267, "learning_rate": 9.736357211841578e-06, "loss": 0.4751, "step": 781 }, { "epoch": 0.5812685827552032, "grad_norm": 0.4976192116737366, "learning_rate": 9.734969226672979e-06, "loss": 0.4785, "step": 782 }, { "epoch": 0.58201189296333, "grad_norm": 0.5001835227012634, "learning_rate": 9.733577696927657e-06, "loss": 0.4703, "step": 783 }, { "epoch": 0.5827552031714569, "grad_norm": 0.4857095777988434, "learning_rate": 9.732182623647303e-06, "loss": 0.4726, "step": 784 }, { "epoch": 0.5834985133795837, "grad_norm": 0.4940665066242218, "learning_rate": 9.730784007876264e-06, "loss": 0.4795, "step": 785 }, { "epoch": 0.5842418235877106, "grad_norm": 0.5648552179336548, "learning_rate": 9.729381850661537e-06, "loss": 0.4602, "step": 786 }, { "epoch": 0.5849851337958375, "grad_norm": 0.5010696053504944, "learning_rate": 9.727976153052772e-06, "loss": 0.458, "step": 787 }, { "epoch": 0.5857284440039643, "grad_norm": 0.5051469802856445, "learning_rate": 9.726566916102265e-06, "loss": 0.4827, "step": 788 }, { "epoch": 0.5864717542120912, "grad_norm": 0.49138492345809937, "learning_rate": 9.725154140864968e-06, "loss": 0.4648, "step": 789 }, { "epoch": 0.5872150644202181, "grad_norm": 0.557781457901001, "learning_rate": 9.723737828398476e-06, "loss": 0.489, "step": 790 }, { "epoch": 0.5879583746283449, "grad_norm": 0.4769551455974579, "learning_rate": 9.722317979763034e-06, "loss": 0.4642, "step": 791 }, { "epoch": 0.5887016848364718, "grad_norm": 0.45479312539100647, "learning_rate": 9.720894596021536e-06, "loss": 0.4564, "step": 792 }, { "epoch": 0.5894449950445986, "grad_norm": 0.47152793407440186, "learning_rate": 9.719467678239519e-06, "loss": 0.4661, "step": 793 }, { "epoch": 0.5901883052527255, "grad_norm": 0.4859204888343811, "learning_rate": 9.718037227485166e-06, "loss": 0.4896, "step": 794 }, { "epoch": 0.5909316154608524, "grad_norm": 0.41534945368766785, "learning_rate": 9.71660324482931e-06, "loss": 0.4624, "step": 795 }, { "epoch": 0.5916749256689792, "grad_norm": 0.4862332344055176, "learning_rate": 9.71516573134542e-06, "loss": 0.4796, "step": 796 }, { "epoch": 0.5924182358771061, "grad_norm": 0.5535085201263428, "learning_rate": 9.713724688109616e-06, "loss": 0.4933, "step": 797 }, { "epoch": 0.5931615460852329, "grad_norm": 0.49807602167129517, "learning_rate": 9.712280116200654e-06, "loss": 0.468, "step": 798 }, { "epoch": 0.5939048562933598, "grad_norm": 0.6346691250801086, "learning_rate": 9.710832016699933e-06, "loss": 0.4682, "step": 799 }, { "epoch": 0.5946481665014867, "grad_norm": 0.47724059224128723, "learning_rate": 9.709380390691498e-06, "loss": 0.4671, "step": 800 }, { "epoch": 0.5953914767096135, "grad_norm": 0.6016086339950562, "learning_rate": 9.707925239262024e-06, "loss": 0.445, "step": 801 }, { "epoch": 0.5961347869177404, "grad_norm": 0.5732694268226624, "learning_rate": 9.706466563500836e-06, "loss": 0.4749, "step": 802 }, { "epoch": 0.5968780971258671, "grad_norm": 0.555593729019165, "learning_rate": 9.70500436449989e-06, "loss": 0.4335, "step": 803 }, { "epoch": 0.597621407333994, "grad_norm": 0.5321863889694214, "learning_rate": 9.70353864335378e-06, "loss": 0.4121, "step": 804 }, { "epoch": 0.598364717542121, "grad_norm": 0.5500171780586243, "learning_rate": 9.702069401159742e-06, "loss": 0.4644, "step": 805 }, { "epoch": 0.5991080277502477, "grad_norm": 0.4612651765346527, "learning_rate": 9.700596639017641e-06, "loss": 0.4497, "step": 806 }, { "epoch": 0.5998513379583746, "grad_norm": 0.5883724093437195, "learning_rate": 9.699120358029981e-06, "loss": 0.428, "step": 807 }, { "epoch": 0.6005946481665014, "grad_norm": 0.7389349341392517, "learning_rate": 9.697640559301899e-06, "loss": 0.4661, "step": 808 }, { "epoch": 0.6013379583746283, "grad_norm": 0.5897778272628784, "learning_rate": 9.696157243941167e-06, "loss": 0.4615, "step": 809 }, { "epoch": 0.6020812685827552, "grad_norm": 0.7592300176620483, "learning_rate": 9.694670413058187e-06, "loss": 0.4883, "step": 810 }, { "epoch": 0.602824578790882, "grad_norm": 0.5273610353469849, "learning_rate": 9.693180067765994e-06, "loss": 0.4579, "step": 811 }, { "epoch": 0.6035678889990089, "grad_norm": 0.6374710202217102, "learning_rate": 9.691686209180256e-06, "loss": 0.4777, "step": 812 }, { "epoch": 0.6043111992071358, "grad_norm": 0.5528088808059692, "learning_rate": 9.690188838419266e-06, "loss": 0.4849, "step": 813 }, { "epoch": 0.6050545094152626, "grad_norm": 0.5823690891265869, "learning_rate": 9.688687956603949e-06, "loss": 0.4881, "step": 814 }, { "epoch": 0.6057978196233895, "grad_norm": 0.4414238929748535, "learning_rate": 9.687183564857861e-06, "loss": 0.4753, "step": 815 }, { "epoch": 0.6065411298315163, "grad_norm": 0.576725959777832, "learning_rate": 9.685675664307179e-06, "loss": 0.4599, "step": 816 }, { "epoch": 0.6072844400396432, "grad_norm": 0.5099811553955078, "learning_rate": 9.684164256080714e-06, "loss": 0.4621, "step": 817 }, { "epoch": 0.6080277502477701, "grad_norm": 0.528939962387085, "learning_rate": 9.682649341309898e-06, "loss": 0.4612, "step": 818 }, { "epoch": 0.6087710604558969, "grad_norm": 0.6105527281761169, "learning_rate": 9.68113092112879e-06, "loss": 0.4942, "step": 819 }, { "epoch": 0.6095143706640238, "grad_norm": 0.5654282569885254, "learning_rate": 9.679608996674072e-06, "loss": 0.4327, "step": 820 }, { "epoch": 0.6102576808721506, "grad_norm": 0.6891658902168274, "learning_rate": 9.678083569085052e-06, "loss": 0.4847, "step": 821 }, { "epoch": 0.6110009910802775, "grad_norm": 0.5143589377403259, "learning_rate": 9.676554639503655e-06, "loss": 0.4979, "step": 822 }, { "epoch": 0.6117443012884044, "grad_norm": 0.6479382514953613, "learning_rate": 9.675022209074431e-06, "loss": 0.4694, "step": 823 }, { "epoch": 0.6124876114965312, "grad_norm": 0.5231307148933411, "learning_rate": 9.673486278944553e-06, "loss": 0.483, "step": 824 }, { "epoch": 0.6132309217046581, "grad_norm": 0.655732274055481, "learning_rate": 9.67194685026381e-06, "loss": 0.4475, "step": 825 }, { "epoch": 0.6139742319127849, "grad_norm": 0.5325483083724976, "learning_rate": 9.670403924184611e-06, "loss": 0.4485, "step": 826 }, { "epoch": 0.6147175421209118, "grad_norm": 0.5771063566207886, "learning_rate": 9.668857501861984e-06, "loss": 0.4664, "step": 827 }, { "epoch": 0.6154608523290387, "grad_norm": 0.6284517049789429, "learning_rate": 9.667307584453573e-06, "loss": 0.4312, "step": 828 }, { "epoch": 0.6162041625371655, "grad_norm": 0.47194796800613403, "learning_rate": 9.665754173119642e-06, "loss": 0.4628, "step": 829 }, { "epoch": 0.6169474727452924, "grad_norm": 0.6006263494491577, "learning_rate": 9.664197269023065e-06, "loss": 0.4695, "step": 830 }, { "epoch": 0.6176907829534193, "grad_norm": 0.5439622402191162, "learning_rate": 9.662636873329334e-06, "loss": 0.4886, "step": 831 }, { "epoch": 0.6184340931615461, "grad_norm": 0.5727571845054626, "learning_rate": 9.661072987206554e-06, "loss": 0.4779, "step": 832 }, { "epoch": 0.619177403369673, "grad_norm": 0.5324578285217285, "learning_rate": 9.659505611825445e-06, "loss": 0.4706, "step": 833 }, { "epoch": 0.6199207135777998, "grad_norm": 0.5858307480812073, "learning_rate": 9.657934748359334e-06, "loss": 0.4412, "step": 834 }, { "epoch": 0.6206640237859267, "grad_norm": 0.5658852458000183, "learning_rate": 9.656360397984167e-06, "loss": 0.449, "step": 835 }, { "epoch": 0.6214073339940536, "grad_norm": 0.5143101215362549, "learning_rate": 9.654782561878491e-06, "loss": 0.4357, "step": 836 }, { "epoch": 0.6221506442021804, "grad_norm": 0.5866367220878601, "learning_rate": 9.653201241223468e-06, "loss": 0.4584, "step": 837 }, { "epoch": 0.6228939544103073, "grad_norm": 0.49089959263801575, "learning_rate": 9.651616437202869e-06, "loss": 0.4999, "step": 838 }, { "epoch": 0.623637264618434, "grad_norm": 0.5973488092422485, "learning_rate": 9.65002815100307e-06, "loss": 0.423, "step": 839 }, { "epoch": 0.624380574826561, "grad_norm": 0.5002806186676025, "learning_rate": 9.648436383813055e-06, "loss": 0.4673, "step": 840 }, { "epoch": 0.6251238850346879, "grad_norm": 0.5634835958480835, "learning_rate": 9.646841136824416e-06, "loss": 0.4709, "step": 841 }, { "epoch": 0.6258671952428146, "grad_norm": 0.5496928691864014, "learning_rate": 9.645242411231345e-06, "loss": 0.4953, "step": 842 }, { "epoch": 0.6266105054509415, "grad_norm": 0.5221600532531738, "learning_rate": 9.643640208230642e-06, "loss": 0.489, "step": 843 }, { "epoch": 0.6273538156590683, "grad_norm": 0.5426389575004578, "learning_rate": 9.642034529021708e-06, "loss": 0.4752, "step": 844 }, { "epoch": 0.6280971258671952, "grad_norm": 0.528915524482727, "learning_rate": 9.64042537480655e-06, "loss": 0.4671, "step": 845 }, { "epoch": 0.6288404360753221, "grad_norm": 0.5519990921020508, "learning_rate": 9.63881274678977e-06, "loss": 0.4759, "step": 846 }, { "epoch": 0.6295837462834489, "grad_norm": 0.4786272943019867, "learning_rate": 9.637196646178576e-06, "loss": 0.4205, "step": 847 }, { "epoch": 0.6303270564915758, "grad_norm": 0.5662400722503662, "learning_rate": 9.635577074182773e-06, "loss": 0.4544, "step": 848 }, { "epoch": 0.6310703666997026, "grad_norm": 0.6167908906936646, "learning_rate": 9.633954032014768e-06, "loss": 0.46, "step": 849 }, { "epoch": 0.6318136769078295, "grad_norm": 0.44485989212989807, "learning_rate": 9.632327520889558e-06, "loss": 0.4632, "step": 850 }, { "epoch": 0.6325569871159564, "grad_norm": 0.5604773759841919, "learning_rate": 9.630697542024746e-06, "loss": 0.468, "step": 851 }, { "epoch": 0.6333002973240832, "grad_norm": 0.4929850995540619, "learning_rate": 9.629064096640526e-06, "loss": 0.4552, "step": 852 }, { "epoch": 0.6340436075322101, "grad_norm": 0.4834783971309662, "learning_rate": 9.627427185959686e-06, "loss": 0.4627, "step": 853 }, { "epoch": 0.634786917740337, "grad_norm": 0.48295506834983826, "learning_rate": 9.625786811207613e-06, "loss": 0.471, "step": 854 }, { "epoch": 0.6355302279484638, "grad_norm": 0.49519288539886475, "learning_rate": 9.624142973612282e-06, "loss": 0.4619, "step": 855 }, { "epoch": 0.6362735381565907, "grad_norm": 0.4563632011413574, "learning_rate": 9.622495674404263e-06, "loss": 0.4419, "step": 856 }, { "epoch": 0.6370168483647175, "grad_norm": 0.5943471193313599, "learning_rate": 9.620844914816716e-06, "loss": 0.4686, "step": 857 }, { "epoch": 0.6377601585728444, "grad_norm": 0.4672428071498871, "learning_rate": 9.619190696085395e-06, "loss": 0.4356, "step": 858 }, { "epoch": 0.6385034687809713, "grad_norm": 0.4649907350540161, "learning_rate": 9.617533019448638e-06, "loss": 0.4569, "step": 859 }, { "epoch": 0.6392467789890981, "grad_norm": 0.5465190410614014, "learning_rate": 9.615871886147375e-06, "loss": 0.4569, "step": 860 }, { "epoch": 0.639990089197225, "grad_norm": 0.4701210558414459, "learning_rate": 9.614207297425124e-06, "loss": 0.4727, "step": 861 }, { "epoch": 0.6407333994053518, "grad_norm": 0.5167921781539917, "learning_rate": 9.612539254527988e-06, "loss": 0.4825, "step": 862 }, { "epoch": 0.6414767096134787, "grad_norm": 0.4861242175102234, "learning_rate": 9.610867758704654e-06, "loss": 0.4588, "step": 863 }, { "epoch": 0.6422200198216056, "grad_norm": 0.49005022644996643, "learning_rate": 9.6091928112064e-06, "loss": 0.4425, "step": 864 }, { "epoch": 0.6429633300297324, "grad_norm": 0.5339453816413879, "learning_rate": 9.607514413287083e-06, "loss": 0.4918, "step": 865 }, { "epoch": 0.6437066402378593, "grad_norm": 0.455135315656662, "learning_rate": 9.605832566203144e-06, "loss": 0.4411, "step": 866 }, { "epoch": 0.6444499504459861, "grad_norm": 0.5564399361610413, "learning_rate": 9.604147271213604e-06, "loss": 0.4563, "step": 867 }, { "epoch": 0.645193260654113, "grad_norm": 0.4989868700504303, "learning_rate": 9.602458529580069e-06, "loss": 0.4686, "step": 868 }, { "epoch": 0.6459365708622399, "grad_norm": 0.5617790818214417, "learning_rate": 9.600766342566727e-06, "loss": 0.4973, "step": 869 }, { "epoch": 0.6466798810703667, "grad_norm": 0.44572436809539795, "learning_rate": 9.599070711440335e-06, "loss": 0.448, "step": 870 }, { "epoch": 0.6474231912784936, "grad_norm": 0.5089277029037476, "learning_rate": 9.597371637470241e-06, "loss": 0.4574, "step": 871 }, { "epoch": 0.6481665014866204, "grad_norm": 0.4913333058357239, "learning_rate": 9.595669121928359e-06, "loss": 0.4614, "step": 872 }, { "epoch": 0.6489098116947473, "grad_norm": 0.46019700169563293, "learning_rate": 9.593963166089189e-06, "loss": 0.4867, "step": 873 }, { "epoch": 0.6496531219028742, "grad_norm": 0.6139445900917053, "learning_rate": 9.592253771229796e-06, "loss": 0.5018, "step": 874 }, { "epoch": 0.650396432111001, "grad_norm": 0.5299757122993469, "learning_rate": 9.590540938629833e-06, "loss": 0.4604, "step": 875 }, { "epoch": 0.6511397423191279, "grad_norm": 0.46487411856651306, "learning_rate": 9.588824669571513e-06, "loss": 0.4432, "step": 876 }, { "epoch": 0.6518830525272548, "grad_norm": 0.6320675611495972, "learning_rate": 9.587104965339629e-06, "loss": 0.47, "step": 877 }, { "epoch": 0.6526263627353815, "grad_norm": 0.4905308485031128, "learning_rate": 9.585381827221544e-06, "loss": 0.4424, "step": 878 }, { "epoch": 0.6533696729435085, "grad_norm": 0.4289315640926361, "learning_rate": 9.583655256507192e-06, "loss": 0.442, "step": 879 }, { "epoch": 0.6541129831516352, "grad_norm": 0.5506812930107117, "learning_rate": 9.581925254489074e-06, "loss": 0.4498, "step": 880 }, { "epoch": 0.6548562933597621, "grad_norm": 0.43898066878318787, "learning_rate": 9.580191822462265e-06, "loss": 0.4778, "step": 881 }, { "epoch": 0.655599603567889, "grad_norm": 0.43407654762268066, "learning_rate": 9.578454961724402e-06, "loss": 0.468, "step": 882 }, { "epoch": 0.6563429137760158, "grad_norm": 0.4989941716194153, "learning_rate": 9.576714673575691e-06, "loss": 0.4823, "step": 883 }, { "epoch": 0.6570862239841427, "grad_norm": 0.4647194743156433, "learning_rate": 9.574970959318906e-06, "loss": 0.4537, "step": 884 }, { "epoch": 0.6578295341922695, "grad_norm": 0.4613417685031891, "learning_rate": 9.573223820259382e-06, "loss": 0.4678, "step": 885 }, { "epoch": 0.6585728444003964, "grad_norm": 0.5251882076263428, "learning_rate": 9.57147325770502e-06, "loss": 0.4796, "step": 886 }, { "epoch": 0.6593161546085233, "grad_norm": 0.4764990210533142, "learning_rate": 9.569719272966284e-06, "loss": 0.4323, "step": 887 }, { "epoch": 0.6600594648166501, "grad_norm": 0.5145969390869141, "learning_rate": 9.567961867356197e-06, "loss": 0.4435, "step": 888 }, { "epoch": 0.660802775024777, "grad_norm": 0.5065026879310608, "learning_rate": 9.566201042190348e-06, "loss": 0.4594, "step": 889 }, { "epoch": 0.6615460852329038, "grad_norm": 0.53518146276474, "learning_rate": 9.56443679878688e-06, "loss": 0.4789, "step": 890 }, { "epoch": 0.6622893954410307, "grad_norm": 0.4500609040260315, "learning_rate": 9.562669138466498e-06, "loss": 0.47, "step": 891 }, { "epoch": 0.6630327056491576, "grad_norm": 0.4739845097064972, "learning_rate": 9.560898062552466e-06, "loss": 0.4349, "step": 892 }, { "epoch": 0.6637760158572844, "grad_norm": 0.4847898483276367, "learning_rate": 9.559123572370604e-06, "loss": 0.4686, "step": 893 }, { "epoch": 0.6645193260654113, "grad_norm": 0.5365830659866333, "learning_rate": 9.557345669249286e-06, "loss": 0.4553, "step": 894 }, { "epoch": 0.6652626362735382, "grad_norm": 0.5371538400650024, "learning_rate": 9.55556435451944e-06, "loss": 0.4454, "step": 895 }, { "epoch": 0.666005946481665, "grad_norm": 0.5147626996040344, "learning_rate": 9.553779629514555e-06, "loss": 0.4686, "step": 896 }, { "epoch": 0.6667492566897919, "grad_norm": 0.5638909339904785, "learning_rate": 9.551991495570664e-06, "loss": 0.4457, "step": 897 }, { "epoch": 0.6674925668979187, "grad_norm": 0.5371088981628418, "learning_rate": 9.55019995402636e-06, "loss": 0.4771, "step": 898 }, { "epoch": 0.6682358771060456, "grad_norm": 0.5080381631851196, "learning_rate": 9.54840500622278e-06, "loss": 0.5038, "step": 899 }, { "epoch": 0.6689791873141725, "grad_norm": 0.49846217036247253, "learning_rate": 9.546606653503616e-06, "loss": 0.4488, "step": 900 }, { "epoch": 0.6697224975222993, "grad_norm": 0.5480803847312927, "learning_rate": 9.544804897215106e-06, "loss": 0.4712, "step": 901 }, { "epoch": 0.6704658077304262, "grad_norm": 0.5290748476982117, "learning_rate": 9.542999738706033e-06, "loss": 0.4546, "step": 902 }, { "epoch": 0.671209117938553, "grad_norm": 0.46562114357948303, "learning_rate": 9.541191179327735e-06, "loss": 0.4799, "step": 903 }, { "epoch": 0.6719524281466799, "grad_norm": 0.4553632438182831, "learning_rate": 9.539379220434093e-06, "loss": 0.4789, "step": 904 }, { "epoch": 0.6726957383548068, "grad_norm": 0.5217464566230774, "learning_rate": 9.537563863381526e-06, "loss": 0.472, "step": 905 }, { "epoch": 0.6734390485629336, "grad_norm": 0.42763665318489075, "learning_rate": 9.535745109529004e-06, "loss": 0.433, "step": 906 }, { "epoch": 0.6741823587710605, "grad_norm": 0.44725409150123596, "learning_rate": 9.53392296023804e-06, "loss": 0.45, "step": 907 }, { "epoch": 0.6749256689791873, "grad_norm": 0.5011516213417053, "learning_rate": 9.532097416872686e-06, "loss": 0.4491, "step": 908 }, { "epoch": 0.6756689791873142, "grad_norm": 0.47638553380966187, "learning_rate": 9.530268480799533e-06, "loss": 0.4589, "step": 909 }, { "epoch": 0.6764122893954411, "grad_norm": 0.4814852178096771, "learning_rate": 9.52843615338772e-06, "loss": 0.4629, "step": 910 }, { "epoch": 0.6771555996035679, "grad_norm": 0.5059792995452881, "learning_rate": 9.526600436008912e-06, "loss": 0.4779, "step": 911 }, { "epoch": 0.6778989098116948, "grad_norm": 0.48307082056999207, "learning_rate": 9.524761330037322e-06, "loss": 0.4637, "step": 912 }, { "epoch": 0.6786422200198216, "grad_norm": 0.46275049448013306, "learning_rate": 9.5229188368497e-06, "loss": 0.4627, "step": 913 }, { "epoch": 0.6793855302279485, "grad_norm": 0.5437094569206238, "learning_rate": 9.521072957825322e-06, "loss": 0.4565, "step": 914 }, { "epoch": 0.6801288404360754, "grad_norm": 0.4960193634033203, "learning_rate": 9.519223694346009e-06, "loss": 0.4824, "step": 915 }, { "epoch": 0.6808721506442021, "grad_norm": 0.5394682288169861, "learning_rate": 9.51737104779611e-06, "loss": 0.4682, "step": 916 }, { "epoch": 0.681615460852329, "grad_norm": 0.47233057022094727, "learning_rate": 9.515515019562505e-06, "loss": 0.4863, "step": 917 }, { "epoch": 0.682358771060456, "grad_norm": 0.4702315330505371, "learning_rate": 9.513655611034615e-06, "loss": 0.4637, "step": 918 }, { "epoch": 0.6831020812685827, "grad_norm": 0.47492143511772156, "learning_rate": 9.51179282360438e-06, "loss": 0.439, "step": 919 }, { "epoch": 0.6838453914767096, "grad_norm": 0.45277148485183716, "learning_rate": 9.509926658666274e-06, "loss": 0.4857, "step": 920 }, { "epoch": 0.6845887016848364, "grad_norm": 0.46383368968963623, "learning_rate": 9.508057117617301e-06, "loss": 0.4367, "step": 921 }, { "epoch": 0.6853320118929633, "grad_norm": 0.45225027203559875, "learning_rate": 9.506184201856994e-06, "loss": 0.4673, "step": 922 }, { "epoch": 0.6860753221010902, "grad_norm": 0.5063316226005554, "learning_rate": 9.504307912787406e-06, "loss": 0.4678, "step": 923 }, { "epoch": 0.686818632309217, "grad_norm": 0.5209326148033142, "learning_rate": 9.502428251813118e-06, "loss": 0.4857, "step": 924 }, { "epoch": 0.6875619425173439, "grad_norm": 0.5774552226066589, "learning_rate": 9.500545220341237e-06, "loss": 0.4669, "step": 925 }, { "epoch": 0.6883052527254707, "grad_norm": 0.488336443901062, "learning_rate": 9.498658819781391e-06, "loss": 0.4931, "step": 926 }, { "epoch": 0.6890485629335976, "grad_norm": 0.49070051312446594, "learning_rate": 9.496769051545733e-06, "loss": 0.4654, "step": 927 }, { "epoch": 0.6897918731417245, "grad_norm": 0.47521910071372986, "learning_rate": 9.494875917048934e-06, "loss": 0.4496, "step": 928 }, { "epoch": 0.6905351833498513, "grad_norm": 0.4491052031517029, "learning_rate": 9.492979417708185e-06, "loss": 0.4339, "step": 929 }, { "epoch": 0.6912784935579782, "grad_norm": 0.599682629108429, "learning_rate": 9.491079554943197e-06, "loss": 0.476, "step": 930 }, { "epoch": 0.692021803766105, "grad_norm": 0.457552045583725, "learning_rate": 9.489176330176202e-06, "loss": 0.4886, "step": 931 }, { "epoch": 0.6927651139742319, "grad_norm": 0.4602850377559662, "learning_rate": 9.487269744831942e-06, "loss": 0.459, "step": 932 }, { "epoch": 0.6935084241823588, "grad_norm": 0.5094089508056641, "learning_rate": 9.485359800337678e-06, "loss": 0.4268, "step": 933 }, { "epoch": 0.6942517343904856, "grad_norm": 0.538470447063446, "learning_rate": 9.483446498123187e-06, "loss": 0.4708, "step": 934 }, { "epoch": 0.6949950445986125, "grad_norm": 0.6571008563041687, "learning_rate": 9.48152983962076e-06, "loss": 0.4699, "step": 935 }, { "epoch": 0.6957383548067394, "grad_norm": 0.501526415348053, "learning_rate": 9.479609826265196e-06, "loss": 0.4728, "step": 936 }, { "epoch": 0.6964816650148662, "grad_norm": 0.6597030162811279, "learning_rate": 9.477686459493811e-06, "loss": 0.4295, "step": 937 }, { "epoch": 0.6972249752229931, "grad_norm": 0.5226945877075195, "learning_rate": 9.47575974074643e-06, "loss": 0.4779, "step": 938 }, { "epoch": 0.6979682854311199, "grad_norm": 0.5816861987113953, "learning_rate": 9.47382967146538e-06, "loss": 0.4597, "step": 939 }, { "epoch": 0.6987115956392468, "grad_norm": 0.5452780723571777, "learning_rate": 9.471896253095507e-06, "loss": 0.4675, "step": 940 }, { "epoch": 0.6994549058473737, "grad_norm": 0.5364345908164978, "learning_rate": 9.469959487084159e-06, "loss": 0.4396, "step": 941 }, { "epoch": 0.7001982160555005, "grad_norm": 0.5398139953613281, "learning_rate": 9.468019374881187e-06, "loss": 0.4626, "step": 942 }, { "epoch": 0.7009415262636274, "grad_norm": 0.43898624181747437, "learning_rate": 9.466075917938955e-06, "loss": 0.4513, "step": 943 }, { "epoch": 0.7016848364717542, "grad_norm": 0.5783939957618713, "learning_rate": 9.464129117712324e-06, "loss": 0.4569, "step": 944 }, { "epoch": 0.7024281466798811, "grad_norm": 0.5002337098121643, "learning_rate": 9.46217897565866e-06, "loss": 0.4509, "step": 945 }, { "epoch": 0.703171456888008, "grad_norm": 0.5461890697479248, "learning_rate": 9.460225493237829e-06, "loss": 0.4337, "step": 946 }, { "epoch": 0.7039147670961348, "grad_norm": 0.6042929291725159, "learning_rate": 9.458268671912202e-06, "loss": 0.4393, "step": 947 }, { "epoch": 0.7046580773042617, "grad_norm": 0.46235227584838867, "learning_rate": 9.456308513146645e-06, "loss": 0.4334, "step": 948 }, { "epoch": 0.7054013875123885, "grad_norm": 0.5552744269371033, "learning_rate": 9.454345018408525e-06, "loss": 0.4653, "step": 949 }, { "epoch": 0.7061446977205154, "grad_norm": 0.5654956102371216, "learning_rate": 9.452378189167703e-06, "loss": 0.4799, "step": 950 }, { "epoch": 0.7068880079286423, "grad_norm": 0.48988935351371765, "learning_rate": 9.450408026896542e-06, "loss": 0.4674, "step": 951 }, { "epoch": 0.707631318136769, "grad_norm": 0.461681991815567, "learning_rate": 9.448434533069894e-06, "loss": 0.48, "step": 952 }, { "epoch": 0.708374628344896, "grad_norm": 0.590302586555481, "learning_rate": 9.446457709165109e-06, "loss": 0.4555, "step": 953 }, { "epoch": 0.7091179385530227, "grad_norm": 0.5557051301002502, "learning_rate": 9.444477556662028e-06, "loss": 0.4471, "step": 954 }, { "epoch": 0.7098612487611496, "grad_norm": 0.45610612630844116, "learning_rate": 9.442494077042985e-06, "loss": 0.4772, "step": 955 }, { "epoch": 0.7106045589692765, "grad_norm": 0.5471734404563904, "learning_rate": 9.440507271792803e-06, "loss": 0.4632, "step": 956 }, { "epoch": 0.7113478691774033, "grad_norm": 0.479922890663147, "learning_rate": 9.438517142398792e-06, "loss": 0.4354, "step": 957 }, { "epoch": 0.7120911793855302, "grad_norm": 0.43596532940864563, "learning_rate": 9.436523690350763e-06, "loss": 0.4581, "step": 958 }, { "epoch": 0.7128344895936571, "grad_norm": 0.4642520546913147, "learning_rate": 9.434526917140997e-06, "loss": 0.4436, "step": 959 }, { "epoch": 0.7135777998017839, "grad_norm": 0.4428408443927765, "learning_rate": 9.432526824264274e-06, "loss": 0.4837, "step": 960 }, { "epoch": 0.7143211100099108, "grad_norm": 0.5091699957847595, "learning_rate": 9.430523413217854e-06, "loss": 0.4707, "step": 961 }, { "epoch": 0.7150644202180376, "grad_norm": 0.43270573019981384, "learning_rate": 9.428516685501479e-06, "loss": 0.487, "step": 962 }, { "epoch": 0.7158077304261645, "grad_norm": 0.45919594168663025, "learning_rate": 9.426506642617377e-06, "loss": 0.4403, "step": 963 }, { "epoch": 0.7165510406342914, "grad_norm": 0.4532421827316284, "learning_rate": 9.424493286070259e-06, "loss": 0.4625, "step": 964 }, { "epoch": 0.7172943508424182, "grad_norm": 0.449047714471817, "learning_rate": 9.422476617367313e-06, "loss": 0.4668, "step": 965 }, { "epoch": 0.7180376610505451, "grad_norm": 0.44587281346321106, "learning_rate": 9.42045663801821e-06, "loss": 0.4683, "step": 966 }, { "epoch": 0.7187809712586719, "grad_norm": 0.48000311851501465, "learning_rate": 9.418433349535096e-06, "loss": 0.4619, "step": 967 }, { "epoch": 0.7195242814667988, "grad_norm": 0.4348175525665283, "learning_rate": 9.416406753432595e-06, "loss": 0.4876, "step": 968 }, { "epoch": 0.7202675916749257, "grad_norm": 0.4858129322528839, "learning_rate": 9.41437685122781e-06, "loss": 0.4436, "step": 969 }, { "epoch": 0.7210109018830525, "grad_norm": 0.45434442162513733, "learning_rate": 9.412343644440314e-06, "loss": 0.4564, "step": 970 }, { "epoch": 0.7217542120911794, "grad_norm": 0.44029858708381653, "learning_rate": 9.41030713459216e-06, "loss": 0.4528, "step": 971 }, { "epoch": 0.7224975222993062, "grad_norm": 0.4334816634654999, "learning_rate": 9.408267323207866e-06, "loss": 0.4526, "step": 972 }, { "epoch": 0.7232408325074331, "grad_norm": 0.49104952812194824, "learning_rate": 9.40622421181443e-06, "loss": 0.4675, "step": 973 }, { "epoch": 0.72398414271556, "grad_norm": 0.44157522916793823, "learning_rate": 9.40417780194131e-06, "loss": 0.4218, "step": 974 }, { "epoch": 0.7247274529236868, "grad_norm": 0.5173838138580322, "learning_rate": 9.402128095120446e-06, "loss": 0.4383, "step": 975 }, { "epoch": 0.7254707631318137, "grad_norm": 0.5465394854545593, "learning_rate": 9.400075092886234e-06, "loss": 0.4832, "step": 976 }, { "epoch": 0.7262140733399405, "grad_norm": 0.5964661836624146, "learning_rate": 9.398018796775548e-06, "loss": 0.4524, "step": 977 }, { "epoch": 0.7269573835480674, "grad_norm": 0.4812648296356201, "learning_rate": 9.395959208327715e-06, "loss": 0.4582, "step": 978 }, { "epoch": 0.7277006937561943, "grad_norm": 0.5539537668228149, "learning_rate": 9.39389632908454e-06, "loss": 0.467, "step": 979 }, { "epoch": 0.7284440039643211, "grad_norm": 0.4577666223049164, "learning_rate": 9.39183016059028e-06, "loss": 0.4303, "step": 980 }, { "epoch": 0.729187314172448, "grad_norm": 0.5313789248466492, "learning_rate": 9.389760704391664e-06, "loss": 0.4451, "step": 981 }, { "epoch": 0.7299306243805749, "grad_norm": 0.4308892488479614, "learning_rate": 9.387687962037874e-06, "loss": 0.4636, "step": 982 }, { "epoch": 0.7306739345887017, "grad_norm": 0.49718499183654785, "learning_rate": 9.385611935080559e-06, "loss": 0.4523, "step": 983 }, { "epoch": 0.7314172447968286, "grad_norm": 0.4787195026874542, "learning_rate": 9.383532625073818e-06, "loss": 0.4523, "step": 984 }, { "epoch": 0.7321605550049554, "grad_norm": 0.5157716274261475, "learning_rate": 9.381450033574219e-06, "loss": 0.4398, "step": 985 }, { "epoch": 0.7329038652130823, "grad_norm": 0.47888803482055664, "learning_rate": 9.37936416214078e-06, "loss": 0.4464, "step": 986 }, { "epoch": 0.7336471754212092, "grad_norm": 0.4751543700695038, "learning_rate": 9.377275012334972e-06, "loss": 0.4583, "step": 987 }, { "epoch": 0.734390485629336, "grad_norm": 0.49656417965888977, "learning_rate": 9.375182585720724e-06, "loss": 0.4676, "step": 988 }, { "epoch": 0.7351337958374629, "grad_norm": 0.45995622873306274, "learning_rate": 9.373086883864418e-06, "loss": 0.4756, "step": 989 }, { "epoch": 0.7358771060455896, "grad_norm": 0.5394033789634705, "learning_rate": 9.370987908334888e-06, "loss": 0.4831, "step": 990 }, { "epoch": 0.7366204162537165, "grad_norm": 0.44168105721473694, "learning_rate": 9.368885660703415e-06, "loss": 0.4717, "step": 991 }, { "epoch": 0.7373637264618434, "grad_norm": 0.5321037769317627, "learning_rate": 9.366780142543734e-06, "loss": 0.4632, "step": 992 }, { "epoch": 0.7381070366699702, "grad_norm": 0.4744819104671478, "learning_rate": 9.364671355432027e-06, "loss": 0.4718, "step": 993 }, { "epoch": 0.7388503468780971, "grad_norm": 0.49699294567108154, "learning_rate": 9.36255930094692e-06, "loss": 0.4418, "step": 994 }, { "epoch": 0.7395936570862239, "grad_norm": 0.45681872963905334, "learning_rate": 9.36044398066949e-06, "loss": 0.443, "step": 995 }, { "epoch": 0.7403369672943508, "grad_norm": 0.5793610215187073, "learning_rate": 9.358325396183254e-06, "loss": 0.4824, "step": 996 }, { "epoch": 0.7410802775024777, "grad_norm": 0.5281182527542114, "learning_rate": 9.356203549074178e-06, "loss": 0.4846, "step": 997 }, { "epoch": 0.7418235877106045, "grad_norm": 0.4560754597187042, "learning_rate": 9.354078440930665e-06, "loss": 0.4786, "step": 998 }, { "epoch": 0.7425668979187314, "grad_norm": 0.6355721950531006, "learning_rate": 9.351950073343563e-06, "loss": 0.4569, "step": 999 }, { "epoch": 0.7433102081268583, "grad_norm": 0.5297816395759583, "learning_rate": 9.349818447906155e-06, "loss": 0.4575, "step": 1000 }, { "epoch": 0.7440535183349851, "grad_norm": 0.4251619279384613, "learning_rate": 9.34768356621417e-06, "loss": 0.4516, "step": 1001 }, { "epoch": 0.744796828543112, "grad_norm": 0.5207258462905884, "learning_rate": 9.345545429865769e-06, "loss": 0.4384, "step": 1002 }, { "epoch": 0.7455401387512388, "grad_norm": 0.5795654058456421, "learning_rate": 9.343404040461551e-06, "loss": 0.4649, "step": 1003 }, { "epoch": 0.7462834489593657, "grad_norm": 0.42715856432914734, "learning_rate": 9.341259399604552e-06, "loss": 0.481, "step": 1004 }, { "epoch": 0.7470267591674926, "grad_norm": 0.5213301777839661, "learning_rate": 9.339111508900241e-06, "loss": 0.4756, "step": 1005 }, { "epoch": 0.7477700693756194, "grad_norm": 0.5206126570701599, "learning_rate": 9.336960369956516e-06, "loss": 0.4433, "step": 1006 }, { "epoch": 0.7485133795837463, "grad_norm": 0.717562735080719, "learning_rate": 9.334805984383713e-06, "loss": 0.4752, "step": 1007 }, { "epoch": 0.7492566897918731, "grad_norm": 0.5095377564430237, "learning_rate": 9.332648353794594e-06, "loss": 0.454, "step": 1008 }, { "epoch": 0.75, "grad_norm": 0.5031697154045105, "learning_rate": 9.330487479804352e-06, "loss": 0.4452, "step": 1009 }, { "epoch": 0.7507433102081269, "grad_norm": 0.5283638834953308, "learning_rate": 9.328323364030606e-06, "loss": 0.444, "step": 1010 }, { "epoch": 0.7514866204162537, "grad_norm": 0.5628459453582764, "learning_rate": 9.326156008093409e-06, "loss": 0.4521, "step": 1011 }, { "epoch": 0.7522299306243806, "grad_norm": 0.5420393347740173, "learning_rate": 9.323985413615226e-06, "loss": 0.4855, "step": 1012 }, { "epoch": 0.7529732408325074, "grad_norm": 0.5072373151779175, "learning_rate": 9.32181158222096e-06, "loss": 0.4573, "step": 1013 }, { "epoch": 0.7537165510406343, "grad_norm": 0.5136252641677856, "learning_rate": 9.319634515537929e-06, "loss": 0.4654, "step": 1014 }, { "epoch": 0.7544598612487612, "grad_norm": 0.5246591567993164, "learning_rate": 9.317454215195875e-06, "loss": 0.4764, "step": 1015 }, { "epoch": 0.755203171456888, "grad_norm": 0.5466529130935669, "learning_rate": 9.315270682826964e-06, "loss": 0.4624, "step": 1016 }, { "epoch": 0.7559464816650149, "grad_norm": 0.4817536771297455, "learning_rate": 9.313083920065777e-06, "loss": 0.4396, "step": 1017 }, { "epoch": 0.7566897918731417, "grad_norm": 0.47002938389778137, "learning_rate": 9.310893928549311e-06, "loss": 0.4692, "step": 1018 }, { "epoch": 0.7574331020812686, "grad_norm": 0.5388737916946411, "learning_rate": 9.30870070991699e-06, "loss": 0.4546, "step": 1019 }, { "epoch": 0.7581764122893955, "grad_norm": 0.4439884126186371, "learning_rate": 9.306504265810644e-06, "loss": 0.4651, "step": 1020 }, { "epoch": 0.7589197224975223, "grad_norm": 0.5051538944244385, "learning_rate": 9.304304597874522e-06, "loss": 0.4258, "step": 1021 }, { "epoch": 0.7596630327056492, "grad_norm": 0.559989333152771, "learning_rate": 9.302101707755284e-06, "loss": 0.4525, "step": 1022 }, { "epoch": 0.7604063429137761, "grad_norm": 0.5756279826164246, "learning_rate": 9.299895597102007e-06, "loss": 0.4625, "step": 1023 }, { "epoch": 0.7611496531219029, "grad_norm": 0.5083605051040649, "learning_rate": 9.297686267566176e-06, "loss": 0.4398, "step": 1024 }, { "epoch": 0.7618929633300298, "grad_norm": 0.5100540518760681, "learning_rate": 9.295473720801677e-06, "loss": 0.4395, "step": 1025 }, { "epoch": 0.7626362735381566, "grad_norm": 0.6141822338104248, "learning_rate": 9.293257958464823e-06, "loss": 0.4701, "step": 1026 }, { "epoch": 0.7633795837462835, "grad_norm": 0.51729416847229, "learning_rate": 9.291038982214317e-06, "loss": 0.4456, "step": 1027 }, { "epoch": 0.7641228939544104, "grad_norm": 0.5366793870925903, "learning_rate": 9.28881679371128e-06, "loss": 0.4624, "step": 1028 }, { "epoch": 0.7648662041625371, "grad_norm": 0.4644019305706024, "learning_rate": 9.286591394619227e-06, "loss": 0.4567, "step": 1029 }, { "epoch": 0.765609514370664, "grad_norm": 0.5432155728340149, "learning_rate": 9.284362786604083e-06, "loss": 0.456, "step": 1030 }, { "epoch": 0.7663528245787908, "grad_norm": 0.5464529395103455, "learning_rate": 9.282130971334176e-06, "loss": 0.4481, "step": 1031 }, { "epoch": 0.7670961347869177, "grad_norm": 0.5222446918487549, "learning_rate": 9.279895950480233e-06, "loss": 0.4567, "step": 1032 }, { "epoch": 0.7678394449950446, "grad_norm": 0.4935692548751831, "learning_rate": 9.277657725715378e-06, "loss": 0.4476, "step": 1033 }, { "epoch": 0.7685827552031714, "grad_norm": 0.5299301147460938, "learning_rate": 9.275416298715139e-06, "loss": 0.4354, "step": 1034 }, { "epoch": 0.7693260654112983, "grad_norm": 0.5952715873718262, "learning_rate": 9.273171671157435e-06, "loss": 0.4821, "step": 1035 }, { "epoch": 0.7700693756194251, "grad_norm": 0.5490743517875671, "learning_rate": 9.270923844722586e-06, "loss": 0.4774, "step": 1036 }, { "epoch": 0.770812685827552, "grad_norm": 0.5325977802276611, "learning_rate": 9.268672821093305e-06, "loss": 0.4945, "step": 1037 }, { "epoch": 0.7715559960356789, "grad_norm": 0.5105495452880859, "learning_rate": 9.266418601954698e-06, "loss": 0.4735, "step": 1038 }, { "epoch": 0.7722993062438057, "grad_norm": 0.6355246305465698, "learning_rate": 9.264161188994262e-06, "loss": 0.4915, "step": 1039 }, { "epoch": 0.7730426164519326, "grad_norm": 0.5190297365188599, "learning_rate": 9.26190058390189e-06, "loss": 0.4584, "step": 1040 }, { "epoch": 0.7737859266600595, "grad_norm": 0.5666778683662415, "learning_rate": 9.259636788369856e-06, "loss": 0.429, "step": 1041 }, { "epoch": 0.7745292368681863, "grad_norm": 0.4899786412715912, "learning_rate": 9.257369804092831e-06, "loss": 0.4382, "step": 1042 }, { "epoch": 0.7752725470763132, "grad_norm": 0.5233214497566223, "learning_rate": 9.255099632767865e-06, "loss": 0.4783, "step": 1043 }, { "epoch": 0.77601585728444, "grad_norm": 0.5474101305007935, "learning_rate": 9.252826276094402e-06, "loss": 0.452, "step": 1044 }, { "epoch": 0.7767591674925669, "grad_norm": 0.49262097477912903, "learning_rate": 9.250549735774266e-06, "loss": 0.4764, "step": 1045 }, { "epoch": 0.7775024777006938, "grad_norm": 0.6034062504768372, "learning_rate": 9.248270013511665e-06, "loss": 0.473, "step": 1046 }, { "epoch": 0.7782457879088206, "grad_norm": 0.5547810792922974, "learning_rate": 9.245987111013185e-06, "loss": 0.4667, "step": 1047 }, { "epoch": 0.7789890981169475, "grad_norm": 0.5466582775115967, "learning_rate": 9.243701029987801e-06, "loss": 0.4368, "step": 1048 }, { "epoch": 0.7797324083250743, "grad_norm": 0.44203323125839233, "learning_rate": 9.241411772146864e-06, "loss": 0.4797, "step": 1049 }, { "epoch": 0.7804757185332012, "grad_norm": 0.536811113357544, "learning_rate": 9.239119339204096e-06, "loss": 0.4645, "step": 1050 }, { "epoch": 0.7812190287413281, "grad_norm": 0.495128333568573, "learning_rate": 9.236823732875609e-06, "loss": 0.4592, "step": 1051 }, { "epoch": 0.7819623389494549, "grad_norm": 0.4977523386478424, "learning_rate": 9.234524954879879e-06, "loss": 0.4415, "step": 1052 }, { "epoch": 0.7827056491575818, "grad_norm": 0.5145884156227112, "learning_rate": 9.232223006937761e-06, "loss": 0.47, "step": 1053 }, { "epoch": 0.7834489593657086, "grad_norm": 0.4817182421684265, "learning_rate": 9.229917890772486e-06, "loss": 0.4467, "step": 1054 }, { "epoch": 0.7841922695738355, "grad_norm": 0.5318998694419861, "learning_rate": 9.227609608109651e-06, "loss": 0.4542, "step": 1055 }, { "epoch": 0.7849355797819624, "grad_norm": 0.5378772020339966, "learning_rate": 9.225298160677228e-06, "loss": 0.4817, "step": 1056 }, { "epoch": 0.7856788899900892, "grad_norm": 0.5109139680862427, "learning_rate": 9.222983550205553e-06, "loss": 0.4798, "step": 1057 }, { "epoch": 0.7864222001982161, "grad_norm": 0.48046842217445374, "learning_rate": 9.220665778427336e-06, "loss": 0.4709, "step": 1058 }, { "epoch": 0.7871655104063429, "grad_norm": 0.5126140713691711, "learning_rate": 9.21834484707765e-06, "loss": 0.4744, "step": 1059 }, { "epoch": 0.7879088206144698, "grad_norm": 0.462019145488739, "learning_rate": 9.216020757893933e-06, "loss": 0.4736, "step": 1060 }, { "epoch": 0.7886521308225967, "grad_norm": 0.5865974426269531, "learning_rate": 9.213693512615988e-06, "loss": 0.4773, "step": 1061 }, { "epoch": 0.7893954410307235, "grad_norm": 0.5216721296310425, "learning_rate": 9.21136311298598e-06, "loss": 0.4538, "step": 1062 }, { "epoch": 0.7901387512388504, "grad_norm": 0.4796646237373352, "learning_rate": 9.209029560748437e-06, "loss": 0.4558, "step": 1063 }, { "epoch": 0.7908820614469773, "grad_norm": 0.4897122383117676, "learning_rate": 9.206692857650245e-06, "loss": 0.4344, "step": 1064 }, { "epoch": 0.791625371655104, "grad_norm": 0.6594359278678894, "learning_rate": 9.20435300544065e-06, "loss": 0.4325, "step": 1065 }, { "epoch": 0.792368681863231, "grad_norm": 0.5068952441215515, "learning_rate": 9.202010005871253e-06, "loss": 0.4556, "step": 1066 }, { "epoch": 0.7931119920713577, "grad_norm": 0.538832426071167, "learning_rate": 9.199663860696014e-06, "loss": 0.4594, "step": 1067 }, { "epoch": 0.7938553022794846, "grad_norm": 0.5099045634269714, "learning_rate": 9.197314571671248e-06, "loss": 0.4374, "step": 1068 }, { "epoch": 0.7945986124876115, "grad_norm": 0.48046785593032837, "learning_rate": 9.194962140555621e-06, "loss": 0.4402, "step": 1069 }, { "epoch": 0.7953419226957383, "grad_norm": 0.5637921094894409, "learning_rate": 9.192606569110152e-06, "loss": 0.4758, "step": 1070 }, { "epoch": 0.7960852329038652, "grad_norm": 0.5110792517662048, "learning_rate": 9.190247859098214e-06, "loss": 0.4771, "step": 1071 }, { "epoch": 0.796828543111992, "grad_norm": 0.5054506063461304, "learning_rate": 9.187886012285522e-06, "loss": 0.4655, "step": 1072 }, { "epoch": 0.7975718533201189, "grad_norm": 0.4988269805908203, "learning_rate": 9.185521030440148e-06, "loss": 0.464, "step": 1073 }, { "epoch": 0.7983151635282458, "grad_norm": 0.5588865280151367, "learning_rate": 9.183152915332504e-06, "loss": 0.4475, "step": 1074 }, { "epoch": 0.7990584737363726, "grad_norm": 0.4610159993171692, "learning_rate": 9.180781668735353e-06, "loss": 0.4556, "step": 1075 }, { "epoch": 0.7998017839444995, "grad_norm": 0.4697340428829193, "learning_rate": 9.178407292423796e-06, "loss": 0.4468, "step": 1076 }, { "epoch": 0.8005450941526263, "grad_norm": 0.5227687954902649, "learning_rate": 9.176029788175285e-06, "loss": 0.462, "step": 1077 }, { "epoch": 0.8012884043607532, "grad_norm": 0.46274641156196594, "learning_rate": 9.173649157769606e-06, "loss": 0.4709, "step": 1078 }, { "epoch": 0.8020317145688801, "grad_norm": 0.5280256271362305, "learning_rate": 9.17126540298889e-06, "loss": 0.4586, "step": 1079 }, { "epoch": 0.8027750247770069, "grad_norm": 0.5487954020500183, "learning_rate": 9.168878525617601e-06, "loss": 0.4451, "step": 1080 }, { "epoch": 0.8035183349851338, "grad_norm": 0.4690883159637451, "learning_rate": 9.166488527442549e-06, "loss": 0.46, "step": 1081 }, { "epoch": 0.8042616451932606, "grad_norm": 0.5192556381225586, "learning_rate": 9.164095410252877e-06, "loss": 0.451, "step": 1082 }, { "epoch": 0.8050049554013875, "grad_norm": 0.5730867981910706, "learning_rate": 9.161699175840057e-06, "loss": 0.4672, "step": 1083 }, { "epoch": 0.8057482656095144, "grad_norm": 0.5203640460968018, "learning_rate": 9.159299825997903e-06, "loss": 0.4305, "step": 1084 }, { "epoch": 0.8064915758176412, "grad_norm": 0.5817157626152039, "learning_rate": 9.156897362522557e-06, "loss": 0.445, "step": 1085 }, { "epoch": 0.8072348860257681, "grad_norm": 0.5089548826217651, "learning_rate": 9.15449178721249e-06, "loss": 0.4449, "step": 1086 }, { "epoch": 0.807978196233895, "grad_norm": 0.5023419857025146, "learning_rate": 9.152083101868507e-06, "loss": 0.4521, "step": 1087 }, { "epoch": 0.8087215064420218, "grad_norm": 0.573520302772522, "learning_rate": 9.149671308293739e-06, "loss": 0.4624, "step": 1088 }, { "epoch": 0.8094648166501487, "grad_norm": 0.5104497671127319, "learning_rate": 9.147256408293643e-06, "loss": 0.4386, "step": 1089 }, { "epoch": 0.8102081268582755, "grad_norm": 0.5472707748413086, "learning_rate": 9.144838403676002e-06, "loss": 0.4486, "step": 1090 }, { "epoch": 0.8109514370664024, "grad_norm": 0.5311079621315002, "learning_rate": 9.142417296250926e-06, "loss": 0.4696, "step": 1091 }, { "epoch": 0.8116947472745293, "grad_norm": 0.5047554969787598, "learning_rate": 9.139993087830843e-06, "loss": 0.4872, "step": 1092 }, { "epoch": 0.8124380574826561, "grad_norm": 0.568343997001648, "learning_rate": 9.137565780230504e-06, "loss": 0.4792, "step": 1093 }, { "epoch": 0.813181367690783, "grad_norm": 0.5550546646118164, "learning_rate": 9.13513537526698e-06, "loss": 0.4465, "step": 1094 }, { "epoch": 0.8139246778989098, "grad_norm": 0.5016593933105469, "learning_rate": 9.132701874759667e-06, "loss": 0.4583, "step": 1095 }, { "epoch": 0.8146679881070367, "grad_norm": 0.6678863167762756, "learning_rate": 9.130265280530265e-06, "loss": 0.4947, "step": 1096 }, { "epoch": 0.8154112983151636, "grad_norm": 0.48445263504981995, "learning_rate": 9.127825594402804e-06, "loss": 0.4385, "step": 1097 }, { "epoch": 0.8161546085232904, "grad_norm": 0.5363125801086426, "learning_rate": 9.125382818203615e-06, "loss": 0.4437, "step": 1098 }, { "epoch": 0.8168979187314173, "grad_norm": 0.5694593191146851, "learning_rate": 9.122936953761356e-06, "loss": 0.4536, "step": 1099 }, { "epoch": 0.817641228939544, "grad_norm": 0.528186023235321, "learning_rate": 9.120488002906987e-06, "loss": 0.4849, "step": 1100 }, { "epoch": 0.818384539147671, "grad_norm": 0.5086936354637146, "learning_rate": 9.118035967473779e-06, "loss": 0.4397, "step": 1101 }, { "epoch": 0.8191278493557979, "grad_norm": 0.47481048107147217, "learning_rate": 9.11558084929732e-06, "loss": 0.4335, "step": 1102 }, { "epoch": 0.8198711595639246, "grad_norm": 0.5227126479148865, "learning_rate": 9.1131226502155e-06, "loss": 0.4476, "step": 1103 }, { "epoch": 0.8206144697720515, "grad_norm": 0.4567933976650238, "learning_rate": 9.110661372068513e-06, "loss": 0.4358, "step": 1104 }, { "epoch": 0.8213577799801784, "grad_norm": 0.46268343925476074, "learning_rate": 9.10819701669886e-06, "loss": 0.4299, "step": 1105 }, { "epoch": 0.8221010901883052, "grad_norm": 0.5040360689163208, "learning_rate": 9.105729585951348e-06, "loss": 0.4941, "step": 1106 }, { "epoch": 0.8228444003964321, "grad_norm": 0.48094284534454346, "learning_rate": 9.103259081673086e-06, "loss": 0.4211, "step": 1107 }, { "epoch": 0.8235877106045589, "grad_norm": 0.4670538902282715, "learning_rate": 9.10078550571348e-06, "loss": 0.4616, "step": 1108 }, { "epoch": 0.8243310208126858, "grad_norm": 0.47259265184402466, "learning_rate": 9.09830885992424e-06, "loss": 0.4635, "step": 1109 }, { "epoch": 0.8250743310208127, "grad_norm": 0.5086628198623657, "learning_rate": 9.095829146159373e-06, "loss": 0.4712, "step": 1110 }, { "epoch": 0.8258176412289395, "grad_norm": 0.5169985294342041, "learning_rate": 9.093346366275177e-06, "loss": 0.4567, "step": 1111 }, { "epoch": 0.8265609514370664, "grad_norm": 0.549599289894104, "learning_rate": 9.090860522130254e-06, "loss": 0.4523, "step": 1112 }, { "epoch": 0.8273042616451932, "grad_norm": 0.4981691837310791, "learning_rate": 9.088371615585492e-06, "loss": 0.4752, "step": 1113 }, { "epoch": 0.8280475718533201, "grad_norm": 0.6626493334770203, "learning_rate": 9.085879648504079e-06, "loss": 0.4546, "step": 1114 }, { "epoch": 0.828790882061447, "grad_norm": 0.5690227746963501, "learning_rate": 9.083384622751487e-06, "loss": 0.4707, "step": 1115 }, { "epoch": 0.8295341922695738, "grad_norm": 0.45774054527282715, "learning_rate": 9.080886540195486e-06, "loss": 0.4254, "step": 1116 }, { "epoch": 0.8302775024777007, "grad_norm": 0.5989899635314941, "learning_rate": 9.078385402706124e-06, "loss": 0.4305, "step": 1117 }, { "epoch": 0.8310208126858275, "grad_norm": 0.5454801917076111, "learning_rate": 9.075881212155745e-06, "loss": 0.4595, "step": 1118 }, { "epoch": 0.8317641228939544, "grad_norm": 0.5258079767227173, "learning_rate": 9.073373970418973e-06, "loss": 0.4555, "step": 1119 }, { "epoch": 0.8325074331020813, "grad_norm": 0.5341175198554993, "learning_rate": 9.070863679372716e-06, "loss": 0.4365, "step": 1120 }, { "epoch": 0.8332507433102081, "grad_norm": 0.5391564965248108, "learning_rate": 9.068350340896173e-06, "loss": 0.4669, "step": 1121 }, { "epoch": 0.833994053518335, "grad_norm": 0.5213730335235596, "learning_rate": 9.06583395687081e-06, "loss": 0.431, "step": 1122 }, { "epoch": 0.8347373637264618, "grad_norm": 0.4759930670261383, "learning_rate": 9.063314529180388e-06, "loss": 0.4556, "step": 1123 }, { "epoch": 0.8354806739345887, "grad_norm": 0.527105987071991, "learning_rate": 9.060792059710935e-06, "loss": 0.4793, "step": 1124 }, { "epoch": 0.8362239841427156, "grad_norm": 0.42739132046699524, "learning_rate": 9.05826655035076e-06, "loss": 0.4351, "step": 1125 }, { "epoch": 0.8369672943508424, "grad_norm": 0.5820596218109131, "learning_rate": 9.05573800299045e-06, "loss": 0.4309, "step": 1126 }, { "epoch": 0.8377106045589693, "grad_norm": 0.4800148010253906, "learning_rate": 9.053206419522865e-06, "loss": 0.4477, "step": 1127 }, { "epoch": 0.8384539147670962, "grad_norm": 0.4869455397129059, "learning_rate": 9.050671801843133e-06, "loss": 0.4606, "step": 1128 }, { "epoch": 0.839197224975223, "grad_norm": 0.5414069890975952, "learning_rate": 9.048134151848662e-06, "loss": 0.4461, "step": 1129 }, { "epoch": 0.8399405351833499, "grad_norm": 0.42889732122421265, "learning_rate": 9.045593471439122e-06, "loss": 0.4358, "step": 1130 }, { "epoch": 0.8406838453914767, "grad_norm": 0.5170581936836243, "learning_rate": 9.043049762516458e-06, "loss": 0.4726, "step": 1131 }, { "epoch": 0.8414271555996036, "grad_norm": 0.4866901636123657, "learning_rate": 9.040503026984876e-06, "loss": 0.4485, "step": 1132 }, { "epoch": 0.8421704658077305, "grad_norm": 0.4430224299430847, "learning_rate": 9.037953266750852e-06, "loss": 0.4659, "step": 1133 }, { "epoch": 0.8429137760158573, "grad_norm": 0.5231969356536865, "learning_rate": 9.035400483723127e-06, "loss": 0.4576, "step": 1134 }, { "epoch": 0.8436570862239842, "grad_norm": 0.5036267042160034, "learning_rate": 9.032844679812699e-06, "loss": 0.4743, "step": 1135 }, { "epoch": 0.844400396432111, "grad_norm": 0.4956863522529602, "learning_rate": 9.030285856932834e-06, "loss": 0.4725, "step": 1136 }, { "epoch": 0.8451437066402379, "grad_norm": 0.4976566731929779, "learning_rate": 9.027724016999056e-06, "loss": 0.4427, "step": 1137 }, { "epoch": 0.8458870168483648, "grad_norm": 0.4906451404094696, "learning_rate": 9.025159161929144e-06, "loss": 0.4646, "step": 1138 }, { "epoch": 0.8466303270564915, "grad_norm": 0.4398617148399353, "learning_rate": 9.02259129364314e-06, "loss": 0.4585, "step": 1139 }, { "epoch": 0.8473736372646185, "grad_norm": 0.4611119329929352, "learning_rate": 9.020020414063337e-06, "loss": 0.4715, "step": 1140 }, { "epoch": 0.8481169474727452, "grad_norm": 0.5590518116950989, "learning_rate": 9.017446525114285e-06, "loss": 0.4693, "step": 1141 }, { "epoch": 0.8488602576808721, "grad_norm": 0.43929898738861084, "learning_rate": 9.014869628722784e-06, "loss": 0.4612, "step": 1142 }, { "epoch": 0.849603567888999, "grad_norm": 0.5182512998580933, "learning_rate": 9.01228972681789e-06, "loss": 0.4545, "step": 1143 }, { "epoch": 0.8503468780971258, "grad_norm": 0.46792566776275635, "learning_rate": 9.0097068213309e-06, "loss": 0.4331, "step": 1144 }, { "epoch": 0.8510901883052527, "grad_norm": 0.5314329862594604, "learning_rate": 9.007120914195374e-06, "loss": 0.4438, "step": 1145 }, { "epoch": 0.8518334985133796, "grad_norm": 0.49262571334838867, "learning_rate": 9.004532007347104e-06, "loss": 0.4435, "step": 1146 }, { "epoch": 0.8525768087215064, "grad_norm": 0.4623616933822632, "learning_rate": 9.001940102724136e-06, "loss": 0.4399, "step": 1147 }, { "epoch": 0.8533201189296333, "grad_norm": 0.5029604434967041, "learning_rate": 8.99934520226676e-06, "loss": 0.4659, "step": 1148 }, { "epoch": 0.8540634291377601, "grad_norm": 0.42018771171569824, "learning_rate": 8.996747307917503e-06, "loss": 0.4638, "step": 1149 }, { "epoch": 0.854806739345887, "grad_norm": 0.49179691076278687, "learning_rate": 8.99414642162114e-06, "loss": 0.4586, "step": 1150 }, { "epoch": 0.8555500495540139, "grad_norm": 0.45434197783470154, "learning_rate": 8.991542545324684e-06, "loss": 0.4306, "step": 1151 }, { "epoch": 0.8562933597621407, "grad_norm": 0.4674937427043915, "learning_rate": 8.988935680977381e-06, "loss": 0.47, "step": 1152 }, { "epoch": 0.8570366699702676, "grad_norm": 0.6300725340843201, "learning_rate": 8.98632583053072e-06, "loss": 0.473, "step": 1153 }, { "epoch": 0.8577799801783944, "grad_norm": 0.5068283677101135, "learning_rate": 8.983712995938423e-06, "loss": 0.4393, "step": 1154 }, { "epoch": 0.8585232903865213, "grad_norm": 0.5238627791404724, "learning_rate": 8.981097179156448e-06, "loss": 0.4525, "step": 1155 }, { "epoch": 0.8592666005946482, "grad_norm": 0.4531859755516052, "learning_rate": 8.978478382142984e-06, "loss": 0.4042, "step": 1156 }, { "epoch": 0.860009910802775, "grad_norm": 0.48926523327827454, "learning_rate": 8.975856606858447e-06, "loss": 0.4614, "step": 1157 }, { "epoch": 0.8607532210109019, "grad_norm": 0.4896219074726105, "learning_rate": 8.973231855265488e-06, "loss": 0.427, "step": 1158 }, { "epoch": 0.8614965312190287, "grad_norm": 0.5090711116790771, "learning_rate": 8.970604129328984e-06, "loss": 0.418, "step": 1159 }, { "epoch": 0.8622398414271556, "grad_norm": 0.5038774609565735, "learning_rate": 8.96797343101604e-06, "loss": 0.4356, "step": 1160 }, { "epoch": 0.8629831516352825, "grad_norm": 0.4608532190322876, "learning_rate": 8.965339762295985e-06, "loss": 0.458, "step": 1161 }, { "epoch": 0.8637264618434093, "grad_norm": 0.5499937534332275, "learning_rate": 8.962703125140371e-06, "loss": 0.4992, "step": 1162 }, { "epoch": 0.8644697720515362, "grad_norm": 0.4386967718601227, "learning_rate": 8.960063521522973e-06, "loss": 0.4474, "step": 1163 }, { "epoch": 0.865213082259663, "grad_norm": 0.5277020931243896, "learning_rate": 8.957420953419784e-06, "loss": 0.4656, "step": 1164 }, { "epoch": 0.8659563924677899, "grad_norm": 0.4847412109375, "learning_rate": 8.954775422809023e-06, "loss": 0.4569, "step": 1165 }, { "epoch": 0.8666997026759168, "grad_norm": 0.4978225529193878, "learning_rate": 8.952126931671118e-06, "loss": 0.4546, "step": 1166 }, { "epoch": 0.8674430128840436, "grad_norm": 0.4870559573173523, "learning_rate": 8.949475481988723e-06, "loss": 0.4411, "step": 1167 }, { "epoch": 0.8681863230921705, "grad_norm": 0.5012298226356506, "learning_rate": 8.946821075746697e-06, "loss": 0.43, "step": 1168 }, { "epoch": 0.8689296333002974, "grad_norm": 0.44890281558036804, "learning_rate": 8.944163714932117e-06, "loss": 0.4379, "step": 1169 }, { "epoch": 0.8696729435084242, "grad_norm": 0.46615439653396606, "learning_rate": 8.94150340153427e-06, "loss": 0.467, "step": 1170 }, { "epoch": 0.8704162537165511, "grad_norm": 0.4876047968864441, "learning_rate": 8.93884013754466e-06, "loss": 0.4549, "step": 1171 }, { "epoch": 0.8711595639246779, "grad_norm": 0.4439481198787689, "learning_rate": 8.93617392495699e-06, "loss": 0.4651, "step": 1172 }, { "epoch": 0.8719028741328048, "grad_norm": 0.49054282903671265, "learning_rate": 8.933504765767176e-06, "loss": 0.4782, "step": 1173 }, { "epoch": 0.8726461843409317, "grad_norm": 0.569549024105072, "learning_rate": 8.930832661973338e-06, "loss": 0.4708, "step": 1174 }, { "epoch": 0.8733894945490585, "grad_norm": 0.461215078830719, "learning_rate": 8.928157615575802e-06, "loss": 0.4717, "step": 1175 }, { "epoch": 0.8741328047571854, "grad_norm": 0.5555558204650879, "learning_rate": 8.925479628577094e-06, "loss": 0.4667, "step": 1176 }, { "epoch": 0.8748761149653121, "grad_norm": 0.5762171745300293, "learning_rate": 8.922798702981947e-06, "loss": 0.4593, "step": 1177 }, { "epoch": 0.875619425173439, "grad_norm": 0.4543423652648926, "learning_rate": 8.920114840797285e-06, "loss": 0.4425, "step": 1178 }, { "epoch": 0.876362735381566, "grad_norm": 0.5606073141098022, "learning_rate": 8.917428044032237e-06, "loss": 0.4513, "step": 1179 }, { "epoch": 0.8771060455896927, "grad_norm": 0.48809272050857544, "learning_rate": 8.914738314698129e-06, "loss": 0.4482, "step": 1180 }, { "epoch": 0.8778493557978196, "grad_norm": 0.5963600873947144, "learning_rate": 8.91204565480848e-06, "loss": 0.4495, "step": 1181 }, { "epoch": 0.8785926660059464, "grad_norm": 0.4861907958984375, "learning_rate": 8.909350066378997e-06, "loss": 0.4735, "step": 1182 }, { "epoch": 0.8793359762140733, "grad_norm": 0.5377980470657349, "learning_rate": 8.906651551427595e-06, "loss": 0.4866, "step": 1183 }, { "epoch": 0.8800792864222002, "grad_norm": 0.5451145768165588, "learning_rate": 8.903950111974362e-06, "loss": 0.4782, "step": 1184 }, { "epoch": 0.880822596630327, "grad_norm": 0.45915475487709045, "learning_rate": 8.901245750041588e-06, "loss": 0.4774, "step": 1185 }, { "epoch": 0.8815659068384539, "grad_norm": 0.6150170564651489, "learning_rate": 8.898538467653746e-06, "loss": 0.4864, "step": 1186 }, { "epoch": 0.8823092170465807, "grad_norm": 0.563065767288208, "learning_rate": 8.895828266837493e-06, "loss": 0.4503, "step": 1187 }, { "epoch": 0.8830525272547076, "grad_norm": 0.5436647534370422, "learning_rate": 8.893115149621676e-06, "loss": 0.4441, "step": 1188 }, { "epoch": 0.8837958374628345, "grad_norm": 0.6101474761962891, "learning_rate": 8.89039911803732e-06, "loss": 0.452, "step": 1189 }, { "epoch": 0.8845391476709613, "grad_norm": 0.45844709873199463, "learning_rate": 8.887680174117634e-06, "loss": 0.4327, "step": 1190 }, { "epoch": 0.8852824578790882, "grad_norm": 0.5014277100563049, "learning_rate": 8.884958319898012e-06, "loss": 0.4607, "step": 1191 }, { "epoch": 0.8860257680872151, "grad_norm": 0.5075340867042542, "learning_rate": 8.882233557416016e-06, "loss": 0.4472, "step": 1192 }, { "epoch": 0.8867690782953419, "grad_norm": 0.4598458409309387, "learning_rate": 8.879505888711392e-06, "loss": 0.4675, "step": 1193 }, { "epoch": 0.8875123885034688, "grad_norm": 0.467473566532135, "learning_rate": 8.876775315826063e-06, "loss": 0.4445, "step": 1194 }, { "epoch": 0.8882556987115956, "grad_norm": 0.46067431569099426, "learning_rate": 8.874041840804122e-06, "loss": 0.4108, "step": 1195 }, { "epoch": 0.8889990089197225, "grad_norm": 0.5423702001571655, "learning_rate": 8.871305465691838e-06, "loss": 0.496, "step": 1196 }, { "epoch": 0.8897423191278494, "grad_norm": 0.43368107080459595, "learning_rate": 8.868566192537645e-06, "loss": 0.4857, "step": 1197 }, { "epoch": 0.8904856293359762, "grad_norm": 0.5051267743110657, "learning_rate": 8.865824023392156e-06, "loss": 0.4596, "step": 1198 }, { "epoch": 0.8912289395441031, "grad_norm": 0.49934688210487366, "learning_rate": 8.863078960308142e-06, "loss": 0.4542, "step": 1199 }, { "epoch": 0.8919722497522299, "grad_norm": 0.44184908270835876, "learning_rate": 8.860331005340545e-06, "loss": 0.4214, "step": 1200 }, { "epoch": 0.8927155599603568, "grad_norm": 0.47965800762176514, "learning_rate": 8.857580160546476e-06, "loss": 0.4065, "step": 1201 }, { "epoch": 0.8934588701684837, "grad_norm": 0.4283570349216461, "learning_rate": 8.8548264279852e-06, "loss": 0.4509, "step": 1202 }, { "epoch": 0.8942021803766105, "grad_norm": 0.47304266691207886, "learning_rate": 8.852069809718156e-06, "loss": 0.4469, "step": 1203 }, { "epoch": 0.8949454905847374, "grad_norm": 0.5314627885818481, "learning_rate": 8.849310307808928e-06, "loss": 0.4901, "step": 1204 }, { "epoch": 0.8956888007928642, "grad_norm": 0.5125458240509033, "learning_rate": 8.84654792432327e-06, "loss": 0.4494, "step": 1205 }, { "epoch": 0.8964321110009911, "grad_norm": 0.49056002497673035, "learning_rate": 8.84378266132909e-06, "loss": 0.4163, "step": 1206 }, { "epoch": 0.897175421209118, "grad_norm": 0.5613182187080383, "learning_rate": 8.841014520896452e-06, "loss": 0.4515, "step": 1207 }, { "epoch": 0.8979187314172448, "grad_norm": 0.4846705496311188, "learning_rate": 8.838243505097575e-06, "loss": 0.4546, "step": 1208 }, { "epoch": 0.8986620416253717, "grad_norm": 0.5487819314002991, "learning_rate": 8.835469616006824e-06, "loss": 0.4627, "step": 1209 }, { "epoch": 0.8994053518334986, "grad_norm": 0.4438391327857971, "learning_rate": 8.832692855700724e-06, "loss": 0.4535, "step": 1210 }, { "epoch": 0.9001486620416254, "grad_norm": 0.5400130152702332, "learning_rate": 8.829913226257944e-06, "loss": 0.4166, "step": 1211 }, { "epoch": 0.9008919722497523, "grad_norm": 0.4898989498615265, "learning_rate": 8.827130729759304e-06, "loss": 0.4465, "step": 1212 }, { "epoch": 0.901635282457879, "grad_norm": 0.4966314136981964, "learning_rate": 8.824345368287765e-06, "loss": 0.4502, "step": 1213 }, { "epoch": 0.902378592666006, "grad_norm": 0.49474477767944336, "learning_rate": 8.82155714392844e-06, "loss": 0.4448, "step": 1214 }, { "epoch": 0.9031219028741329, "grad_norm": 0.4562070071697235, "learning_rate": 8.818766058768575e-06, "loss": 0.4409, "step": 1215 }, { "epoch": 0.9038652130822596, "grad_norm": 0.4846962094306946, "learning_rate": 8.815972114897571e-06, "loss": 0.4473, "step": 1216 }, { "epoch": 0.9046085232903865, "grad_norm": 0.5864778757095337, "learning_rate": 8.813175314406958e-06, "loss": 0.4553, "step": 1217 }, { "epoch": 0.9053518334985133, "grad_norm": 0.43516939878463745, "learning_rate": 8.81037565939041e-06, "loss": 0.4784, "step": 1218 }, { "epoch": 0.9060951437066402, "grad_norm": 0.47418826818466187, "learning_rate": 8.807573151943734e-06, "loss": 0.4477, "step": 1219 }, { "epoch": 0.9068384539147671, "grad_norm": 0.6252117156982422, "learning_rate": 8.804767794164876e-06, "loss": 0.4575, "step": 1220 }, { "epoch": 0.9075817641228939, "grad_norm": 0.49491164088249207, "learning_rate": 8.801959588153916e-06, "loss": 0.4481, "step": 1221 }, { "epoch": 0.9083250743310208, "grad_norm": 0.46395161747932434, "learning_rate": 8.79914853601306e-06, "loss": 0.4425, "step": 1222 }, { "epoch": 0.9090683845391476, "grad_norm": 0.4495792090892792, "learning_rate": 8.796334639846653e-06, "loss": 0.4306, "step": 1223 }, { "epoch": 0.9098116947472745, "grad_norm": 0.5587769746780396, "learning_rate": 8.793517901761163e-06, "loss": 0.4581, "step": 1224 }, { "epoch": 0.9105550049554014, "grad_norm": 0.5096619725227356, "learning_rate": 8.790698323865187e-06, "loss": 0.4452, "step": 1225 }, { "epoch": 0.9112983151635282, "grad_norm": 0.5103654265403748, "learning_rate": 8.787875908269453e-06, "loss": 0.4541, "step": 1226 }, { "epoch": 0.9120416253716551, "grad_norm": 0.4789261221885681, "learning_rate": 8.7850506570868e-06, "loss": 0.469, "step": 1227 }, { "epoch": 0.9127849355797819, "grad_norm": 0.4233356714248657, "learning_rate": 8.782222572432207e-06, "loss": 0.444, "step": 1228 }, { "epoch": 0.9135282457879088, "grad_norm": 0.46189263463020325, "learning_rate": 8.77939165642276e-06, "loss": 0.4738, "step": 1229 }, { "epoch": 0.9142715559960357, "grad_norm": 0.5069364905357361, "learning_rate": 8.776557911177674e-06, "loss": 0.4664, "step": 1230 }, { "epoch": 0.9150148662041625, "grad_norm": 0.4914691746234894, "learning_rate": 8.773721338818276e-06, "loss": 0.4413, "step": 1231 }, { "epoch": 0.9157581764122894, "grad_norm": 0.459713339805603, "learning_rate": 8.770881941468008e-06, "loss": 0.4456, "step": 1232 }, { "epoch": 0.9165014866204163, "grad_norm": 0.515769362449646, "learning_rate": 8.768039721252435e-06, "loss": 0.4401, "step": 1233 }, { "epoch": 0.9172447968285431, "grad_norm": 0.46340784430503845, "learning_rate": 8.76519468029923e-06, "loss": 0.475, "step": 1234 }, { "epoch": 0.91798810703667, "grad_norm": 0.47998687624931335, "learning_rate": 8.762346820738175e-06, "loss": 0.469, "step": 1235 }, { "epoch": 0.9187314172447968, "grad_norm": 0.4271591007709503, "learning_rate": 8.759496144701168e-06, "loss": 0.4585, "step": 1236 }, { "epoch": 0.9194747274529237, "grad_norm": 0.5063279271125793, "learning_rate": 8.75664265432221e-06, "loss": 0.4406, "step": 1237 }, { "epoch": 0.9202180376610506, "grad_norm": 0.44423967599868774, "learning_rate": 8.753786351737412e-06, "loss": 0.4606, "step": 1238 }, { "epoch": 0.9209613478691774, "grad_norm": 0.43537411093711853, "learning_rate": 8.750927239084987e-06, "loss": 0.4257, "step": 1239 }, { "epoch": 0.9217046580773043, "grad_norm": 0.42236095666885376, "learning_rate": 8.748065318505258e-06, "loss": 0.4591, "step": 1240 }, { "epoch": 0.9224479682854311, "grad_norm": 0.44468823075294495, "learning_rate": 8.745200592140646e-06, "loss": 0.4674, "step": 1241 }, { "epoch": 0.923191278493558, "grad_norm": 0.45986247062683105, "learning_rate": 8.742333062135667e-06, "loss": 0.4446, "step": 1242 }, { "epoch": 0.9239345887016849, "grad_norm": 0.4433731138706207, "learning_rate": 8.739462730636945e-06, "loss": 0.4348, "step": 1243 }, { "epoch": 0.9246778989098117, "grad_norm": 0.43961191177368164, "learning_rate": 8.736589599793198e-06, "loss": 0.4471, "step": 1244 }, { "epoch": 0.9254212091179386, "grad_norm": 0.4242549538612366, "learning_rate": 8.733713671755237e-06, "loss": 0.435, "step": 1245 }, { "epoch": 0.9261645193260654, "grad_norm": 0.515942394733429, "learning_rate": 8.730834948675968e-06, "loss": 0.4502, "step": 1246 }, { "epoch": 0.9269078295341923, "grad_norm": 0.48062852025032043, "learning_rate": 8.727953432710394e-06, "loss": 0.4326, "step": 1247 }, { "epoch": 0.9276511397423192, "grad_norm": 0.4900757372379303, "learning_rate": 8.725069126015601e-06, "loss": 0.4729, "step": 1248 }, { "epoch": 0.928394449950446, "grad_norm": 0.49424275755882263, "learning_rate": 8.72218203075077e-06, "loss": 0.4537, "step": 1249 }, { "epoch": 0.9291377601585729, "grad_norm": 0.5026392340660095, "learning_rate": 8.719292149077166e-06, "loss": 0.4887, "step": 1250 }, { "epoch": 0.9298810703666998, "grad_norm": 0.42945653200149536, "learning_rate": 8.716399483158144e-06, "loss": 0.4391, "step": 1251 }, { "epoch": 0.9306243805748265, "grad_norm": 0.43093907833099365, "learning_rate": 8.71350403515914e-06, "loss": 0.4644, "step": 1252 }, { "epoch": 0.9313676907829535, "grad_norm": 0.4744725823402405, "learning_rate": 8.71060580724767e-06, "loss": 0.4426, "step": 1253 }, { "epoch": 0.9321110009910802, "grad_norm": 0.4402635991573334, "learning_rate": 8.707704801593339e-06, "loss": 0.4422, "step": 1254 }, { "epoch": 0.9328543111992071, "grad_norm": 0.48304325342178345, "learning_rate": 8.704801020367823e-06, "loss": 0.4412, "step": 1255 }, { "epoch": 0.933597621407334, "grad_norm": 0.5250988006591797, "learning_rate": 8.701894465744885e-06, "loss": 0.4735, "step": 1256 }, { "epoch": 0.9343409316154608, "grad_norm": 0.45650920271873474, "learning_rate": 8.698985139900352e-06, "loss": 0.4377, "step": 1257 }, { "epoch": 0.9350842418235877, "grad_norm": 0.602508544921875, "learning_rate": 8.696073045012136e-06, "loss": 0.4528, "step": 1258 }, { "epoch": 0.9358275520317145, "grad_norm": 0.5003390312194824, "learning_rate": 8.69315818326022e-06, "loss": 0.448, "step": 1259 }, { "epoch": 0.9365708622398414, "grad_norm": 0.5037537813186646, "learning_rate": 8.690240556826652e-06, "loss": 0.438, "step": 1260 }, { "epoch": 0.9373141724479683, "grad_norm": 0.6548896431922913, "learning_rate": 8.687320167895556e-06, "loss": 0.4572, "step": 1261 }, { "epoch": 0.9380574826560951, "grad_norm": 0.46693822741508484, "learning_rate": 8.684397018653124e-06, "loss": 0.4061, "step": 1262 }, { "epoch": 0.938800792864222, "grad_norm": 0.5943148732185364, "learning_rate": 8.681471111287609e-06, "loss": 0.4869, "step": 1263 }, { "epoch": 0.9395441030723488, "grad_norm": 0.5996972322463989, "learning_rate": 8.678542447989334e-06, "loss": 0.4534, "step": 1264 }, { "epoch": 0.9402874132804757, "grad_norm": 0.5401907563209534, "learning_rate": 8.675611030950684e-06, "loss": 0.4361, "step": 1265 }, { "epoch": 0.9410307234886026, "grad_norm": 0.6547409296035767, "learning_rate": 8.672676862366102e-06, "loss": 0.4607, "step": 1266 }, { "epoch": 0.9417740336967294, "grad_norm": 0.48992666602134705, "learning_rate": 8.669739944432096e-06, "loss": 0.452, "step": 1267 }, { "epoch": 0.9425173439048563, "grad_norm": 0.6138998866081238, "learning_rate": 8.666800279347229e-06, "loss": 0.4538, "step": 1268 }, { "epoch": 0.9432606541129831, "grad_norm": 0.4767806828022003, "learning_rate": 8.66385786931212e-06, "loss": 0.4592, "step": 1269 }, { "epoch": 0.94400396432111, "grad_norm": 0.4783853590488434, "learning_rate": 8.660912716529446e-06, "loss": 0.4599, "step": 1270 }, { "epoch": 0.9447472745292369, "grad_norm": 0.48707711696624756, "learning_rate": 8.657964823203932e-06, "loss": 0.4738, "step": 1271 }, { "epoch": 0.9454905847373637, "grad_norm": 0.4718414545059204, "learning_rate": 8.655014191542362e-06, "loss": 0.4818, "step": 1272 }, { "epoch": 0.9462338949454906, "grad_norm": 0.5359189510345459, "learning_rate": 8.652060823753562e-06, "loss": 0.471, "step": 1273 }, { "epoch": 0.9469772051536175, "grad_norm": 0.4098649322986603, "learning_rate": 8.649104722048412e-06, "loss": 0.473, "step": 1274 }, { "epoch": 0.9477205153617443, "grad_norm": 0.5605353116989136, "learning_rate": 8.646145888639834e-06, "loss": 0.4384, "step": 1275 }, { "epoch": 0.9484638255698712, "grad_norm": 0.4612846076488495, "learning_rate": 8.6431843257428e-06, "loss": 0.4324, "step": 1276 }, { "epoch": 0.949207135777998, "grad_norm": 0.4625771641731262, "learning_rate": 8.64022003557432e-06, "loss": 0.4374, "step": 1277 }, { "epoch": 0.9499504459861249, "grad_norm": 0.5782881379127502, "learning_rate": 8.637253020353454e-06, "loss": 0.436, "step": 1278 }, { "epoch": 0.9506937561942518, "grad_norm": 0.4762547016143799, "learning_rate": 8.634283282301289e-06, "loss": 0.4695, "step": 1279 }, { "epoch": 0.9514370664023786, "grad_norm": 0.4688927233219147, "learning_rate": 8.63131082364096e-06, "loss": 0.4853, "step": 1280 }, { "epoch": 0.9521803766105055, "grad_norm": 0.4737952649593353, "learning_rate": 8.62833564659764e-06, "loss": 0.4517, "step": 1281 }, { "epoch": 0.9529236868186323, "grad_norm": 0.4871763586997986, "learning_rate": 8.625357753398528e-06, "loss": 0.4621, "step": 1282 }, { "epoch": 0.9536669970267592, "grad_norm": 0.4182206988334656, "learning_rate": 8.622377146272865e-06, "loss": 0.4494, "step": 1283 }, { "epoch": 0.9544103072348861, "grad_norm": 0.4488937258720398, "learning_rate": 8.619393827451919e-06, "loss": 0.4598, "step": 1284 }, { "epoch": 0.9551536174430129, "grad_norm": 0.49460864067077637, "learning_rate": 8.61640779916899e-06, "loss": 0.4767, "step": 1285 }, { "epoch": 0.9558969276511398, "grad_norm": 0.4735492169857025, "learning_rate": 8.613419063659403e-06, "loss": 0.4469, "step": 1286 }, { "epoch": 0.9566402378592666, "grad_norm": 0.43344202637672424, "learning_rate": 8.610427623160513e-06, "loss": 0.4266, "step": 1287 }, { "epoch": 0.9573835480673935, "grad_norm": 0.49380236864089966, "learning_rate": 8.6074334799117e-06, "loss": 0.4604, "step": 1288 }, { "epoch": 0.9581268582755204, "grad_norm": 0.4432218372821808, "learning_rate": 8.604436636154365e-06, "loss": 0.4516, "step": 1289 }, { "epoch": 0.9588701684836471, "grad_norm": 0.45385652780532837, "learning_rate": 8.601437094131932e-06, "loss": 0.4705, "step": 1290 }, { "epoch": 0.959613478691774, "grad_norm": 0.47905436158180237, "learning_rate": 8.598434856089847e-06, "loss": 0.4222, "step": 1291 }, { "epoch": 0.9603567888999008, "grad_norm": 0.5267741680145264, "learning_rate": 8.595429924275569e-06, "loss": 0.4673, "step": 1292 }, { "epoch": 0.9611000991080277, "grad_norm": 0.4377823770046234, "learning_rate": 8.592422300938578e-06, "loss": 0.4244, "step": 1293 }, { "epoch": 0.9618434093161546, "grad_norm": 0.503892719745636, "learning_rate": 8.589411988330365e-06, "loss": 0.4606, "step": 1294 }, { "epoch": 0.9625867195242814, "grad_norm": 0.5026291012763977, "learning_rate": 8.58639898870444e-06, "loss": 0.4591, "step": 1295 }, { "epoch": 0.9633300297324083, "grad_norm": 0.432129830121994, "learning_rate": 8.583383304316319e-06, "loss": 0.4559, "step": 1296 }, { "epoch": 0.9640733399405352, "grad_norm": 0.4352560341358185, "learning_rate": 8.58036493742353e-06, "loss": 0.4657, "step": 1297 }, { "epoch": 0.964816650148662, "grad_norm": 0.4782082736492157, "learning_rate": 8.577343890285608e-06, "loss": 0.4594, "step": 1298 }, { "epoch": 0.9655599603567889, "grad_norm": 0.5171038508415222, "learning_rate": 8.574320165164098e-06, "loss": 0.4669, "step": 1299 }, { "epoch": 0.9663032705649157, "grad_norm": 0.601269543170929, "learning_rate": 8.571293764322543e-06, "loss": 0.4685, "step": 1300 }, { "epoch": 0.9670465807730426, "grad_norm": 0.5345183610916138, "learning_rate": 8.568264690026497e-06, "loss": 0.4633, "step": 1301 }, { "epoch": 0.9677898909811695, "grad_norm": 0.5090839266777039, "learning_rate": 8.56523294454351e-06, "loss": 0.4608, "step": 1302 }, { "epoch": 0.9685332011892963, "grad_norm": 0.502476155757904, "learning_rate": 8.562198530143133e-06, "loss": 0.4662, "step": 1303 }, { "epoch": 0.9692765113974232, "grad_norm": 0.5080944299697876, "learning_rate": 8.559161449096915e-06, "loss": 0.4583, "step": 1304 }, { "epoch": 0.97001982160555, "grad_norm": 0.45631662011146545, "learning_rate": 8.556121703678401e-06, "loss": 0.4609, "step": 1305 }, { "epoch": 0.9707631318136769, "grad_norm": 0.4417707026004791, "learning_rate": 8.55307929616313e-06, "loss": 0.4824, "step": 1306 }, { "epoch": 0.9715064420218038, "grad_norm": 0.43026241660118103, "learning_rate": 8.550034228828638e-06, "loss": 0.4116, "step": 1307 }, { "epoch": 0.9722497522299306, "grad_norm": 0.5246022343635559, "learning_rate": 8.546986503954447e-06, "loss": 0.4826, "step": 1308 }, { "epoch": 0.9729930624380575, "grad_norm": 0.4484599828720093, "learning_rate": 8.543936123822068e-06, "loss": 0.4434, "step": 1309 }, { "epoch": 0.9737363726461843, "grad_norm": 0.4515054523944855, "learning_rate": 8.540883090715003e-06, "loss": 0.4586, "step": 1310 }, { "epoch": 0.9744796828543112, "grad_norm": 0.4994641840457916, "learning_rate": 8.53782740691874e-06, "loss": 0.467, "step": 1311 }, { "epoch": 0.9752229930624381, "grad_norm": 0.608975350856781, "learning_rate": 8.534769074720749e-06, "loss": 0.46, "step": 1312 }, { "epoch": 0.9759663032705649, "grad_norm": 0.4882795512676239, "learning_rate": 8.531708096410485e-06, "loss": 0.4593, "step": 1313 }, { "epoch": 0.9767096134786918, "grad_norm": 0.5413875579833984, "learning_rate": 8.528644474279382e-06, "loss": 0.4327, "step": 1314 }, { "epoch": 0.9774529236868187, "grad_norm": 0.5837482213973999, "learning_rate": 8.525578210620853e-06, "loss": 0.4286, "step": 1315 }, { "epoch": 0.9781962338949455, "grad_norm": 0.4642443358898163, "learning_rate": 8.52250930773029e-06, "loss": 0.4685, "step": 1316 }, { "epoch": 0.9789395441030724, "grad_norm": 0.6055053472518921, "learning_rate": 8.51943776790506e-06, "loss": 0.4561, "step": 1317 }, { "epoch": 0.9796828543111992, "grad_norm": 0.4839946925640106, "learning_rate": 8.516363593444506e-06, "loss": 0.4917, "step": 1318 }, { "epoch": 0.9804261645193261, "grad_norm": 0.5379674434661865, "learning_rate": 8.513286786649937e-06, "loss": 0.457, "step": 1319 }, { "epoch": 0.981169474727453, "grad_norm": 0.5623194575309753, "learning_rate": 8.510207349824637e-06, "loss": 0.4525, "step": 1320 }, { "epoch": 0.9819127849355798, "grad_norm": 0.4606481194496155, "learning_rate": 8.507125285273863e-06, "loss": 0.4507, "step": 1321 }, { "epoch": 0.9826560951437067, "grad_norm": 0.5391710996627808, "learning_rate": 8.504040595304833e-06, "loss": 0.4133, "step": 1322 }, { "epoch": 0.9833994053518335, "grad_norm": 0.5692965984344482, "learning_rate": 8.50095328222673e-06, "loss": 0.4588, "step": 1323 }, { "epoch": 0.9841427155599604, "grad_norm": 0.48797178268432617, "learning_rate": 8.497863348350701e-06, "loss": 0.4731, "step": 1324 }, { "epoch": 0.9848860257680873, "grad_norm": 0.5278564095497131, "learning_rate": 8.494770795989863e-06, "loss": 0.4748, "step": 1325 }, { "epoch": 0.985629335976214, "grad_norm": 0.5556752681732178, "learning_rate": 8.491675627459282e-06, "loss": 0.4652, "step": 1326 }, { "epoch": 0.986372646184341, "grad_norm": 0.4650265574455261, "learning_rate": 8.488577845075989e-06, "loss": 0.4502, "step": 1327 }, { "epoch": 0.9871159563924677, "grad_norm": 0.4959860146045685, "learning_rate": 8.48547745115897e-06, "loss": 0.4517, "step": 1328 }, { "epoch": 0.9878592666005946, "grad_norm": 0.5210519433021545, "learning_rate": 8.482374448029164e-06, "loss": 0.4362, "step": 1329 }, { "epoch": 0.9886025768087215, "grad_norm": 0.49434223771095276, "learning_rate": 8.479268838009469e-06, "loss": 0.4618, "step": 1330 }, { "epoch": 0.9893458870168483, "grad_norm": 0.4834180176258087, "learning_rate": 8.476160623424727e-06, "loss": 0.4454, "step": 1331 }, { "epoch": 0.9900891972249752, "grad_norm": 0.5321545004844666, "learning_rate": 8.473049806601736e-06, "loss": 0.4477, "step": 1332 }, { "epoch": 0.990832507433102, "grad_norm": 0.5302188992500305, "learning_rate": 8.469936389869239e-06, "loss": 0.449, "step": 1333 }, { "epoch": 0.9915758176412289, "grad_norm": 0.48079609870910645, "learning_rate": 8.466820375557925e-06, "loss": 0.4397, "step": 1334 }, { "epoch": 0.9923191278493558, "grad_norm": 0.47685831785202026, "learning_rate": 8.463701766000427e-06, "loss": 0.4625, "step": 1335 }, { "epoch": 0.9930624380574826, "grad_norm": 0.47755560278892517, "learning_rate": 8.460580563531325e-06, "loss": 0.4396, "step": 1336 }, { "epoch": 0.9938057482656095, "grad_norm": 0.43397071957588196, "learning_rate": 8.457456770487137e-06, "loss": 0.4571, "step": 1337 }, { "epoch": 0.9945490584737364, "grad_norm": 0.48438557982444763, "learning_rate": 8.454330389206318e-06, "loss": 0.4411, "step": 1338 }, { "epoch": 0.9952923686818632, "grad_norm": 0.45903024077415466, "learning_rate": 8.451201422029264e-06, "loss": 0.4503, "step": 1339 }, { "epoch": 0.9960356788899901, "grad_norm": 0.5708373785018921, "learning_rate": 8.448069871298308e-06, "loss": 0.4612, "step": 1340 }, { "epoch": 0.9967789890981169, "grad_norm": 0.45725566148757935, "learning_rate": 8.444935739357714e-06, "loss": 0.434, "step": 1341 }, { "epoch": 0.9975222993062438, "grad_norm": 0.5437684059143066, "learning_rate": 8.441799028553677e-06, "loss": 0.4641, "step": 1342 }, { "epoch": 0.9982656095143707, "grad_norm": 0.4959586262702942, "learning_rate": 8.438659741234325e-06, "loss": 0.4547, "step": 1343 }, { "epoch": 0.9990089197224975, "grad_norm": 0.447900652885437, "learning_rate": 8.435517879749716e-06, "loss": 0.4533, "step": 1344 }, { "epoch": 0.9997522299306244, "grad_norm": 0.5647950172424316, "learning_rate": 8.432373446451832e-06, "loss": 0.424, "step": 1345 }, { "epoch": 1.0004955401387512, "grad_norm": 0.9216330647468567, "learning_rate": 8.429226443694582e-06, "loss": 0.7886, "step": 1346 }, { "epoch": 1.001238850346878, "grad_norm": 0.5575891733169556, "learning_rate": 8.4260768738338e-06, "loss": 0.4159, "step": 1347 }, { "epoch": 1.001982160555005, "grad_norm": 0.6077316999435425, "learning_rate": 8.422924739227238e-06, "loss": 0.4257, "step": 1348 }, { "epoch": 1.002725470763132, "grad_norm": 0.4746643304824829, "learning_rate": 8.41977004223457e-06, "loss": 0.3779, "step": 1349 }, { "epoch": 1.0034687809712586, "grad_norm": 0.7446602582931519, "learning_rate": 8.416612785217387e-06, "loss": 0.4497, "step": 1350 }, { "epoch": 1.0042120911793855, "grad_norm": 0.47406822443008423, "learning_rate": 8.4134529705392e-06, "loss": 0.4119, "step": 1351 }, { "epoch": 1.0049554013875124, "grad_norm": 0.5878908038139343, "learning_rate": 8.41029060056543e-06, "loss": 0.4434, "step": 1352 }, { "epoch": 1.0056987115956393, "grad_norm": 0.6395425200462341, "learning_rate": 8.407125677663415e-06, "loss": 0.4519, "step": 1353 }, { "epoch": 1.0064420218037662, "grad_norm": 0.41794589161872864, "learning_rate": 8.403958204202401e-06, "loss": 0.3862, "step": 1354 }, { "epoch": 1.0071853320118929, "grad_norm": 0.5851595997810364, "learning_rate": 8.40078818255354e-06, "loss": 0.4012, "step": 1355 }, { "epoch": 1.0079286422200198, "grad_norm": 0.6615515947341919, "learning_rate": 8.397615615089905e-06, "loss": 0.4338, "step": 1356 }, { "epoch": 1.0086719524281467, "grad_norm": 0.5219057202339172, "learning_rate": 8.394440504186458e-06, "loss": 0.4374, "step": 1357 }, { "epoch": 1.0094152626362736, "grad_norm": 0.5950163006782532, "learning_rate": 8.391262852220079e-06, "loss": 0.4551, "step": 1358 }, { "epoch": 1.0101585728444005, "grad_norm": 0.527938187122345, "learning_rate": 8.388082661569536e-06, "loss": 0.4393, "step": 1359 }, { "epoch": 1.0109018830525272, "grad_norm": 0.5478428602218628, "learning_rate": 8.384899934615514e-06, "loss": 0.4194, "step": 1360 }, { "epoch": 1.011645193260654, "grad_norm": 0.4219658076763153, "learning_rate": 8.381714673740584e-06, "loss": 0.3831, "step": 1361 }, { "epoch": 1.012388503468781, "grad_norm": 0.6353829503059387, "learning_rate": 8.378526881329217e-06, "loss": 0.43, "step": 1362 }, { "epoch": 1.0131318136769079, "grad_norm": 0.5453194379806519, "learning_rate": 8.375336559767782e-06, "loss": 0.4247, "step": 1363 }, { "epoch": 1.0138751238850348, "grad_norm": 0.5197610855102539, "learning_rate": 8.37214371144454e-06, "loss": 0.4731, "step": 1364 }, { "epoch": 1.0146184340931614, "grad_norm": 0.5584667325019836, "learning_rate": 8.368948338749641e-06, "loss": 0.3906, "step": 1365 }, { "epoch": 1.0153617443012883, "grad_norm": 0.4456656277179718, "learning_rate": 8.365750444075126e-06, "loss": 0.3976, "step": 1366 }, { "epoch": 1.0161050545094152, "grad_norm": 0.5502115488052368, "learning_rate": 8.362550029814926e-06, "loss": 0.4085, "step": 1367 }, { "epoch": 1.0168483647175421, "grad_norm": 0.5143947005271912, "learning_rate": 8.359347098364859e-06, "loss": 0.4587, "step": 1368 }, { "epoch": 1.017591674925669, "grad_norm": 0.3863092362880707, "learning_rate": 8.356141652122623e-06, "loss": 0.3695, "step": 1369 }, { "epoch": 1.018334985133796, "grad_norm": 0.5944133400917053, "learning_rate": 8.3529336934878e-06, "loss": 0.4546, "step": 1370 }, { "epoch": 1.0190782953419226, "grad_norm": 0.4469628632068634, "learning_rate": 8.349723224861852e-06, "loss": 0.4536, "step": 1371 }, { "epoch": 1.0198216055500495, "grad_norm": 0.4945685565471649, "learning_rate": 8.346510248648125e-06, "loss": 0.4525, "step": 1372 }, { "epoch": 1.0205649157581764, "grad_norm": 0.4558737277984619, "learning_rate": 8.343294767251838e-06, "loss": 0.3861, "step": 1373 }, { "epoch": 1.0213082259663033, "grad_norm": 0.4560452997684479, "learning_rate": 8.340076783080084e-06, "loss": 0.4743, "step": 1374 }, { "epoch": 1.0220515361744302, "grad_norm": 0.41187193989753723, "learning_rate": 8.336856298541834e-06, "loss": 0.3844, "step": 1375 }, { "epoch": 1.022794846382557, "grad_norm": 0.41694924235343933, "learning_rate": 8.333633316047925e-06, "loss": 0.3886, "step": 1376 }, { "epoch": 1.0235381565906838, "grad_norm": 0.5973553657531738, "learning_rate": 8.33040783801107e-06, "loss": 0.4608, "step": 1377 }, { "epoch": 1.0242814667988107, "grad_norm": 0.41343387961387634, "learning_rate": 8.327179866845846e-06, "loss": 0.3829, "step": 1378 }, { "epoch": 1.0250247770069376, "grad_norm": 0.5074083209037781, "learning_rate": 8.323949404968696e-06, "loss": 0.4635, "step": 1379 }, { "epoch": 1.0257680872150645, "grad_norm": 0.40307262539863586, "learning_rate": 8.320716454797934e-06, "loss": 0.3831, "step": 1380 }, { "epoch": 1.0265113974231912, "grad_norm": 0.5049639344215393, "learning_rate": 8.31748101875373e-06, "loss": 0.4466, "step": 1381 }, { "epoch": 1.027254707631318, "grad_norm": 0.5547864437103271, "learning_rate": 8.314243099258114e-06, "loss": 0.4357, "step": 1382 }, { "epoch": 1.027998017839445, "grad_norm": 0.4714955687522888, "learning_rate": 8.311002698734982e-06, "loss": 0.4058, "step": 1383 }, { "epoch": 1.028741328047572, "grad_norm": 0.4908568263053894, "learning_rate": 8.307759819610081e-06, "loss": 0.4334, "step": 1384 }, { "epoch": 1.0294846382556988, "grad_norm": 0.5381824970245361, "learning_rate": 8.304514464311017e-06, "loss": 0.379, "step": 1385 }, { "epoch": 1.0302279484638255, "grad_norm": 0.5456455945968628, "learning_rate": 8.301266635267248e-06, "loss": 0.4046, "step": 1386 }, { "epoch": 1.0309712586719524, "grad_norm": 0.438999205827713, "learning_rate": 8.298016334910085e-06, "loss": 0.407, "step": 1387 }, { "epoch": 1.0317145688800793, "grad_norm": 0.6798426508903503, "learning_rate": 8.294763565672684e-06, "loss": 0.4008, "step": 1388 }, { "epoch": 1.0324578790882062, "grad_norm": 0.5314680933952332, "learning_rate": 8.291508329990059e-06, "loss": 0.4141, "step": 1389 }, { "epoch": 1.033201189296333, "grad_norm": 0.5397836565971375, "learning_rate": 8.288250630299063e-06, "loss": 0.3872, "step": 1390 }, { "epoch": 1.0339444995044598, "grad_norm": 0.4667796492576599, "learning_rate": 8.284990469038395e-06, "loss": 0.3788, "step": 1391 }, { "epoch": 1.0346878097125867, "grad_norm": 0.45833078026771545, "learning_rate": 8.281727848648596e-06, "loss": 0.4425, "step": 1392 }, { "epoch": 1.0354311199207136, "grad_norm": 0.5301562547683716, "learning_rate": 8.27846277157205e-06, "loss": 0.4686, "step": 1393 }, { "epoch": 1.0361744301288405, "grad_norm": 0.4260234534740448, "learning_rate": 8.27519524025298e-06, "loss": 0.3304, "step": 1394 }, { "epoch": 1.0369177403369674, "grad_norm": 0.5242775678634644, "learning_rate": 8.271925257137445e-06, "loss": 0.4841, "step": 1395 }, { "epoch": 1.037661050545094, "grad_norm": 0.42162254452705383, "learning_rate": 8.268652824673337e-06, "loss": 0.3531, "step": 1396 }, { "epoch": 1.038404360753221, "grad_norm": 0.5454967617988586, "learning_rate": 8.265377945310388e-06, "loss": 0.4021, "step": 1397 }, { "epoch": 1.0391476709613479, "grad_norm": 0.5066623687744141, "learning_rate": 8.262100621500157e-06, "loss": 0.4761, "step": 1398 }, { "epoch": 1.0398909811694748, "grad_norm": 0.41770458221435547, "learning_rate": 8.258820855696032e-06, "loss": 0.3965, "step": 1399 }, { "epoch": 1.0406342913776017, "grad_norm": 0.5548464059829712, "learning_rate": 8.255538650353235e-06, "loss": 0.4762, "step": 1400 }, { "epoch": 1.0413776015857283, "grad_norm": 0.5090433359146118, "learning_rate": 8.252254007928804e-06, "loss": 0.4291, "step": 1401 }, { "epoch": 1.0421209117938552, "grad_norm": 0.404183954000473, "learning_rate": 8.248966930881612e-06, "loss": 0.382, "step": 1402 }, { "epoch": 1.0428642220019821, "grad_norm": 0.635075569152832, "learning_rate": 8.245677421672351e-06, "loss": 0.4292, "step": 1403 }, { "epoch": 1.043607532210109, "grad_norm": 0.5546230673789978, "learning_rate": 8.242385482763528e-06, "loss": 0.4294, "step": 1404 }, { "epoch": 1.044350842418236, "grad_norm": 0.4595355987548828, "learning_rate": 8.239091116619478e-06, "loss": 0.3802, "step": 1405 }, { "epoch": 1.0450941526263628, "grad_norm": 0.6122593879699707, "learning_rate": 8.235794325706347e-06, "loss": 0.4694, "step": 1406 }, { "epoch": 1.0458374628344895, "grad_norm": 0.4862074553966522, "learning_rate": 8.2324951124921e-06, "loss": 0.4512, "step": 1407 }, { "epoch": 1.0465807730426164, "grad_norm": 0.3967507779598236, "learning_rate": 8.229193479446511e-06, "loss": 0.3775, "step": 1408 }, { "epoch": 1.0473240832507433, "grad_norm": 0.5546627044677734, "learning_rate": 8.225889429041167e-06, "loss": 0.4495, "step": 1409 }, { "epoch": 1.0480673934588702, "grad_norm": 0.45290419459342957, "learning_rate": 8.222582963749468e-06, "loss": 0.4455, "step": 1410 }, { "epoch": 1.0488107036669971, "grad_norm": 0.41978907585144043, "learning_rate": 8.219274086046616e-06, "loss": 0.3924, "step": 1411 }, { "epoch": 1.0495540138751238, "grad_norm": 0.49103423953056335, "learning_rate": 8.215962798409624e-06, "loss": 0.397, "step": 1412 }, { "epoch": 1.0502973240832507, "grad_norm": 0.5135620832443237, "learning_rate": 8.21264910331731e-06, "loss": 0.4154, "step": 1413 }, { "epoch": 1.0510406342913776, "grad_norm": 0.4605289101600647, "learning_rate": 8.209333003250284e-06, "loss": 0.424, "step": 1414 }, { "epoch": 1.0517839444995045, "grad_norm": 0.553627073764801, "learning_rate": 8.20601450069097e-06, "loss": 0.4378, "step": 1415 }, { "epoch": 1.0525272547076314, "grad_norm": 0.5135162472724915, "learning_rate": 8.20269359812358e-06, "loss": 0.4263, "step": 1416 }, { "epoch": 1.053270564915758, "grad_norm": 0.4700072407722473, "learning_rate": 8.199370298034129e-06, "loss": 0.444, "step": 1417 }, { "epoch": 1.054013875123885, "grad_norm": 0.5117049813270569, "learning_rate": 8.196044602910425e-06, "loss": 0.4119, "step": 1418 }, { "epoch": 1.054757185332012, "grad_norm": 0.5036429166793823, "learning_rate": 8.192716515242067e-06, "loss": 0.4062, "step": 1419 }, { "epoch": 1.0555004955401388, "grad_norm": 0.4832095801830292, "learning_rate": 8.189386037520446e-06, "loss": 0.4059, "step": 1420 }, { "epoch": 1.0562438057482657, "grad_norm": 0.5160186290740967, "learning_rate": 8.186053172238747e-06, "loss": 0.4479, "step": 1421 }, { "epoch": 1.0569871159563924, "grad_norm": 0.46740061044692993, "learning_rate": 8.182717921891932e-06, "loss": 0.4195, "step": 1422 }, { "epoch": 1.0577304261645193, "grad_norm": 0.5861173272132874, "learning_rate": 8.179380288976757e-06, "loss": 0.4127, "step": 1423 }, { "epoch": 1.0584737363726462, "grad_norm": 0.5086235404014587, "learning_rate": 8.17604027599176e-06, "loss": 0.4155, "step": 1424 }, { "epoch": 1.059217046580773, "grad_norm": 0.40125370025634766, "learning_rate": 8.172697885437257e-06, "loss": 0.4047, "step": 1425 }, { "epoch": 1.0599603567889, "grad_norm": 0.5538062453269958, "learning_rate": 8.169353119815351e-06, "loss": 0.4255, "step": 1426 }, { "epoch": 1.0607036669970267, "grad_norm": 0.4666023254394531, "learning_rate": 8.166005981629915e-06, "loss": 0.4017, "step": 1427 }, { "epoch": 1.0614469772051536, "grad_norm": 0.5683228969573975, "learning_rate": 8.162656473386603e-06, "loss": 0.4562, "step": 1428 }, { "epoch": 1.0621902874132805, "grad_norm": 0.5142205953598022, "learning_rate": 8.159304597592841e-06, "loss": 0.4048, "step": 1429 }, { "epoch": 1.0629335976214074, "grad_norm": 0.4328297972679138, "learning_rate": 8.155950356757829e-06, "loss": 0.385, "step": 1430 }, { "epoch": 1.0636769078295343, "grad_norm": 0.5874948501586914, "learning_rate": 8.152593753392536e-06, "loss": 0.4672, "step": 1431 }, { "epoch": 1.064420218037661, "grad_norm": 0.5124465823173523, "learning_rate": 8.149234790009701e-06, "loss": 0.4148, "step": 1432 }, { "epoch": 1.0651635282457879, "grad_norm": 0.49608999490737915, "learning_rate": 8.145873469123828e-06, "loss": 0.3911, "step": 1433 }, { "epoch": 1.0659068384539148, "grad_norm": 0.44521215558052063, "learning_rate": 8.142509793251188e-06, "loss": 0.4424, "step": 1434 }, { "epoch": 1.0666501486620417, "grad_norm": 0.528067946434021, "learning_rate": 8.139143764909814e-06, "loss": 0.451, "step": 1435 }, { "epoch": 1.0673934588701686, "grad_norm": 0.48761802911758423, "learning_rate": 8.135775386619496e-06, "loss": 0.4204, "step": 1436 }, { "epoch": 1.0681367690782952, "grad_norm": 0.4936544895172119, "learning_rate": 8.13240466090179e-06, "loss": 0.4414, "step": 1437 }, { "epoch": 1.0688800792864221, "grad_norm": 0.4856885075569153, "learning_rate": 8.129031590280008e-06, "loss": 0.3929, "step": 1438 }, { "epoch": 1.069623389494549, "grad_norm": 0.5376114845275879, "learning_rate": 8.12565617727921e-06, "loss": 0.3955, "step": 1439 }, { "epoch": 1.070366699702676, "grad_norm": 0.4625203609466553, "learning_rate": 8.12227842442622e-06, "loss": 0.418, "step": 1440 }, { "epoch": 1.0711100099108029, "grad_norm": 0.44861915707588196, "learning_rate": 8.118898334249607e-06, "loss": 0.4185, "step": 1441 }, { "epoch": 1.0718533201189295, "grad_norm": 0.4843178689479828, "learning_rate": 8.115515909279689e-06, "loss": 0.4242, "step": 1442 }, { "epoch": 1.0725966303270564, "grad_norm": 0.48102647066116333, "learning_rate": 8.11213115204854e-06, "loss": 0.4576, "step": 1443 }, { "epoch": 1.0733399405351833, "grad_norm": 0.3811863660812378, "learning_rate": 8.108744065089967e-06, "loss": 0.373, "step": 1444 }, { "epoch": 1.0740832507433102, "grad_norm": 0.5195821523666382, "learning_rate": 8.105354650939532e-06, "loss": 0.4527, "step": 1445 }, { "epoch": 1.0748265609514371, "grad_norm": 0.4228498041629791, "learning_rate": 8.101962912134538e-06, "loss": 0.4028, "step": 1446 }, { "epoch": 1.0755698711595638, "grad_norm": 0.48948994278907776, "learning_rate": 8.098568851214019e-06, "loss": 0.4364, "step": 1447 }, { "epoch": 1.0763131813676907, "grad_norm": 0.4462527930736542, "learning_rate": 8.095172470718757e-06, "loss": 0.4135, "step": 1448 }, { "epoch": 1.0770564915758176, "grad_norm": 0.43565213680267334, "learning_rate": 8.09177377319127e-06, "loss": 0.4082, "step": 1449 }, { "epoch": 1.0777998017839445, "grad_norm": 0.4580375552177429, "learning_rate": 8.088372761175803e-06, "loss": 0.434, "step": 1450 }, { "epoch": 1.0785431119920714, "grad_norm": 0.48926839232444763, "learning_rate": 8.08496943721834e-06, "loss": 0.4328, "step": 1451 }, { "epoch": 1.079286422200198, "grad_norm": 0.5281175374984741, "learning_rate": 8.081563803866594e-06, "loss": 0.439, "step": 1452 }, { "epoch": 1.080029732408325, "grad_norm": 0.5018773078918457, "learning_rate": 8.078155863670008e-06, "loss": 0.4242, "step": 1453 }, { "epoch": 1.080773042616452, "grad_norm": 0.4479289650917053, "learning_rate": 8.07474561917975e-06, "loss": 0.4032, "step": 1454 }, { "epoch": 1.0815163528245788, "grad_norm": 0.5113857984542847, "learning_rate": 8.071333072948712e-06, "loss": 0.4282, "step": 1455 }, { "epoch": 1.0822596630327057, "grad_norm": 0.4347362816333771, "learning_rate": 8.067918227531515e-06, "loss": 0.3655, "step": 1456 }, { "epoch": 1.0830029732408324, "grad_norm": 0.44490939378738403, "learning_rate": 8.064501085484493e-06, "loss": 0.4117, "step": 1457 }, { "epoch": 1.0837462834489593, "grad_norm": 0.521420955657959, "learning_rate": 8.061081649365704e-06, "loss": 0.477, "step": 1458 }, { "epoch": 1.0844895936570862, "grad_norm": 0.4735782742500305, "learning_rate": 8.057659921734924e-06, "loss": 0.4315, "step": 1459 }, { "epoch": 1.085232903865213, "grad_norm": 0.45422035455703735, "learning_rate": 8.054235905153643e-06, "loss": 0.4306, "step": 1460 }, { "epoch": 1.08597621407334, "grad_norm": 0.4684211313724518, "learning_rate": 8.050809602185064e-06, "loss": 0.4246, "step": 1461 }, { "epoch": 1.086719524281467, "grad_norm": 0.499252051115036, "learning_rate": 8.0473810153941e-06, "loss": 0.4332, "step": 1462 }, { "epoch": 1.0874628344895936, "grad_norm": 0.38376662135124207, "learning_rate": 8.043950147347378e-06, "loss": 0.3394, "step": 1463 }, { "epoch": 1.0882061446977205, "grad_norm": 0.5398609042167664, "learning_rate": 8.040517000613228e-06, "loss": 0.4691, "step": 1464 }, { "epoch": 1.0889494549058474, "grad_norm": 0.44059860706329346, "learning_rate": 8.037081577761691e-06, "loss": 0.42, "step": 1465 }, { "epoch": 1.0896927651139743, "grad_norm": 0.4870191216468811, "learning_rate": 8.033643881364509e-06, "loss": 0.4327, "step": 1466 }, { "epoch": 1.0904360753221012, "grad_norm": 0.4051401913166046, "learning_rate": 8.030203913995123e-06, "loss": 0.3802, "step": 1467 }, { "epoch": 1.0911793855302279, "grad_norm": 0.49437689781188965, "learning_rate": 8.026761678228682e-06, "loss": 0.4029, "step": 1468 }, { "epoch": 1.0919226957383548, "grad_norm": 0.445360392332077, "learning_rate": 8.023317176642024e-06, "loss": 0.4059, "step": 1469 }, { "epoch": 1.0926660059464817, "grad_norm": 0.40590834617614746, "learning_rate": 8.019870411813689e-06, "loss": 0.3662, "step": 1470 }, { "epoch": 1.0934093161546086, "grad_norm": 0.5108782052993774, "learning_rate": 8.016421386323912e-06, "loss": 0.4416, "step": 1471 }, { "epoch": 1.0941526263627355, "grad_norm": 0.4600057303905487, "learning_rate": 8.012970102754614e-06, "loss": 0.39, "step": 1472 }, { "epoch": 1.0948959365708621, "grad_norm": 0.4246031939983368, "learning_rate": 8.009516563689412e-06, "loss": 0.4026, "step": 1473 }, { "epoch": 1.095639246778989, "grad_norm": 0.5706141591072083, "learning_rate": 8.006060771713609e-06, "loss": 0.4422, "step": 1474 }, { "epoch": 1.096382556987116, "grad_norm": 0.5377442836761475, "learning_rate": 8.002602729414197e-06, "loss": 0.4277, "step": 1475 }, { "epoch": 1.0971258671952429, "grad_norm": 0.6136458516120911, "learning_rate": 7.999142439379852e-06, "loss": 0.4449, "step": 1476 }, { "epoch": 1.0978691774033698, "grad_norm": 0.48079386353492737, "learning_rate": 7.995679904200928e-06, "loss": 0.3943, "step": 1477 }, { "epoch": 1.0986124876114964, "grad_norm": 0.4914160370826721, "learning_rate": 7.992215126469467e-06, "loss": 0.4431, "step": 1478 }, { "epoch": 1.0993557978196233, "grad_norm": 0.5313761234283447, "learning_rate": 7.988748108779183e-06, "loss": 0.4241, "step": 1479 }, { "epoch": 1.1000991080277502, "grad_norm": 0.4762108623981476, "learning_rate": 7.985278853725471e-06, "loss": 0.398, "step": 1480 }, { "epoch": 1.1008424182358771, "grad_norm": 0.5081911683082581, "learning_rate": 7.981807363905401e-06, "loss": 0.3966, "step": 1481 }, { "epoch": 1.101585728444004, "grad_norm": 0.4200124144554138, "learning_rate": 7.978333641917712e-06, "loss": 0.4554, "step": 1482 }, { "epoch": 1.1023290386521307, "grad_norm": 0.5775662660598755, "learning_rate": 7.974857690362819e-06, "loss": 0.434, "step": 1483 }, { "epoch": 1.1030723488602576, "grad_norm": 0.48887744545936584, "learning_rate": 7.971379511842803e-06, "loss": 0.44, "step": 1484 }, { "epoch": 1.1038156590683845, "grad_norm": 0.49518898129463196, "learning_rate": 7.967899108961415e-06, "loss": 0.4554, "step": 1485 }, { "epoch": 1.1045589692765114, "grad_norm": 0.44963574409484863, "learning_rate": 7.964416484324064e-06, "loss": 0.3605, "step": 1486 }, { "epoch": 1.1053022794846383, "grad_norm": 0.533632755279541, "learning_rate": 7.96093164053783e-06, "loss": 0.4278, "step": 1487 }, { "epoch": 1.1060455896927652, "grad_norm": 0.47612059116363525, "learning_rate": 7.957444580211455e-06, "loss": 0.4144, "step": 1488 }, { "epoch": 1.106788899900892, "grad_norm": 0.5420659184455872, "learning_rate": 7.95395530595533e-06, "loss": 0.455, "step": 1489 }, { "epoch": 1.1075322101090188, "grad_norm": 0.44573959708213806, "learning_rate": 7.950463820381513e-06, "loss": 0.429, "step": 1490 }, { "epoch": 1.1082755203171457, "grad_norm": 0.4272538423538208, "learning_rate": 7.946970126103717e-06, "loss": 0.3854, "step": 1491 }, { "epoch": 1.1090188305252726, "grad_norm": 0.44926419854164124, "learning_rate": 7.943474225737303e-06, "loss": 0.4565, "step": 1492 }, { "epoch": 1.1097621407333995, "grad_norm": 0.4446429908275604, "learning_rate": 7.939976121899286e-06, "loss": 0.462, "step": 1493 }, { "epoch": 1.1105054509415262, "grad_norm": 0.4739059507846832, "learning_rate": 7.936475817208334e-06, "loss": 0.4437, "step": 1494 }, { "epoch": 1.111248761149653, "grad_norm": 0.4290979206562042, "learning_rate": 7.932973314284754e-06, "loss": 0.392, "step": 1495 }, { "epoch": 1.11199207135778, "grad_norm": 0.48193463683128357, "learning_rate": 7.92946861575051e-06, "loss": 0.4452, "step": 1496 }, { "epoch": 1.112735381565907, "grad_norm": 0.4432992935180664, "learning_rate": 7.925961724229199e-06, "loss": 0.4388, "step": 1497 }, { "epoch": 1.1134786917740338, "grad_norm": 0.47636693716049194, "learning_rate": 7.922452642346066e-06, "loss": 0.3983, "step": 1498 }, { "epoch": 1.1142220019821605, "grad_norm": 0.4737434387207031, "learning_rate": 7.918941372727995e-06, "loss": 0.4552, "step": 1499 }, { "epoch": 1.1149653121902874, "grad_norm": 0.43561455607414246, "learning_rate": 7.915427918003504e-06, "loss": 0.4204, "step": 1500 }, { "epoch": 1.1157086223984143, "grad_norm": 0.39559879899024963, "learning_rate": 7.911912280802753e-06, "loss": 0.3499, "step": 1501 }, { "epoch": 1.1164519326065412, "grad_norm": 0.5094627141952515, "learning_rate": 7.908394463757529e-06, "loss": 0.4599, "step": 1502 }, { "epoch": 1.117195242814668, "grad_norm": 0.4845057725906372, "learning_rate": 7.904874469501255e-06, "loss": 0.3988, "step": 1503 }, { "epoch": 1.1179385530227948, "grad_norm": 0.47989439964294434, "learning_rate": 7.901352300668987e-06, "loss": 0.3922, "step": 1504 }, { "epoch": 1.1186818632309217, "grad_norm": 0.4819275736808777, "learning_rate": 7.897827959897397e-06, "loss": 0.4039, "step": 1505 }, { "epoch": 1.1194251734390486, "grad_norm": 0.49351099133491516, "learning_rate": 7.894301449824796e-06, "loss": 0.4337, "step": 1506 }, { "epoch": 1.1201684836471755, "grad_norm": 0.426597535610199, "learning_rate": 7.890772773091114e-06, "loss": 0.3829, "step": 1507 }, { "epoch": 1.1209117938553024, "grad_norm": 0.41440436244010925, "learning_rate": 7.8872419323379e-06, "loss": 0.405, "step": 1508 }, { "epoch": 1.121655104063429, "grad_norm": 0.4914392828941345, "learning_rate": 7.883708930208324e-06, "loss": 0.4738, "step": 1509 }, { "epoch": 1.122398414271556, "grad_norm": 0.43346962332725525, "learning_rate": 7.88017376934718e-06, "loss": 0.4191, "step": 1510 }, { "epoch": 1.1231417244796829, "grad_norm": 0.46064046025276184, "learning_rate": 7.876636452400872e-06, "loss": 0.4428, "step": 1511 }, { "epoch": 1.1238850346878098, "grad_norm": 0.5018149018287659, "learning_rate": 7.873096982017419e-06, "loss": 0.4026, "step": 1512 }, { "epoch": 1.1246283448959367, "grad_norm": 0.4671659469604492, "learning_rate": 7.869555360846453e-06, "loss": 0.3998, "step": 1513 }, { "epoch": 1.1253716551040633, "grad_norm": 0.39233914017677307, "learning_rate": 7.866011591539215e-06, "loss": 0.4174, "step": 1514 }, { "epoch": 1.1261149653121902, "grad_norm": 0.5263728499412537, "learning_rate": 7.862465676748556e-06, "loss": 0.435, "step": 1515 }, { "epoch": 1.1268582755203171, "grad_norm": 0.5160808563232422, "learning_rate": 7.858917619128931e-06, "loss": 0.3969, "step": 1516 }, { "epoch": 1.127601585728444, "grad_norm": 0.48587483167648315, "learning_rate": 7.855367421336401e-06, "loss": 0.4175, "step": 1517 }, { "epoch": 1.128344895936571, "grad_norm": 0.5019778609275818, "learning_rate": 7.851815086028628e-06, "loss": 0.4633, "step": 1518 }, { "epoch": 1.1290882061446976, "grad_norm": 0.4425671696662903, "learning_rate": 7.848260615864874e-06, "loss": 0.3809, "step": 1519 }, { "epoch": 1.1298315163528245, "grad_norm": 0.43146878480911255, "learning_rate": 7.844704013505997e-06, "loss": 0.3987, "step": 1520 }, { "epoch": 1.1305748265609514, "grad_norm": 0.5423685312271118, "learning_rate": 7.841145281614457e-06, "loss": 0.4778, "step": 1521 }, { "epoch": 1.1313181367690783, "grad_norm": 0.4428235590457916, "learning_rate": 7.837584422854303e-06, "loss": 0.4321, "step": 1522 }, { "epoch": 1.1320614469772052, "grad_norm": 0.48538869619369507, "learning_rate": 7.83402143989118e-06, "loss": 0.432, "step": 1523 }, { "epoch": 1.132804757185332, "grad_norm": 0.4153958857059479, "learning_rate": 7.830456335392316e-06, "loss": 0.3894, "step": 1524 }, { "epoch": 1.1335480673934588, "grad_norm": 0.47815579175949097, "learning_rate": 7.826889112026539e-06, "loss": 0.398, "step": 1525 }, { "epoch": 1.1342913776015857, "grad_norm": 0.48136502504348755, "learning_rate": 7.823319772464253e-06, "loss": 0.4658, "step": 1526 }, { "epoch": 1.1350346878097126, "grad_norm": 0.4485211670398712, "learning_rate": 7.819748319377448e-06, "loss": 0.4377, "step": 1527 }, { "epoch": 1.1357779980178395, "grad_norm": 0.49260276556015015, "learning_rate": 7.8161747554397e-06, "loss": 0.4381, "step": 1528 }, { "epoch": 1.1365213082259662, "grad_norm": 0.5833133459091187, "learning_rate": 7.812599083326162e-06, "loss": 0.4234, "step": 1529 }, { "epoch": 1.137264618434093, "grad_norm": 0.470417320728302, "learning_rate": 7.809021305713567e-06, "loss": 0.4257, "step": 1530 }, { "epoch": 1.13800792864222, "grad_norm": 0.46740663051605225, "learning_rate": 7.805441425280225e-06, "loss": 0.4372, "step": 1531 }, { "epoch": 1.138751238850347, "grad_norm": 0.3878171443939209, "learning_rate": 7.801859444706013e-06, "loss": 0.3637, "step": 1532 }, { "epoch": 1.1394945490584738, "grad_norm": 0.5232115983963013, "learning_rate": 7.79827536667239e-06, "loss": 0.4275, "step": 1533 }, { "epoch": 1.1402378592666005, "grad_norm": 0.44349950551986694, "learning_rate": 7.794689193862379e-06, "loss": 0.4221, "step": 1534 }, { "epoch": 1.1409811694747274, "grad_norm": 0.4091913104057312, "learning_rate": 7.791100928960573e-06, "loss": 0.4373, "step": 1535 }, { "epoch": 1.1417244796828543, "grad_norm": 0.4578135311603546, "learning_rate": 7.787510574653132e-06, "loss": 0.3969, "step": 1536 }, { "epoch": 1.1424677898909812, "grad_norm": 0.4802708327770233, "learning_rate": 7.78391813362778e-06, "loss": 0.4669, "step": 1537 }, { "epoch": 1.143211100099108, "grad_norm": 0.41274282336235046, "learning_rate": 7.7803236085738e-06, "loss": 0.3943, "step": 1538 }, { "epoch": 1.1439544103072348, "grad_norm": 0.41281116008758545, "learning_rate": 7.776727002182037e-06, "loss": 0.4062, "step": 1539 }, { "epoch": 1.1446977205153617, "grad_norm": 0.5124258399009705, "learning_rate": 7.773128317144895e-06, "loss": 0.4199, "step": 1540 }, { "epoch": 1.1454410307234886, "grad_norm": 0.47686606645584106, "learning_rate": 7.769527556156336e-06, "loss": 0.4289, "step": 1541 }, { "epoch": 1.1461843409316155, "grad_norm": 0.47101709246635437, "learning_rate": 7.76592472191187e-06, "loss": 0.4451, "step": 1542 }, { "epoch": 1.1469276511397424, "grad_norm": 0.454825758934021, "learning_rate": 7.762319817108566e-06, "loss": 0.3978, "step": 1543 }, { "epoch": 1.147670961347869, "grad_norm": 0.4426186680793762, "learning_rate": 7.758712844445035e-06, "loss": 0.4334, "step": 1544 }, { "epoch": 1.148414271555996, "grad_norm": 0.4025944173336029, "learning_rate": 7.755103806621445e-06, "loss": 0.3656, "step": 1545 }, { "epoch": 1.1491575817641229, "grad_norm": 0.4712747037410736, "learning_rate": 7.751492706339502e-06, "loss": 0.4499, "step": 1546 }, { "epoch": 1.1499008919722498, "grad_norm": 0.5514136552810669, "learning_rate": 7.747879546302463e-06, "loss": 0.4257, "step": 1547 }, { "epoch": 1.1506442021803767, "grad_norm": 0.3884260058403015, "learning_rate": 7.744264329215121e-06, "loss": 0.4088, "step": 1548 }, { "epoch": 1.1513875123885036, "grad_norm": 0.5369802117347717, "learning_rate": 7.740647057783812e-06, "loss": 0.4044, "step": 1549 }, { "epoch": 1.1521308225966302, "grad_norm": 0.4209352135658264, "learning_rate": 7.737027734716408e-06, "loss": 0.4159, "step": 1550 }, { "epoch": 1.1528741328047571, "grad_norm": 0.5335579514503479, "learning_rate": 7.733406362722321e-06, "loss": 0.4755, "step": 1551 }, { "epoch": 1.153617443012884, "grad_norm": 0.4724455773830414, "learning_rate": 7.729782944512491e-06, "loss": 0.3956, "step": 1552 }, { "epoch": 1.154360753221011, "grad_norm": 0.443497896194458, "learning_rate": 7.726157482799393e-06, "loss": 0.4177, "step": 1553 }, { "epoch": 1.1551040634291379, "grad_norm": 0.46354684233665466, "learning_rate": 7.722529980297033e-06, "loss": 0.4104, "step": 1554 }, { "epoch": 1.1558473736372645, "grad_norm": 0.5023008584976196, "learning_rate": 7.718900439720942e-06, "loss": 0.42, "step": 1555 }, { "epoch": 1.1565906838453914, "grad_norm": 0.4276632070541382, "learning_rate": 7.715268863788177e-06, "loss": 0.3748, "step": 1556 }, { "epoch": 1.1573339940535183, "grad_norm": 0.43845516443252563, "learning_rate": 7.71163525521732e-06, "loss": 0.3881, "step": 1557 }, { "epoch": 1.1580773042616452, "grad_norm": 0.5466856360435486, "learning_rate": 7.707999616728475e-06, "loss": 0.4605, "step": 1558 }, { "epoch": 1.1588206144697721, "grad_norm": 0.4921121299266815, "learning_rate": 7.704361951043264e-06, "loss": 0.4133, "step": 1559 }, { "epoch": 1.1595639246778988, "grad_norm": 0.5178412199020386, "learning_rate": 7.700722260884827e-06, "loss": 0.4374, "step": 1560 }, { "epoch": 1.1603072348860257, "grad_norm": 0.5638749003410339, "learning_rate": 7.69708054897782e-06, "loss": 0.4511, "step": 1561 }, { "epoch": 1.1610505450941526, "grad_norm": 0.44455310702323914, "learning_rate": 7.693436818048412e-06, "loss": 0.3957, "step": 1562 }, { "epoch": 1.1617938553022795, "grad_norm": 0.518449604511261, "learning_rate": 7.689791070824283e-06, "loss": 0.4379, "step": 1563 }, { "epoch": 1.1625371655104064, "grad_norm": 0.5956539511680603, "learning_rate": 7.686143310034623e-06, "loss": 0.5043, "step": 1564 }, { "epoch": 1.1632804757185333, "grad_norm": 0.47566136717796326, "learning_rate": 7.682493538410134e-06, "loss": 0.4178, "step": 1565 }, { "epoch": 1.16402378592666, "grad_norm": 0.6306418776512146, "learning_rate": 7.678841758683012e-06, "loss": 0.4353, "step": 1566 }, { "epoch": 1.164767096134787, "grad_norm": 0.4228046238422394, "learning_rate": 7.675187973586966e-06, "loss": 0.4076, "step": 1567 }, { "epoch": 1.1655104063429138, "grad_norm": 0.5097529888153076, "learning_rate": 7.671532185857203e-06, "loss": 0.4185, "step": 1568 }, { "epoch": 1.1662537165510407, "grad_norm": 0.48076409101486206, "learning_rate": 7.66787439823043e-06, "loss": 0.438, "step": 1569 }, { "epoch": 1.1669970267591676, "grad_norm": 0.5065842270851135, "learning_rate": 7.664214613444849e-06, "loss": 0.4162, "step": 1570 }, { "epoch": 1.1677403369672943, "grad_norm": 0.5576058626174927, "learning_rate": 7.660552834240157e-06, "loss": 0.4523, "step": 1571 }, { "epoch": 1.1684836471754212, "grad_norm": 0.3772413730621338, "learning_rate": 7.65688906335755e-06, "loss": 0.3806, "step": 1572 }, { "epoch": 1.169226957383548, "grad_norm": 0.5111067891120911, "learning_rate": 7.653223303539708e-06, "loss": 0.4434, "step": 1573 }, { "epoch": 1.169970267591675, "grad_norm": 0.567559540271759, "learning_rate": 7.649555557530802e-06, "loss": 0.4345, "step": 1574 }, { "epoch": 1.170713577799802, "grad_norm": 0.46122029423713684, "learning_rate": 7.64588582807649e-06, "loss": 0.3828, "step": 1575 }, { "epoch": 1.1714568880079286, "grad_norm": 0.5663712620735168, "learning_rate": 7.642214117923916e-06, "loss": 0.4331, "step": 1576 }, { "epoch": 1.1722001982160555, "grad_norm": 0.49965253472328186, "learning_rate": 7.638540429821705e-06, "loss": 0.4611, "step": 1577 }, { "epoch": 1.1729435084241824, "grad_norm": 0.5218018293380737, "learning_rate": 7.634864766519964e-06, "loss": 0.3489, "step": 1578 }, { "epoch": 1.1736868186323093, "grad_norm": 0.7232185006141663, "learning_rate": 7.631187130770276e-06, "loss": 0.4595, "step": 1579 }, { "epoch": 1.1744301288404362, "grad_norm": 0.45076096057891846, "learning_rate": 7.627507525325708e-06, "loss": 0.3926, "step": 1580 }, { "epoch": 1.1751734390485629, "grad_norm": 0.7499317526817322, "learning_rate": 7.623825952940791e-06, "loss": 0.435, "step": 1581 }, { "epoch": 1.1759167492566898, "grad_norm": 0.48166128993034363, "learning_rate": 7.620142416371534e-06, "loss": 0.3927, "step": 1582 }, { "epoch": 1.1766600594648167, "grad_norm": 0.566220760345459, "learning_rate": 7.6164569183754165e-06, "loss": 0.4317, "step": 1583 }, { "epoch": 1.1774033696729436, "grad_norm": 0.6157498359680176, "learning_rate": 7.612769461711389e-06, "loss": 0.4006, "step": 1584 }, { "epoch": 1.1781466798810705, "grad_norm": 0.43571949005126953, "learning_rate": 7.609080049139861e-06, "loss": 0.438, "step": 1585 }, { "epoch": 1.1788899900891971, "grad_norm": 0.4864311218261719, "learning_rate": 7.605388683422711e-06, "loss": 0.397, "step": 1586 }, { "epoch": 1.179633300297324, "grad_norm": 0.49223315715789795, "learning_rate": 7.6016953673232805e-06, "loss": 0.4054, "step": 1587 }, { "epoch": 1.180376610505451, "grad_norm": 0.48922744393348694, "learning_rate": 7.598000103606369e-06, "loss": 0.4536, "step": 1588 }, { "epoch": 1.1811199207135779, "grad_norm": 0.5033111572265625, "learning_rate": 7.594302895038231e-06, "loss": 0.4248, "step": 1589 }, { "epoch": 1.1818632309217048, "grad_norm": 0.5250124335289001, "learning_rate": 7.590603744386584e-06, "loss": 0.4024, "step": 1590 }, { "epoch": 1.1826065411298314, "grad_norm": 0.5092789530754089, "learning_rate": 7.586902654420596e-06, "loss": 0.4374, "step": 1591 }, { "epoch": 1.1833498513379583, "grad_norm": 0.4744127690792084, "learning_rate": 7.583199627910882e-06, "loss": 0.4319, "step": 1592 }, { "epoch": 1.1840931615460852, "grad_norm": 0.5051032900810242, "learning_rate": 7.579494667629516e-06, "loss": 0.4052, "step": 1593 }, { "epoch": 1.1848364717542121, "grad_norm": 0.529804527759552, "learning_rate": 7.575787776350012e-06, "loss": 0.404, "step": 1594 }, { "epoch": 1.185579781962339, "grad_norm": 0.4685680866241455, "learning_rate": 7.572078956847332e-06, "loss": 0.449, "step": 1595 }, { "epoch": 1.1863230921704657, "grad_norm": 0.43893980979919434, "learning_rate": 7.568368211897883e-06, "loss": 0.4288, "step": 1596 }, { "epoch": 1.1870664023785926, "grad_norm": 0.4060453474521637, "learning_rate": 7.56465554427951e-06, "loss": 0.4032, "step": 1597 }, { "epoch": 1.1878097125867195, "grad_norm": 0.4465078115463257, "learning_rate": 7.560940956771502e-06, "loss": 0.3963, "step": 1598 }, { "epoch": 1.1885530227948464, "grad_norm": 0.44518935680389404, "learning_rate": 7.557224452154581e-06, "loss": 0.4459, "step": 1599 }, { "epoch": 1.1892963330029733, "grad_norm": 0.4477979242801666, "learning_rate": 7.553506033210904e-06, "loss": 0.4257, "step": 1600 }, { "epoch": 1.1900396432111, "grad_norm": 0.4096951484680176, "learning_rate": 7.549785702724068e-06, "loss": 0.3831, "step": 1601 }, { "epoch": 1.190782953419227, "grad_norm": 0.43308043479919434, "learning_rate": 7.54606346347909e-06, "loss": 0.4267, "step": 1602 }, { "epoch": 1.1915262636273538, "grad_norm": 0.4346524477005005, "learning_rate": 7.542339318262424e-06, "loss": 0.4187, "step": 1603 }, { "epoch": 1.1922695738354807, "grad_norm": 0.4351028501987457, "learning_rate": 7.53861326986195e-06, "loss": 0.3883, "step": 1604 }, { "epoch": 1.1930128840436076, "grad_norm": 0.4379247725009918, "learning_rate": 7.534885321066967e-06, "loss": 0.4526, "step": 1605 }, { "epoch": 1.1937561942517343, "grad_norm": 0.4008922576904297, "learning_rate": 7.531155474668204e-06, "loss": 0.3826, "step": 1606 }, { "epoch": 1.1944995044598612, "grad_norm": 0.43739989399909973, "learning_rate": 7.5274237334578075e-06, "loss": 0.4174, "step": 1607 }, { "epoch": 1.195242814667988, "grad_norm": 0.4363674819469452, "learning_rate": 7.5236901002293415e-06, "loss": 0.4315, "step": 1608 }, { "epoch": 1.195986124876115, "grad_norm": 0.41167163848876953, "learning_rate": 7.519954577777785e-06, "loss": 0.4156, "step": 1609 }, { "epoch": 1.196729435084242, "grad_norm": 0.46986114978790283, "learning_rate": 7.516217168899535e-06, "loss": 0.423, "step": 1610 }, { "epoch": 1.1974727452923686, "grad_norm": 0.4279041588306427, "learning_rate": 7.512477876392402e-06, "loss": 0.3919, "step": 1611 }, { "epoch": 1.1982160555004955, "grad_norm": 0.46141570806503296, "learning_rate": 7.508736703055599e-06, "loss": 0.3803, "step": 1612 }, { "epoch": 1.1989593657086224, "grad_norm": 0.48577046394348145, "learning_rate": 7.504993651689752e-06, "loss": 0.4126, "step": 1613 }, { "epoch": 1.1997026759167493, "grad_norm": 0.47686344385147095, "learning_rate": 7.501248725096897e-06, "loss": 0.4022, "step": 1614 }, { "epoch": 1.2004459861248762, "grad_norm": 0.45497384667396545, "learning_rate": 7.497501926080464e-06, "loss": 0.4832, "step": 1615 }, { "epoch": 1.2011892963330029, "grad_norm": 0.45185956358909607, "learning_rate": 7.493753257445292e-06, "loss": 0.4021, "step": 1616 }, { "epoch": 1.2019326065411298, "grad_norm": 0.520775556564331, "learning_rate": 7.490002721997618e-06, "loss": 0.4025, "step": 1617 }, { "epoch": 1.2026759167492567, "grad_norm": 0.5013459920883179, "learning_rate": 7.486250322545075e-06, "loss": 0.4772, "step": 1618 }, { "epoch": 1.2034192269573836, "grad_norm": 0.49351629614830017, "learning_rate": 7.482496061896693e-06, "loss": 0.4029, "step": 1619 }, { "epoch": 1.2041625371655105, "grad_norm": 0.5818110704421997, "learning_rate": 7.478739942862895e-06, "loss": 0.4216, "step": 1620 }, { "epoch": 1.2049058473736372, "grad_norm": 0.4631431996822357, "learning_rate": 7.474981968255493e-06, "loss": 0.4528, "step": 1621 }, { "epoch": 1.205649157581764, "grad_norm": 0.5481220483779907, "learning_rate": 7.471222140887693e-06, "loss": 0.4006, "step": 1622 }, { "epoch": 1.206392467789891, "grad_norm": 0.5382815003395081, "learning_rate": 7.467460463574084e-06, "loss": 0.3877, "step": 1623 }, { "epoch": 1.2071357779980179, "grad_norm": 0.4140545427799225, "learning_rate": 7.463696939130638e-06, "loss": 0.4143, "step": 1624 }, { "epoch": 1.2078790882061448, "grad_norm": 0.5805398225784302, "learning_rate": 7.459931570374717e-06, "loss": 0.4434, "step": 1625 }, { "epoch": 1.2086223984142714, "grad_norm": 0.5023841261863708, "learning_rate": 7.4561643601250544e-06, "loss": 0.4114, "step": 1626 }, { "epoch": 1.2093657086223983, "grad_norm": 0.41420823335647583, "learning_rate": 7.45239531120177e-06, "loss": 0.3858, "step": 1627 }, { "epoch": 1.2101090188305252, "grad_norm": 0.5077630877494812, "learning_rate": 7.448624426426356e-06, "loss": 0.4305, "step": 1628 }, { "epoch": 1.2108523290386521, "grad_norm": 0.4641924798488617, "learning_rate": 7.444851708621681e-06, "loss": 0.4094, "step": 1629 }, { "epoch": 1.211595639246779, "grad_norm": 0.5219717621803284, "learning_rate": 7.441077160611982e-06, "loss": 0.4575, "step": 1630 }, { "epoch": 1.2123389494549057, "grad_norm": 0.4740281403064728, "learning_rate": 7.437300785222873e-06, "loss": 0.4271, "step": 1631 }, { "epoch": 1.2130822596630326, "grad_norm": 0.44335100054740906, "learning_rate": 7.433522585281326e-06, "loss": 0.3949, "step": 1632 }, { "epoch": 1.2138255698711595, "grad_norm": 0.5357722043991089, "learning_rate": 7.429742563615689e-06, "loss": 0.4473, "step": 1633 }, { "epoch": 1.2145688800792864, "grad_norm": 0.48270538449287415, "learning_rate": 7.425960723055668e-06, "loss": 0.4525, "step": 1634 }, { "epoch": 1.2153121902874133, "grad_norm": 0.45047691464424133, "learning_rate": 7.422177066432332e-06, "loss": 0.4274, "step": 1635 }, { "epoch": 1.2160555004955402, "grad_norm": 0.4785202443599701, "learning_rate": 7.418391596578107e-06, "loss": 0.3826, "step": 1636 }, { "epoch": 1.216798810703667, "grad_norm": 0.4432569742202759, "learning_rate": 7.414604316326784e-06, "loss": 0.4524, "step": 1637 }, { "epoch": 1.2175421209117938, "grad_norm": 0.4819079041481018, "learning_rate": 7.4108152285134985e-06, "loss": 0.4272, "step": 1638 }, { "epoch": 1.2182854311199207, "grad_norm": 0.5113914608955383, "learning_rate": 7.407024335974747e-06, "loss": 0.4263, "step": 1639 }, { "epoch": 1.2190287413280476, "grad_norm": 0.45190224051475525, "learning_rate": 7.403231641548374e-06, "loss": 0.4313, "step": 1640 }, { "epoch": 1.2197720515361745, "grad_norm": 0.41534554958343506, "learning_rate": 7.399437148073576e-06, "loss": 0.3843, "step": 1641 }, { "epoch": 1.2205153617443012, "grad_norm": 0.43253183364868164, "learning_rate": 7.395640858390891e-06, "loss": 0.4235, "step": 1642 }, { "epoch": 1.221258671952428, "grad_norm": 0.4972347021102905, "learning_rate": 7.391842775342206e-06, "loss": 0.3883, "step": 1643 }, { "epoch": 1.222001982160555, "grad_norm": 0.5595928430557251, "learning_rate": 7.388042901770747e-06, "loss": 0.4858, "step": 1644 }, { "epoch": 1.222745292368682, "grad_norm": 0.4145627021789551, "learning_rate": 7.384241240521085e-06, "loss": 0.3415, "step": 1645 }, { "epoch": 1.2234886025768088, "grad_norm": 0.5362351536750793, "learning_rate": 7.380437794439124e-06, "loss": 0.4048, "step": 1646 }, { "epoch": 1.2242319127849355, "grad_norm": 0.4570310115814209, "learning_rate": 7.376632566372108e-06, "loss": 0.3886, "step": 1647 }, { "epoch": 1.2249752229930624, "grad_norm": 0.5231766700744629, "learning_rate": 7.372825559168615e-06, "loss": 0.4756, "step": 1648 }, { "epoch": 1.2257185332011893, "grad_norm": 0.5085083842277527, "learning_rate": 7.36901677567855e-06, "loss": 0.4131, "step": 1649 }, { "epoch": 1.2264618434093162, "grad_norm": 0.4178345203399658, "learning_rate": 7.3652062187531534e-06, "loss": 0.3866, "step": 1650 }, { "epoch": 1.227205153617443, "grad_norm": 0.5787187218666077, "learning_rate": 7.361393891244992e-06, "loss": 0.4322, "step": 1651 }, { "epoch": 1.22794846382557, "grad_norm": 0.49332624673843384, "learning_rate": 7.357579796007955e-06, "loss": 0.4144, "step": 1652 }, { "epoch": 1.2286917740336967, "grad_norm": 0.49500468373298645, "learning_rate": 7.353763935897259e-06, "loss": 0.429, "step": 1653 }, { "epoch": 1.2294350842418236, "grad_norm": 0.6007659435272217, "learning_rate": 7.349946313769439e-06, "loss": 0.4254, "step": 1654 }, { "epoch": 1.2301783944499505, "grad_norm": 0.46610432863235474, "learning_rate": 7.346126932482348e-06, "loss": 0.4323, "step": 1655 }, { "epoch": 1.2309217046580774, "grad_norm": 0.5249744057655334, "learning_rate": 7.342305794895159e-06, "loss": 0.4932, "step": 1656 }, { "epoch": 1.2316650148662043, "grad_norm": 0.42752963304519653, "learning_rate": 7.33848290386836e-06, "loss": 0.3928, "step": 1657 }, { "epoch": 1.232408325074331, "grad_norm": 0.4828933775424957, "learning_rate": 7.334658262263749e-06, "loss": 0.3815, "step": 1658 }, { "epoch": 1.2331516352824579, "grad_norm": 0.44464412331581116, "learning_rate": 7.330831872944433e-06, "loss": 0.4363, "step": 1659 }, { "epoch": 1.2338949454905848, "grad_norm": 0.42582762241363525, "learning_rate": 7.327003738774834e-06, "loss": 0.4502, "step": 1660 }, { "epoch": 1.2346382556987117, "grad_norm": 0.48937031626701355, "learning_rate": 7.323173862620675e-06, "loss": 0.4138, "step": 1661 }, { "epoch": 1.2353815659068386, "grad_norm": 0.45850881934165955, "learning_rate": 7.319342247348982e-06, "loss": 0.3782, "step": 1662 }, { "epoch": 1.2361248761149652, "grad_norm": 0.4777900278568268, "learning_rate": 7.315508895828085e-06, "loss": 0.4296, "step": 1663 }, { "epoch": 1.2368681863230921, "grad_norm": 0.4582613706588745, "learning_rate": 7.3116738109276174e-06, "loss": 0.3751, "step": 1664 }, { "epoch": 1.237611496531219, "grad_norm": 0.5451620817184448, "learning_rate": 7.307836995518504e-06, "loss": 0.4969, "step": 1665 }, { "epoch": 1.238354806739346, "grad_norm": 0.49423128366470337, "learning_rate": 7.303998452472967e-06, "loss": 0.4108, "step": 1666 }, { "epoch": 1.2390981169474728, "grad_norm": 0.44766509532928467, "learning_rate": 7.300158184664523e-06, "loss": 0.432, "step": 1667 }, { "epoch": 1.2398414271555995, "grad_norm": 0.46275150775909424, "learning_rate": 7.296316194967981e-06, "loss": 0.4218, "step": 1668 }, { "epoch": 1.2405847373637264, "grad_norm": 0.5833042860031128, "learning_rate": 7.292472486259434e-06, "loss": 0.4347, "step": 1669 }, { "epoch": 1.2413280475718533, "grad_norm": 0.5436184406280518, "learning_rate": 7.288627061416269e-06, "loss": 0.4681, "step": 1670 }, { "epoch": 1.2420713577799802, "grad_norm": 0.44607028365135193, "learning_rate": 7.28477992331715e-06, "loss": 0.3776, "step": 1671 }, { "epoch": 1.2428146679881071, "grad_norm": 0.5187134742736816, "learning_rate": 7.28093107484203e-06, "loss": 0.421, "step": 1672 }, { "epoch": 1.2435579781962338, "grad_norm": 0.5457763671875, "learning_rate": 7.277080518872138e-06, "loss": 0.4403, "step": 1673 }, { "epoch": 1.2443012884043607, "grad_norm": 0.4562883675098419, "learning_rate": 7.273228258289986e-06, "loss": 0.4411, "step": 1674 }, { "epoch": 1.2450445986124876, "grad_norm": 0.46269628405570984, "learning_rate": 7.269374295979355e-06, "loss": 0.4308, "step": 1675 }, { "epoch": 1.2457879088206145, "grad_norm": 0.5433219075202942, "learning_rate": 7.265518634825305e-06, "loss": 0.4464, "step": 1676 }, { "epoch": 1.2465312190287414, "grad_norm": 0.5261064767837524, "learning_rate": 7.2616612777141685e-06, "loss": 0.4199, "step": 1677 }, { "epoch": 1.247274529236868, "grad_norm": 0.4050852060317993, "learning_rate": 7.257802227533543e-06, "loss": 0.3524, "step": 1678 }, { "epoch": 1.248017839444995, "grad_norm": 0.45647957921028137, "learning_rate": 7.253941487172298e-06, "loss": 0.4701, "step": 1679 }, { "epoch": 1.248761149653122, "grad_norm": 0.5021121501922607, "learning_rate": 7.250079059520566e-06, "loss": 0.4577, "step": 1680 }, { "epoch": 1.2495044598612488, "grad_norm": 0.4434012472629547, "learning_rate": 7.246214947469745e-06, "loss": 0.4147, "step": 1681 }, { "epoch": 1.2502477700693757, "grad_norm": 0.4954187870025635, "learning_rate": 7.2423491539124866e-06, "loss": 0.4074, "step": 1682 }, { "epoch": 1.2509910802775024, "grad_norm": 0.4127700626850128, "learning_rate": 7.2384816817427125e-06, "loss": 0.3605, "step": 1683 }, { "epoch": 1.2517343904856293, "grad_norm": 0.445029616355896, "learning_rate": 7.234612533855593e-06, "loss": 0.3749, "step": 1684 }, { "epoch": 1.2524777006937562, "grad_norm": 0.537829577922821, "learning_rate": 7.230741713147553e-06, "loss": 0.4749, "step": 1685 }, { "epoch": 1.253221010901883, "grad_norm": 0.5274674892425537, "learning_rate": 7.226869222516271e-06, "loss": 0.4935, "step": 1686 }, { "epoch": 1.25396432111001, "grad_norm": 0.4154290556907654, "learning_rate": 7.222995064860682e-06, "loss": 0.4031, "step": 1687 }, { "epoch": 1.2547076313181367, "grad_norm": 0.4790441691875458, "learning_rate": 7.2191192430809565e-06, "loss": 0.3851, "step": 1688 }, { "epoch": 1.2554509415262636, "grad_norm": 0.556878387928009, "learning_rate": 7.21524176007852e-06, "loss": 0.4306, "step": 1689 }, { "epoch": 1.2561942517343905, "grad_norm": 0.4063442647457123, "learning_rate": 7.2113626187560386e-06, "loss": 0.3971, "step": 1690 }, { "epoch": 1.2569375619425174, "grad_norm": 0.5563206672668457, "learning_rate": 7.2074818220174204e-06, "loss": 0.4261, "step": 1691 }, { "epoch": 1.2576808721506443, "grad_norm": 0.5149734020233154, "learning_rate": 7.2035993727678114e-06, "loss": 0.426, "step": 1692 }, { "epoch": 1.258424182358771, "grad_norm": 0.46129775047302246, "learning_rate": 7.199715273913597e-06, "loss": 0.4091, "step": 1693 }, { "epoch": 1.2591674925668979, "grad_norm": 0.4400854706764221, "learning_rate": 7.195829528362396e-06, "loss": 0.4361, "step": 1694 }, { "epoch": 1.2599108027750248, "grad_norm": 0.44859445095062256, "learning_rate": 7.191942139023059e-06, "loss": 0.4053, "step": 1695 }, { "epoch": 1.2606541129831517, "grad_norm": 0.5355749130249023, "learning_rate": 7.188053108805669e-06, "loss": 0.4694, "step": 1696 }, { "epoch": 1.2613974231912786, "grad_norm": 0.46287932991981506, "learning_rate": 7.184162440621538e-06, "loss": 0.4346, "step": 1697 }, { "epoch": 1.2621407333994052, "grad_norm": 0.4262552559375763, "learning_rate": 7.1802701373831995e-06, "loss": 0.3898, "step": 1698 }, { "epoch": 1.2628840436075321, "grad_norm": 0.4965451955795288, "learning_rate": 7.176376202004417e-06, "loss": 0.4546, "step": 1699 }, { "epoch": 1.263627353815659, "grad_norm": 0.5016241669654846, "learning_rate": 7.172480637400173e-06, "loss": 0.3854, "step": 1700 }, { "epoch": 1.264370664023786, "grad_norm": 0.5179436206817627, "learning_rate": 7.168583446486668e-06, "loss": 0.485, "step": 1701 }, { "epoch": 1.2651139742319129, "grad_norm": 0.5148916840553284, "learning_rate": 7.1646846321813205e-06, "loss": 0.3911, "step": 1702 }, { "epoch": 1.2658572844400395, "grad_norm": 0.5256156325340271, "learning_rate": 7.160784197402768e-06, "loss": 0.404, "step": 1703 }, { "epoch": 1.2666005946481664, "grad_norm": 0.4914431571960449, "learning_rate": 7.15688214507086e-06, "loss": 0.4347, "step": 1704 }, { "epoch": 1.2673439048562933, "grad_norm": 0.43560120463371277, "learning_rate": 7.152978478106652e-06, "loss": 0.3779, "step": 1705 }, { "epoch": 1.2680872150644202, "grad_norm": 0.5557543039321899, "learning_rate": 7.149073199432412e-06, "loss": 0.4723, "step": 1706 }, { "epoch": 1.2688305252725471, "grad_norm": 0.4670959413051605, "learning_rate": 7.145166311971616e-06, "loss": 0.4356, "step": 1707 }, { "epoch": 1.2695738354806738, "grad_norm": 0.4587242305278778, "learning_rate": 7.141257818648941e-06, "loss": 0.3957, "step": 1708 }, { "epoch": 1.2703171456888007, "grad_norm": 0.4451809227466583, "learning_rate": 7.1373477223902666e-06, "loss": 0.431, "step": 1709 }, { "epoch": 1.2710604558969276, "grad_norm": 0.47745558619499207, "learning_rate": 7.133436026122675e-06, "loss": 0.4742, "step": 1710 }, { "epoch": 1.2718037661050545, "grad_norm": 0.40389153361320496, "learning_rate": 7.129522732774446e-06, "loss": 0.3917, "step": 1711 }, { "epoch": 1.2725470763131814, "grad_norm": 0.4864920675754547, "learning_rate": 7.125607845275049e-06, "loss": 0.4105, "step": 1712 }, { "epoch": 1.273290386521308, "grad_norm": 0.5160154700279236, "learning_rate": 7.121691366555155e-06, "loss": 0.4321, "step": 1713 }, { "epoch": 1.274033696729435, "grad_norm": 0.47538453340530396, "learning_rate": 7.1177732995466205e-06, "loss": 0.4418, "step": 1714 }, { "epoch": 1.274777006937562, "grad_norm": 0.4405979514122009, "learning_rate": 7.113853647182494e-06, "loss": 0.4417, "step": 1715 }, { "epoch": 1.2755203171456888, "grad_norm": 0.5541965365409851, "learning_rate": 7.109932412397009e-06, "loss": 0.4079, "step": 1716 }, { "epoch": 1.2762636273538157, "grad_norm": 0.4809340536594391, "learning_rate": 7.106009598125585e-06, "loss": 0.4259, "step": 1717 }, { "epoch": 1.2770069375619424, "grad_norm": 0.44729137420654297, "learning_rate": 7.102085207304822e-06, "loss": 0.4002, "step": 1718 }, { "epoch": 1.2777502477700695, "grad_norm": 0.47616174817085266, "learning_rate": 7.098159242872502e-06, "loss": 0.4031, "step": 1719 }, { "epoch": 1.2784935579781962, "grad_norm": 0.4468039274215698, "learning_rate": 7.094231707767585e-06, "loss": 0.4435, "step": 1720 }, { "epoch": 1.279236868186323, "grad_norm": 0.44620102643966675, "learning_rate": 7.090302604930203e-06, "loss": 0.4226, "step": 1721 }, { "epoch": 1.27998017839445, "grad_norm": 0.4534017741680145, "learning_rate": 7.0863719373016706e-06, "loss": 0.4655, "step": 1722 }, { "epoch": 1.2807234886025767, "grad_norm": 0.38617628812789917, "learning_rate": 7.082439707824462e-06, "loss": 0.3907, "step": 1723 }, { "epoch": 1.2814667988107038, "grad_norm": 0.445035457611084, "learning_rate": 7.078505919442231e-06, "loss": 0.4538, "step": 1724 }, { "epoch": 1.2822101090188305, "grad_norm": 0.46405166387557983, "learning_rate": 7.07457057509979e-06, "loss": 0.4158, "step": 1725 }, { "epoch": 1.2829534192269574, "grad_norm": 0.4499397873878479, "learning_rate": 7.070633677743124e-06, "loss": 0.4286, "step": 1726 }, { "epoch": 1.2836967294350843, "grad_norm": 0.4319382309913635, "learning_rate": 7.066695230319376e-06, "loss": 0.3699, "step": 1727 }, { "epoch": 1.284440039643211, "grad_norm": 0.42349135875701904, "learning_rate": 7.062755235776848e-06, "loss": 0.3912, "step": 1728 }, { "epoch": 1.285183349851338, "grad_norm": 0.42229217290878296, "learning_rate": 7.058813697065001e-06, "loss": 0.4243, "step": 1729 }, { "epoch": 1.2859266600594648, "grad_norm": 0.3717700242996216, "learning_rate": 7.0548706171344585e-06, "loss": 0.4298, "step": 1730 }, { "epoch": 1.2866699702675917, "grad_norm": 0.430707186460495, "learning_rate": 7.050925998936988e-06, "loss": 0.4714, "step": 1731 }, { "epoch": 1.2874132804757186, "grad_norm": 0.39822953939437866, "learning_rate": 7.046979845425513e-06, "loss": 0.3972, "step": 1732 }, { "epoch": 1.2881565906838455, "grad_norm": 0.4186343252658844, "learning_rate": 7.043032159554111e-06, "loss": 0.4207, "step": 1733 }, { "epoch": 1.2888999008919724, "grad_norm": 0.4482424259185791, "learning_rate": 7.039082944277997e-06, "loss": 0.4138, "step": 1734 }, { "epoch": 1.289643211100099, "grad_norm": 0.46234074234962463, "learning_rate": 7.035132202553537e-06, "loss": 0.4685, "step": 1735 }, { "epoch": 1.290386521308226, "grad_norm": 0.39252355694770813, "learning_rate": 7.03117993733824e-06, "loss": 0.4173, "step": 1736 }, { "epoch": 1.2911298315163529, "grad_norm": 0.4255271553993225, "learning_rate": 7.027226151590751e-06, "loss": 0.4169, "step": 1737 }, { "epoch": 1.2918731417244798, "grad_norm": 0.4422372579574585, "learning_rate": 7.02327084827086e-06, "loss": 0.433, "step": 1738 }, { "epoch": 1.2926164519326067, "grad_norm": 0.46733400225639343, "learning_rate": 7.019314030339488e-06, "loss": 0.4718, "step": 1739 }, { "epoch": 1.2933597621407333, "grad_norm": 0.48382893204689026, "learning_rate": 7.01535570075869e-06, "loss": 0.4491, "step": 1740 }, { "epoch": 1.2941030723488602, "grad_norm": 0.453948050737381, "learning_rate": 7.011395862491654e-06, "loss": 0.3789, "step": 1741 }, { "epoch": 1.2948463825569871, "grad_norm": 0.45614978671073914, "learning_rate": 7.007434518502698e-06, "loss": 0.4394, "step": 1742 }, { "epoch": 1.295589692765114, "grad_norm": 0.39718538522720337, "learning_rate": 7.003471671757267e-06, "loss": 0.4086, "step": 1743 }, { "epoch": 1.296333002973241, "grad_norm": 0.45345285534858704, "learning_rate": 6.999507325221928e-06, "loss": 0.4577, "step": 1744 }, { "epoch": 1.2970763131813676, "grad_norm": 0.4486315846443176, "learning_rate": 6.995541481864375e-06, "loss": 0.4181, "step": 1745 }, { "epoch": 1.2978196233894945, "grad_norm": 0.3996640145778656, "learning_rate": 6.99157414465342e-06, "loss": 0.3609, "step": 1746 }, { "epoch": 1.2985629335976214, "grad_norm": 0.5057870149612427, "learning_rate": 6.987605316558995e-06, "loss": 0.4398, "step": 1747 }, { "epoch": 1.2993062438057483, "grad_norm": 0.41990792751312256, "learning_rate": 6.983635000552144e-06, "loss": 0.4395, "step": 1748 }, { "epoch": 1.3000495540138752, "grad_norm": 0.3740553557872772, "learning_rate": 6.979663199605029e-06, "loss": 0.3781, "step": 1749 }, { "epoch": 1.300792864222002, "grad_norm": 0.5275457501411438, "learning_rate": 6.975689916690925e-06, "loss": 0.4711, "step": 1750 }, { "epoch": 1.3015361744301288, "grad_norm": 0.41287529468536377, "learning_rate": 6.971715154784211e-06, "loss": 0.4198, "step": 1751 }, { "epoch": 1.3022794846382557, "grad_norm": 0.48055392503738403, "learning_rate": 6.967738916860374e-06, "loss": 0.4395, "step": 1752 }, { "epoch": 1.3030227948463826, "grad_norm": 0.5216841697692871, "learning_rate": 6.963761205896013e-06, "loss": 0.391, "step": 1753 }, { "epoch": 1.3037661050545095, "grad_norm": 0.4064953923225403, "learning_rate": 6.959782024868822e-06, "loss": 0.401, "step": 1754 }, { "epoch": 1.3045094152626362, "grad_norm": 0.5900090932846069, "learning_rate": 6.955801376757596e-06, "loss": 0.4462, "step": 1755 }, { "epoch": 1.305252725470763, "grad_norm": 0.4536752998828888, "learning_rate": 6.951819264542235e-06, "loss": 0.4073, "step": 1756 }, { "epoch": 1.30599603567889, "grad_norm": 0.44217249751091003, "learning_rate": 6.947835691203725e-06, "loss": 0.4088, "step": 1757 }, { "epoch": 1.306739345887017, "grad_norm": 0.5840710997581482, "learning_rate": 6.943850659724155e-06, "loss": 0.4737, "step": 1758 }, { "epoch": 1.3074826560951438, "grad_norm": 0.40752413868904114, "learning_rate": 6.939864173086699e-06, "loss": 0.3856, "step": 1759 }, { "epoch": 1.3082259663032705, "grad_norm": 0.4931902587413788, "learning_rate": 6.935876234275627e-06, "loss": 0.4007, "step": 1760 }, { "epoch": 1.3089692765113974, "grad_norm": 0.5690209269523621, "learning_rate": 6.931886846276289e-06, "loss": 0.4275, "step": 1761 }, { "epoch": 1.3097125867195243, "grad_norm": 0.39357423782348633, "learning_rate": 6.927896012075124e-06, "loss": 0.4001, "step": 1762 }, { "epoch": 1.3104558969276512, "grad_norm": 0.5477890372276306, "learning_rate": 6.9239037346596516e-06, "loss": 0.4259, "step": 1763 }, { "epoch": 1.311199207135778, "grad_norm": 0.4412565529346466, "learning_rate": 6.919910017018472e-06, "loss": 0.394, "step": 1764 }, { "epoch": 1.3119425173439048, "grad_norm": 0.44399118423461914, "learning_rate": 6.915914862141267e-06, "loss": 0.4218, "step": 1765 }, { "epoch": 1.3126858275520317, "grad_norm": 0.5589495301246643, "learning_rate": 6.911918273018791e-06, "loss": 0.4148, "step": 1766 }, { "epoch": 1.3134291377601586, "grad_norm": 0.4724481403827667, "learning_rate": 6.90792025264287e-06, "loss": 0.4177, "step": 1767 }, { "epoch": 1.3141724479682855, "grad_norm": 0.42922085523605347, "learning_rate": 6.9039208040064075e-06, "loss": 0.4303, "step": 1768 }, { "epoch": 1.3149157581764124, "grad_norm": 0.631611168384552, "learning_rate": 6.899919930103372e-06, "loss": 0.4277, "step": 1769 }, { "epoch": 1.315659068384539, "grad_norm": 0.3945373594760895, "learning_rate": 6.895917633928799e-06, "loss": 0.3881, "step": 1770 }, { "epoch": 1.316402378592666, "grad_norm": 0.44546666741371155, "learning_rate": 6.891913918478788e-06, "loss": 0.4213, "step": 1771 }, { "epoch": 1.3171456888007929, "grad_norm": 0.5438833236694336, "learning_rate": 6.887908786750504e-06, "loss": 0.4437, "step": 1772 }, { "epoch": 1.3178889990089198, "grad_norm": 0.4902245104312897, "learning_rate": 6.883902241742173e-06, "loss": 0.4956, "step": 1773 }, { "epoch": 1.3186323092170467, "grad_norm": 0.3686160743236542, "learning_rate": 6.879894286453073e-06, "loss": 0.408, "step": 1774 }, { "epoch": 1.3193756194251733, "grad_norm": 0.484271764755249, "learning_rate": 6.875884923883541e-06, "loss": 0.4625, "step": 1775 }, { "epoch": 1.3201189296333002, "grad_norm": 0.4144751727581024, "learning_rate": 6.871874157034973e-06, "loss": 0.4119, "step": 1776 }, { "epoch": 1.3208622398414271, "grad_norm": 0.500171959400177, "learning_rate": 6.867861988909805e-06, "loss": 0.4322, "step": 1777 }, { "epoch": 1.321605550049554, "grad_norm": 0.4355669915676117, "learning_rate": 6.863848422511531e-06, "loss": 0.4008, "step": 1778 }, { "epoch": 1.322348860257681, "grad_norm": 0.42593351006507874, "learning_rate": 6.859833460844688e-06, "loss": 0.4247, "step": 1779 }, { "epoch": 1.3230921704658076, "grad_norm": 0.45131915807724, "learning_rate": 6.8558171069148605e-06, "loss": 0.4363, "step": 1780 }, { "epoch": 1.3238354806739345, "grad_norm": 0.4344847798347473, "learning_rate": 6.851799363728669e-06, "loss": 0.3868, "step": 1781 }, { "epoch": 1.3245787908820614, "grad_norm": 0.42660826444625854, "learning_rate": 6.847780234293782e-06, "loss": 0.4212, "step": 1782 }, { "epoch": 1.3253221010901883, "grad_norm": 0.4391469359397888, "learning_rate": 6.8437597216189e-06, "loss": 0.4384, "step": 1783 }, { "epoch": 1.3260654112983152, "grad_norm": 0.4856177270412445, "learning_rate": 6.83973782871376e-06, "loss": 0.4148, "step": 1784 }, { "epoch": 1.326808721506442, "grad_norm": 0.4700339436531067, "learning_rate": 6.835714558589135e-06, "loss": 0.4199, "step": 1785 }, { "epoch": 1.3275520317145688, "grad_norm": 0.4325474798679352, "learning_rate": 6.8316899142568236e-06, "loss": 0.4223, "step": 1786 }, { "epoch": 1.3282953419226957, "grad_norm": 0.43795445561408997, "learning_rate": 6.8276638987296604e-06, "loss": 0.4131, "step": 1787 }, { "epoch": 1.3290386521308226, "grad_norm": 0.3726143538951874, "learning_rate": 6.823636515021501e-06, "loss": 0.4035, "step": 1788 }, { "epoch": 1.3297819623389495, "grad_norm": 0.47775915265083313, "learning_rate": 6.819607766147225e-06, "loss": 0.4875, "step": 1789 }, { "epoch": 1.3305252725470762, "grad_norm": 0.39970114827156067, "learning_rate": 6.815577655122738e-06, "loss": 0.3725, "step": 1790 }, { "epoch": 1.331268582755203, "grad_norm": 0.4415883421897888, "learning_rate": 6.811546184964963e-06, "loss": 0.4593, "step": 1791 }, { "epoch": 1.33201189296333, "grad_norm": 0.4013579785823822, "learning_rate": 6.80751335869184e-06, "loss": 0.4138, "step": 1792 }, { "epoch": 1.332755203171457, "grad_norm": 0.45532479882240295, "learning_rate": 6.803479179322325e-06, "loss": 0.4257, "step": 1793 }, { "epoch": 1.3334985133795838, "grad_norm": 0.4121997356414795, "learning_rate": 6.799443649876385e-06, "loss": 0.4083, "step": 1794 }, { "epoch": 1.3342418235877105, "grad_norm": 0.40054017305374146, "learning_rate": 6.795406773374999e-06, "loss": 0.3907, "step": 1795 }, { "epoch": 1.3349851337958374, "grad_norm": 0.43727728724479675, "learning_rate": 6.7913685528401594e-06, "loss": 0.4214, "step": 1796 }, { "epoch": 1.3357284440039643, "grad_norm": 0.43006226420402527, "learning_rate": 6.787328991294855e-06, "loss": 0.4277, "step": 1797 }, { "epoch": 1.3364717542120912, "grad_norm": 0.4096756875514984, "learning_rate": 6.783288091763083e-06, "loss": 0.4149, "step": 1798 }, { "epoch": 1.337215064420218, "grad_norm": 0.4461374878883362, "learning_rate": 6.779245857269846e-06, "loss": 0.4049, "step": 1799 }, { "epoch": 1.3379583746283448, "grad_norm": 0.4930255115032196, "learning_rate": 6.775202290841143e-06, "loss": 0.4835, "step": 1800 }, { "epoch": 1.3387016848364717, "grad_norm": 0.4464060962200165, "learning_rate": 6.771157395503967e-06, "loss": 0.4629, "step": 1801 }, { "epoch": 1.3394449950445986, "grad_norm": 0.3925186097621918, "learning_rate": 6.767111174286307e-06, "loss": 0.376, "step": 1802 }, { "epoch": 1.3401883052527255, "grad_norm": 0.41812530159950256, "learning_rate": 6.763063630217152e-06, "loss": 0.3848, "step": 1803 }, { "epoch": 1.3409316154608524, "grad_norm": 0.4446137845516205, "learning_rate": 6.759014766326471e-06, "loss": 0.3994, "step": 1804 }, { "epoch": 1.341674925668979, "grad_norm": 0.41367265582084656, "learning_rate": 6.754964585645225e-06, "loss": 0.4003, "step": 1805 }, { "epoch": 1.3424182358771062, "grad_norm": 0.42683491110801697, "learning_rate": 6.750913091205363e-06, "loss": 0.4368, "step": 1806 }, { "epoch": 1.3431615460852329, "grad_norm": 0.4822109341621399, "learning_rate": 6.746860286039815e-06, "loss": 0.4349, "step": 1807 }, { "epoch": 1.3439048562933598, "grad_norm": 0.41680747270584106, "learning_rate": 6.742806173182493e-06, "loss": 0.4104, "step": 1808 }, { "epoch": 1.3446481665014867, "grad_norm": 0.38790205121040344, "learning_rate": 6.738750755668286e-06, "loss": 0.452, "step": 1809 }, { "epoch": 1.3453914767096133, "grad_norm": 0.405434787273407, "learning_rate": 6.7346940365330624e-06, "loss": 0.3995, "step": 1810 }, { "epoch": 1.3461347869177405, "grad_norm": 0.44290295243263245, "learning_rate": 6.730636018813666e-06, "loss": 0.4161, "step": 1811 }, { "epoch": 1.3468780971258671, "grad_norm": 0.47078534960746765, "learning_rate": 6.726576705547908e-06, "loss": 0.4905, "step": 1812 }, { "epoch": 1.347621407333994, "grad_norm": 0.3846130073070526, "learning_rate": 6.722516099774573e-06, "loss": 0.3529, "step": 1813 }, { "epoch": 1.348364717542121, "grad_norm": 0.41631484031677246, "learning_rate": 6.7184542045334135e-06, "loss": 0.4025, "step": 1814 }, { "epoch": 1.3491080277502479, "grad_norm": 0.41790786385536194, "learning_rate": 6.714391022865143e-06, "loss": 0.4221, "step": 1815 }, { "epoch": 1.3498513379583748, "grad_norm": 0.4047812819480896, "learning_rate": 6.710326557811445e-06, "loss": 0.4535, "step": 1816 }, { "epoch": 1.3505946481665014, "grad_norm": 0.41276365518569946, "learning_rate": 6.706260812414956e-06, "loss": 0.4319, "step": 1817 }, { "epoch": 1.3513379583746283, "grad_norm": 0.3532952070236206, "learning_rate": 6.702193789719276e-06, "loss": 0.3753, "step": 1818 }, { "epoch": 1.3520812685827552, "grad_norm": 0.4265666604042053, "learning_rate": 6.698125492768961e-06, "loss": 0.4527, "step": 1819 }, { "epoch": 1.3528245787908821, "grad_norm": 0.3822106719017029, "learning_rate": 6.694055924609521e-06, "loss": 0.3903, "step": 1820 }, { "epoch": 1.353567888999009, "grad_norm": 0.44343236088752747, "learning_rate": 6.689985088287411e-06, "loss": 0.4454, "step": 1821 }, { "epoch": 1.3543111992071357, "grad_norm": 0.4658934473991394, "learning_rate": 6.6859129868500485e-06, "loss": 0.4425, "step": 1822 }, { "epoch": 1.3550545094152626, "grad_norm": 0.36385905742645264, "learning_rate": 6.681839623345785e-06, "loss": 0.3757, "step": 1823 }, { "epoch": 1.3557978196233895, "grad_norm": 0.4100513756275177, "learning_rate": 6.677765000823923e-06, "loss": 0.4828, "step": 1824 }, { "epoch": 1.3565411298315164, "grad_norm": 0.389748752117157, "learning_rate": 6.673689122334707e-06, "loss": 0.3746, "step": 1825 }, { "epoch": 1.3572844400396433, "grad_norm": 0.42346879839897156, "learning_rate": 6.669611990929323e-06, "loss": 0.4665, "step": 1826 }, { "epoch": 1.35802775024777, "grad_norm": 0.3856436610221863, "learning_rate": 6.665533609659891e-06, "loss": 0.3701, "step": 1827 }, { "epoch": 1.358771060455897, "grad_norm": 0.4223726689815521, "learning_rate": 6.66145398157947e-06, "loss": 0.4304, "step": 1828 }, { "epoch": 1.3595143706640238, "grad_norm": 0.39447924494743347, "learning_rate": 6.657373109742051e-06, "loss": 0.3865, "step": 1829 }, { "epoch": 1.3602576808721507, "grad_norm": 0.4784727990627289, "learning_rate": 6.653290997202555e-06, "loss": 0.4329, "step": 1830 }, { "epoch": 1.3610009910802776, "grad_norm": 0.3967188894748688, "learning_rate": 6.649207647016837e-06, "loss": 0.4337, "step": 1831 }, { "epoch": 1.3617443012884043, "grad_norm": 0.412923127412796, "learning_rate": 6.64512306224167e-06, "loss": 0.407, "step": 1832 }, { "epoch": 1.3624876114965312, "grad_norm": 0.430698424577713, "learning_rate": 6.641037245934758e-06, "loss": 0.4172, "step": 1833 }, { "epoch": 1.363230921704658, "grad_norm": 0.43696674704551697, "learning_rate": 6.636950201154726e-06, "loss": 0.4573, "step": 1834 }, { "epoch": 1.363974231912785, "grad_norm": 0.3868977725505829, "learning_rate": 6.632861930961117e-06, "loss": 0.3893, "step": 1835 }, { "epoch": 1.364717542120912, "grad_norm": 0.4225427210330963, "learning_rate": 6.628772438414389e-06, "loss": 0.4396, "step": 1836 }, { "epoch": 1.3654608523290386, "grad_norm": 0.4184199273586273, "learning_rate": 6.62468172657592e-06, "loss": 0.4326, "step": 1837 }, { "epoch": 1.3662041625371655, "grad_norm": 0.42789679765701294, "learning_rate": 6.620589798507999e-06, "loss": 0.4072, "step": 1838 }, { "epoch": 1.3669474727452924, "grad_norm": 0.37301042675971985, "learning_rate": 6.616496657273823e-06, "loss": 0.4048, "step": 1839 }, { "epoch": 1.3676907829534193, "grad_norm": 0.4864712953567505, "learning_rate": 6.612402305937502e-06, "loss": 0.4735, "step": 1840 }, { "epoch": 1.3684340931615462, "grad_norm": 0.4229707717895508, "learning_rate": 6.608306747564046e-06, "loss": 0.3936, "step": 1841 }, { "epoch": 1.3691774033696729, "grad_norm": 0.4137800335884094, "learning_rate": 6.604209985219374e-06, "loss": 0.4282, "step": 1842 }, { "epoch": 1.3699207135777998, "grad_norm": 0.3706273138523102, "learning_rate": 6.600112021970303e-06, "loss": 0.3849, "step": 1843 }, { "epoch": 1.3706640237859267, "grad_norm": 0.4560408294200897, "learning_rate": 6.5960128608845486e-06, "loss": 0.4457, "step": 1844 }, { "epoch": 1.3714073339940536, "grad_norm": 0.3899238109588623, "learning_rate": 6.591912505030726e-06, "loss": 0.3878, "step": 1845 }, { "epoch": 1.3721506442021805, "grad_norm": 0.4010252356529236, "learning_rate": 6.587810957478345e-06, "loss": 0.3835, "step": 1846 }, { "epoch": 1.3728939544103071, "grad_norm": 0.42207714915275574, "learning_rate": 6.583708221297802e-06, "loss": 0.3961, "step": 1847 }, { "epoch": 1.373637264618434, "grad_norm": 0.4367213547229767, "learning_rate": 6.579604299560387e-06, "loss": 0.4376, "step": 1848 }, { "epoch": 1.374380574826561, "grad_norm": 0.40724679827690125, "learning_rate": 6.575499195338281e-06, "loss": 0.4211, "step": 1849 }, { "epoch": 1.3751238850346879, "grad_norm": 0.4042099118232727, "learning_rate": 6.571392911704544e-06, "loss": 0.4341, "step": 1850 }, { "epoch": 1.3758671952428148, "grad_norm": 0.3401530981063843, "learning_rate": 6.567285451733121e-06, "loss": 0.3577, "step": 1851 }, { "epoch": 1.3766105054509414, "grad_norm": 0.45088455080986023, "learning_rate": 6.563176818498839e-06, "loss": 0.4679, "step": 1852 }, { "epoch": 1.3773538156590683, "grad_norm": 0.3991081416606903, "learning_rate": 6.5590670150774035e-06, "loss": 0.4473, "step": 1853 }, { "epoch": 1.3780971258671952, "grad_norm": 0.39745357632637024, "learning_rate": 6.5549560445453945e-06, "loss": 0.4215, "step": 1854 }, { "epoch": 1.3788404360753221, "grad_norm": 0.4223230183124542, "learning_rate": 6.550843909980264e-06, "loss": 0.3686, "step": 1855 }, { "epoch": 1.379583746283449, "grad_norm": 0.5138887763023376, "learning_rate": 6.546730614460339e-06, "loss": 0.4662, "step": 1856 }, { "epoch": 1.3803270564915757, "grad_norm": 0.4209325611591339, "learning_rate": 6.542616161064815e-06, "loss": 0.4077, "step": 1857 }, { "epoch": 1.3810703666997026, "grad_norm": 0.45988887548446655, "learning_rate": 6.538500552873752e-06, "loss": 0.432, "step": 1858 }, { "epoch": 1.3818136769078295, "grad_norm": 0.5228784084320068, "learning_rate": 6.534383792968077e-06, "loss": 0.3875, "step": 1859 }, { "epoch": 1.3825569871159564, "grad_norm": 0.41887831687927246, "learning_rate": 6.5302658844295765e-06, "loss": 0.3817, "step": 1860 }, { "epoch": 1.3833002973240833, "grad_norm": 0.4424154758453369, "learning_rate": 6.526146830340899e-06, "loss": 0.4587, "step": 1861 }, { "epoch": 1.38404360753221, "grad_norm": 0.4794953465461731, "learning_rate": 6.52202663378555e-06, "loss": 0.3952, "step": 1862 }, { "epoch": 1.384786917740337, "grad_norm": 0.4338167905807495, "learning_rate": 6.517905297847891e-06, "loss": 0.4416, "step": 1863 }, { "epoch": 1.3855302279484638, "grad_norm": 0.43496960401535034, "learning_rate": 6.513782825613134e-06, "loss": 0.4054, "step": 1864 }, { "epoch": 1.3862735381565907, "grad_norm": 0.4992745518684387, "learning_rate": 6.509659220167344e-06, "loss": 0.428, "step": 1865 }, { "epoch": 1.3870168483647176, "grad_norm": 0.5046466588973999, "learning_rate": 6.505534484597434e-06, "loss": 0.3954, "step": 1866 }, { "epoch": 1.3877601585728443, "grad_norm": 0.4467533826828003, "learning_rate": 6.501408621991161e-06, "loss": 0.4506, "step": 1867 }, { "epoch": 1.3885034687809712, "grad_norm": 0.47989532351493835, "learning_rate": 6.497281635437129e-06, "loss": 0.4285, "step": 1868 }, { "epoch": 1.389246778989098, "grad_norm": 0.526138961315155, "learning_rate": 6.493153528024782e-06, "loss": 0.4286, "step": 1869 }, { "epoch": 1.389990089197225, "grad_norm": 0.4362803101539612, "learning_rate": 6.489024302844402e-06, "loss": 0.3445, "step": 1870 }, { "epoch": 1.390733399405352, "grad_norm": 0.5179764032363892, "learning_rate": 6.484893962987105e-06, "loss": 0.4518, "step": 1871 }, { "epoch": 1.3914767096134786, "grad_norm": 0.5225485563278198, "learning_rate": 6.480762511544851e-06, "loss": 0.4241, "step": 1872 }, { "epoch": 1.3922200198216055, "grad_norm": 0.4177126884460449, "learning_rate": 6.476629951610423e-06, "loss": 0.4283, "step": 1873 }, { "epoch": 1.3929633300297324, "grad_norm": 0.3901465833187103, "learning_rate": 6.472496286277435e-06, "loss": 0.4166, "step": 1874 }, { "epoch": 1.3937066402378593, "grad_norm": 0.47079333662986755, "learning_rate": 6.468361518640332e-06, "loss": 0.4243, "step": 1875 }, { "epoch": 1.3944499504459862, "grad_norm": 0.465628445148468, "learning_rate": 6.464225651794385e-06, "loss": 0.4142, "step": 1876 }, { "epoch": 1.3951932606541129, "grad_norm": 0.41782253980636597, "learning_rate": 6.460088688835681e-06, "loss": 0.4286, "step": 1877 }, { "epoch": 1.3959365708622398, "grad_norm": 0.3754199743270874, "learning_rate": 6.4559506328611345e-06, "loss": 0.3821, "step": 1878 }, { "epoch": 1.3966798810703667, "grad_norm": 0.42996594309806824, "learning_rate": 6.451811486968476e-06, "loss": 0.3912, "step": 1879 }, { "epoch": 1.3974231912784936, "grad_norm": 0.4544013440608978, "learning_rate": 6.447671254256249e-06, "loss": 0.4259, "step": 1880 }, { "epoch": 1.3981665014866205, "grad_norm": 0.4304277002811432, "learning_rate": 6.443529937823816e-06, "loss": 0.4425, "step": 1881 }, { "epoch": 1.3989098116947472, "grad_norm": 0.39461714029312134, "learning_rate": 6.439387540771346e-06, "loss": 0.3984, "step": 1882 }, { "epoch": 1.399653121902874, "grad_norm": 0.4039512872695923, "learning_rate": 6.4352440661998195e-06, "loss": 0.4145, "step": 1883 }, { "epoch": 1.400396432111001, "grad_norm": 0.4169312119483948, "learning_rate": 6.431099517211024e-06, "loss": 0.4149, "step": 1884 }, { "epoch": 1.4011397423191279, "grad_norm": 0.4487173557281494, "learning_rate": 6.426953896907548e-06, "loss": 0.4007, "step": 1885 }, { "epoch": 1.4018830525272548, "grad_norm": 0.47855785489082336, "learning_rate": 6.4228072083927874e-06, "loss": 0.4283, "step": 1886 }, { "epoch": 1.4026263627353814, "grad_norm": 0.3957831859588623, "learning_rate": 6.41865945477093e-06, "loss": 0.3987, "step": 1887 }, { "epoch": 1.4033696729435086, "grad_norm": 0.4032670855522156, "learning_rate": 6.414510639146969e-06, "loss": 0.424, "step": 1888 }, { "epoch": 1.4041129831516352, "grad_norm": 0.41958409547805786, "learning_rate": 6.410360764626691e-06, "loss": 0.416, "step": 1889 }, { "epoch": 1.4048562933597621, "grad_norm": 0.44960543513298035, "learning_rate": 6.406209834316668e-06, "loss": 0.3995, "step": 1890 }, { "epoch": 1.405599603567889, "grad_norm": 0.4091561734676361, "learning_rate": 6.402057851324271e-06, "loss": 0.4298, "step": 1891 }, { "epoch": 1.4063429137760157, "grad_norm": 0.3768632113933563, "learning_rate": 6.397904818757659e-06, "loss": 0.3774, "step": 1892 }, { "epoch": 1.4070862239841428, "grad_norm": 0.42361122369766235, "learning_rate": 6.393750739725767e-06, "loss": 0.3999, "step": 1893 }, { "epoch": 1.4078295341922695, "grad_norm": 0.45774874091148376, "learning_rate": 6.389595617338323e-06, "loss": 0.4277, "step": 1894 }, { "epoch": 1.4085728444003964, "grad_norm": 0.45465925335884094, "learning_rate": 6.385439454705835e-06, "loss": 0.3999, "step": 1895 }, { "epoch": 1.4093161546085233, "grad_norm": 0.5113004446029663, "learning_rate": 6.3812822549395844e-06, "loss": 0.3776, "step": 1896 }, { "epoch": 1.41005946481665, "grad_norm": 0.45368966460227966, "learning_rate": 6.377124021151634e-06, "loss": 0.4237, "step": 1897 }, { "epoch": 1.4108027750247771, "grad_norm": 0.46319419145584106, "learning_rate": 6.372964756454818e-06, "loss": 0.3858, "step": 1898 }, { "epoch": 1.4115460852329038, "grad_norm": 0.4369020164012909, "learning_rate": 6.368804463962746e-06, "loss": 0.4182, "step": 1899 }, { "epoch": 1.4122893954410307, "grad_norm": 0.4620474576950073, "learning_rate": 6.364643146789791e-06, "loss": 0.4727, "step": 1900 }, { "epoch": 1.4130327056491576, "grad_norm": 0.4568532705307007, "learning_rate": 6.360480808051096e-06, "loss": 0.3673, "step": 1901 }, { "epoch": 1.4137760158572845, "grad_norm": 0.46185097098350525, "learning_rate": 6.356317450862573e-06, "loss": 0.427, "step": 1902 }, { "epoch": 1.4145193260654114, "grad_norm": 0.4097491502761841, "learning_rate": 6.352153078340888e-06, "loss": 0.3933, "step": 1903 }, { "epoch": 1.415262636273538, "grad_norm": 0.44725579023361206, "learning_rate": 6.347987693603476e-06, "loss": 0.4105, "step": 1904 }, { "epoch": 1.416005946481665, "grad_norm": 0.5354040265083313, "learning_rate": 6.343821299768522e-06, "loss": 0.4322, "step": 1905 }, { "epoch": 1.416749256689792, "grad_norm": 0.3785301446914673, "learning_rate": 6.3396538999549706e-06, "loss": 0.4192, "step": 1906 }, { "epoch": 1.4174925668979188, "grad_norm": 0.4605170786380768, "learning_rate": 6.335485497282519e-06, "loss": 0.388, "step": 1907 }, { "epoch": 1.4182358771060457, "grad_norm": 0.47205549478530884, "learning_rate": 6.3313160948716155e-06, "loss": 0.4401, "step": 1908 }, { "epoch": 1.4189791873141724, "grad_norm": 0.39987990260124207, "learning_rate": 6.327145695843455e-06, "loss": 0.3935, "step": 1909 }, { "epoch": 1.4197224975222993, "grad_norm": 0.47949981689453125, "learning_rate": 6.3229743033199785e-06, "loss": 0.4321, "step": 1910 }, { "epoch": 1.4204658077304262, "grad_norm": 0.423931747674942, "learning_rate": 6.318801920423874e-06, "loss": 0.4075, "step": 1911 }, { "epoch": 1.421209117938553, "grad_norm": 0.42605218291282654, "learning_rate": 6.314628550278569e-06, "loss": 0.4602, "step": 1912 }, { "epoch": 1.42195242814668, "grad_norm": 0.5748387575149536, "learning_rate": 6.310454196008226e-06, "loss": 0.4534, "step": 1913 }, { "epoch": 1.4226957383548067, "grad_norm": 0.37014859914779663, "learning_rate": 6.306278860737749e-06, "loss": 0.3542, "step": 1914 }, { "epoch": 1.4234390485629336, "grad_norm": 0.45925194025039673, "learning_rate": 6.3021025475927784e-06, "loss": 0.4646, "step": 1915 }, { "epoch": 1.4241823587710605, "grad_norm": 0.37345755100250244, "learning_rate": 6.297925259699682e-06, "loss": 0.3923, "step": 1916 }, { "epoch": 1.4249256689791874, "grad_norm": 0.4085337519645691, "learning_rate": 6.293747000185555e-06, "loss": 0.4731, "step": 1917 }, { "epoch": 1.4256689791873143, "grad_norm": 0.364019513130188, "learning_rate": 6.289567772178229e-06, "loss": 0.3508, "step": 1918 }, { "epoch": 1.426412289395441, "grad_norm": 0.4515548646450043, "learning_rate": 6.285387578806255e-06, "loss": 0.4279, "step": 1919 }, { "epoch": 1.4271555996035679, "grad_norm": 0.4057188630104065, "learning_rate": 6.281206423198902e-06, "loss": 0.4721, "step": 1920 }, { "epoch": 1.4278989098116948, "grad_norm": 0.4205494821071625, "learning_rate": 6.27702430848617e-06, "loss": 0.4134, "step": 1921 }, { "epoch": 1.4286422200198217, "grad_norm": 0.5088273286819458, "learning_rate": 6.272841237798768e-06, "loss": 0.4266, "step": 1922 }, { "epoch": 1.4293855302279486, "grad_norm": 0.38572433590888977, "learning_rate": 6.268657214268125e-06, "loss": 0.4394, "step": 1923 }, { "epoch": 1.4301288404360752, "grad_norm": 0.5105190277099609, "learning_rate": 6.264472241026383e-06, "loss": 0.4201, "step": 1924 }, { "epoch": 1.4308721506442021, "grad_norm": 0.4331299066543579, "learning_rate": 6.260286321206391e-06, "loss": 0.39, "step": 1925 }, { "epoch": 1.431615460852329, "grad_norm": 0.3952659070491791, "learning_rate": 6.256099457941714e-06, "loss": 0.4138, "step": 1926 }, { "epoch": 1.432358771060456, "grad_norm": 0.4451560974121094, "learning_rate": 6.251911654366616e-06, "loss": 0.4012, "step": 1927 }, { "epoch": 1.4331020812685829, "grad_norm": 0.47172266244888306, "learning_rate": 6.24772291361607e-06, "loss": 0.4412, "step": 1928 }, { "epoch": 1.4338453914767095, "grad_norm": 0.382525771856308, "learning_rate": 6.243533238825747e-06, "loss": 0.4092, "step": 1929 }, { "epoch": 1.4345887016848364, "grad_norm": 0.5003975629806519, "learning_rate": 6.239342633132021e-06, "loss": 0.417, "step": 1930 }, { "epoch": 1.4353320118929633, "grad_norm": 0.4920903742313385, "learning_rate": 6.235151099671957e-06, "loss": 0.3995, "step": 1931 }, { "epoch": 1.4360753221010902, "grad_norm": 0.42302098870277405, "learning_rate": 6.230958641583322e-06, "loss": 0.4125, "step": 1932 }, { "epoch": 1.4368186323092171, "grad_norm": 0.4514380097389221, "learning_rate": 6.226765262004568e-06, "loss": 0.382, "step": 1933 }, { "epoch": 1.4375619425173438, "grad_norm": 0.4494574964046478, "learning_rate": 6.222570964074842e-06, "loss": 0.4781, "step": 1934 }, { "epoch": 1.4383052527254707, "grad_norm": 0.46605294942855835, "learning_rate": 6.218375750933978e-06, "loss": 0.4074, "step": 1935 }, { "epoch": 1.4390485629335976, "grad_norm": 0.44783660769462585, "learning_rate": 6.214179625722491e-06, "loss": 0.4244, "step": 1936 }, { "epoch": 1.4397918731417245, "grad_norm": 0.3858300447463989, "learning_rate": 6.20998259158158e-06, "loss": 0.3759, "step": 1937 }, { "epoch": 1.4405351833498514, "grad_norm": 0.486003577709198, "learning_rate": 6.205784651653131e-06, "loss": 0.409, "step": 1938 }, { "epoch": 1.441278493557978, "grad_norm": 0.41201290488243103, "learning_rate": 6.201585809079702e-06, "loss": 0.4299, "step": 1939 }, { "epoch": 1.442021803766105, "grad_norm": 0.39496514201164246, "learning_rate": 6.197386067004524e-06, "loss": 0.4034, "step": 1940 }, { "epoch": 1.442765113974232, "grad_norm": 0.5514125823974609, "learning_rate": 6.193185428571506e-06, "loss": 0.4188, "step": 1941 }, { "epoch": 1.4435084241823588, "grad_norm": 0.4441640079021454, "learning_rate": 6.1889838969252314e-06, "loss": 0.4273, "step": 1942 }, { "epoch": 1.4442517343904857, "grad_norm": 0.44146931171417236, "learning_rate": 6.184781475210944e-06, "loss": 0.4347, "step": 1943 }, { "epoch": 1.4449950445986124, "grad_norm": 0.49088913202285767, "learning_rate": 6.180578166574556e-06, "loss": 0.4346, "step": 1944 }, { "epoch": 1.4457383548067393, "grad_norm": 0.41241633892059326, "learning_rate": 6.176373974162652e-06, "loss": 0.3887, "step": 1945 }, { "epoch": 1.4464816650148662, "grad_norm": 0.5127524137496948, "learning_rate": 6.172168901122466e-06, "loss": 0.4473, "step": 1946 }, { "epoch": 1.447224975222993, "grad_norm": 0.3979187607765198, "learning_rate": 6.167962950601899e-06, "loss": 0.3792, "step": 1947 }, { "epoch": 1.44796828543112, "grad_norm": 0.42240723967552185, "learning_rate": 6.163756125749505e-06, "loss": 0.4392, "step": 1948 }, { "epoch": 1.4487115956392467, "grad_norm": 0.4657686948776245, "learning_rate": 6.159548429714497e-06, "loss": 0.4176, "step": 1949 }, { "epoch": 1.4494549058473736, "grad_norm": 0.4308370351791382, "learning_rate": 6.1553398656467345e-06, "loss": 0.4127, "step": 1950 }, { "epoch": 1.4501982160555005, "grad_norm": 0.41759511828422546, "learning_rate": 6.151130436696731e-06, "loss": 0.425, "step": 1951 }, { "epoch": 1.4509415262636274, "grad_norm": 0.4653950035572052, "learning_rate": 6.146920146015647e-06, "loss": 0.4422, "step": 1952 }, { "epoch": 1.4516848364717543, "grad_norm": 0.4306773245334625, "learning_rate": 6.142708996755286e-06, "loss": 0.4254, "step": 1953 }, { "epoch": 1.452428146679881, "grad_norm": 0.3879142701625824, "learning_rate": 6.138496992068096e-06, "loss": 0.4003, "step": 1954 }, { "epoch": 1.4531714568880079, "grad_norm": 0.47474315762519836, "learning_rate": 6.134284135107166e-06, "loss": 0.4558, "step": 1955 }, { "epoch": 1.4539147670961348, "grad_norm": 0.3769921064376831, "learning_rate": 6.130070429026218e-06, "loss": 0.3766, "step": 1956 }, { "epoch": 1.4546580773042617, "grad_norm": 0.49857115745544434, "learning_rate": 6.125855876979616e-06, "loss": 0.4605, "step": 1957 }, { "epoch": 1.4554013875123886, "grad_norm": 0.4324010908603668, "learning_rate": 6.121640482122355e-06, "loss": 0.4228, "step": 1958 }, { "epoch": 1.4561446977205152, "grad_norm": 0.4626833498477936, "learning_rate": 6.117424247610062e-06, "loss": 0.3683, "step": 1959 }, { "epoch": 1.4568880079286421, "grad_norm": 0.4351122975349426, "learning_rate": 6.113207176598988e-06, "loss": 0.4564, "step": 1960 }, { "epoch": 1.457631318136769, "grad_norm": 0.3824023902416229, "learning_rate": 6.108989272246015e-06, "loss": 0.3976, "step": 1961 }, { "epoch": 1.458374628344896, "grad_norm": 0.4835955500602722, "learning_rate": 6.104770537708649e-06, "loss": 0.4661, "step": 1962 }, { "epoch": 1.4591179385530229, "grad_norm": 0.4098982512950897, "learning_rate": 6.100550976145012e-06, "loss": 0.3727, "step": 1963 }, { "epoch": 1.4598612487611495, "grad_norm": 0.46400171518325806, "learning_rate": 6.096330590713851e-06, "loss": 0.3918, "step": 1964 }, { "epoch": 1.4606045589692764, "grad_norm": 0.43964841961860657, "learning_rate": 6.0921093845745296e-06, "loss": 0.4055, "step": 1965 }, { "epoch": 1.4613478691774033, "grad_norm": 0.4594268798828125, "learning_rate": 6.08788736088702e-06, "loss": 0.4501, "step": 1966 }, { "epoch": 1.4620911793855302, "grad_norm": 0.39923977851867676, "learning_rate": 6.083664522811912e-06, "loss": 0.4057, "step": 1967 }, { "epoch": 1.4628344895936571, "grad_norm": 0.43097594380378723, "learning_rate": 6.0794408735104035e-06, "loss": 0.4139, "step": 1968 }, { "epoch": 1.4635777998017838, "grad_norm": 0.44130006432533264, "learning_rate": 6.075216416144298e-06, "loss": 0.414, "step": 1969 }, { "epoch": 1.4643211100099107, "grad_norm": 0.49078330397605896, "learning_rate": 6.070991153876008e-06, "loss": 0.4481, "step": 1970 }, { "epoch": 1.4650644202180376, "grad_norm": 0.38245877623558044, "learning_rate": 6.066765089868543e-06, "loss": 0.423, "step": 1971 }, { "epoch": 1.4658077304261645, "grad_norm": 0.36880117654800415, "learning_rate": 6.0625382272855186e-06, "loss": 0.3655, "step": 1972 }, { "epoch": 1.4665510406342914, "grad_norm": 0.47624123096466064, "learning_rate": 6.058310569291141e-06, "loss": 0.46, "step": 1973 }, { "epoch": 1.467294350842418, "grad_norm": 0.43014785647392273, "learning_rate": 6.05408211905022e-06, "loss": 0.4238, "step": 1974 }, { "epoch": 1.4680376610505452, "grad_norm": 0.3617027997970581, "learning_rate": 6.049852879728153e-06, "loss": 0.413, "step": 1975 }, { "epoch": 1.468780971258672, "grad_norm": 0.39451462030410767, "learning_rate": 6.045622854490929e-06, "loss": 0.4116, "step": 1976 }, { "epoch": 1.4695242814667988, "grad_norm": 0.48546773195266724, "learning_rate": 6.041392046505126e-06, "loss": 0.4219, "step": 1977 }, { "epoch": 1.4702675916749257, "grad_norm": 0.43347233533859253, "learning_rate": 6.0371604589379085e-06, "loss": 0.4042, "step": 1978 }, { "epoch": 1.4710109018830524, "grad_norm": 0.41691622138023376, "learning_rate": 6.032928094957024e-06, "loss": 0.4319, "step": 1979 }, { "epoch": 1.4717542120911795, "grad_norm": 0.4044100046157837, "learning_rate": 6.0286949577308e-06, "loss": 0.3866, "step": 1980 }, { "epoch": 1.4724975222993062, "grad_norm": 0.4096701443195343, "learning_rate": 6.024461050428147e-06, "loss": 0.4283, "step": 1981 }, { "epoch": 1.473240832507433, "grad_norm": 0.40992486476898193, "learning_rate": 6.020226376218547e-06, "loss": 0.3704, "step": 1982 }, { "epoch": 1.47398414271556, "grad_norm": 0.4620095193386078, "learning_rate": 6.015990938272055e-06, "loss": 0.4433, "step": 1983 }, { "epoch": 1.474727452923687, "grad_norm": 0.39133474230766296, "learning_rate": 6.011754739759306e-06, "loss": 0.4094, "step": 1984 }, { "epoch": 1.4754707631318138, "grad_norm": 0.49676817655563354, "learning_rate": 6.007517783851499e-06, "loss": 0.4353, "step": 1985 }, { "epoch": 1.4762140733399405, "grad_norm": 0.4054776430130005, "learning_rate": 6.003280073720395e-06, "loss": 0.3985, "step": 1986 }, { "epoch": 1.4769573835480674, "grad_norm": 0.46305498480796814, "learning_rate": 5.99904161253833e-06, "loss": 0.4242, "step": 1987 }, { "epoch": 1.4777006937561943, "grad_norm": 0.4087692201137543, "learning_rate": 5.994802403478195e-06, "loss": 0.4204, "step": 1988 }, { "epoch": 1.4784440039643212, "grad_norm": 0.3980127274990082, "learning_rate": 5.990562449713443e-06, "loss": 0.3853, "step": 1989 }, { "epoch": 1.479187314172448, "grad_norm": 0.49483025074005127, "learning_rate": 5.986321754418084e-06, "loss": 0.4524, "step": 1990 }, { "epoch": 1.4799306243805748, "grad_norm": 0.37994396686553955, "learning_rate": 5.982080320766685e-06, "loss": 0.3953, "step": 1991 }, { "epoch": 1.4806739345887017, "grad_norm": 0.4117732644081116, "learning_rate": 5.977838151934362e-06, "loss": 0.3932, "step": 1992 }, { "epoch": 1.4814172447968286, "grad_norm": 0.46926695108413696, "learning_rate": 5.973595251096784e-06, "loss": 0.4324, "step": 1993 }, { "epoch": 1.4821605550049555, "grad_norm": 0.42456915974617004, "learning_rate": 5.96935162143017e-06, "loss": 0.3984, "step": 1994 }, { "epoch": 1.4829038652130824, "grad_norm": 0.39602959156036377, "learning_rate": 5.9651072661112795e-06, "loss": 0.4307, "step": 1995 }, { "epoch": 1.483647175421209, "grad_norm": 0.37617185711860657, "learning_rate": 5.960862188317419e-06, "loss": 0.446, "step": 1996 }, { "epoch": 1.484390485629336, "grad_norm": 0.42307987809181213, "learning_rate": 5.9566163912264345e-06, "loss": 0.4252, "step": 1997 }, { "epoch": 1.4851337958374629, "grad_norm": 0.42810678482055664, "learning_rate": 5.95236987801671e-06, "loss": 0.3978, "step": 1998 }, { "epoch": 1.4858771060455898, "grad_norm": 0.42109906673431396, "learning_rate": 5.948122651867166e-06, "loss": 0.4512, "step": 1999 }, { "epoch": 1.4866204162537167, "grad_norm": 0.3957291841506958, "learning_rate": 5.943874715957259e-06, "loss": 0.3732, "step": 2000 }, { "epoch": 1.4873637264618433, "grad_norm": 0.4324089586734772, "learning_rate": 5.939626073466973e-06, "loss": 0.4627, "step": 2001 }, { "epoch": 1.4881070366699702, "grad_norm": 0.3614172041416168, "learning_rate": 5.9353767275768234e-06, "loss": 0.4165, "step": 2002 }, { "epoch": 1.4888503468780971, "grad_norm": 0.4405874013900757, "learning_rate": 5.931126681467851e-06, "loss": 0.4592, "step": 2003 }, { "epoch": 1.489593657086224, "grad_norm": 0.4531208872795105, "learning_rate": 5.926875938321622e-06, "loss": 0.441, "step": 2004 }, { "epoch": 1.490336967294351, "grad_norm": 0.3657870590686798, "learning_rate": 5.922624501320225e-06, "loss": 0.3681, "step": 2005 }, { "epoch": 1.4910802775024776, "grad_norm": 0.45104703307151794, "learning_rate": 5.918372373646262e-06, "loss": 0.4303, "step": 2006 }, { "epoch": 1.4918235877106045, "grad_norm": 0.4686802923679352, "learning_rate": 5.9141195584828594e-06, "loss": 0.4415, "step": 2007 }, { "epoch": 1.4925668979187314, "grad_norm": 0.41881290078163147, "learning_rate": 5.909866059013658e-06, "loss": 0.3996, "step": 2008 }, { "epoch": 1.4933102081268583, "grad_norm": 0.4513864517211914, "learning_rate": 5.905611878422804e-06, "loss": 0.4293, "step": 2009 }, { "epoch": 1.4940535183349852, "grad_norm": 0.48860523104667664, "learning_rate": 5.901357019894961e-06, "loss": 0.392, "step": 2010 }, { "epoch": 1.494796828543112, "grad_norm": 0.4230043888092041, "learning_rate": 5.897101486615296e-06, "loss": 0.4444, "step": 2011 }, { "epoch": 1.4955401387512388, "grad_norm": 0.42088502645492554, "learning_rate": 5.892845281769483e-06, "loss": 0.4079, "step": 2012 }, { "epoch": 1.4962834489593657, "grad_norm": 0.48691606521606445, "learning_rate": 5.888588408543696e-06, "loss": 0.44, "step": 2013 }, { "epoch": 1.4970267591674926, "grad_norm": 0.40322452783584595, "learning_rate": 5.884330870124611e-06, "loss": 0.3878, "step": 2014 }, { "epoch": 1.4977700693756195, "grad_norm": 0.46527573466300964, "learning_rate": 5.880072669699407e-06, "loss": 0.4592, "step": 2015 }, { "epoch": 1.4985133795837462, "grad_norm": 0.5015386939048767, "learning_rate": 5.875813810455747e-06, "loss": 0.4181, "step": 2016 }, { "epoch": 1.499256689791873, "grad_norm": 0.43517833948135376, "learning_rate": 5.871554295581799e-06, "loss": 0.3873, "step": 2017 }, { "epoch": 1.5, "grad_norm": 0.4421725273132324, "learning_rate": 5.8672941282662136e-06, "loss": 0.3951, "step": 2018 }, { "epoch": 1.500743310208127, "grad_norm": 0.4579140841960907, "learning_rate": 5.8630333116981346e-06, "loss": 0.4342, "step": 2019 }, { "epoch": 1.5014866204162538, "grad_norm": 0.4623957574367523, "learning_rate": 5.85877184906719e-06, "loss": 0.4179, "step": 2020 }, { "epoch": 1.5022299306243805, "grad_norm": 0.44817492365837097, "learning_rate": 5.85450974356349e-06, "loss": 0.4104, "step": 2021 }, { "epoch": 1.5029732408325074, "grad_norm": 0.43137919902801514, "learning_rate": 5.85024699837763e-06, "loss": 0.4007, "step": 2022 }, { "epoch": 1.5037165510406343, "grad_norm": 0.39318278431892395, "learning_rate": 5.84598361670068e-06, "loss": 0.409, "step": 2023 }, { "epoch": 1.5044598612487612, "grad_norm": 0.4408240020275116, "learning_rate": 5.841719601724191e-06, "loss": 0.445, "step": 2024 }, { "epoch": 1.505203171456888, "grad_norm": 0.43823832273483276, "learning_rate": 5.8374549566401825e-06, "loss": 0.415, "step": 2025 }, { "epoch": 1.5059464816650148, "grad_norm": 0.4339882731437683, "learning_rate": 5.8331896846411495e-06, "loss": 0.4342, "step": 2026 }, { "epoch": 1.5066897918731417, "grad_norm": 0.43916526436805725, "learning_rate": 5.828923788920058e-06, "loss": 0.4322, "step": 2027 }, { "epoch": 1.5074331020812686, "grad_norm": 0.40633058547973633, "learning_rate": 5.8246572726703354e-06, "loss": 0.4035, "step": 2028 }, { "epoch": 1.5081764122893955, "grad_norm": 0.3926868438720703, "learning_rate": 5.8203901390858766e-06, "loss": 0.3767, "step": 2029 }, { "epoch": 1.5089197224975224, "grad_norm": 0.4054635167121887, "learning_rate": 5.81612239136104e-06, "loss": 0.459, "step": 2030 }, { "epoch": 1.509663032705649, "grad_norm": 0.3724508583545685, "learning_rate": 5.811854032690644e-06, "loss": 0.3795, "step": 2031 }, { "epoch": 1.5104063429137762, "grad_norm": 0.4197691082954407, "learning_rate": 5.807585066269959e-06, "loss": 0.4247, "step": 2032 }, { "epoch": 1.5111496531219029, "grad_norm": 0.40540367364883423, "learning_rate": 5.803315495294715e-06, "loss": 0.4209, "step": 2033 }, { "epoch": 1.5118929633300298, "grad_norm": 0.4242480397224426, "learning_rate": 5.799045322961096e-06, "loss": 0.4153, "step": 2034 }, { "epoch": 1.5126362735381567, "grad_norm": 0.46160274744033813, "learning_rate": 5.794774552465732e-06, "loss": 0.4546, "step": 2035 }, { "epoch": 1.5133795837462833, "grad_norm": 0.38798975944519043, "learning_rate": 5.790503187005704e-06, "loss": 0.4208, "step": 2036 }, { "epoch": 1.5141228939544105, "grad_norm": 0.4211389720439911, "learning_rate": 5.786231229778534e-06, "loss": 0.414, "step": 2037 }, { "epoch": 1.5148662041625371, "grad_norm": 0.41599494218826294, "learning_rate": 5.781958683982194e-06, "loss": 0.4143, "step": 2038 }, { "epoch": 1.515609514370664, "grad_norm": 0.3825068175792694, "learning_rate": 5.7776855528150895e-06, "loss": 0.4256, "step": 2039 }, { "epoch": 1.516352824578791, "grad_norm": 0.4246884286403656, "learning_rate": 5.77341183947607e-06, "loss": 0.4163, "step": 2040 }, { "epoch": 1.5170961347869176, "grad_norm": 0.41877269744873047, "learning_rate": 5.769137547164416e-06, "loss": 0.4157, "step": 2041 }, { "epoch": 1.5178394449950448, "grad_norm": 0.41030749678611755, "learning_rate": 5.764862679079844e-06, "loss": 0.4149, "step": 2042 }, { "epoch": 1.5185827552031714, "grad_norm": 0.361771821975708, "learning_rate": 5.760587238422502e-06, "loss": 0.4031, "step": 2043 }, { "epoch": 1.5193260654112983, "grad_norm": 0.4446527063846588, "learning_rate": 5.756311228392965e-06, "loss": 0.4196, "step": 2044 }, { "epoch": 1.5200693756194252, "grad_norm": 0.4454541802406311, "learning_rate": 5.752034652192233e-06, "loss": 0.4807, "step": 2045 }, { "epoch": 1.520812685827552, "grad_norm": 0.38385242223739624, "learning_rate": 5.7477575130217355e-06, "loss": 0.3491, "step": 2046 }, { "epoch": 1.521555996035679, "grad_norm": 0.3819950222969055, "learning_rate": 5.743479814083318e-06, "loss": 0.4106, "step": 2047 }, { "epoch": 1.5222993062438057, "grad_norm": 0.42612424492836, "learning_rate": 5.739201558579246e-06, "loss": 0.4449, "step": 2048 }, { "epoch": 1.5230426164519326, "grad_norm": 0.37456271052360535, "learning_rate": 5.734922749712204e-06, "loss": 0.3912, "step": 2049 }, { "epoch": 1.5237859266600595, "grad_norm": 0.4402252733707428, "learning_rate": 5.730643390685287e-06, "loss": 0.3707, "step": 2050 }, { "epoch": 1.5245292368681862, "grad_norm": 0.45792070031166077, "learning_rate": 5.7263634847020065e-06, "loss": 0.4339, "step": 2051 }, { "epoch": 1.5252725470763133, "grad_norm": 0.42348408699035645, "learning_rate": 5.722083034966278e-06, "loss": 0.4373, "step": 2052 }, { "epoch": 1.52601585728444, "grad_norm": 0.41169843077659607, "learning_rate": 5.717802044682429e-06, "loss": 0.4093, "step": 2053 }, { "epoch": 1.526759167492567, "grad_norm": 0.42453575134277344, "learning_rate": 5.713520517055189e-06, "loss": 0.3762, "step": 2054 }, { "epoch": 1.5275024777006938, "grad_norm": 0.4431438446044922, "learning_rate": 5.709238455289692e-06, "loss": 0.4566, "step": 2055 }, { "epoch": 1.5282457879088205, "grad_norm": 0.3745981752872467, "learning_rate": 5.704955862591467e-06, "loss": 0.3597, "step": 2056 }, { "epoch": 1.5289890981169476, "grad_norm": 0.4656010866165161, "learning_rate": 5.700672742166445e-06, "loss": 0.4707, "step": 2057 }, { "epoch": 1.5297324083250743, "grad_norm": 0.43779194355010986, "learning_rate": 5.6963890972209536e-06, "loss": 0.4413, "step": 2058 }, { "epoch": 1.5304757185332012, "grad_norm": 0.41010764241218567, "learning_rate": 5.692104930961706e-06, "loss": 0.3945, "step": 2059 }, { "epoch": 1.531219028741328, "grad_norm": 0.42761102318763733, "learning_rate": 5.687820246595811e-06, "loss": 0.4039, "step": 2060 }, { "epoch": 1.5319623389494548, "grad_norm": 0.4090751111507416, "learning_rate": 5.683535047330767e-06, "loss": 0.3738, "step": 2061 }, { "epoch": 1.532705649157582, "grad_norm": 0.46122264862060547, "learning_rate": 5.679249336374451e-06, "loss": 0.4813, "step": 2062 }, { "epoch": 1.5334489593657086, "grad_norm": 0.4500584602355957, "learning_rate": 5.674963116935129e-06, "loss": 0.4529, "step": 2063 }, { "epoch": 1.5341922695738355, "grad_norm": 0.38057905435562134, "learning_rate": 5.6706763922214454e-06, "loss": 0.3952, "step": 2064 }, { "epoch": 1.5349355797819624, "grad_norm": 0.44161441922187805, "learning_rate": 5.666389165442423e-06, "loss": 0.4003, "step": 2065 }, { "epoch": 1.535678889990089, "grad_norm": 0.4360629916191101, "learning_rate": 5.662101439807461e-06, "loss": 0.4005, "step": 2066 }, { "epoch": 1.5364222001982162, "grad_norm": 0.44629666209220886, "learning_rate": 5.65781321852633e-06, "loss": 0.4259, "step": 2067 }, { "epoch": 1.5371655104063429, "grad_norm": 0.425461083650589, "learning_rate": 5.653524504809175e-06, "loss": 0.3898, "step": 2068 }, { "epoch": 1.5379088206144698, "grad_norm": 0.4843215048313141, "learning_rate": 5.649235301866507e-06, "loss": 0.4259, "step": 2069 }, { "epoch": 1.5386521308225967, "grad_norm": 0.3940296173095703, "learning_rate": 5.644945612909204e-06, "loss": 0.3864, "step": 2070 }, { "epoch": 1.5393954410307233, "grad_norm": 0.4364657402038574, "learning_rate": 5.640655441148509e-06, "loss": 0.449, "step": 2071 }, { "epoch": 1.5401387512388505, "grad_norm": 0.4364665448665619, "learning_rate": 5.636364789796023e-06, "loss": 0.3887, "step": 2072 }, { "epoch": 1.5408820614469771, "grad_norm": 0.39972710609436035, "learning_rate": 5.632073662063711e-06, "loss": 0.4093, "step": 2073 }, { "epoch": 1.541625371655104, "grad_norm": 0.4331281781196594, "learning_rate": 5.62778206116389e-06, "loss": 0.432, "step": 2074 }, { "epoch": 1.542368681863231, "grad_norm": 0.44006749987602234, "learning_rate": 5.623489990309235e-06, "loss": 0.4043, "step": 2075 }, { "epoch": 1.5431119920713576, "grad_norm": 0.4676348865032196, "learning_rate": 5.619197452712769e-06, "loss": 0.4541, "step": 2076 }, { "epoch": 1.5438553022794848, "grad_norm": 0.41005393862724304, "learning_rate": 5.614904451587868e-06, "loss": 0.4099, "step": 2077 }, { "epoch": 1.5445986124876114, "grad_norm": 0.4057479500770569, "learning_rate": 5.610610990148253e-06, "loss": 0.3718, "step": 2078 }, { "epoch": 1.5453419226957383, "grad_norm": 0.4518084228038788, "learning_rate": 5.606317071607988e-06, "loss": 0.4416, "step": 2079 }, { "epoch": 1.5460852329038652, "grad_norm": 0.3796696662902832, "learning_rate": 5.6020226991814855e-06, "loss": 0.4178, "step": 2080 }, { "epoch": 1.546828543111992, "grad_norm": 0.4541732966899872, "learning_rate": 5.597727876083491e-06, "loss": 0.4347, "step": 2081 }, { "epoch": 1.547571853320119, "grad_norm": 0.4140625, "learning_rate": 5.593432605529087e-06, "loss": 0.3551, "step": 2082 }, { "epoch": 1.5483151635282457, "grad_norm": 0.4188442528247833, "learning_rate": 5.589136890733698e-06, "loss": 0.4314, "step": 2083 }, { "epoch": 1.5490584737363726, "grad_norm": 0.4254947602748871, "learning_rate": 5.584840734913075e-06, "loss": 0.4321, "step": 2084 }, { "epoch": 1.5498017839444995, "grad_norm": 0.40538832545280457, "learning_rate": 5.580544141283301e-06, "loss": 0.3858, "step": 2085 }, { "epoch": 1.5505450941526262, "grad_norm": 0.39302700757980347, "learning_rate": 5.5762471130607875e-06, "loss": 0.4177, "step": 2086 }, { "epoch": 1.5512884043607533, "grad_norm": 0.3892432451248169, "learning_rate": 5.5719496534622695e-06, "loss": 0.434, "step": 2087 }, { "epoch": 1.55203171456888, "grad_norm": 0.5014067888259888, "learning_rate": 5.567651765704807e-06, "loss": 0.446, "step": 2088 }, { "epoch": 1.552775024777007, "grad_norm": 0.4150072932243347, "learning_rate": 5.563353453005779e-06, "loss": 0.4139, "step": 2089 }, { "epoch": 1.5535183349851338, "grad_norm": 0.38799527287483215, "learning_rate": 5.559054718582882e-06, "loss": 0.3754, "step": 2090 }, { "epoch": 1.5542616451932605, "grad_norm": 0.4248225688934326, "learning_rate": 5.554755565654131e-06, "loss": 0.4105, "step": 2091 }, { "epoch": 1.5550049554013876, "grad_norm": 0.4428665041923523, "learning_rate": 5.550455997437852e-06, "loss": 0.4372, "step": 2092 }, { "epoch": 1.5557482656095143, "grad_norm": 0.3831041753292084, "learning_rate": 5.5461560171526815e-06, "loss": 0.4088, "step": 2093 }, { "epoch": 1.5564915758176412, "grad_norm": 0.378036767244339, "learning_rate": 5.541855628017568e-06, "loss": 0.3969, "step": 2094 }, { "epoch": 1.557234886025768, "grad_norm": 0.4373740553855896, "learning_rate": 5.537554833251761e-06, "loss": 0.4041, "step": 2095 }, { "epoch": 1.557978196233895, "grad_norm": 0.4257183074951172, "learning_rate": 5.5332536360748176e-06, "loss": 0.4369, "step": 2096 }, { "epoch": 1.558721506442022, "grad_norm": 0.4269067645072937, "learning_rate": 5.528952039706594e-06, "loss": 0.4379, "step": 2097 }, { "epoch": 1.5594648166501486, "grad_norm": 0.46070247888565063, "learning_rate": 5.524650047367247e-06, "loss": 0.3947, "step": 2098 }, { "epoch": 1.5602081268582755, "grad_norm": 0.4246279299259186, "learning_rate": 5.520347662277227e-06, "loss": 0.387, "step": 2099 }, { "epoch": 1.5609514370664024, "grad_norm": 0.42073190212249756, "learning_rate": 5.516044887657281e-06, "loss": 0.4049, "step": 2100 }, { "epoch": 1.5616947472745293, "grad_norm": 0.46981826424598694, "learning_rate": 5.5117417267284475e-06, "loss": 0.3926, "step": 2101 }, { "epoch": 1.5624380574826562, "grad_norm": 0.4356699287891388, "learning_rate": 5.507438182712051e-06, "loss": 0.4165, "step": 2102 }, { "epoch": 1.5631813676907829, "grad_norm": 0.4305039048194885, "learning_rate": 5.503134258829709e-06, "loss": 0.4275, "step": 2103 }, { "epoch": 1.5639246778989098, "grad_norm": 0.35729438066482544, "learning_rate": 5.498829958303318e-06, "loss": 0.4183, "step": 2104 }, { "epoch": 1.5646679881070367, "grad_norm": 0.3918493986129761, "learning_rate": 5.494525284355057e-06, "loss": 0.4272, "step": 2105 }, { "epoch": 1.5654112983151636, "grad_norm": 0.4214198887348175, "learning_rate": 5.490220240207386e-06, "loss": 0.4227, "step": 2106 }, { "epoch": 1.5661546085232905, "grad_norm": 0.382312536239624, "learning_rate": 5.485914829083044e-06, "loss": 0.4062, "step": 2107 }, { "epoch": 1.5668979187314171, "grad_norm": 0.3888411223888397, "learning_rate": 5.481609054205038e-06, "loss": 0.4092, "step": 2108 }, { "epoch": 1.567641228939544, "grad_norm": 0.4384445548057556, "learning_rate": 5.4773029187966555e-06, "loss": 0.4162, "step": 2109 }, { "epoch": 1.568384539147671, "grad_norm": 0.4139658510684967, "learning_rate": 5.472996426081449e-06, "loss": 0.4213, "step": 2110 }, { "epoch": 1.5691278493557979, "grad_norm": 0.3897598385810852, "learning_rate": 5.46868957928324e-06, "loss": 0.4124, "step": 2111 }, { "epoch": 1.5698711595639248, "grad_norm": 0.4600202441215515, "learning_rate": 5.464382381626113e-06, "loss": 0.4324, "step": 2112 }, { "epoch": 1.5706144697720514, "grad_norm": 0.43716612458229065, "learning_rate": 5.460074836334419e-06, "loss": 0.4299, "step": 2113 }, { "epoch": 1.5713577799801786, "grad_norm": 0.3979136347770691, "learning_rate": 5.455766946632767e-06, "loss": 0.3759, "step": 2114 }, { "epoch": 1.5721010901883052, "grad_norm": 0.43478575348854065, "learning_rate": 5.451458715746022e-06, "loss": 0.4168, "step": 2115 }, { "epoch": 1.5728444003964321, "grad_norm": 0.3912346065044403, "learning_rate": 5.447150146899308e-06, "loss": 0.3829, "step": 2116 }, { "epoch": 1.573587710604559, "grad_norm": 0.38430848717689514, "learning_rate": 5.442841243318e-06, "loss": 0.416, "step": 2117 }, { "epoch": 1.5743310208126857, "grad_norm": 0.4861869215965271, "learning_rate": 5.438532008227723e-06, "loss": 0.4216, "step": 2118 }, { "epoch": 1.5750743310208128, "grad_norm": 0.4318416118621826, "learning_rate": 5.434222444854352e-06, "loss": 0.4379, "step": 2119 }, { "epoch": 1.5758176412289395, "grad_norm": 0.3870198726654053, "learning_rate": 5.429912556424007e-06, "loss": 0.3734, "step": 2120 }, { "epoch": 1.5765609514370664, "grad_norm": 0.40975457429885864, "learning_rate": 5.4256023461630505e-06, "loss": 0.4511, "step": 2121 }, { "epoch": 1.5773042616451933, "grad_norm": 0.36713525652885437, "learning_rate": 5.421291817298086e-06, "loss": 0.376, "step": 2122 }, { "epoch": 1.57804757185332, "grad_norm": 0.44694721698760986, "learning_rate": 5.416980973055958e-06, "loss": 0.4631, "step": 2123 }, { "epoch": 1.5787908820614471, "grad_norm": 0.3660389184951782, "learning_rate": 5.412669816663744e-06, "loss": 0.3408, "step": 2124 }, { "epoch": 1.5795341922695738, "grad_norm": 0.4522136449813843, "learning_rate": 5.408358351348755e-06, "loss": 0.4844, "step": 2125 }, { "epoch": 1.5802775024777007, "grad_norm": 0.4416705369949341, "learning_rate": 5.404046580338536e-06, "loss": 0.3984, "step": 2126 }, { "epoch": 1.5810208126858276, "grad_norm": 0.4651661217212677, "learning_rate": 5.3997345068608605e-06, "loss": 0.4148, "step": 2127 }, { "epoch": 1.5817641228939543, "grad_norm": 0.45959070324897766, "learning_rate": 5.395422134143726e-06, "loss": 0.4316, "step": 2128 }, { "epoch": 1.5825074331020814, "grad_norm": 0.4436878263950348, "learning_rate": 5.391109465415354e-06, "loss": 0.4124, "step": 2129 }, { "epoch": 1.583250743310208, "grad_norm": 0.44854286313056946, "learning_rate": 5.386796503904194e-06, "loss": 0.4021, "step": 2130 }, { "epoch": 1.583994053518335, "grad_norm": 0.4250687062740326, "learning_rate": 5.3824832528389035e-06, "loss": 0.442, "step": 2131 }, { "epoch": 1.584737363726462, "grad_norm": 0.3944947421550751, "learning_rate": 5.378169715448366e-06, "loss": 0.3777, "step": 2132 }, { "epoch": 1.5854806739345886, "grad_norm": 0.43662506341934204, "learning_rate": 5.373855894961674e-06, "loss": 0.3806, "step": 2133 }, { "epoch": 1.5862239841427157, "grad_norm": 0.46514639258384705, "learning_rate": 5.36954179460814e-06, "loss": 0.469, "step": 2134 }, { "epoch": 1.5869672943508424, "grad_norm": 0.3872693181037903, "learning_rate": 5.365227417617273e-06, "loss": 0.3909, "step": 2135 }, { "epoch": 1.5877106045589693, "grad_norm": 0.46188077330589294, "learning_rate": 5.360912767218801e-06, "loss": 0.4046, "step": 2136 }, { "epoch": 1.5884539147670962, "grad_norm": 0.43129363656044006, "learning_rate": 5.35659784664265e-06, "loss": 0.3945, "step": 2137 }, { "epoch": 1.5891972249752229, "grad_norm": 0.39116406440734863, "learning_rate": 5.352282659118952e-06, "loss": 0.4465, "step": 2138 }, { "epoch": 1.58994053518335, "grad_norm": 0.41459301114082336, "learning_rate": 5.347967207878033e-06, "loss": 0.3879, "step": 2139 }, { "epoch": 1.5906838453914767, "grad_norm": 0.4543687701225281, "learning_rate": 5.343651496150425e-06, "loss": 0.4233, "step": 2140 }, { "epoch": 1.5914271555996036, "grad_norm": 0.36266592144966125, "learning_rate": 5.3393355271668465e-06, "loss": 0.3782, "step": 2141 }, { "epoch": 1.5921704658077305, "grad_norm": 0.4225417375564575, "learning_rate": 5.335019304158217e-06, "loss": 0.4243, "step": 2142 }, { "epoch": 1.5929137760158572, "grad_norm": 0.4929162561893463, "learning_rate": 5.330702830355638e-06, "loss": 0.4167, "step": 2143 }, { "epoch": 1.5936570862239843, "grad_norm": 0.4026085138320923, "learning_rate": 5.326386108990404e-06, "loss": 0.3662, "step": 2144 }, { "epoch": 1.594400396432111, "grad_norm": 0.4433020055294037, "learning_rate": 5.3220691432939905e-06, "loss": 0.4029, "step": 2145 }, { "epoch": 1.5951437066402379, "grad_norm": 0.5278528332710266, "learning_rate": 5.317751936498063e-06, "loss": 0.391, "step": 2146 }, { "epoch": 1.5958870168483648, "grad_norm": 0.40749067068099976, "learning_rate": 5.313434491834463e-06, "loss": 0.4645, "step": 2147 }, { "epoch": 1.5966303270564914, "grad_norm": 0.3522327244281769, "learning_rate": 5.3091168125352055e-06, "loss": 0.3467, "step": 2148 }, { "epoch": 1.5973736372646186, "grad_norm": 0.5018125772476196, "learning_rate": 5.304798901832488e-06, "loss": 0.4514, "step": 2149 }, { "epoch": 1.5981169474727452, "grad_norm": 0.4250851571559906, "learning_rate": 5.300480762958683e-06, "loss": 0.3788, "step": 2150 }, { "epoch": 1.5988602576808721, "grad_norm": 0.40425845980644226, "learning_rate": 5.296162399146325e-06, "loss": 0.43, "step": 2151 }, { "epoch": 1.599603567888999, "grad_norm": 0.46340930461883545, "learning_rate": 5.2918438136281235e-06, "loss": 0.4298, "step": 2152 }, { "epoch": 1.6003468780971257, "grad_norm": 0.41610080003738403, "learning_rate": 5.287525009636951e-06, "loss": 0.3892, "step": 2153 }, { "epoch": 1.6010901883052528, "grad_norm": 0.44058769941329956, "learning_rate": 5.283205990405849e-06, "loss": 0.4403, "step": 2154 }, { "epoch": 1.6018334985133795, "grad_norm": 0.4151749610900879, "learning_rate": 5.27888675916801e-06, "loss": 0.4053, "step": 2155 }, { "epoch": 1.6025768087215064, "grad_norm": 0.4595803916454315, "learning_rate": 5.274567319156795e-06, "loss": 0.4025, "step": 2156 }, { "epoch": 1.6033201189296333, "grad_norm": 0.4769136607646942, "learning_rate": 5.270247673605717e-06, "loss": 0.4503, "step": 2157 }, { "epoch": 1.60406342913776, "grad_norm": 0.42937713861465454, "learning_rate": 5.265927825748444e-06, "loss": 0.3808, "step": 2158 }, { "epoch": 1.6048067393458871, "grad_norm": 0.4367390275001526, "learning_rate": 5.261607778818791e-06, "loss": 0.4229, "step": 2159 }, { "epoch": 1.6055500495540138, "grad_norm": 0.44080793857574463, "learning_rate": 5.257287536050729e-06, "loss": 0.4222, "step": 2160 }, { "epoch": 1.6062933597621407, "grad_norm": 0.4327227771282196, "learning_rate": 5.252967100678373e-06, "loss": 0.3809, "step": 2161 }, { "epoch": 1.6070366699702676, "grad_norm": 0.4301125407218933, "learning_rate": 5.2486464759359775e-06, "loss": 0.4286, "step": 2162 }, { "epoch": 1.6077799801783943, "grad_norm": 0.4050564765930176, "learning_rate": 5.244325665057947e-06, "loss": 0.376, "step": 2163 }, { "epoch": 1.6085232903865214, "grad_norm": 0.4453825056552887, "learning_rate": 5.24000467127882e-06, "loss": 0.3914, "step": 2164 }, { "epoch": 1.609266600594648, "grad_norm": 0.4382590055465698, "learning_rate": 5.235683497833272e-06, "loss": 0.4698, "step": 2165 }, { "epoch": 1.610009910802775, "grad_norm": 0.4083161950111389, "learning_rate": 5.231362147956115e-06, "loss": 0.4253, "step": 2166 }, { "epoch": 1.610753221010902, "grad_norm": 0.42056047916412354, "learning_rate": 5.227040624882293e-06, "loss": 0.4395, "step": 2167 }, { "epoch": 1.6114965312190286, "grad_norm": 0.39196091890335083, "learning_rate": 5.2227189318468756e-06, "loss": 0.3872, "step": 2168 }, { "epoch": 1.6122398414271557, "grad_norm": 0.4113222062587738, "learning_rate": 5.218397072085065e-06, "loss": 0.3892, "step": 2169 }, { "epoch": 1.6129831516352824, "grad_norm": 0.4262596368789673, "learning_rate": 5.214075048832188e-06, "loss": 0.4213, "step": 2170 }, { "epoch": 1.6137264618434093, "grad_norm": 0.4097088575363159, "learning_rate": 5.209752865323689e-06, "loss": 0.3705, "step": 2171 }, { "epoch": 1.6144697720515362, "grad_norm": 0.42867326736450195, "learning_rate": 5.205430524795133e-06, "loss": 0.4497, "step": 2172 }, { "epoch": 1.6152130822596629, "grad_norm": 0.39015012979507446, "learning_rate": 5.2011080304822105e-06, "loss": 0.4131, "step": 2173 }, { "epoch": 1.61595639246779, "grad_norm": 0.3832240104675293, "learning_rate": 5.196785385620718e-06, "loss": 0.4336, "step": 2174 }, { "epoch": 1.6166997026759167, "grad_norm": 0.4399740695953369, "learning_rate": 5.192462593446569e-06, "loss": 0.4295, "step": 2175 }, { "epoch": 1.6174430128840436, "grad_norm": 0.4347476065158844, "learning_rate": 5.1881396571957846e-06, "loss": 0.4084, "step": 2176 }, { "epoch": 1.6181863230921705, "grad_norm": 0.4226624071598053, "learning_rate": 5.1838165801044974e-06, "loss": 0.4148, "step": 2177 }, { "epoch": 1.6189296333002974, "grad_norm": 0.4204042851924896, "learning_rate": 5.179493365408943e-06, "loss": 0.4243, "step": 2178 }, { "epoch": 1.6196729435084243, "grad_norm": 0.36873993277549744, "learning_rate": 5.1751700163454595e-06, "loss": 0.4165, "step": 2179 }, { "epoch": 1.620416253716551, "grad_norm": 0.4046187698841095, "learning_rate": 5.170846536150489e-06, "loss": 0.4087, "step": 2180 }, { "epoch": 1.6211595639246779, "grad_norm": 0.46586260199546814, "learning_rate": 5.166522928060567e-06, "loss": 0.4501, "step": 2181 }, { "epoch": 1.6219028741328048, "grad_norm": 0.40411660075187683, "learning_rate": 5.162199195312327e-06, "loss": 0.4536, "step": 2182 }, { "epoch": 1.6226461843409317, "grad_norm": 0.37741678953170776, "learning_rate": 5.1578753411424985e-06, "loss": 0.3658, "step": 2183 }, { "epoch": 1.6233894945490586, "grad_norm": 0.42570361495018005, "learning_rate": 5.153551368787898e-06, "loss": 0.4277, "step": 2184 }, { "epoch": 1.6241328047571852, "grad_norm": 0.39834660291671753, "learning_rate": 5.14922728148543e-06, "loss": 0.3894, "step": 2185 }, { "epoch": 1.6248761149653121, "grad_norm": 0.45791807770729065, "learning_rate": 5.14490308247209e-06, "loss": 0.4317, "step": 2186 }, { "epoch": 1.625619425173439, "grad_norm": 0.4148871898651123, "learning_rate": 5.140578774984951e-06, "loss": 0.4062, "step": 2187 }, { "epoch": 1.626362735381566, "grad_norm": 0.38540077209472656, "learning_rate": 5.136254362261173e-06, "loss": 0.3408, "step": 2188 }, { "epoch": 1.6271060455896929, "grad_norm": 0.4043302834033966, "learning_rate": 5.13192984753799e-06, "loss": 0.4648, "step": 2189 }, { "epoch": 1.6278493557978195, "grad_norm": 0.42527738213539124, "learning_rate": 5.127605234052716e-06, "loss": 0.4192, "step": 2190 }, { "epoch": 1.6285926660059464, "grad_norm": 0.38460883498191833, "learning_rate": 5.123280525042735e-06, "loss": 0.412, "step": 2191 }, { "epoch": 1.6293359762140733, "grad_norm": 0.38325172662734985, "learning_rate": 5.118955723745504e-06, "loss": 0.4074, "step": 2192 }, { "epoch": 1.6300792864222002, "grad_norm": 0.41759416460990906, "learning_rate": 5.114630833398555e-06, "loss": 0.4012, "step": 2193 }, { "epoch": 1.6308225966303271, "grad_norm": 0.3485237956047058, "learning_rate": 5.110305857239478e-06, "loss": 0.4122, "step": 2194 }, { "epoch": 1.6315659068384538, "grad_norm": 0.3935968279838562, "learning_rate": 5.105980798505929e-06, "loss": 0.3999, "step": 2195 }, { "epoch": 1.6323092170465807, "grad_norm": 0.36025726795196533, "learning_rate": 5.101655660435632e-06, "loss": 0.3574, "step": 2196 }, { "epoch": 1.6330525272547076, "grad_norm": 0.3888029158115387, "learning_rate": 5.097330446266363e-06, "loss": 0.4167, "step": 2197 }, { "epoch": 1.6337958374628345, "grad_norm": 0.390180766582489, "learning_rate": 5.093005159235958e-06, "loss": 0.4558, "step": 2198 }, { "epoch": 1.6345391476709614, "grad_norm": 0.3582594096660614, "learning_rate": 5.088679802582306e-06, "loss": 0.382, "step": 2199 }, { "epoch": 1.635282457879088, "grad_norm": 0.4142029285430908, "learning_rate": 5.0843543795433545e-06, "loss": 0.4369, "step": 2200 }, { "epoch": 1.6360257680872152, "grad_norm": 0.4430461525917053, "learning_rate": 5.0800288933570915e-06, "loss": 0.44, "step": 2201 }, { "epoch": 1.636769078295342, "grad_norm": 0.3676953613758087, "learning_rate": 5.0757033472615595e-06, "loss": 0.4081, "step": 2202 }, { "epoch": 1.6375123885034688, "grad_norm": 0.4515434205532074, "learning_rate": 5.07137774449484e-06, "loss": 0.4545, "step": 2203 }, { "epoch": 1.6382556987115957, "grad_norm": 0.3695361018180847, "learning_rate": 5.067052088295061e-06, "loss": 0.3422, "step": 2204 }, { "epoch": 1.6389990089197224, "grad_norm": 0.38656869530677795, "learning_rate": 5.062726381900389e-06, "loss": 0.3988, "step": 2205 }, { "epoch": 1.6397423191278495, "grad_norm": 0.41857606172561646, "learning_rate": 5.0584006285490305e-06, "loss": 0.3857, "step": 2206 }, { "epoch": 1.6404856293359762, "grad_norm": 0.4160768389701843, "learning_rate": 5.054074831479222e-06, "loss": 0.4137, "step": 2207 }, { "epoch": 1.641228939544103, "grad_norm": 0.4356304109096527, "learning_rate": 5.049748993929238e-06, "loss": 0.4714, "step": 2208 }, { "epoch": 1.64197224975223, "grad_norm": 0.43891581892967224, "learning_rate": 5.04542311913738e-06, "loss": 0.455, "step": 2209 }, { "epoch": 1.6427155599603567, "grad_norm": 0.42782747745513916, "learning_rate": 5.041097210341978e-06, "loss": 0.3825, "step": 2210 }, { "epoch": 1.6434588701684838, "grad_norm": 0.4465388357639313, "learning_rate": 5.0367712707813896e-06, "loss": 0.42, "step": 2211 }, { "epoch": 1.6442021803766105, "grad_norm": 0.43098685145378113, "learning_rate": 5.0324453036939936e-06, "loss": 0.3954, "step": 2212 }, { "epoch": 1.6449454905847374, "grad_norm": 0.4337514042854309, "learning_rate": 5.028119312318187e-06, "loss": 0.363, "step": 2213 }, { "epoch": 1.6456888007928643, "grad_norm": 0.4142025113105774, "learning_rate": 5.023793299892391e-06, "loss": 0.4181, "step": 2214 }, { "epoch": 1.646432111000991, "grad_norm": 0.4291398525238037, "learning_rate": 5.0194672696550366e-06, "loss": 0.433, "step": 2215 }, { "epoch": 1.647175421209118, "grad_norm": 0.4361360967159271, "learning_rate": 5.0151412248445716e-06, "loss": 0.4035, "step": 2216 }, { "epoch": 1.6479187314172448, "grad_norm": 0.43298181891441345, "learning_rate": 5.010815168699455e-06, "loss": 0.4359, "step": 2217 }, { "epoch": 1.6486620416253717, "grad_norm": 0.43526414036750793, "learning_rate": 5.006489104458149e-06, "loss": 0.4035, "step": 2218 }, { "epoch": 1.6494053518334986, "grad_norm": 0.4031701385974884, "learning_rate": 5.002163035359129e-06, "loss": 0.421, "step": 2219 }, { "epoch": 1.6501486620416252, "grad_norm": 0.44850027561187744, "learning_rate": 4.9978369646408716e-06, "loss": 0.4131, "step": 2220 }, { "epoch": 1.6508919722497524, "grad_norm": 0.41679391264915466, "learning_rate": 4.993510895541852e-06, "loss": 0.4368, "step": 2221 }, { "epoch": 1.651635282457879, "grad_norm": 0.40705057978630066, "learning_rate": 4.989184831300547e-06, "loss": 0.3757, "step": 2222 }, { "epoch": 1.652378592666006, "grad_norm": 0.48560595512390137, "learning_rate": 4.98485877515543e-06, "loss": 0.4483, "step": 2223 }, { "epoch": 1.6531219028741329, "grad_norm": 0.4500993490219116, "learning_rate": 4.980532730344965e-06, "loss": 0.4232, "step": 2224 }, { "epoch": 1.6538652130822595, "grad_norm": 0.38134971261024475, "learning_rate": 4.9762067001076105e-06, "loss": 0.3919, "step": 2225 }, { "epoch": 1.6546085232903867, "grad_norm": 0.4851958453655243, "learning_rate": 4.971880687681814e-06, "loss": 0.4788, "step": 2226 }, { "epoch": 1.6553518334985133, "grad_norm": 0.38186854124069214, "learning_rate": 4.96755469630601e-06, "loss": 0.3832, "step": 2227 }, { "epoch": 1.6560951437066402, "grad_norm": 0.41007357835769653, "learning_rate": 4.9632287292186096e-06, "loss": 0.4033, "step": 2228 }, { "epoch": 1.6568384539147671, "grad_norm": 0.40079402923583984, "learning_rate": 4.958902789658022e-06, "loss": 0.3797, "step": 2229 }, { "epoch": 1.6575817641228938, "grad_norm": 0.49411338567733765, "learning_rate": 4.954576880862622e-06, "loss": 0.4456, "step": 2230 }, { "epoch": 1.658325074331021, "grad_norm": 0.4250369071960449, "learning_rate": 4.950251006070764e-06, "loss": 0.4103, "step": 2231 }, { "epoch": 1.6590683845391476, "grad_norm": 0.5160863995552063, "learning_rate": 4.945925168520779e-06, "loss": 0.4639, "step": 2232 }, { "epoch": 1.6598116947472745, "grad_norm": 0.4620717465877533, "learning_rate": 4.941599371450971e-06, "loss": 0.4113, "step": 2233 }, { "epoch": 1.6605550049554014, "grad_norm": 0.4980165362358093, "learning_rate": 4.937273618099613e-06, "loss": 0.4388, "step": 2234 }, { "epoch": 1.661298315163528, "grad_norm": 0.4594234526157379, "learning_rate": 4.9329479117049394e-06, "loss": 0.3731, "step": 2235 }, { "epoch": 1.6620416253716552, "grad_norm": 0.522476077079773, "learning_rate": 4.928622255505162e-06, "loss": 0.3908, "step": 2236 }, { "epoch": 1.662784935579782, "grad_norm": 0.4189070761203766, "learning_rate": 4.924296652738442e-06, "loss": 0.3781, "step": 2237 }, { "epoch": 1.6635282457879088, "grad_norm": 0.4641875624656677, "learning_rate": 4.919971106642909e-06, "loss": 0.4576, "step": 2238 }, { "epoch": 1.6642715559960357, "grad_norm": 0.49737483263015747, "learning_rate": 4.915645620456646e-06, "loss": 0.3888, "step": 2239 }, { "epoch": 1.6650148662041624, "grad_norm": 0.45880579948425293, "learning_rate": 4.911320197417695e-06, "loss": 0.4091, "step": 2240 }, { "epoch": 1.6657581764122895, "grad_norm": 0.37169110774993896, "learning_rate": 4.906994840764045e-06, "loss": 0.398, "step": 2241 }, { "epoch": 1.6665014866204162, "grad_norm": 0.467021107673645, "learning_rate": 4.90266955373364e-06, "loss": 0.4192, "step": 2242 }, { "epoch": 1.667244796828543, "grad_norm": 0.4364836812019348, "learning_rate": 4.898344339564369e-06, "loss": 0.4353, "step": 2243 }, { "epoch": 1.66798810703667, "grad_norm": 0.3617345094680786, "learning_rate": 4.8940192014940715e-06, "loss": 0.3745, "step": 2244 }, { "epoch": 1.6687314172447967, "grad_norm": 0.4784591495990753, "learning_rate": 4.889694142760523e-06, "loss": 0.4852, "step": 2245 }, { "epoch": 1.6694747274529238, "grad_norm": 0.39927008748054504, "learning_rate": 4.885369166601446e-06, "loss": 0.4293, "step": 2246 }, { "epoch": 1.6702180376610505, "grad_norm": 0.392924427986145, "learning_rate": 4.881044276254497e-06, "loss": 0.3904, "step": 2247 }, { "epoch": 1.6709613478691774, "grad_norm": 0.42454683780670166, "learning_rate": 4.876719474957268e-06, "loss": 0.388, "step": 2248 }, { "epoch": 1.6717046580773043, "grad_norm": 0.43385177850723267, "learning_rate": 4.872394765947287e-06, "loss": 0.4267, "step": 2249 }, { "epoch": 1.672447968285431, "grad_norm": 0.427409291267395, "learning_rate": 4.868070152462013e-06, "loss": 0.4349, "step": 2250 }, { "epoch": 1.673191278493558, "grad_norm": 0.43509846925735474, "learning_rate": 4.863745637738828e-06, "loss": 0.399, "step": 2251 }, { "epoch": 1.6739345887016848, "grad_norm": 0.4348894953727722, "learning_rate": 4.859421225015048e-06, "loss": 0.434, "step": 2252 }, { "epoch": 1.6746778989098117, "grad_norm": 0.4135676920413971, "learning_rate": 4.855096917527912e-06, "loss": 0.4193, "step": 2253 }, { "epoch": 1.6754212091179386, "grad_norm": 0.4014666974544525, "learning_rate": 4.8507727185145705e-06, "loss": 0.3726, "step": 2254 }, { "epoch": 1.6761645193260652, "grad_norm": 0.4325551688671112, "learning_rate": 4.846448631212104e-06, "loss": 0.3905, "step": 2255 }, { "epoch": 1.6769078295341924, "grad_norm": 0.46552571654319763, "learning_rate": 4.842124658857502e-06, "loss": 0.4311, "step": 2256 }, { "epoch": 1.677651139742319, "grad_norm": 0.4013361632823944, "learning_rate": 4.837800804687674e-06, "loss": 0.4102, "step": 2257 }, { "epoch": 1.678394449950446, "grad_norm": 0.4491037428379059, "learning_rate": 4.833477071939436e-06, "loss": 0.3965, "step": 2258 }, { "epoch": 1.6791377601585729, "grad_norm": 0.453352153301239, "learning_rate": 4.829153463849513e-06, "loss": 0.4256, "step": 2259 }, { "epoch": 1.6798810703666998, "grad_norm": 0.39380764961242676, "learning_rate": 4.824829983654541e-06, "loss": 0.3792, "step": 2260 }, { "epoch": 1.6806243805748267, "grad_norm": 0.4616633653640747, "learning_rate": 4.820506634591059e-06, "loss": 0.477, "step": 2261 }, { "epoch": 1.6813676907829533, "grad_norm": 0.4298695921897888, "learning_rate": 4.816183419895503e-06, "loss": 0.3881, "step": 2262 }, { "epoch": 1.6821110009910802, "grad_norm": 0.4957587718963623, "learning_rate": 4.811860342804218e-06, "loss": 0.4534, "step": 2263 }, { "epoch": 1.6828543111992071, "grad_norm": 0.39646533131599426, "learning_rate": 4.807537406553433e-06, "loss": 0.4391, "step": 2264 }, { "epoch": 1.683597621407334, "grad_norm": 0.37748798727989197, "learning_rate": 4.803214614379284e-06, "loss": 0.3859, "step": 2265 }, { "epoch": 1.684340931615461, "grad_norm": 0.4757707715034485, "learning_rate": 4.79889196951779e-06, "loss": 0.4566, "step": 2266 }, { "epoch": 1.6850842418235876, "grad_norm": 0.3949195444583893, "learning_rate": 4.794569475204867e-06, "loss": 0.3805, "step": 2267 }, { "epoch": 1.6858275520317145, "grad_norm": 0.37908071279525757, "learning_rate": 4.790247134676313e-06, "loss": 0.3973, "step": 2268 }, { "epoch": 1.6865708622398414, "grad_norm": 0.4571331739425659, "learning_rate": 4.785924951167813e-06, "loss": 0.4099, "step": 2269 }, { "epoch": 1.6873141724479683, "grad_norm": 0.4376804232597351, "learning_rate": 4.7816029279149356e-06, "loss": 0.435, "step": 2270 }, { "epoch": 1.6880574826560952, "grad_norm": 0.3794824481010437, "learning_rate": 4.777281068153125e-06, "loss": 0.4309, "step": 2271 }, { "epoch": 1.688800792864222, "grad_norm": 0.41324061155319214, "learning_rate": 4.772959375117709e-06, "loss": 0.4165, "step": 2272 }, { "epoch": 1.6895441030723488, "grad_norm": 0.44033780694007874, "learning_rate": 4.768637852043886e-06, "loss": 0.4006, "step": 2273 }, { "epoch": 1.6902874132804757, "grad_norm": 0.41549918055534363, "learning_rate": 4.764316502166728e-06, "loss": 0.458, "step": 2274 }, { "epoch": 1.6910307234886026, "grad_norm": 0.36153876781463623, "learning_rate": 4.75999532872118e-06, "loss": 0.3865, "step": 2275 }, { "epoch": 1.6917740336967295, "grad_norm": 0.447512686252594, "learning_rate": 4.7556743349420534e-06, "loss": 0.4165, "step": 2276 }, { "epoch": 1.6925173439048562, "grad_norm": 0.45725172758102417, "learning_rate": 4.751353524064023e-06, "loss": 0.4179, "step": 2277 }, { "epoch": 1.693260654112983, "grad_norm": 0.3750922679901123, "learning_rate": 4.747032899321629e-06, "loss": 0.4402, "step": 2278 }, { "epoch": 1.69400396432111, "grad_norm": 0.39037710428237915, "learning_rate": 4.742712463949272e-06, "loss": 0.4457, "step": 2279 }, { "epoch": 1.694747274529237, "grad_norm": 0.3849135637283325, "learning_rate": 4.738392221181211e-06, "loss": 0.361, "step": 2280 }, { "epoch": 1.6954905847373638, "grad_norm": 0.5384837985038757, "learning_rate": 4.73407217425156e-06, "loss": 0.4815, "step": 2281 }, { "epoch": 1.6962338949454905, "grad_norm": 0.40932977199554443, "learning_rate": 4.729752326394284e-06, "loss": 0.4058, "step": 2282 }, { "epoch": 1.6969772051536176, "grad_norm": 0.3760969340801239, "learning_rate": 4.725432680843206e-06, "loss": 0.3622, "step": 2283 }, { "epoch": 1.6977205153617443, "grad_norm": 0.3968771696090698, "learning_rate": 4.721113240831991e-06, "loss": 0.3876, "step": 2284 }, { "epoch": 1.6984638255698712, "grad_norm": 0.4327227771282196, "learning_rate": 4.716794009594153e-06, "loss": 0.4437, "step": 2285 }, { "epoch": 1.699207135777998, "grad_norm": 0.39265021681785583, "learning_rate": 4.7124749903630505e-06, "loss": 0.395, "step": 2286 }, { "epoch": 1.6999504459861248, "grad_norm": 0.39978349208831787, "learning_rate": 4.708156186371879e-06, "loss": 0.403, "step": 2287 }, { "epoch": 1.700693756194252, "grad_norm": 0.40435874462127686, "learning_rate": 4.703837600853678e-06, "loss": 0.383, "step": 2288 }, { "epoch": 1.7014370664023786, "grad_norm": 0.4386213719844818, "learning_rate": 4.699519237041318e-06, "loss": 0.4245, "step": 2289 }, { "epoch": 1.7021803766105055, "grad_norm": 0.3730771541595459, "learning_rate": 4.695201098167512e-06, "loss": 0.4435, "step": 2290 }, { "epoch": 1.7029236868186324, "grad_norm": 0.39570382237434387, "learning_rate": 4.690883187464796e-06, "loss": 0.4061, "step": 2291 }, { "epoch": 1.703666997026759, "grad_norm": 0.4041057825088501, "learning_rate": 4.686565508165539e-06, "loss": 0.3976, "step": 2292 }, { "epoch": 1.7044103072348862, "grad_norm": 0.40635159611701965, "learning_rate": 4.682248063501938e-06, "loss": 0.4126, "step": 2293 }, { "epoch": 1.7051536174430129, "grad_norm": 0.3778231143951416, "learning_rate": 4.67793085670601e-06, "loss": 0.3508, "step": 2294 }, { "epoch": 1.7058969276511398, "grad_norm": 0.49317416548728943, "learning_rate": 4.673613891009599e-06, "loss": 0.4833, "step": 2295 }, { "epoch": 1.7066402378592667, "grad_norm": 0.4360116124153137, "learning_rate": 4.669297169644365e-06, "loss": 0.3966, "step": 2296 }, { "epoch": 1.7073835480673933, "grad_norm": 0.3960617780685425, "learning_rate": 4.664980695841784e-06, "loss": 0.4081, "step": 2297 }, { "epoch": 1.7081268582755205, "grad_norm": 0.4313351809978485, "learning_rate": 4.660664472833153e-06, "loss": 0.4408, "step": 2298 }, { "epoch": 1.7088701684836471, "grad_norm": 0.42066583037376404, "learning_rate": 4.656348503849577e-06, "loss": 0.3953, "step": 2299 }, { "epoch": 1.709613478691774, "grad_norm": 0.4811610281467438, "learning_rate": 4.6520327921219686e-06, "loss": 0.4451, "step": 2300 }, { "epoch": 1.710356788899901, "grad_norm": 0.40710368752479553, "learning_rate": 4.647717340881051e-06, "loss": 0.385, "step": 2301 }, { "epoch": 1.7111000991080276, "grad_norm": 0.4117491841316223, "learning_rate": 4.643402153357351e-06, "loss": 0.4241, "step": 2302 }, { "epoch": 1.7118434093161548, "grad_norm": 0.37708568572998047, "learning_rate": 4.639087232781201e-06, "loss": 0.4064, "step": 2303 }, { "epoch": 1.7125867195242814, "grad_norm": 0.4085550606250763, "learning_rate": 4.634772582382728e-06, "loss": 0.4119, "step": 2304 }, { "epoch": 1.7133300297324083, "grad_norm": 0.42342427372932434, "learning_rate": 4.630458205391862e-06, "loss": 0.4141, "step": 2305 }, { "epoch": 1.7140733399405352, "grad_norm": 0.3875895142555237, "learning_rate": 4.6261441050383256e-06, "loss": 0.4191, "step": 2306 }, { "epoch": 1.714816650148662, "grad_norm": 0.3952023386955261, "learning_rate": 4.621830284551636e-06, "loss": 0.4342, "step": 2307 }, { "epoch": 1.715559960356789, "grad_norm": 0.41098007559776306, "learning_rate": 4.617516747161098e-06, "loss": 0.4111, "step": 2308 }, { "epoch": 1.7163032705649157, "grad_norm": 0.4024622440338135, "learning_rate": 4.613203496095808e-06, "loss": 0.4386, "step": 2309 }, { "epoch": 1.7170465807730426, "grad_norm": 0.3850974440574646, "learning_rate": 4.608890534584647e-06, "loss": 0.4262, "step": 2310 }, { "epoch": 1.7177898909811695, "grad_norm": 0.3982198238372803, "learning_rate": 4.604577865856277e-06, "loss": 0.3915, "step": 2311 }, { "epoch": 1.7185332011892962, "grad_norm": 0.37293022871017456, "learning_rate": 4.600265493139142e-06, "loss": 0.3831, "step": 2312 }, { "epoch": 1.7192765113974233, "grad_norm": 0.4257054030895233, "learning_rate": 4.595953419661464e-06, "loss": 0.4354, "step": 2313 }, { "epoch": 1.72001982160555, "grad_norm": 0.38073110580444336, "learning_rate": 4.591641648651246e-06, "loss": 0.4056, "step": 2314 }, { "epoch": 1.720763131813677, "grad_norm": 0.4175257086753845, "learning_rate": 4.587330183336258e-06, "loss": 0.4449, "step": 2315 }, { "epoch": 1.7215064420218038, "grad_norm": 0.42345041036605835, "learning_rate": 4.583019026944045e-06, "loss": 0.4313, "step": 2316 }, { "epoch": 1.7222497522299305, "grad_norm": 0.35453447699546814, "learning_rate": 4.578708182701916e-06, "loss": 0.3701, "step": 2317 }, { "epoch": 1.7229930624380576, "grad_norm": 0.39999791979789734, "learning_rate": 4.574397653836952e-06, "loss": 0.4757, "step": 2318 }, { "epoch": 1.7237363726461843, "grad_norm": 0.38838568329811096, "learning_rate": 4.570087443575995e-06, "loss": 0.398, "step": 2319 }, { "epoch": 1.7244796828543112, "grad_norm": 0.3515084385871887, "learning_rate": 4.5657775551456486e-06, "loss": 0.3903, "step": 2320 }, { "epoch": 1.725222993062438, "grad_norm": 0.42430341243743896, "learning_rate": 4.561467991772278e-06, "loss": 0.3934, "step": 2321 }, { "epoch": 1.7259663032705648, "grad_norm": 0.4910987317562103, "learning_rate": 4.557158756682002e-06, "loss": 0.4488, "step": 2322 }, { "epoch": 1.726709613478692, "grad_norm": 0.3561475872993469, "learning_rate": 4.552849853100694e-06, "loss": 0.3831, "step": 2323 }, { "epoch": 1.7274529236868186, "grad_norm": 0.4117140471935272, "learning_rate": 4.5485412842539794e-06, "loss": 0.455, "step": 2324 }, { "epoch": 1.7281962338949455, "grad_norm": 0.40547409653663635, "learning_rate": 4.5442330533672345e-06, "loss": 0.3889, "step": 2325 }, { "epoch": 1.7289395441030724, "grad_norm": 0.3879833221435547, "learning_rate": 4.5399251636655835e-06, "loss": 0.3588, "step": 2326 }, { "epoch": 1.729682854311199, "grad_norm": 0.46359914541244507, "learning_rate": 4.535617618373889e-06, "loss": 0.4572, "step": 2327 }, { "epoch": 1.7304261645193262, "grad_norm": 0.46390601992607117, "learning_rate": 4.53131042071676e-06, "loss": 0.431, "step": 2328 }, { "epoch": 1.7311694747274529, "grad_norm": 0.3582051396369934, "learning_rate": 4.527003573918552e-06, "loss": 0.362, "step": 2329 }, { "epoch": 1.7319127849355798, "grad_norm": 0.44516998529434204, "learning_rate": 4.522697081203346e-06, "loss": 0.4703, "step": 2330 }, { "epoch": 1.7326560951437067, "grad_norm": 0.4461948573589325, "learning_rate": 4.518390945794964e-06, "loss": 0.4201, "step": 2331 }, { "epoch": 1.7333994053518333, "grad_norm": 0.4247529208660126, "learning_rate": 4.514085170916959e-06, "loss": 0.362, "step": 2332 }, { "epoch": 1.7341427155599605, "grad_norm": 0.39945584535598755, "learning_rate": 4.509779759792616e-06, "loss": 0.4258, "step": 2333 }, { "epoch": 1.7348860257680871, "grad_norm": 0.3945949673652649, "learning_rate": 4.5054747156449455e-06, "loss": 0.4313, "step": 2334 }, { "epoch": 1.735629335976214, "grad_norm": 0.42227864265441895, "learning_rate": 4.501170041696684e-06, "loss": 0.4181, "step": 2335 }, { "epoch": 1.736372646184341, "grad_norm": 0.43482717871665955, "learning_rate": 4.496865741170291e-06, "loss": 0.4556, "step": 2336 }, { "epoch": 1.7371159563924676, "grad_norm": 0.4166463315486908, "learning_rate": 4.492561817287949e-06, "loss": 0.4088, "step": 2337 }, { "epoch": 1.7378592666005948, "grad_norm": 0.3962649703025818, "learning_rate": 4.488258273271554e-06, "loss": 0.4052, "step": 2338 }, { "epoch": 1.7386025768087214, "grad_norm": 0.3853617012500763, "learning_rate": 4.483955112342721e-06, "loss": 0.3915, "step": 2339 }, { "epoch": 1.7393458870168483, "grad_norm": 0.41941678524017334, "learning_rate": 4.479652337722775e-06, "loss": 0.4172, "step": 2340 }, { "epoch": 1.7400891972249752, "grad_norm": 0.44487687945365906, "learning_rate": 4.475349952632755e-06, "loss": 0.3916, "step": 2341 }, { "epoch": 1.740832507433102, "grad_norm": 0.47390395402908325, "learning_rate": 4.471047960293407e-06, "loss": 0.4537, "step": 2342 }, { "epoch": 1.741575817641229, "grad_norm": 0.41293010115623474, "learning_rate": 4.466746363925182e-06, "loss": 0.3847, "step": 2343 }, { "epoch": 1.7423191278493557, "grad_norm": 0.48612338304519653, "learning_rate": 4.462445166748239e-06, "loss": 0.4516, "step": 2344 }, { "epoch": 1.7430624380574826, "grad_norm": 0.3733048737049103, "learning_rate": 4.4581443719824335e-06, "loss": 0.4094, "step": 2345 }, { "epoch": 1.7438057482656095, "grad_norm": 0.4243617653846741, "learning_rate": 4.453843982847319e-06, "loss": 0.4586, "step": 2346 }, { "epoch": 1.7445490584737364, "grad_norm": 0.428244411945343, "learning_rate": 4.449544002562149e-06, "loss": 0.4151, "step": 2347 }, { "epoch": 1.7452923686818633, "grad_norm": 0.38949277997016907, "learning_rate": 4.445244434345871e-06, "loss": 0.3806, "step": 2348 }, { "epoch": 1.74603567888999, "grad_norm": 0.47684165835380554, "learning_rate": 4.440945281417119e-06, "loss": 0.4226, "step": 2349 }, { "epoch": 1.746778989098117, "grad_norm": 0.4248225688934326, "learning_rate": 4.4366465469942245e-06, "loss": 0.4481, "step": 2350 }, { "epoch": 1.7475222993062438, "grad_norm": 0.41822585463523865, "learning_rate": 4.432348234295194e-06, "loss": 0.4184, "step": 2351 }, { "epoch": 1.7482656095143707, "grad_norm": 0.4726165533065796, "learning_rate": 4.428050346537731e-06, "loss": 0.4144, "step": 2352 }, { "epoch": 1.7490089197224976, "grad_norm": 0.35095781087875366, "learning_rate": 4.423752886939214e-06, "loss": 0.3739, "step": 2353 }, { "epoch": 1.7497522299306243, "grad_norm": 0.4208243787288666, "learning_rate": 4.4194558587167e-06, "loss": 0.415, "step": 2354 }, { "epoch": 1.7504955401387512, "grad_norm": 0.4147305488586426, "learning_rate": 4.415159265086926e-06, "loss": 0.4168, "step": 2355 }, { "epoch": 1.751238850346878, "grad_norm": 0.46193885803222656, "learning_rate": 4.410863109266304e-06, "loss": 0.4266, "step": 2356 }, { "epoch": 1.751982160555005, "grad_norm": 0.4180683195590973, "learning_rate": 4.406567394470915e-06, "loss": 0.4108, "step": 2357 }, { "epoch": 1.752725470763132, "grad_norm": 0.4351331293582916, "learning_rate": 4.4022721239165125e-06, "loss": 0.4096, "step": 2358 }, { "epoch": 1.7534687809712586, "grad_norm": 0.4668525457382202, "learning_rate": 4.397977300818515e-06, "loss": 0.4011, "step": 2359 }, { "epoch": 1.7542120911793855, "grad_norm": 0.417341023683548, "learning_rate": 4.393682928392011e-06, "loss": 0.4265, "step": 2360 }, { "epoch": 1.7549554013875124, "grad_norm": 0.3857533931732178, "learning_rate": 4.389389009851748e-06, "loss": 0.4057, "step": 2361 }, { "epoch": 1.7556987115956393, "grad_norm": 0.3960522413253784, "learning_rate": 4.385095548412134e-06, "loss": 0.3778, "step": 2362 }, { "epoch": 1.7564420218037662, "grad_norm": 0.4690462350845337, "learning_rate": 4.380802547287233e-06, "loss": 0.4391, "step": 2363 }, { "epoch": 1.7571853320118929, "grad_norm": 0.441043883562088, "learning_rate": 4.376510009690767e-06, "loss": 0.4451, "step": 2364 }, { "epoch": 1.75792864222002, "grad_norm": 0.3871294856071472, "learning_rate": 4.372217938836112e-06, "loss": 0.4166, "step": 2365 }, { "epoch": 1.7586719524281467, "grad_norm": 0.36544665694236755, "learning_rate": 4.36792633793629e-06, "loss": 0.406, "step": 2366 }, { "epoch": 1.7594152626362736, "grad_norm": 0.46386706829071045, "learning_rate": 4.363635210203978e-06, "loss": 0.4646, "step": 2367 }, { "epoch": 1.7601585728444005, "grad_norm": 0.36668410897254944, "learning_rate": 4.3593445588514926e-06, "loss": 0.3586, "step": 2368 }, { "epoch": 1.7609018830525272, "grad_norm": 0.4578559100627899, "learning_rate": 4.355054387090798e-06, "loss": 0.4424, "step": 2369 }, { "epoch": 1.7616451932606543, "grad_norm": 0.3851667642593384, "learning_rate": 4.350764698133495e-06, "loss": 0.3857, "step": 2370 }, { "epoch": 1.762388503468781, "grad_norm": 0.436710923910141, "learning_rate": 4.346475495190828e-06, "loss": 0.4519, "step": 2371 }, { "epoch": 1.7631318136769079, "grad_norm": 0.4153221845626831, "learning_rate": 4.342186781473672e-06, "loss": 0.403, "step": 2372 }, { "epoch": 1.7638751238850348, "grad_norm": 0.37711572647094727, "learning_rate": 4.337898560192542e-06, "loss": 0.3702, "step": 2373 }, { "epoch": 1.7646184340931614, "grad_norm": 0.3811671733856201, "learning_rate": 4.333610834557578e-06, "loss": 0.408, "step": 2374 }, { "epoch": 1.7653617443012886, "grad_norm": 0.3953961431980133, "learning_rate": 4.329323607778556e-06, "loss": 0.4302, "step": 2375 }, { "epoch": 1.7661050545094152, "grad_norm": 0.3952876031398773, "learning_rate": 4.325036883064872e-06, "loss": 0.3827, "step": 2376 }, { "epoch": 1.7668483647175421, "grad_norm": 0.3944253623485565, "learning_rate": 4.32075066362555e-06, "loss": 0.4128, "step": 2377 }, { "epoch": 1.767591674925669, "grad_norm": 0.376057893037796, "learning_rate": 4.316464952669234e-06, "loss": 0.4072, "step": 2378 }, { "epoch": 1.7683349851337957, "grad_norm": 0.41779452562332153, "learning_rate": 4.312179753404189e-06, "loss": 0.4414, "step": 2379 }, { "epoch": 1.7690782953419228, "grad_norm": 0.38898739218711853, "learning_rate": 4.307895069038296e-06, "loss": 0.3889, "step": 2380 }, { "epoch": 1.7698216055500495, "grad_norm": 0.42090553045272827, "learning_rate": 4.303610902779048e-06, "loss": 0.4434, "step": 2381 }, { "epoch": 1.7705649157581764, "grad_norm": 0.4166044592857361, "learning_rate": 4.299327257833555e-06, "loss": 0.4162, "step": 2382 }, { "epoch": 1.7713082259663033, "grad_norm": 0.3876262307167053, "learning_rate": 4.295044137408534e-06, "loss": 0.3923, "step": 2383 }, { "epoch": 1.77205153617443, "grad_norm": 0.40042349696159363, "learning_rate": 4.290761544710309e-06, "loss": 0.4062, "step": 2384 }, { "epoch": 1.7727948463825571, "grad_norm": 0.4465305209159851, "learning_rate": 4.286479482944812e-06, "loss": 0.4281, "step": 2385 }, { "epoch": 1.7735381565906838, "grad_norm": 0.3832791745662689, "learning_rate": 4.282197955317573e-06, "loss": 0.395, "step": 2386 }, { "epoch": 1.7742814667988107, "grad_norm": 0.42082563042640686, "learning_rate": 4.2779169650337235e-06, "loss": 0.4338, "step": 2387 }, { "epoch": 1.7750247770069376, "grad_norm": 0.34179967641830444, "learning_rate": 4.273636515297996e-06, "loss": 0.3788, "step": 2388 }, { "epoch": 1.7757680872150643, "grad_norm": 0.436021089553833, "learning_rate": 4.269356609314716e-06, "loss": 0.4566, "step": 2389 }, { "epoch": 1.7765113974231914, "grad_norm": 0.3983279764652252, "learning_rate": 4.265077250287797e-06, "loss": 0.3982, "step": 2390 }, { "epoch": 1.777254707631318, "grad_norm": 0.3713359534740448, "learning_rate": 4.260798441420754e-06, "loss": 0.405, "step": 2391 }, { "epoch": 1.777998017839445, "grad_norm": 0.3822384774684906, "learning_rate": 4.256520185916683e-06, "loss": 0.3772, "step": 2392 }, { "epoch": 1.778741328047572, "grad_norm": 0.46758437156677246, "learning_rate": 4.252242486978265e-06, "loss": 0.4657, "step": 2393 }, { "epoch": 1.7794846382556986, "grad_norm": 0.4823459982872009, "learning_rate": 4.2479653478077675e-06, "loss": 0.4713, "step": 2394 }, { "epoch": 1.7802279484638257, "grad_norm": 0.38685643672943115, "learning_rate": 4.243688771607038e-06, "loss": 0.3947, "step": 2395 }, { "epoch": 1.7809712586719524, "grad_norm": 0.3985477387905121, "learning_rate": 4.239412761577501e-06, "loss": 0.3891, "step": 2396 }, { "epoch": 1.7817145688800793, "grad_norm": 0.3997265100479126, "learning_rate": 4.235137320920157e-06, "loss": 0.4141, "step": 2397 }, { "epoch": 1.7824578790882062, "grad_norm": 0.4686138927936554, "learning_rate": 4.2308624528355855e-06, "loss": 0.4574, "step": 2398 }, { "epoch": 1.7832011892963329, "grad_norm": 0.34362855553627014, "learning_rate": 4.226588160523931e-06, "loss": 0.3294, "step": 2399 }, { "epoch": 1.78394449950446, "grad_norm": 0.420723557472229, "learning_rate": 4.222314447184911e-06, "loss": 0.4331, "step": 2400 }, { "epoch": 1.7846878097125867, "grad_norm": 0.4487960934638977, "learning_rate": 4.218041316017808e-06, "loss": 0.4261, "step": 2401 }, { "epoch": 1.7854311199207136, "grad_norm": 0.46004602313041687, "learning_rate": 4.213768770221468e-06, "loss": 0.384, "step": 2402 }, { "epoch": 1.7861744301288405, "grad_norm": 0.4284772276878357, "learning_rate": 4.209496812994299e-06, "loss": 0.4634, "step": 2403 }, { "epoch": 1.7869177403369672, "grad_norm": 0.4036477208137512, "learning_rate": 4.20522544753427e-06, "loss": 0.4377, "step": 2404 }, { "epoch": 1.7876610505450943, "grad_norm": 0.43693307042121887, "learning_rate": 4.200954677038905e-06, "loss": 0.4238, "step": 2405 }, { "epoch": 1.788404360753221, "grad_norm": 0.41561374068260193, "learning_rate": 4.196684504705286e-06, "loss": 0.3918, "step": 2406 }, { "epoch": 1.7891476709613479, "grad_norm": 0.4323495030403137, "learning_rate": 4.192414933730043e-06, "loss": 0.4233, "step": 2407 }, { "epoch": 1.7898909811694748, "grad_norm": 0.37901005148887634, "learning_rate": 4.188145967309359e-06, "loss": 0.4513, "step": 2408 }, { "epoch": 1.7906342913776014, "grad_norm": 0.39327335357666016, "learning_rate": 4.183877608638962e-06, "loss": 0.4505, "step": 2409 }, { "epoch": 1.7913776015857286, "grad_norm": 0.40934517979621887, "learning_rate": 4.179609860914126e-06, "loss": 0.42, "step": 2410 }, { "epoch": 1.7921209117938552, "grad_norm": 0.43494105339050293, "learning_rate": 4.175342727329667e-06, "loss": 0.4426, "step": 2411 }, { "epoch": 1.7928642220019821, "grad_norm": 0.3877786695957184, "learning_rate": 4.171076211079944e-06, "loss": 0.3908, "step": 2412 }, { "epoch": 1.793607532210109, "grad_norm": 0.3984208106994629, "learning_rate": 4.1668103153588505e-06, "loss": 0.432, "step": 2413 }, { "epoch": 1.7943508424182357, "grad_norm": 0.42250946164131165, "learning_rate": 4.162545043359818e-06, "loss": 0.4805, "step": 2414 }, { "epoch": 1.7950941526263628, "grad_norm": 0.42477330565452576, "learning_rate": 4.158280398275811e-06, "loss": 0.4043, "step": 2415 }, { "epoch": 1.7958374628344895, "grad_norm": 0.40224334597587585, "learning_rate": 4.1540163832993205e-06, "loss": 0.4494, "step": 2416 }, { "epoch": 1.7965807730426164, "grad_norm": 0.34055182337760925, "learning_rate": 4.149753001622372e-06, "loss": 0.3584, "step": 2417 }, { "epoch": 1.7973240832507433, "grad_norm": 0.3733159899711609, "learning_rate": 4.145490256436511e-06, "loss": 0.4213, "step": 2418 }, { "epoch": 1.79806739345887, "grad_norm": 0.3826321065425873, "learning_rate": 4.141228150932814e-06, "loss": 0.3793, "step": 2419 }, { "epoch": 1.7988107036669971, "grad_norm": 0.44171345233917236, "learning_rate": 4.136966688301866e-06, "loss": 0.4585, "step": 2420 }, { "epoch": 1.7995540138751238, "grad_norm": 0.3931301236152649, "learning_rate": 4.132705871733788e-06, "loss": 0.4404, "step": 2421 }, { "epoch": 1.8002973240832507, "grad_norm": 0.36934152245521545, "learning_rate": 4.128445704418202e-06, "loss": 0.3769, "step": 2422 }, { "epoch": 1.8010406342913776, "grad_norm": 0.35664981603622437, "learning_rate": 4.124186189544253e-06, "loss": 0.3767, "step": 2423 }, { "epoch": 1.8017839444995043, "grad_norm": 0.335744172334671, "learning_rate": 4.119927330300595e-06, "loss": 0.3781, "step": 2424 }, { "epoch": 1.8025272547076314, "grad_norm": 0.3964659571647644, "learning_rate": 4.1156691298753895e-06, "loss": 0.458, "step": 2425 }, { "epoch": 1.803270564915758, "grad_norm": 0.3590914309024811, "learning_rate": 4.111411591456306e-06, "loss": 0.4023, "step": 2426 }, { "epoch": 1.804013875123885, "grad_norm": 0.3618118464946747, "learning_rate": 4.10715471823052e-06, "loss": 0.3846, "step": 2427 }, { "epoch": 1.804757185332012, "grad_norm": 0.4127982258796692, "learning_rate": 4.102898513384705e-06, "loss": 0.4168, "step": 2428 }, { "epoch": 1.8055004955401388, "grad_norm": 0.3646968901157379, "learning_rate": 4.09864298010504e-06, "loss": 0.3796, "step": 2429 }, { "epoch": 1.8062438057482657, "grad_norm": 0.3878285586833954, "learning_rate": 4.094388121577197e-06, "loss": 0.4304, "step": 2430 }, { "epoch": 1.8069871159563924, "grad_norm": 0.37601253390312195, "learning_rate": 4.0901339409863435e-06, "loss": 0.4157, "step": 2431 }, { "epoch": 1.8077304261645193, "grad_norm": 0.37866705656051636, "learning_rate": 4.085880441517141e-06, "loss": 0.3868, "step": 2432 }, { "epoch": 1.8084737363726462, "grad_norm": 0.38868701457977295, "learning_rate": 4.0816276263537405e-06, "loss": 0.4431, "step": 2433 }, { "epoch": 1.809217046580773, "grad_norm": 0.4037517309188843, "learning_rate": 4.077375498679778e-06, "loss": 0.4467, "step": 2434 }, { "epoch": 1.8099603567889, "grad_norm": 0.38520053029060364, "learning_rate": 4.073124061678379e-06, "loss": 0.3905, "step": 2435 }, { "epoch": 1.8107036669970267, "grad_norm": 0.3713292181491852, "learning_rate": 4.0688733185321486e-06, "loss": 0.3993, "step": 2436 }, { "epoch": 1.8114469772051536, "grad_norm": 0.3958156108856201, "learning_rate": 4.064623272423177e-06, "loss": 0.4315, "step": 2437 }, { "epoch": 1.8121902874132805, "grad_norm": 0.38287490606307983, "learning_rate": 4.060373926533028e-06, "loss": 0.4157, "step": 2438 }, { "epoch": 1.8129335976214074, "grad_norm": 0.3950175344944, "learning_rate": 4.056125284042742e-06, "loss": 0.3875, "step": 2439 }, { "epoch": 1.8136769078295343, "grad_norm": 0.3922598958015442, "learning_rate": 4.051877348132836e-06, "loss": 0.4019, "step": 2440 }, { "epoch": 1.814420218037661, "grad_norm": 0.3987888693809509, "learning_rate": 4.047630121983292e-06, "loss": 0.3904, "step": 2441 }, { "epoch": 1.8151635282457879, "grad_norm": 0.4071506857872009, "learning_rate": 4.043383608773569e-06, "loss": 0.3973, "step": 2442 }, { "epoch": 1.8159068384539148, "grad_norm": 0.408986359834671, "learning_rate": 4.039137811682584e-06, "loss": 0.3936, "step": 2443 }, { "epoch": 1.8166501486620417, "grad_norm": 0.3797813653945923, "learning_rate": 4.034892733888721e-06, "loss": 0.3932, "step": 2444 }, { "epoch": 1.8173934588701686, "grad_norm": 0.3910747170448303, "learning_rate": 4.030648378569831e-06, "loss": 0.4142, "step": 2445 }, { "epoch": 1.8181367690782952, "grad_norm": 0.4121123254299164, "learning_rate": 4.026404748903217e-06, "loss": 0.3931, "step": 2446 }, { "epoch": 1.8188800792864221, "grad_norm": 0.4254898428916931, "learning_rate": 4.02216184806564e-06, "loss": 0.4383, "step": 2447 }, { "epoch": 1.819623389494549, "grad_norm": 0.44102412462234497, "learning_rate": 4.017919679233317e-06, "loss": 0.3855, "step": 2448 }, { "epoch": 1.820366699702676, "grad_norm": 0.4054432809352875, "learning_rate": 4.013678245581918e-06, "loss": 0.4254, "step": 2449 }, { "epoch": 1.8211100099108029, "grad_norm": 0.39295005798339844, "learning_rate": 4.00943755028656e-06, "loss": 0.3747, "step": 2450 }, { "epoch": 1.8218533201189295, "grad_norm": 0.4171927273273468, "learning_rate": 4.005197596521806e-06, "loss": 0.4362, "step": 2451 }, { "epoch": 1.8225966303270567, "grad_norm": 0.4123203158378601, "learning_rate": 4.0009583874616705e-06, "loss": 0.3917, "step": 2452 }, { "epoch": 1.8233399405351833, "grad_norm": 0.420540452003479, "learning_rate": 3.9967199262796054e-06, "loss": 0.4237, "step": 2453 }, { "epoch": 1.8240832507433102, "grad_norm": 0.3914930522441864, "learning_rate": 3.992482216148504e-06, "loss": 0.4178, "step": 2454 }, { "epoch": 1.8248265609514371, "grad_norm": 0.45156580209732056, "learning_rate": 3.988245260240695e-06, "loss": 0.431, "step": 2455 }, { "epoch": 1.8255698711595638, "grad_norm": 0.3621576130390167, "learning_rate": 3.984009061727946e-06, "loss": 0.3262, "step": 2456 }, { "epoch": 1.826313181367691, "grad_norm": 0.3961043655872345, "learning_rate": 3.979773623781456e-06, "loss": 0.4511, "step": 2457 }, { "epoch": 1.8270564915758176, "grad_norm": 0.4676054120063782, "learning_rate": 3.975538949571855e-06, "loss": 0.4042, "step": 2458 }, { "epoch": 1.8277998017839445, "grad_norm": 0.48152875900268555, "learning_rate": 3.9713050422692e-06, "loss": 0.447, "step": 2459 }, { "epoch": 1.8285431119920714, "grad_norm": 0.39356642961502075, "learning_rate": 3.9670719050429765e-06, "loss": 0.4085, "step": 2460 }, { "epoch": 1.829286422200198, "grad_norm": 0.35796216130256653, "learning_rate": 3.962839541062093e-06, "loss": 0.4124, "step": 2461 }, { "epoch": 1.8300297324083252, "grad_norm": 0.41410133242607117, "learning_rate": 3.958607953494875e-06, "loss": 0.3848, "step": 2462 }, { "epoch": 1.830773042616452, "grad_norm": 0.44007208943367004, "learning_rate": 3.954377145509073e-06, "loss": 0.3498, "step": 2463 }, { "epoch": 1.8315163528245788, "grad_norm": 0.4125143587589264, "learning_rate": 3.950147120271849e-06, "loss": 0.3915, "step": 2464 }, { "epoch": 1.8322596630327057, "grad_norm": 0.39900678396224976, "learning_rate": 3.9459178809497825e-06, "loss": 0.4357, "step": 2465 }, { "epoch": 1.8330029732408324, "grad_norm": 0.39099788665771484, "learning_rate": 3.941689430708861e-06, "loss": 0.3822, "step": 2466 }, { "epoch": 1.8337462834489595, "grad_norm": 0.4734610319137573, "learning_rate": 3.937461772714482e-06, "loss": 0.4296, "step": 2467 }, { "epoch": 1.8344895936570862, "grad_norm": 0.4313727915287018, "learning_rate": 3.9332349101314575e-06, "loss": 0.4416, "step": 2468 }, { "epoch": 1.835232903865213, "grad_norm": 0.45699480175971985, "learning_rate": 3.929008846123993e-06, "loss": 0.4205, "step": 2469 }, { "epoch": 1.83597621407334, "grad_norm": 0.3652292490005493, "learning_rate": 3.924783583855703e-06, "loss": 0.3721, "step": 2470 }, { "epoch": 1.8367195242814667, "grad_norm": 0.4090847671031952, "learning_rate": 3.920559126489598e-06, "loss": 0.4457, "step": 2471 }, { "epoch": 1.8374628344895938, "grad_norm": 0.3847059905529022, "learning_rate": 3.91633547718809e-06, "loss": 0.366, "step": 2472 }, { "epoch": 1.8382061446977205, "grad_norm": 0.41357359290122986, "learning_rate": 3.912112639112982e-06, "loss": 0.3594, "step": 2473 }, { "epoch": 1.8389494549058474, "grad_norm": 0.3891028165817261, "learning_rate": 3.907890615425472e-06, "loss": 0.3969, "step": 2474 }, { "epoch": 1.8396927651139743, "grad_norm": 0.4166219234466553, "learning_rate": 3.90366940928615e-06, "loss": 0.4223, "step": 2475 }, { "epoch": 1.840436075322101, "grad_norm": 0.4242033362388611, "learning_rate": 3.899449023854989e-06, "loss": 0.4466, "step": 2476 }, { "epoch": 1.841179385530228, "grad_norm": 0.38428786396980286, "learning_rate": 3.895229462291352e-06, "loss": 0.4011, "step": 2477 }, { "epoch": 1.8419226957383548, "grad_norm": 0.38183507323265076, "learning_rate": 3.8910107277539865e-06, "loss": 0.3843, "step": 2478 }, { "epoch": 1.8426660059464817, "grad_norm": 0.3977887034416199, "learning_rate": 3.886792823401014e-06, "loss": 0.4059, "step": 2479 }, { "epoch": 1.8434093161546086, "grad_norm": 0.3959232568740845, "learning_rate": 3.88257575238994e-06, "loss": 0.3712, "step": 2480 }, { "epoch": 1.8441526263627352, "grad_norm": 0.4431343376636505, "learning_rate": 3.878359517877646e-06, "loss": 0.4433, "step": 2481 }, { "epoch": 1.8448959365708624, "grad_norm": 0.3537960946559906, "learning_rate": 3.874144123020384e-06, "loss": 0.3623, "step": 2482 }, { "epoch": 1.845639246778989, "grad_norm": 0.41793856024742126, "learning_rate": 3.8699295709737824e-06, "loss": 0.4492, "step": 2483 }, { "epoch": 1.846382556987116, "grad_norm": 0.4227231442928314, "learning_rate": 3.8657158648928355e-06, "loss": 0.4101, "step": 2484 }, { "epoch": 1.8471258671952429, "grad_norm": 0.3869159519672394, "learning_rate": 3.861503007931905e-06, "loss": 0.4096, "step": 2485 }, { "epoch": 1.8478691774033695, "grad_norm": 0.4330957531929016, "learning_rate": 3.857291003244715e-06, "loss": 0.4433, "step": 2486 }, { "epoch": 1.8486124876114967, "grad_norm": 0.3570367097854614, "learning_rate": 3.853079853984354e-06, "loss": 0.3944, "step": 2487 }, { "epoch": 1.8493557978196233, "grad_norm": 0.40329962968826294, "learning_rate": 3.8488695633032695e-06, "loss": 0.4299, "step": 2488 }, { "epoch": 1.8500991080277502, "grad_norm": 0.40518268942832947, "learning_rate": 3.844660134353267e-06, "loss": 0.372, "step": 2489 }, { "epoch": 1.8508424182358771, "grad_norm": 0.3954118490219116, "learning_rate": 3.840451570285504e-06, "loss": 0.3804, "step": 2490 }, { "epoch": 1.8515857284440038, "grad_norm": 0.4109381139278412, "learning_rate": 3.836243874250496e-06, "loss": 0.4297, "step": 2491 }, { "epoch": 1.852329038652131, "grad_norm": 0.47205254435539246, "learning_rate": 3.832037049398103e-06, "loss": 0.4135, "step": 2492 }, { "epoch": 1.8530723488602576, "grad_norm": 0.4039191007614136, "learning_rate": 3.827831098877535e-06, "loss": 0.4057, "step": 2493 }, { "epoch": 1.8538156590683845, "grad_norm": 0.39348921179771423, "learning_rate": 3.823626025837349e-06, "loss": 0.4615, "step": 2494 }, { "epoch": 1.8545589692765114, "grad_norm": 0.4513653814792633, "learning_rate": 3.8194218334254444e-06, "loss": 0.3844, "step": 2495 }, { "epoch": 1.855302279484638, "grad_norm": 0.408113032579422, "learning_rate": 3.815218524789059e-06, "loss": 0.4108, "step": 2496 }, { "epoch": 1.8560455896927652, "grad_norm": 0.3676633834838867, "learning_rate": 3.811016103074771e-06, "loss": 0.4339, "step": 2497 }, { "epoch": 1.856788899900892, "grad_norm": 0.3838540017604828, "learning_rate": 3.806814571428494e-06, "loss": 0.4243, "step": 2498 }, { "epoch": 1.8575322101090188, "grad_norm": 0.445507287979126, "learning_rate": 3.8026139329954775e-06, "loss": 0.471, "step": 2499 }, { "epoch": 1.8582755203171457, "grad_norm": 0.37023383378982544, "learning_rate": 3.7984141909203e-06, "loss": 0.3711, "step": 2500 }, { "epoch": 1.8590188305252724, "grad_norm": 0.35302916169166565, "learning_rate": 3.7942153483468704e-06, "loss": 0.4033, "step": 2501 }, { "epoch": 1.8597621407333995, "grad_norm": 0.42220765352249146, "learning_rate": 3.7900174084184205e-06, "loss": 0.4427, "step": 2502 }, { "epoch": 1.8605054509415262, "grad_norm": 0.43083974719047546, "learning_rate": 3.785820374277512e-06, "loss": 0.4034, "step": 2503 }, { "epoch": 1.861248761149653, "grad_norm": 0.38413119316101074, "learning_rate": 3.7816242490660242e-06, "loss": 0.3798, "step": 2504 }, { "epoch": 1.86199207135778, "grad_norm": 0.41753315925598145, "learning_rate": 3.777429035925158e-06, "loss": 0.4519, "step": 2505 }, { "epoch": 1.8627353815659067, "grad_norm": 0.42626431584358215, "learning_rate": 3.7732347379954325e-06, "loss": 0.4577, "step": 2506 }, { "epoch": 1.8634786917740338, "grad_norm": 0.39528611302375793, "learning_rate": 3.769041358416679e-06, "loss": 0.405, "step": 2507 }, { "epoch": 1.8642220019821605, "grad_norm": 0.41489458084106445, "learning_rate": 3.7648489003280443e-06, "loss": 0.4501, "step": 2508 }, { "epoch": 1.8649653121902874, "grad_norm": 0.37363821268081665, "learning_rate": 3.7606573668679813e-06, "loss": 0.3686, "step": 2509 }, { "epoch": 1.8657086223984143, "grad_norm": 0.37972575426101685, "learning_rate": 3.7564667611742537e-06, "loss": 0.3877, "step": 2510 }, { "epoch": 1.866451932606541, "grad_norm": 0.42790383100509644, "learning_rate": 3.7522770863839314e-06, "loss": 0.4232, "step": 2511 }, { "epoch": 1.867195242814668, "grad_norm": 0.3737665116786957, "learning_rate": 3.748088345633386e-06, "loss": 0.3874, "step": 2512 }, { "epoch": 1.8679385530227948, "grad_norm": 0.390821635723114, "learning_rate": 3.7439005420582864e-06, "loss": 0.3589, "step": 2513 }, { "epoch": 1.8686818632309217, "grad_norm": 0.42118629813194275, "learning_rate": 3.7397136787936104e-06, "loss": 0.4246, "step": 2514 }, { "epoch": 1.8694251734390486, "grad_norm": 0.43074849247932434, "learning_rate": 3.7355277589736192e-06, "loss": 0.4435, "step": 2515 }, { "epoch": 1.8701684836471755, "grad_norm": 0.3641786277294159, "learning_rate": 3.731342785731876e-06, "loss": 0.3542, "step": 2516 }, { "epoch": 1.8709117938553024, "grad_norm": 0.3937283754348755, "learning_rate": 3.727158762201233e-06, "loss": 0.3926, "step": 2517 }, { "epoch": 1.871655104063429, "grad_norm": 0.41430041193962097, "learning_rate": 3.722975691513833e-06, "loss": 0.3911, "step": 2518 }, { "epoch": 1.872398414271556, "grad_norm": 0.3797227144241333, "learning_rate": 3.7187935768010995e-06, "loss": 0.4147, "step": 2519 }, { "epoch": 1.8731417244796829, "grad_norm": 0.3618844151496887, "learning_rate": 3.714612421193748e-06, "loss": 0.4119, "step": 2520 }, { "epoch": 1.8738850346878098, "grad_norm": 0.40511247515678406, "learning_rate": 3.710432227821771e-06, "loss": 0.446, "step": 2521 }, { "epoch": 1.8746283448959367, "grad_norm": 0.4371126592159271, "learning_rate": 3.706252999814445e-06, "loss": 0.3798, "step": 2522 }, { "epoch": 1.8753716551040633, "grad_norm": 0.45317840576171875, "learning_rate": 3.7020747403003197e-06, "loss": 0.4091, "step": 2523 }, { "epoch": 1.8761149653121902, "grad_norm": 0.42834511399269104, "learning_rate": 3.6978974524072224e-06, "loss": 0.4157, "step": 2524 }, { "epoch": 1.8768582755203171, "grad_norm": 0.4191945791244507, "learning_rate": 3.6937211392622524e-06, "loss": 0.4329, "step": 2525 }, { "epoch": 1.877601585728444, "grad_norm": 0.42015597224235535, "learning_rate": 3.6895458039917763e-06, "loss": 0.4453, "step": 2526 }, { "epoch": 1.878344895936571, "grad_norm": 0.37357112765312195, "learning_rate": 3.6853714497214345e-06, "loss": 0.3914, "step": 2527 }, { "epoch": 1.8790882061446976, "grad_norm": 0.38392141461372375, "learning_rate": 3.681198079576126e-06, "loss": 0.4213, "step": 2528 }, { "epoch": 1.8798315163528245, "grad_norm": 0.36563801765441895, "learning_rate": 3.677025696680021e-06, "loss": 0.377, "step": 2529 }, { "epoch": 1.8805748265609514, "grad_norm": 0.41312187910079956, "learning_rate": 3.6728543041565455e-06, "loss": 0.4512, "step": 2530 }, { "epoch": 1.8813181367690783, "grad_norm": 0.430073618888855, "learning_rate": 3.668683905128386e-06, "loss": 0.4297, "step": 2531 }, { "epoch": 1.8820614469772052, "grad_norm": 0.35714682936668396, "learning_rate": 3.6645145027174823e-06, "loss": 0.4182, "step": 2532 }, { "epoch": 1.882804757185332, "grad_norm": 0.3420628607273102, "learning_rate": 3.6603461000450302e-06, "loss": 0.3493, "step": 2533 }, { "epoch": 1.883548067393459, "grad_norm": 0.40284886956214905, "learning_rate": 3.6561787002314798e-06, "loss": 0.4126, "step": 2534 }, { "epoch": 1.8842913776015857, "grad_norm": 0.3899948298931122, "learning_rate": 3.652012306396527e-06, "loss": 0.4358, "step": 2535 }, { "epoch": 1.8850346878097126, "grad_norm": 0.43159669637680054, "learning_rate": 3.647846921659112e-06, "loss": 0.4278, "step": 2536 }, { "epoch": 1.8857779980178395, "grad_norm": 0.38283583521842957, "learning_rate": 3.643682549137429e-06, "loss": 0.4094, "step": 2537 }, { "epoch": 1.8865213082259662, "grad_norm": 0.3556538224220276, "learning_rate": 3.6395191919489046e-06, "loss": 0.3799, "step": 2538 }, { "epoch": 1.8872646184340933, "grad_norm": 0.3566563129425049, "learning_rate": 3.635356853210211e-06, "loss": 0.3908, "step": 2539 }, { "epoch": 1.88800792864222, "grad_norm": 0.40816664695739746, "learning_rate": 3.6311955360372563e-06, "loss": 0.4433, "step": 2540 }, { "epoch": 1.888751238850347, "grad_norm": 0.34159472584724426, "learning_rate": 3.627035243545184e-06, "loss": 0.377, "step": 2541 }, { "epoch": 1.8894945490584738, "grad_norm": 0.42881518602371216, "learning_rate": 3.6228759788483683e-06, "loss": 0.4298, "step": 2542 }, { "epoch": 1.8902378592666005, "grad_norm": 0.39481455087661743, "learning_rate": 3.6187177450604177e-06, "loss": 0.3958, "step": 2543 }, { "epoch": 1.8909811694747276, "grad_norm": 0.41073980927467346, "learning_rate": 3.614560545294166e-06, "loss": 0.4494, "step": 2544 }, { "epoch": 1.8917244796828543, "grad_norm": 0.36723533272743225, "learning_rate": 3.6104043826616774e-06, "loss": 0.3744, "step": 2545 }, { "epoch": 1.8924677898909812, "grad_norm": 0.44761958718299866, "learning_rate": 3.606249260274234e-06, "loss": 0.4373, "step": 2546 }, { "epoch": 1.893211100099108, "grad_norm": 0.3929520845413208, "learning_rate": 3.6020951812423433e-06, "loss": 0.3794, "step": 2547 }, { "epoch": 1.8939544103072348, "grad_norm": 0.3550293445587158, "learning_rate": 3.5979421486757293e-06, "loss": 0.3992, "step": 2548 }, { "epoch": 1.894697720515362, "grad_norm": 0.39545923471450806, "learning_rate": 3.5937901656833337e-06, "loss": 0.4139, "step": 2549 }, { "epoch": 1.8954410307234886, "grad_norm": 0.3783950209617615, "learning_rate": 3.5896392353733116e-06, "loss": 0.4019, "step": 2550 }, { "epoch": 1.8961843409316155, "grad_norm": 0.4143766760826111, "learning_rate": 3.5854893608530305e-06, "loss": 0.4344, "step": 2551 }, { "epoch": 1.8969276511397424, "grad_norm": 0.39108210802078247, "learning_rate": 3.5813405452290707e-06, "loss": 0.4246, "step": 2552 }, { "epoch": 1.897670961347869, "grad_norm": 0.3701939284801483, "learning_rate": 3.5771927916072147e-06, "loss": 0.3964, "step": 2553 }, { "epoch": 1.8984142715559962, "grad_norm": 0.38883310556411743, "learning_rate": 3.573046103092454e-06, "loss": 0.4091, "step": 2554 }, { "epoch": 1.8991575817641229, "grad_norm": 0.3912850618362427, "learning_rate": 3.5689004827889783e-06, "loss": 0.4017, "step": 2555 }, { "epoch": 1.8999008919722498, "grad_norm": 0.3483099341392517, "learning_rate": 3.5647559338001826e-06, "loss": 0.383, "step": 2556 }, { "epoch": 1.9006442021803767, "grad_norm": 0.38705649971961975, "learning_rate": 3.5606124592286564e-06, "loss": 0.4231, "step": 2557 }, { "epoch": 1.9013875123885033, "grad_norm": 0.39683109521865845, "learning_rate": 3.5564700621761873e-06, "loss": 0.4287, "step": 2558 }, { "epoch": 1.9021308225966305, "grad_norm": 0.4402178227901459, "learning_rate": 3.5523287457437515e-06, "loss": 0.4531, "step": 2559 }, { "epoch": 1.9028741328047571, "grad_norm": 0.3918790817260742, "learning_rate": 3.5481885130315265e-06, "loss": 0.3815, "step": 2560 }, { "epoch": 1.903617443012884, "grad_norm": 0.3636537194252014, "learning_rate": 3.544049367138867e-06, "loss": 0.4191, "step": 2561 }, { "epoch": 1.904360753221011, "grad_norm": 0.4153100550174713, "learning_rate": 3.53991131116432e-06, "loss": 0.4201, "step": 2562 }, { "epoch": 1.9051040634291376, "grad_norm": 0.4225628077983856, "learning_rate": 3.5357743482056175e-06, "loss": 0.4201, "step": 2563 }, { "epoch": 1.9058473736372648, "grad_norm": 0.43407905101776123, "learning_rate": 3.531638481359669e-06, "loss": 0.4243, "step": 2564 }, { "epoch": 1.9065906838453914, "grad_norm": 0.3706410527229309, "learning_rate": 3.5275037137225677e-06, "loss": 0.3962, "step": 2565 }, { "epoch": 1.9073339940535183, "grad_norm": 0.42837825417518616, "learning_rate": 3.5233700483895807e-06, "loss": 0.4382, "step": 2566 }, { "epoch": 1.9080773042616452, "grad_norm": 0.42476537823677063, "learning_rate": 3.51923748845515e-06, "loss": 0.4404, "step": 2567 }, { "epoch": 1.908820614469772, "grad_norm": 0.3576662540435791, "learning_rate": 3.5151060370128946e-06, "loss": 0.3805, "step": 2568 }, { "epoch": 1.909563924677899, "grad_norm": 0.3604731261730194, "learning_rate": 3.5109756971555997e-06, "loss": 0.3982, "step": 2569 }, { "epoch": 1.9103072348860257, "grad_norm": 0.3928928077220917, "learning_rate": 3.5068464719752184e-06, "loss": 0.416, "step": 2570 }, { "epoch": 1.9110505450941526, "grad_norm": 0.42880168557167053, "learning_rate": 3.5027183645628717e-06, "loss": 0.4515, "step": 2571 }, { "epoch": 1.9117938553022795, "grad_norm": 0.3838127553462982, "learning_rate": 3.4985913780088398e-06, "loss": 0.4175, "step": 2572 }, { "epoch": 1.9125371655104062, "grad_norm": 0.43903952836990356, "learning_rate": 3.4944655154025673e-06, "loss": 0.4581, "step": 2573 }, { "epoch": 1.9132804757185333, "grad_norm": 0.37452974915504456, "learning_rate": 3.490340779832657e-06, "loss": 0.3658, "step": 2574 }, { "epoch": 1.91402378592666, "grad_norm": 0.37635332345962524, "learning_rate": 3.4862171743868666e-06, "loss": 0.4004, "step": 2575 }, { "epoch": 1.914767096134787, "grad_norm": 0.3706459701061249, "learning_rate": 3.48209470215211e-06, "loss": 0.3899, "step": 2576 }, { "epoch": 1.9155104063429138, "grad_norm": 0.4272243082523346, "learning_rate": 3.477973366214451e-06, "loss": 0.4294, "step": 2577 }, { "epoch": 1.9162537165510405, "grad_norm": 0.40280959010124207, "learning_rate": 3.4738531696591028e-06, "loss": 0.394, "step": 2578 }, { "epoch": 1.9169970267591676, "grad_norm": 0.3966800570487976, "learning_rate": 3.4697341155704252e-06, "loss": 0.4264, "step": 2579 }, { "epoch": 1.9177403369672943, "grad_norm": 0.3458070158958435, "learning_rate": 3.465616207031924e-06, "loss": 0.4015, "step": 2580 }, { "epoch": 1.9184836471754212, "grad_norm": 0.39867404103279114, "learning_rate": 3.46149944712625e-06, "loss": 0.4196, "step": 2581 }, { "epoch": 1.919226957383548, "grad_norm": 0.4083695709705353, "learning_rate": 3.457383838935185e-06, "loss": 0.4323, "step": 2582 }, { "epoch": 1.9199702675916748, "grad_norm": 0.3708915710449219, "learning_rate": 3.45326938553966e-06, "loss": 0.3874, "step": 2583 }, { "epoch": 1.920713577799802, "grad_norm": 0.42775484919548035, "learning_rate": 3.4491560900197373e-06, "loss": 0.4203, "step": 2584 }, { "epoch": 1.9214568880079286, "grad_norm": 0.3690681457519531, "learning_rate": 3.445043955454607e-06, "loss": 0.4207, "step": 2585 }, { "epoch": 1.9222001982160555, "grad_norm": 0.39235132932662964, "learning_rate": 3.4409329849225974e-06, "loss": 0.3852, "step": 2586 }, { "epoch": 1.9229435084241824, "grad_norm": 0.38236692547798157, "learning_rate": 3.4368231815011614e-06, "loss": 0.4148, "step": 2587 }, { "epoch": 1.923686818632309, "grad_norm": 0.40981265902519226, "learning_rate": 3.432714548266881e-06, "loss": 0.4213, "step": 2588 }, { "epoch": 1.9244301288404362, "grad_norm": 0.3907361328601837, "learning_rate": 3.428607088295459e-06, "loss": 0.3629, "step": 2589 }, { "epoch": 1.9251734390485629, "grad_norm": 0.4290710985660553, "learning_rate": 3.42450080466172e-06, "loss": 0.4396, "step": 2590 }, { "epoch": 1.9259167492566898, "grad_norm": 0.4012486934661865, "learning_rate": 3.420395700439614e-06, "loss": 0.4149, "step": 2591 }, { "epoch": 1.9266600594648167, "grad_norm": 0.39175093173980713, "learning_rate": 3.4162917787022e-06, "loss": 0.4119, "step": 2592 }, { "epoch": 1.9274033696729433, "grad_norm": 0.39177966117858887, "learning_rate": 3.4121890425216562e-06, "loss": 0.4223, "step": 2593 }, { "epoch": 1.9281466798810705, "grad_norm": 0.3827507495880127, "learning_rate": 3.408087494969275e-06, "loss": 0.3774, "step": 2594 }, { "epoch": 1.9288899900891971, "grad_norm": 0.4082188606262207, "learning_rate": 3.4039871391154527e-06, "loss": 0.456, "step": 2595 }, { "epoch": 1.929633300297324, "grad_norm": 0.3613107204437256, "learning_rate": 3.3998879780296987e-06, "loss": 0.3932, "step": 2596 }, { "epoch": 1.930376610505451, "grad_norm": 0.4088954031467438, "learning_rate": 3.395790014780628e-06, "loss": 0.3897, "step": 2597 }, { "epoch": 1.9311199207135779, "grad_norm": 0.3820333778858185, "learning_rate": 3.391693252435954e-06, "loss": 0.4379, "step": 2598 }, { "epoch": 1.9318632309217048, "grad_norm": 0.3789059817790985, "learning_rate": 3.387597694062499e-06, "loss": 0.3965, "step": 2599 }, { "epoch": 1.9326065411298314, "grad_norm": 0.3882884681224823, "learning_rate": 3.383503342726178e-06, "loss": 0.4286, "step": 2600 }, { "epoch": 1.9333498513379583, "grad_norm": 0.37733030319213867, "learning_rate": 3.3794102014920027e-06, "loss": 0.3623, "step": 2601 }, { "epoch": 1.9340931615460852, "grad_norm": 0.39019209146499634, "learning_rate": 3.375318273424081e-06, "loss": 0.3801, "step": 2602 }, { "epoch": 1.9348364717542121, "grad_norm": 0.39982932806015015, "learning_rate": 3.3712275615856127e-06, "loss": 0.4498, "step": 2603 }, { "epoch": 1.935579781962339, "grad_norm": 0.33657586574554443, "learning_rate": 3.3671380690388863e-06, "loss": 0.362, "step": 2604 }, { "epoch": 1.9363230921704657, "grad_norm": 0.37388408184051514, "learning_rate": 3.363049798845274e-06, "loss": 0.4287, "step": 2605 }, { "epoch": 1.9370664023785926, "grad_norm": 0.4293311536312103, "learning_rate": 3.3589627540652416e-06, "loss": 0.4542, "step": 2606 }, { "epoch": 1.9378097125867195, "grad_norm": 0.3655772805213928, "learning_rate": 3.354876937758331e-06, "loss": 0.3698, "step": 2607 }, { "epoch": 1.9385530227948464, "grad_norm": 0.33671969175338745, "learning_rate": 3.3507923529831655e-06, "loss": 0.4295, "step": 2608 }, { "epoch": 1.9392963330029733, "grad_norm": 0.3881883919239044, "learning_rate": 3.346709002797446e-06, "loss": 0.409, "step": 2609 }, { "epoch": 1.9400396432111, "grad_norm": 0.3650270700454712, "learning_rate": 3.3426268902579507e-06, "loss": 0.3879, "step": 2610 }, { "epoch": 1.940782953419227, "grad_norm": 0.3898342251777649, "learning_rate": 3.3385460184205325e-06, "loss": 0.4218, "step": 2611 }, { "epoch": 1.9415262636273538, "grad_norm": 0.36267927289009094, "learning_rate": 3.3344663903401115e-06, "loss": 0.363, "step": 2612 }, { "epoch": 1.9422695738354807, "grad_norm": 0.39540258049964905, "learning_rate": 3.3303880090706785e-06, "loss": 0.4352, "step": 2613 }, { "epoch": 1.9430128840436076, "grad_norm": 0.37097233533859253, "learning_rate": 3.326310877665293e-06, "loss": 0.4152, "step": 2614 }, { "epoch": 1.9437561942517343, "grad_norm": 0.3519951105117798, "learning_rate": 3.3222349991760784e-06, "loss": 0.4301, "step": 2615 }, { "epoch": 1.9444995044598612, "grad_norm": 0.36968138813972473, "learning_rate": 3.3181603766542157e-06, "loss": 0.3972, "step": 2616 }, { "epoch": 1.945242814667988, "grad_norm": 0.378135085105896, "learning_rate": 3.3140870131499536e-06, "loss": 0.4106, "step": 2617 }, { "epoch": 1.945986124876115, "grad_norm": 0.487020343542099, "learning_rate": 3.310014911712589e-06, "loss": 0.4778, "step": 2618 }, { "epoch": 1.946729435084242, "grad_norm": 0.361870139837265, "learning_rate": 3.3059440753904816e-06, "loss": 0.4239, "step": 2619 }, { "epoch": 1.9474727452923686, "grad_norm": 0.3933032155036926, "learning_rate": 3.3018745072310397e-06, "loss": 0.3985, "step": 2620 }, { "epoch": 1.9482160555004957, "grad_norm": 0.44639238715171814, "learning_rate": 3.297806210280724e-06, "loss": 0.4207, "step": 2621 }, { "epoch": 1.9489593657086224, "grad_norm": 0.4159824252128601, "learning_rate": 3.2937391875850455e-06, "loss": 0.4379, "step": 2622 }, { "epoch": 1.9497026759167493, "grad_norm": 0.42074882984161377, "learning_rate": 3.2896734421885568e-06, "loss": 0.4108, "step": 2623 }, { "epoch": 1.9504459861248762, "grad_norm": 0.38110819458961487, "learning_rate": 3.285608977134858e-06, "loss": 0.4022, "step": 2624 }, { "epoch": 1.9511892963330029, "grad_norm": 0.37523502111434937, "learning_rate": 3.2815457954665886e-06, "loss": 0.3753, "step": 2625 }, { "epoch": 1.95193260654113, "grad_norm": 0.41602185368537903, "learning_rate": 3.2774839002254277e-06, "loss": 0.4183, "step": 2626 }, { "epoch": 1.9526759167492567, "grad_norm": 0.4401800334453583, "learning_rate": 3.2734232944520927e-06, "loss": 0.4239, "step": 2627 }, { "epoch": 1.9534192269573836, "grad_norm": 0.3961527645587921, "learning_rate": 3.2693639811863363e-06, "loss": 0.3792, "step": 2628 }, { "epoch": 1.9541625371655105, "grad_norm": 0.3915402889251709, "learning_rate": 3.265305963466937e-06, "loss": 0.4113, "step": 2629 }, { "epoch": 1.9549058473736372, "grad_norm": 0.405513733625412, "learning_rate": 3.2612492443317156e-06, "loss": 0.396, "step": 2630 }, { "epoch": 1.9556491575817643, "grad_norm": 0.40256017446517944, "learning_rate": 3.25719382681751e-06, "loss": 0.3912, "step": 2631 }, { "epoch": 1.956392467789891, "grad_norm": 0.42153042554855347, "learning_rate": 3.2531397139601866e-06, "loss": 0.4236, "step": 2632 }, { "epoch": 1.9571357779980179, "grad_norm": 0.3998820185661316, "learning_rate": 3.2490869087946388e-06, "loss": 0.4128, "step": 2633 }, { "epoch": 1.9578790882061448, "grad_norm": 0.3584192991256714, "learning_rate": 3.2450354143547774e-06, "loss": 0.3799, "step": 2634 }, { "epoch": 1.9586223984142714, "grad_norm": 0.38776129484176636, "learning_rate": 3.240985233673532e-06, "loss": 0.389, "step": 2635 }, { "epoch": 1.9593657086223986, "grad_norm": 0.3767891228199005, "learning_rate": 3.2369363697828494e-06, "loss": 0.4244, "step": 2636 }, { "epoch": 1.9601090188305252, "grad_norm": 0.3617686629295349, "learning_rate": 3.2328888257136926e-06, "loss": 0.3848, "step": 2637 }, { "epoch": 1.9608523290386521, "grad_norm": 0.4014440178871155, "learning_rate": 3.228842604496035e-06, "loss": 0.4098, "step": 2638 }, { "epoch": 1.961595639246779, "grad_norm": 0.3633176386356354, "learning_rate": 3.2247977091588586e-06, "loss": 0.3962, "step": 2639 }, { "epoch": 1.9623389494549057, "grad_norm": 0.4639827311038971, "learning_rate": 3.2207541427301547e-06, "loss": 0.4416, "step": 2640 }, { "epoch": 1.9630822596630328, "grad_norm": 0.36297765374183655, "learning_rate": 3.216711908236918e-06, "loss": 0.407, "step": 2641 }, { "epoch": 1.9638255698711595, "grad_norm": 0.4124203026294708, "learning_rate": 3.212671008705148e-06, "loss": 0.4324, "step": 2642 }, { "epoch": 1.9645688800792864, "grad_norm": 0.3937511444091797, "learning_rate": 3.2086314471598435e-06, "loss": 0.3712, "step": 2643 }, { "epoch": 1.9653121902874133, "grad_norm": 0.39009469747543335, "learning_rate": 3.204593226625e-06, "loss": 0.4325, "step": 2644 }, { "epoch": 1.96605550049554, "grad_norm": 0.3697218894958496, "learning_rate": 3.2005563501236157e-06, "loss": 0.4203, "step": 2645 }, { "epoch": 1.9667988107036671, "grad_norm": 0.35337522625923157, "learning_rate": 3.1965208206776767e-06, "loss": 0.4227, "step": 2646 }, { "epoch": 1.9675421209117938, "grad_norm": 0.33605730533599854, "learning_rate": 3.1924866413081612e-06, "loss": 0.3457, "step": 2647 }, { "epoch": 1.9682854311199207, "grad_norm": 0.3908374309539795, "learning_rate": 3.1884538150350374e-06, "loss": 0.4621, "step": 2648 }, { "epoch": 1.9690287413280476, "grad_norm": 0.3797048330307007, "learning_rate": 3.184422344877263e-06, "loss": 0.3924, "step": 2649 }, { "epoch": 1.9697720515361743, "grad_norm": 0.37100526690483093, "learning_rate": 3.180392233852776e-06, "loss": 0.4444, "step": 2650 }, { "epoch": 1.9705153617443014, "grad_norm": 0.375336617231369, "learning_rate": 3.176363484978502e-06, "loss": 0.437, "step": 2651 }, { "epoch": 1.971258671952428, "grad_norm": 0.3926558792591095, "learning_rate": 3.1723361012703404e-06, "loss": 0.4513, "step": 2652 }, { "epoch": 1.972001982160555, "grad_norm": 0.3875398337841034, "learning_rate": 3.168310085743177e-06, "loss": 0.4121, "step": 2653 }, { "epoch": 1.972745292368682, "grad_norm": 0.3859836757183075, "learning_rate": 3.164285441410867e-06, "loss": 0.4371, "step": 2654 }, { "epoch": 1.9734886025768086, "grad_norm": 0.31421369314193726, "learning_rate": 3.1602621712862414e-06, "loss": 0.3643, "step": 2655 }, { "epoch": 1.9742319127849357, "grad_norm": 0.4093247950077057, "learning_rate": 3.156240278381102e-06, "loss": 0.4262, "step": 2656 }, { "epoch": 1.9749752229930624, "grad_norm": 0.4403294026851654, "learning_rate": 3.15221976570622e-06, "loss": 0.3776, "step": 2657 }, { "epoch": 1.9757185332011893, "grad_norm": 0.3638985753059387, "learning_rate": 3.1482006362713326e-06, "loss": 0.4187, "step": 2658 }, { "epoch": 1.9764618434093162, "grad_norm": 0.3793680965900421, "learning_rate": 3.1441828930851404e-06, "loss": 0.447, "step": 2659 }, { "epoch": 1.9772051536174429, "grad_norm": 0.3894979655742645, "learning_rate": 3.1401665391553127e-06, "loss": 0.3949, "step": 2660 }, { "epoch": 1.97794846382557, "grad_norm": 0.4232301414012909, "learning_rate": 3.13615157748847e-06, "loss": 0.3995, "step": 2661 }, { "epoch": 1.9786917740336967, "grad_norm": 0.3831191956996918, "learning_rate": 3.1321380110901967e-06, "loss": 0.3869, "step": 2662 }, { "epoch": 1.9794350842418236, "grad_norm": 0.3797236979007721, "learning_rate": 3.128125842965029e-06, "loss": 0.4046, "step": 2663 }, { "epoch": 1.9801783944499505, "grad_norm": 0.36481085419654846, "learning_rate": 3.12411507611646e-06, "loss": 0.3799, "step": 2664 }, { "epoch": 1.9809217046580772, "grad_norm": 0.44812580943107605, "learning_rate": 3.12010571354693e-06, "loss": 0.43, "step": 2665 }, { "epoch": 1.9816650148662043, "grad_norm": 0.4363606572151184, "learning_rate": 3.1160977582578294e-06, "loss": 0.4412, "step": 2666 }, { "epoch": 1.982408325074331, "grad_norm": 0.3686516284942627, "learning_rate": 3.112091213249496e-06, "loss": 0.3638, "step": 2667 }, { "epoch": 1.9831516352824579, "grad_norm": 0.3304034173488617, "learning_rate": 3.1080860815212134e-06, "loss": 0.4047, "step": 2668 }, { "epoch": 1.9838949454905848, "grad_norm": 0.3681882619857788, "learning_rate": 3.1040823660712027e-06, "loss": 0.4046, "step": 2669 }, { "epoch": 1.9846382556987114, "grad_norm": 0.3806927800178528, "learning_rate": 3.1000800698966303e-06, "loss": 0.451, "step": 2670 }, { "epoch": 1.9853815659068386, "grad_norm": 0.3884618878364563, "learning_rate": 3.0960791959935937e-06, "loss": 0.396, "step": 2671 }, { "epoch": 1.9861248761149652, "grad_norm": 0.3801078200340271, "learning_rate": 3.0920797473571306e-06, "loss": 0.4176, "step": 2672 }, { "epoch": 1.9868681863230921, "grad_norm": 0.34976571798324585, "learning_rate": 3.0880817269812107e-06, "loss": 0.4205, "step": 2673 }, { "epoch": 1.987611496531219, "grad_norm": 0.3440803289413452, "learning_rate": 3.084085137858735e-06, "loss": 0.4142, "step": 2674 }, { "epoch": 1.9883548067393457, "grad_norm": 0.37666183710098267, "learning_rate": 3.0800899829815276e-06, "loss": 0.3795, "step": 2675 }, { "epoch": 1.9890981169474728, "grad_norm": 0.433723509311676, "learning_rate": 3.0760962653403505e-06, "loss": 0.4382, "step": 2676 }, { "epoch": 1.9898414271555995, "grad_norm": 0.37976592779159546, "learning_rate": 3.072103987924878e-06, "loss": 0.3896, "step": 2677 }, { "epoch": 1.9905847373637264, "grad_norm": 0.40233078598976135, "learning_rate": 3.068113153723712e-06, "loss": 0.4363, "step": 2678 }, { "epoch": 1.9913280475718533, "grad_norm": 0.41558173298835754, "learning_rate": 3.064123765724374e-06, "loss": 0.376, "step": 2679 }, { "epoch": 1.99207135777998, "grad_norm": 0.4538148045539856, "learning_rate": 3.0601358269133017e-06, "loss": 0.4435, "step": 2680 }, { "epoch": 1.9928146679881071, "grad_norm": 0.374266654253006, "learning_rate": 3.0561493402758474e-06, "loss": 0.4301, "step": 2681 }, { "epoch": 1.9935579781962338, "grad_norm": 0.33728882670402527, "learning_rate": 3.052164308796277e-06, "loss": 0.3338, "step": 2682 }, { "epoch": 1.9943012884043607, "grad_norm": 0.43682315945625305, "learning_rate": 3.0481807354577675e-06, "loss": 0.4351, "step": 2683 }, { "epoch": 1.9950445986124876, "grad_norm": 0.37409523129463196, "learning_rate": 3.044198623242405e-06, "loss": 0.3829, "step": 2684 }, { "epoch": 1.9957879088206145, "grad_norm": 0.3486426770687103, "learning_rate": 3.04021797513118e-06, "loss": 0.3992, "step": 2685 }, { "epoch": 1.9965312190287414, "grad_norm": 0.362394779920578, "learning_rate": 3.0362387941039883e-06, "loss": 0.4399, "step": 2686 }, { "epoch": 1.997274529236868, "grad_norm": 0.38569769263267517, "learning_rate": 3.032261083139627e-06, "loss": 0.3883, "step": 2687 }, { "epoch": 1.998017839444995, "grad_norm": 0.36952686309814453, "learning_rate": 3.028284845215792e-06, "loss": 0.4088, "step": 2688 }, { "epoch": 1.998761149653122, "grad_norm": 0.38241684436798096, "learning_rate": 3.024310083309077e-06, "loss": 0.4365, "step": 2689 }, { "epoch": 1.9995044598612488, "grad_norm": 0.36260727047920227, "learning_rate": 3.020336800394971e-06, "loss": 0.3891, "step": 2690 }, { "epoch": 2.0002477700693757, "grad_norm": 0.8722225427627563, "learning_rate": 3.0163649994478566e-06, "loss": 0.6741, "step": 2691 }, { "epoch": 2.0009910802775024, "grad_norm": 0.38477885723114014, "learning_rate": 3.012394683441006e-06, "loss": 0.3719, "step": 2692 }, { "epoch": 2.0017343904856295, "grad_norm": 0.3932250142097473, "learning_rate": 3.0084258553465806e-06, "loss": 0.4036, "step": 2693 }, { "epoch": 2.002477700693756, "grad_norm": 0.4281269907951355, "learning_rate": 3.0044585181356255e-06, "loss": 0.4109, "step": 2694 }, { "epoch": 2.003221010901883, "grad_norm": 0.4193592965602875, "learning_rate": 3.000492674778073e-06, "loss": 0.3764, "step": 2695 }, { "epoch": 2.00396432111001, "grad_norm": 0.35552456974983215, "learning_rate": 2.996528328242735e-06, "loss": 0.3794, "step": 2696 }, { "epoch": 2.0047076313181367, "grad_norm": 0.39270731806755066, "learning_rate": 2.9925654814973036e-06, "loss": 0.4245, "step": 2697 }, { "epoch": 2.005450941526264, "grad_norm": 0.3822779059410095, "learning_rate": 2.988604137508346e-06, "loss": 0.3958, "step": 2698 }, { "epoch": 2.0061942517343905, "grad_norm": 0.41489654779434204, "learning_rate": 2.9846442992413104e-06, "loss": 0.3843, "step": 2699 }, { "epoch": 2.006937561942517, "grad_norm": 0.41857776045799255, "learning_rate": 2.980685969660514e-06, "loss": 0.352, "step": 2700 }, { "epoch": 2.0076808721506443, "grad_norm": 0.3571939170360565, "learning_rate": 2.976729151729141e-06, "loss": 0.3973, "step": 2701 }, { "epoch": 2.008424182358771, "grad_norm": 0.39849191904067993, "learning_rate": 2.9727738484092496e-06, "loss": 0.3765, "step": 2702 }, { "epoch": 2.009167492566898, "grad_norm": 0.3458631932735443, "learning_rate": 2.968820062661762e-06, "loss": 0.3437, "step": 2703 }, { "epoch": 2.0099108027750248, "grad_norm": 0.40418168902397156, "learning_rate": 2.9648677974464657e-06, "loss": 0.41, "step": 2704 }, { "epoch": 2.0106541129831514, "grad_norm": 0.3920042812824249, "learning_rate": 2.960917055722006e-06, "loss": 0.3588, "step": 2705 }, { "epoch": 2.0113974231912786, "grad_norm": 0.3453976511955261, "learning_rate": 2.95696784044589e-06, "loss": 0.3697, "step": 2706 }, { "epoch": 2.0121407333994052, "grad_norm": 0.3947224020957947, "learning_rate": 2.9530201545744865e-06, "loss": 0.4096, "step": 2707 }, { "epoch": 2.0128840436075324, "grad_norm": 0.39062613248825073, "learning_rate": 2.949074001063013e-06, "loss": 0.388, "step": 2708 }, { "epoch": 2.013627353815659, "grad_norm": 0.3678574562072754, "learning_rate": 2.9451293828655424e-06, "loss": 0.3582, "step": 2709 }, { "epoch": 2.0143706640237857, "grad_norm": 0.37443751096725464, "learning_rate": 2.941186302935e-06, "loss": 0.3743, "step": 2710 }, { "epoch": 2.015113974231913, "grad_norm": 0.3416731059551239, "learning_rate": 2.937244764223155e-06, "loss": 0.3843, "step": 2711 }, { "epoch": 2.0158572844400395, "grad_norm": 0.37432199716567993, "learning_rate": 2.933304769680626e-06, "loss": 0.3652, "step": 2712 }, { "epoch": 2.0166005946481667, "grad_norm": 0.37264859676361084, "learning_rate": 2.9293663222568757e-06, "loss": 0.4073, "step": 2713 }, { "epoch": 2.0173439048562933, "grad_norm": 0.3549811542034149, "learning_rate": 2.9254294249002092e-06, "loss": 0.3905, "step": 2714 }, { "epoch": 2.01808721506442, "grad_norm": 0.35239627957344055, "learning_rate": 2.9214940805577695e-06, "loss": 0.3558, "step": 2715 }, { "epoch": 2.018830525272547, "grad_norm": 0.35523712635040283, "learning_rate": 2.917560292175538e-06, "loss": 0.4215, "step": 2716 }, { "epoch": 2.019573835480674, "grad_norm": 0.368066668510437, "learning_rate": 2.9136280626983315e-06, "loss": 0.3908, "step": 2717 }, { "epoch": 2.020317145688801, "grad_norm": 0.374389111995697, "learning_rate": 2.9096973950697987e-06, "loss": 0.3787, "step": 2718 }, { "epoch": 2.0210604558969276, "grad_norm": 0.3615299165248871, "learning_rate": 2.9057682922324177e-06, "loss": 0.3728, "step": 2719 }, { "epoch": 2.0218037661050543, "grad_norm": 0.3740482032299042, "learning_rate": 2.9018407571275007e-06, "loss": 0.3249, "step": 2720 }, { "epoch": 2.0225470763131814, "grad_norm": 0.3766627609729767, "learning_rate": 2.89791479269518e-06, "loss": 0.374, "step": 2721 }, { "epoch": 2.023290386521308, "grad_norm": 0.35779061913490295, "learning_rate": 2.893990401874416e-06, "loss": 0.3665, "step": 2722 }, { "epoch": 2.0240336967294352, "grad_norm": 0.3545067012310028, "learning_rate": 2.8900675876029925e-06, "loss": 0.3757, "step": 2723 }, { "epoch": 2.024777006937562, "grad_norm": 0.3135336935520172, "learning_rate": 2.886146352817506e-06, "loss": 0.3917, "step": 2724 }, { "epoch": 2.0255203171456886, "grad_norm": 0.3452955186367035, "learning_rate": 2.8822267004533803e-06, "loss": 0.3819, "step": 2725 }, { "epoch": 2.0262636273538157, "grad_norm": 0.41933825612068176, "learning_rate": 2.8783086334448472e-06, "loss": 0.394, "step": 2726 }, { "epoch": 2.0270069375619424, "grad_norm": 0.3684336543083191, "learning_rate": 2.874392154724952e-06, "loss": 0.3784, "step": 2727 }, { "epoch": 2.0277502477700695, "grad_norm": 0.3842262625694275, "learning_rate": 2.8704772672255565e-06, "loss": 0.3748, "step": 2728 }, { "epoch": 2.028493557978196, "grad_norm": 0.3567473590373993, "learning_rate": 2.8665639738773253e-06, "loss": 0.3787, "step": 2729 }, { "epoch": 2.029236868186323, "grad_norm": 0.36489763855934143, "learning_rate": 2.862652277609733e-06, "loss": 0.3997, "step": 2730 }, { "epoch": 2.02998017839445, "grad_norm": 0.39444705843925476, "learning_rate": 2.8587421813510607e-06, "loss": 0.3964, "step": 2731 }, { "epoch": 2.0307234886025767, "grad_norm": 0.34987857937812805, "learning_rate": 2.8548336880283866e-06, "loss": 0.3481, "step": 2732 }, { "epoch": 2.031466798810704, "grad_norm": 0.41317880153656006, "learning_rate": 2.8509268005675882e-06, "loss": 0.3883, "step": 2733 }, { "epoch": 2.0322101090188305, "grad_norm": 0.362121045589447, "learning_rate": 2.8470215218933505e-06, "loss": 0.3993, "step": 2734 }, { "epoch": 2.032953419226957, "grad_norm": 0.3682557940483093, "learning_rate": 2.8431178549291427e-06, "loss": 0.413, "step": 2735 }, { "epoch": 2.0336967294350843, "grad_norm": 0.35502922534942627, "learning_rate": 2.8392158025972326e-06, "loss": 0.3542, "step": 2736 }, { "epoch": 2.034440039643211, "grad_norm": 0.38731008768081665, "learning_rate": 2.8353153678186795e-06, "loss": 0.3811, "step": 2737 }, { "epoch": 2.035183349851338, "grad_norm": 0.40730100870132446, "learning_rate": 2.8314165535133353e-06, "loss": 0.4212, "step": 2738 }, { "epoch": 2.0359266600594648, "grad_norm": 0.3490102291107178, "learning_rate": 2.8275193625998287e-06, "loss": 0.2971, "step": 2739 }, { "epoch": 2.036669970267592, "grad_norm": 0.36691778898239136, "learning_rate": 2.8236237979955845e-06, "loss": 0.3975, "step": 2740 }, { "epoch": 2.0374132804757186, "grad_norm": 0.35474470257759094, "learning_rate": 2.819729862616801e-06, "loss": 0.3298, "step": 2741 }, { "epoch": 2.0381565906838452, "grad_norm": 0.4081672132015228, "learning_rate": 2.815837559378464e-06, "loss": 0.4303, "step": 2742 }, { "epoch": 2.0388999008919724, "grad_norm": 0.35814255475997925, "learning_rate": 2.8119468911943327e-06, "loss": 0.421, "step": 2743 }, { "epoch": 2.039643211100099, "grad_norm": 0.34571847319602966, "learning_rate": 2.8080578609769423e-06, "loss": 0.3611, "step": 2744 }, { "epoch": 2.040386521308226, "grad_norm": 0.33362165093421936, "learning_rate": 2.8041704716376044e-06, "loss": 0.3723, "step": 2745 }, { "epoch": 2.041129831516353, "grad_norm": 0.36522647738456726, "learning_rate": 2.800284726086404e-06, "loss": 0.4243, "step": 2746 }, { "epoch": 2.0418731417244795, "grad_norm": 0.3754381537437439, "learning_rate": 2.7964006272321885e-06, "loss": 0.363, "step": 2747 }, { "epoch": 2.0426164519326067, "grad_norm": 0.391512006521225, "learning_rate": 2.7925181779825816e-06, "loss": 0.3781, "step": 2748 }, { "epoch": 2.0433597621407333, "grad_norm": 0.34283703565597534, "learning_rate": 2.788637381243964e-06, "loss": 0.3522, "step": 2749 }, { "epoch": 2.0441030723488605, "grad_norm": 0.35414373874664307, "learning_rate": 2.784758239921482e-06, "loss": 0.3646, "step": 2750 }, { "epoch": 2.044846382556987, "grad_norm": 0.3972248136997223, "learning_rate": 2.780880756919046e-06, "loss": 0.3842, "step": 2751 }, { "epoch": 2.045589692765114, "grad_norm": 0.4105724096298218, "learning_rate": 2.77700493513932e-06, "loss": 0.3733, "step": 2752 }, { "epoch": 2.046333002973241, "grad_norm": 0.3981853127479553, "learning_rate": 2.773130777483728e-06, "loss": 0.4241, "step": 2753 }, { "epoch": 2.0470763131813676, "grad_norm": 0.3310334384441376, "learning_rate": 2.769258286852449e-06, "loss": 0.3323, "step": 2754 }, { "epoch": 2.0478196233894947, "grad_norm": 0.35570934414863586, "learning_rate": 2.7653874661444097e-06, "loss": 0.3702, "step": 2755 }, { "epoch": 2.0485629335976214, "grad_norm": 0.34551554918289185, "learning_rate": 2.761518318257288e-06, "loss": 0.3363, "step": 2756 }, { "epoch": 2.049306243805748, "grad_norm": 0.4050469994544983, "learning_rate": 2.757650846087514e-06, "loss": 0.4337, "step": 2757 }, { "epoch": 2.0500495540138752, "grad_norm": 0.3801851272583008, "learning_rate": 2.7537850525302585e-06, "loss": 0.4028, "step": 2758 }, { "epoch": 2.050792864222002, "grad_norm": 0.36662715673446655, "learning_rate": 2.749920940479435e-06, "loss": 0.3553, "step": 2759 }, { "epoch": 2.051536174430129, "grad_norm": 0.3608556389808655, "learning_rate": 2.7460585128277025e-06, "loss": 0.4036, "step": 2760 }, { "epoch": 2.0522794846382557, "grad_norm": 0.41125965118408203, "learning_rate": 2.742197772466459e-06, "loss": 0.4342, "step": 2761 }, { "epoch": 2.0530227948463824, "grad_norm": 0.3543540835380554, "learning_rate": 2.738338722285833e-06, "loss": 0.3725, "step": 2762 }, { "epoch": 2.0537661050545095, "grad_norm": 0.3787147104740143, "learning_rate": 2.734481365174697e-06, "loss": 0.3887, "step": 2763 }, { "epoch": 2.054509415262636, "grad_norm": 0.3651859164237976, "learning_rate": 2.7306257040206462e-06, "loss": 0.3789, "step": 2764 }, { "epoch": 2.0552527254707633, "grad_norm": 0.4003317058086395, "learning_rate": 2.7267717417100167e-06, "loss": 0.3806, "step": 2765 }, { "epoch": 2.05599603567889, "grad_norm": 0.40265217423439026, "learning_rate": 2.7229194811278635e-06, "loss": 0.3671, "step": 2766 }, { "epoch": 2.0567393458870167, "grad_norm": 0.35323479771614075, "learning_rate": 2.7190689251579714e-06, "loss": 0.3583, "step": 2767 }, { "epoch": 2.057482656095144, "grad_norm": 0.4391591548919678, "learning_rate": 2.7152200766828503e-06, "loss": 0.3767, "step": 2768 }, { "epoch": 2.0582259663032705, "grad_norm": 0.34374409914016724, "learning_rate": 2.711372938583733e-06, "loss": 0.3464, "step": 2769 }, { "epoch": 2.0589692765113976, "grad_norm": 0.36582520604133606, "learning_rate": 2.7075275137405656e-06, "loss": 0.3587, "step": 2770 }, { "epoch": 2.0597125867195243, "grad_norm": 0.3936537206172943, "learning_rate": 2.703683805032021e-06, "loss": 0.3824, "step": 2771 }, { "epoch": 2.060455896927651, "grad_norm": 0.3234668970108032, "learning_rate": 2.699841815335479e-06, "loss": 0.3442, "step": 2772 }, { "epoch": 2.061199207135778, "grad_norm": 0.4252116084098816, "learning_rate": 2.6960015475270344e-06, "loss": 0.4389, "step": 2773 }, { "epoch": 2.0619425173439048, "grad_norm": 0.4038064777851105, "learning_rate": 2.6921630044814983e-06, "loss": 0.3685, "step": 2774 }, { "epoch": 2.062685827552032, "grad_norm": 0.3642488420009613, "learning_rate": 2.6883261890723834e-06, "loss": 0.3565, "step": 2775 }, { "epoch": 2.0634291377601586, "grad_norm": 0.39021575450897217, "learning_rate": 2.6844911041719144e-06, "loss": 0.3596, "step": 2776 }, { "epoch": 2.0641724479682853, "grad_norm": 0.43542391061782837, "learning_rate": 2.68065775265102e-06, "loss": 0.4008, "step": 2777 }, { "epoch": 2.0649157581764124, "grad_norm": 0.41722628474235535, "learning_rate": 2.6768261373793282e-06, "loss": 0.3881, "step": 2778 }, { "epoch": 2.065659068384539, "grad_norm": 0.3924551010131836, "learning_rate": 2.6729962612251663e-06, "loss": 0.4184, "step": 2779 }, { "epoch": 2.066402378592666, "grad_norm": 0.3753860294818878, "learning_rate": 2.6691681270555682e-06, "loss": 0.3331, "step": 2780 }, { "epoch": 2.067145688800793, "grad_norm": 0.3566315174102783, "learning_rate": 2.6653417377362523e-06, "loss": 0.387, "step": 2781 }, { "epoch": 2.0678889990089195, "grad_norm": 0.3737882077693939, "learning_rate": 2.6615170961316417e-06, "loss": 0.3994, "step": 2782 }, { "epoch": 2.0686323092170467, "grad_norm": 0.3826865553855896, "learning_rate": 2.657694205104841e-06, "loss": 0.3979, "step": 2783 }, { "epoch": 2.0693756194251733, "grad_norm": 0.383865624666214, "learning_rate": 2.653873067517654e-06, "loss": 0.3918, "step": 2784 }, { "epoch": 2.0701189296333005, "grad_norm": 0.3841201961040497, "learning_rate": 2.6500536862305625e-06, "loss": 0.384, "step": 2785 }, { "epoch": 2.070862239841427, "grad_norm": 0.38439416885375977, "learning_rate": 2.6462360641027423e-06, "loss": 0.415, "step": 2786 }, { "epoch": 2.071605550049554, "grad_norm": 0.342563658952713, "learning_rate": 2.642420203992044e-06, "loss": 0.3691, "step": 2787 }, { "epoch": 2.072348860257681, "grad_norm": 0.36511844396591187, "learning_rate": 2.638606108755009e-06, "loss": 0.3763, "step": 2788 }, { "epoch": 2.0730921704658076, "grad_norm": 0.3965713381767273, "learning_rate": 2.6347937812468482e-06, "loss": 0.3987, "step": 2789 }, { "epoch": 2.0738354806739348, "grad_norm": 0.41564661264419556, "learning_rate": 2.630983224321451e-06, "loss": 0.3589, "step": 2790 }, { "epoch": 2.0745787908820614, "grad_norm": 0.3490481376647949, "learning_rate": 2.6271744408313858e-06, "loss": 0.3666, "step": 2791 }, { "epoch": 2.075322101090188, "grad_norm": 0.3459349572658539, "learning_rate": 2.6233674336278925e-06, "loss": 0.385, "step": 2792 }, { "epoch": 2.0760654112983152, "grad_norm": 0.38481754064559937, "learning_rate": 2.619562205560876e-06, "loss": 0.4246, "step": 2793 }, { "epoch": 2.076808721506442, "grad_norm": 0.3800375759601593, "learning_rate": 2.6157587594789167e-06, "loss": 0.3618, "step": 2794 }, { "epoch": 2.077552031714569, "grad_norm": 0.3802664279937744, "learning_rate": 2.6119570982292547e-06, "loss": 0.4269, "step": 2795 }, { "epoch": 2.0782953419226957, "grad_norm": 0.38551899790763855, "learning_rate": 2.6081572246577957e-06, "loss": 0.3817, "step": 2796 }, { "epoch": 2.0790386521308224, "grad_norm": 0.3612678349018097, "learning_rate": 2.6043591416091106e-06, "loss": 0.4004, "step": 2797 }, { "epoch": 2.0797819623389495, "grad_norm": 0.3591584861278534, "learning_rate": 2.6005628519264247e-06, "loss": 0.3962, "step": 2798 }, { "epoch": 2.080525272547076, "grad_norm": 0.3682560324668884, "learning_rate": 2.5967683584516253e-06, "loss": 0.3712, "step": 2799 }, { "epoch": 2.0812685827552033, "grad_norm": 0.35942795872688293, "learning_rate": 2.592975664025254e-06, "loss": 0.3674, "step": 2800 }, { "epoch": 2.08201189296333, "grad_norm": 0.3936978280544281, "learning_rate": 2.5891847714865036e-06, "loss": 0.4074, "step": 2801 }, { "epoch": 2.0827552031714567, "grad_norm": 0.3620539903640747, "learning_rate": 2.5853956836732175e-06, "loss": 0.3689, "step": 2802 }, { "epoch": 2.083498513379584, "grad_norm": 0.3531515896320343, "learning_rate": 2.5816084034218936e-06, "loss": 0.3921, "step": 2803 }, { "epoch": 2.0842418235877105, "grad_norm": 0.3772771954536438, "learning_rate": 2.57782293356767e-06, "loss": 0.3826, "step": 2804 }, { "epoch": 2.0849851337958376, "grad_norm": 0.32709801197052, "learning_rate": 2.5740392769443333e-06, "loss": 0.3409, "step": 2805 }, { "epoch": 2.0857284440039643, "grad_norm": 0.41868603229522705, "learning_rate": 2.5702574363843115e-06, "loss": 0.3617, "step": 2806 }, { "epoch": 2.086471754212091, "grad_norm": 0.4187427759170532, "learning_rate": 2.5664774147186748e-06, "loss": 0.3796, "step": 2807 }, { "epoch": 2.087215064420218, "grad_norm": 0.3602338433265686, "learning_rate": 2.562699214777128e-06, "loss": 0.3929, "step": 2808 }, { "epoch": 2.0879583746283448, "grad_norm": 0.3760814666748047, "learning_rate": 2.5589228393880183e-06, "loss": 0.3871, "step": 2809 }, { "epoch": 2.088701684836472, "grad_norm": 0.30677342414855957, "learning_rate": 2.55514829137832e-06, "loss": 0.337, "step": 2810 }, { "epoch": 2.0894449950445986, "grad_norm": 0.36291512846946716, "learning_rate": 2.5513755735736446e-06, "loss": 0.3925, "step": 2811 }, { "epoch": 2.0901883052527257, "grad_norm": 0.38321012258529663, "learning_rate": 2.547604688798232e-06, "loss": 0.4475, "step": 2812 }, { "epoch": 2.0909316154608524, "grad_norm": 0.31314191222190857, "learning_rate": 2.543835639874947e-06, "loss": 0.3227, "step": 2813 }, { "epoch": 2.091674925668979, "grad_norm": 0.37030288577079773, "learning_rate": 2.540068429625284e-06, "loss": 0.3698, "step": 2814 }, { "epoch": 2.092418235877106, "grad_norm": 0.36467838287353516, "learning_rate": 2.5363030608693627e-06, "loss": 0.4067, "step": 2815 }, { "epoch": 2.093161546085233, "grad_norm": 0.3390768766403198, "learning_rate": 2.532539536425917e-06, "loss": 0.37, "step": 2816 }, { "epoch": 2.09390485629336, "grad_norm": 0.3832557797431946, "learning_rate": 2.528777859112308e-06, "loss": 0.3843, "step": 2817 }, { "epoch": 2.0946481665014867, "grad_norm": 0.3595709204673767, "learning_rate": 2.5250180317445083e-06, "loss": 0.3722, "step": 2818 }, { "epoch": 2.0953914767096133, "grad_norm": 0.359115868806839, "learning_rate": 2.5212600571371073e-06, "loss": 0.3765, "step": 2819 }, { "epoch": 2.0961347869177405, "grad_norm": 0.37207865715026855, "learning_rate": 2.5175039381033094e-06, "loss": 0.3797, "step": 2820 }, { "epoch": 2.096878097125867, "grad_norm": 0.37981462478637695, "learning_rate": 2.5137496774549264e-06, "loss": 0.3781, "step": 2821 }, { "epoch": 2.0976214073339943, "grad_norm": 0.37989556789398193, "learning_rate": 2.509997278002383e-06, "loss": 0.4302, "step": 2822 }, { "epoch": 2.098364717542121, "grad_norm": 0.3058462142944336, "learning_rate": 2.5062467425547098e-06, "loss": 0.3236, "step": 2823 }, { "epoch": 2.0991080277502476, "grad_norm": 0.3855479955673218, "learning_rate": 2.502498073919539e-06, "loss": 0.3903, "step": 2824 }, { "epoch": 2.0998513379583748, "grad_norm": 0.385749489068985, "learning_rate": 2.498751274903105e-06, "loss": 0.4034, "step": 2825 }, { "epoch": 2.1005946481665014, "grad_norm": 0.3514871299266815, "learning_rate": 2.4950063483102487e-06, "loss": 0.3759, "step": 2826 }, { "epoch": 2.1013379583746286, "grad_norm": 0.37180885672569275, "learning_rate": 2.4912632969444024e-06, "loss": 0.3712, "step": 2827 }, { "epoch": 2.1020812685827552, "grad_norm": 0.38678425550460815, "learning_rate": 2.4875221236076002e-06, "loss": 0.395, "step": 2828 }, { "epoch": 2.102824578790882, "grad_norm": 0.39979809522628784, "learning_rate": 2.483782831100465e-06, "loss": 0.3313, "step": 2829 }, { "epoch": 2.103567888999009, "grad_norm": 0.3598143458366394, "learning_rate": 2.4800454222222163e-06, "loss": 0.3807, "step": 2830 }, { "epoch": 2.1043111992071357, "grad_norm": 0.35318103432655334, "learning_rate": 2.4763098997706597e-06, "loss": 0.3583, "step": 2831 }, { "epoch": 2.105054509415263, "grad_norm": 0.3878331184387207, "learning_rate": 2.472576266542194e-06, "loss": 0.3764, "step": 2832 }, { "epoch": 2.1057978196233895, "grad_norm": 0.3821086585521698, "learning_rate": 2.468844525331796e-06, "loss": 0.3871, "step": 2833 }, { "epoch": 2.106541129831516, "grad_norm": 0.39045387506484985, "learning_rate": 2.4651146789330343e-06, "loss": 0.4111, "step": 2834 }, { "epoch": 2.1072844400396433, "grad_norm": 0.3508051633834839, "learning_rate": 2.4613867301380534e-06, "loss": 0.3643, "step": 2835 }, { "epoch": 2.10802775024777, "grad_norm": 0.3747502267360687, "learning_rate": 2.457660681737577e-06, "loss": 0.394, "step": 2836 }, { "epoch": 2.108771060455897, "grad_norm": 0.3501208424568176, "learning_rate": 2.4539365365209108e-06, "loss": 0.4021, "step": 2837 }, { "epoch": 2.109514370664024, "grad_norm": 0.3647649586200714, "learning_rate": 2.450214297275934e-06, "loss": 0.3635, "step": 2838 }, { "epoch": 2.1102576808721505, "grad_norm": 0.3565928637981415, "learning_rate": 2.446493966789095e-06, "loss": 0.3455, "step": 2839 }, { "epoch": 2.1110009910802776, "grad_norm": 0.37984979152679443, "learning_rate": 2.4427755478454207e-06, "loss": 0.3945, "step": 2840 }, { "epoch": 2.1117443012884043, "grad_norm": 0.35525941848754883, "learning_rate": 2.4390590432285005e-06, "loss": 0.3778, "step": 2841 }, { "epoch": 2.1124876114965314, "grad_norm": 0.3516775965690613, "learning_rate": 2.4353444557204915e-06, "loss": 0.3642, "step": 2842 }, { "epoch": 2.113230921704658, "grad_norm": 0.3471482992172241, "learning_rate": 2.4316317881021202e-06, "loss": 0.3779, "step": 2843 }, { "epoch": 2.1139742319127848, "grad_norm": 0.3474150598049164, "learning_rate": 2.4279210431526695e-06, "loss": 0.3682, "step": 2844 }, { "epoch": 2.114717542120912, "grad_norm": 0.35941407084465027, "learning_rate": 2.424212223649989e-06, "loss": 0.3875, "step": 2845 }, { "epoch": 2.1154608523290386, "grad_norm": 0.3960981070995331, "learning_rate": 2.4205053323704852e-06, "loss": 0.4004, "step": 2846 }, { "epoch": 2.1162041625371657, "grad_norm": 0.3688826858997345, "learning_rate": 2.4168003720891197e-06, "loss": 0.4019, "step": 2847 }, { "epoch": 2.1169474727452924, "grad_norm": 0.36745142936706543, "learning_rate": 2.413097345579406e-06, "loss": 0.3798, "step": 2848 }, { "epoch": 2.117690782953419, "grad_norm": 0.35868510603904724, "learning_rate": 2.4093962556134177e-06, "loss": 0.3632, "step": 2849 }, { "epoch": 2.118434093161546, "grad_norm": 0.3951771855354309, "learning_rate": 2.40569710496177e-06, "loss": 0.3875, "step": 2850 }, { "epoch": 2.119177403369673, "grad_norm": 0.35900944471359253, "learning_rate": 2.401999896393634e-06, "loss": 0.4029, "step": 2851 }, { "epoch": 2.1199207135778, "grad_norm": 0.343338280916214, "learning_rate": 2.3983046326767203e-06, "loss": 0.3901, "step": 2852 }, { "epoch": 2.1206640237859267, "grad_norm": 0.3679533004760742, "learning_rate": 2.3946113165772905e-06, "loss": 0.3727, "step": 2853 }, { "epoch": 2.1214073339940533, "grad_norm": 0.3863505721092224, "learning_rate": 2.39091995086014e-06, "loss": 0.4272, "step": 2854 }, { "epoch": 2.1221506442021805, "grad_norm": 0.33595025539398193, "learning_rate": 2.387230538288613e-06, "loss": 0.3441, "step": 2855 }, { "epoch": 2.122893954410307, "grad_norm": 0.3463905453681946, "learning_rate": 2.383543081624583e-06, "loss": 0.3739, "step": 2856 }, { "epoch": 2.1236372646184343, "grad_norm": 0.39093995094299316, "learning_rate": 2.379857583628468e-06, "loss": 0.4039, "step": 2857 }, { "epoch": 2.124380574826561, "grad_norm": 0.3728741705417633, "learning_rate": 2.3761740470592125e-06, "loss": 0.3808, "step": 2858 }, { "epoch": 2.1251238850346876, "grad_norm": 0.4104476869106293, "learning_rate": 2.372492474674294e-06, "loss": 0.3824, "step": 2859 }, { "epoch": 2.1258671952428148, "grad_norm": 0.3598857522010803, "learning_rate": 2.368812869229723e-06, "loss": 0.4077, "step": 2860 }, { "epoch": 2.1266105054509414, "grad_norm": 0.35541775822639465, "learning_rate": 2.3651352334800375e-06, "loss": 0.4027, "step": 2861 }, { "epoch": 2.1273538156590686, "grad_norm": 0.3560545742511749, "learning_rate": 2.3614595701782956e-06, "loss": 0.3476, "step": 2862 }, { "epoch": 2.1280971258671952, "grad_norm": 0.35028961300849915, "learning_rate": 2.3577858820760855e-06, "loss": 0.3789, "step": 2863 }, { "epoch": 2.128840436075322, "grad_norm": 0.37365972995758057, "learning_rate": 2.3541141719235123e-06, "loss": 0.3877, "step": 2864 }, { "epoch": 2.129583746283449, "grad_norm": 0.3823856711387634, "learning_rate": 2.3504444424691995e-06, "loss": 0.4327, "step": 2865 }, { "epoch": 2.1303270564915757, "grad_norm": 0.3555949032306671, "learning_rate": 2.346776696460294e-06, "loss": 0.3944, "step": 2866 }, { "epoch": 2.131070366699703, "grad_norm": 0.4276350140571594, "learning_rate": 2.3431109366424503e-06, "loss": 0.4288, "step": 2867 }, { "epoch": 2.1318136769078295, "grad_norm": 0.3072110116481781, "learning_rate": 2.3394471657598422e-06, "loss": 0.329, "step": 2868 }, { "epoch": 2.132556987115956, "grad_norm": 0.4030570685863495, "learning_rate": 2.335785386555153e-06, "loss": 0.4105, "step": 2869 }, { "epoch": 2.1333002973240833, "grad_norm": 0.36084961891174316, "learning_rate": 2.3321256017695727e-06, "loss": 0.3774, "step": 2870 }, { "epoch": 2.13404360753221, "grad_norm": 0.3767324984073639, "learning_rate": 2.3284678141427976e-06, "loss": 0.3859, "step": 2871 }, { "epoch": 2.134786917740337, "grad_norm": 0.4075312912464142, "learning_rate": 2.3248120264130357e-06, "loss": 0.3762, "step": 2872 }, { "epoch": 2.135530227948464, "grad_norm": 0.3529856204986572, "learning_rate": 2.3211582413169885e-06, "loss": 0.3958, "step": 2873 }, { "epoch": 2.1362735381565905, "grad_norm": 0.37073764204978943, "learning_rate": 2.3175064615898685e-06, "loss": 0.3491, "step": 2874 }, { "epoch": 2.1370168483647176, "grad_norm": 0.3902513384819031, "learning_rate": 2.3138566899653756e-06, "loss": 0.3987, "step": 2875 }, { "epoch": 2.1377601585728443, "grad_norm": 0.4487965703010559, "learning_rate": 2.310208929175717e-06, "loss": 0.474, "step": 2876 }, { "epoch": 2.1385034687809714, "grad_norm": 0.35557153820991516, "learning_rate": 2.3065631819515892e-06, "loss": 0.3623, "step": 2877 }, { "epoch": 2.139246778989098, "grad_norm": 0.3202385902404785, "learning_rate": 2.302919451022182e-06, "loss": 0.3594, "step": 2878 }, { "epoch": 2.1399900891972248, "grad_norm": 0.34496140480041504, "learning_rate": 2.299277739115174e-06, "loss": 0.3703, "step": 2879 }, { "epoch": 2.140733399405352, "grad_norm": 0.36253198981285095, "learning_rate": 2.295638048956738e-06, "loss": 0.3785, "step": 2880 }, { "epoch": 2.1414767096134786, "grad_norm": 0.35535648465156555, "learning_rate": 2.2920003832715275e-06, "loss": 0.3869, "step": 2881 }, { "epoch": 2.1422200198216057, "grad_norm": 0.34917211532592773, "learning_rate": 2.2883647447826813e-06, "loss": 0.3551, "step": 2882 }, { "epoch": 2.1429633300297324, "grad_norm": 0.373955100774765, "learning_rate": 2.2847311362118237e-06, "loss": 0.3742, "step": 2883 }, { "epoch": 2.143706640237859, "grad_norm": 0.39630112051963806, "learning_rate": 2.28109956027906e-06, "loss": 0.3582, "step": 2884 }, { "epoch": 2.144449950445986, "grad_norm": 0.4058905243873596, "learning_rate": 2.2774700197029675e-06, "loss": 0.3656, "step": 2885 }, { "epoch": 2.145193260654113, "grad_norm": 0.40281206369400024, "learning_rate": 2.273842517200607e-06, "loss": 0.3968, "step": 2886 }, { "epoch": 2.14593657086224, "grad_norm": 0.3755055069923401, "learning_rate": 2.2702170554875107e-06, "loss": 0.3632, "step": 2887 }, { "epoch": 2.1466798810703667, "grad_norm": 0.34261688590049744, "learning_rate": 2.26659363727768e-06, "loss": 0.3685, "step": 2888 }, { "epoch": 2.1474231912784933, "grad_norm": 0.35492342710494995, "learning_rate": 2.262972265283593e-06, "loss": 0.4043, "step": 2889 }, { "epoch": 2.1481665014866205, "grad_norm": 0.3825913071632385, "learning_rate": 2.259352942216189e-06, "loss": 0.3862, "step": 2890 }, { "epoch": 2.148909811694747, "grad_norm": 0.3513205647468567, "learning_rate": 2.255735670784879e-06, "loss": 0.3474, "step": 2891 }, { "epoch": 2.1496531219028743, "grad_norm": 0.36588263511657715, "learning_rate": 2.2521204536975375e-06, "loss": 0.3572, "step": 2892 }, { "epoch": 2.150396432111001, "grad_norm": 0.3500080406665802, "learning_rate": 2.248507293660499e-06, "loss": 0.4061, "step": 2893 }, { "epoch": 2.1511397423191276, "grad_norm": 0.3616887629032135, "learning_rate": 2.2448961933785568e-06, "loss": 0.3923, "step": 2894 }, { "epoch": 2.1518830525272548, "grad_norm": 0.3320009410381317, "learning_rate": 2.241287155554967e-06, "loss": 0.3584, "step": 2895 }, { "epoch": 2.1526263627353814, "grad_norm": 0.35265904664993286, "learning_rate": 2.237680182891436e-06, "loss": 0.3945, "step": 2896 }, { "epoch": 2.1533696729435086, "grad_norm": 0.3567326068878174, "learning_rate": 2.234075278088132e-06, "loss": 0.3904, "step": 2897 }, { "epoch": 2.1541129831516352, "grad_norm": 0.4240482449531555, "learning_rate": 2.2304724438436653e-06, "loss": 0.3764, "step": 2898 }, { "epoch": 2.154856293359762, "grad_norm": 0.3992108404636383, "learning_rate": 2.2268716828551045e-06, "loss": 0.3863, "step": 2899 }, { "epoch": 2.155599603567889, "grad_norm": 0.34499379992485046, "learning_rate": 2.2232729978179646e-06, "loss": 0.3761, "step": 2900 }, { "epoch": 2.1563429137760157, "grad_norm": 0.3726985454559326, "learning_rate": 2.219676391426203e-06, "loss": 0.3889, "step": 2901 }, { "epoch": 2.157086223984143, "grad_norm": 0.3559260368347168, "learning_rate": 2.2160818663722218e-06, "loss": 0.3631, "step": 2902 }, { "epoch": 2.1578295341922695, "grad_norm": 0.38933315873146057, "learning_rate": 2.2124894253468694e-06, "loss": 0.3987, "step": 2903 }, { "epoch": 2.158572844400396, "grad_norm": 0.38876304030418396, "learning_rate": 2.2088990710394292e-06, "loss": 0.3944, "step": 2904 }, { "epoch": 2.1593161546085233, "grad_norm": 0.3771619200706482, "learning_rate": 2.205310806137623e-06, "loss": 0.3838, "step": 2905 }, { "epoch": 2.16005946481665, "grad_norm": 0.3323052227497101, "learning_rate": 2.201724633327611e-06, "loss": 0.3089, "step": 2906 }, { "epoch": 2.160802775024777, "grad_norm": 0.3967815339565277, "learning_rate": 2.1981405552939886e-06, "loss": 0.4637, "step": 2907 }, { "epoch": 2.161546085232904, "grad_norm": 0.3335983455181122, "learning_rate": 2.194558574719777e-06, "loss": 0.3304, "step": 2908 }, { "epoch": 2.1622893954410305, "grad_norm": 0.3838711678981781, "learning_rate": 2.190978694286434e-06, "loss": 0.4218, "step": 2909 }, { "epoch": 2.1630327056491576, "grad_norm": 0.358462393283844, "learning_rate": 2.1874009166738396e-06, "loss": 0.3674, "step": 2910 }, { "epoch": 2.1637760158572843, "grad_norm": 0.3835816979408264, "learning_rate": 2.1838252445603015e-06, "loss": 0.365, "step": 2911 }, { "epoch": 2.1645193260654114, "grad_norm": 0.36234763264656067, "learning_rate": 2.180251680622554e-06, "loss": 0.3463, "step": 2912 }, { "epoch": 2.165262636273538, "grad_norm": 0.37716928124427795, "learning_rate": 2.176680227535749e-06, "loss": 0.3527, "step": 2913 }, { "epoch": 2.166005946481665, "grad_norm": 0.42728328704833984, "learning_rate": 2.173110887973461e-06, "loss": 0.4382, "step": 2914 }, { "epoch": 2.166749256689792, "grad_norm": 0.3667446970939636, "learning_rate": 2.169543664607684e-06, "loss": 0.3732, "step": 2915 }, { "epoch": 2.1674925668979186, "grad_norm": 0.35719141364097595, "learning_rate": 2.165978560108821e-06, "loss": 0.3568, "step": 2916 }, { "epoch": 2.1682358771060457, "grad_norm": 0.3712882697582245, "learning_rate": 2.162415577145698e-06, "loss": 0.3828, "step": 2917 }, { "epoch": 2.1689791873141724, "grad_norm": 0.3181682229042053, "learning_rate": 2.1588547183855453e-06, "loss": 0.3536, "step": 2918 }, { "epoch": 2.1697224975222995, "grad_norm": 0.3748629689216614, "learning_rate": 2.1552959864940044e-06, "loss": 0.4153, "step": 2919 }, { "epoch": 2.170465807730426, "grad_norm": 0.34077832102775574, "learning_rate": 2.151739384135129e-06, "loss": 0.3628, "step": 2920 }, { "epoch": 2.171209117938553, "grad_norm": 0.36609339714050293, "learning_rate": 2.148184913971375e-06, "loss": 0.4097, "step": 2921 }, { "epoch": 2.17195242814668, "grad_norm": 0.38976094126701355, "learning_rate": 2.1446325786635986e-06, "loss": 0.3863, "step": 2922 }, { "epoch": 2.1726957383548067, "grad_norm": 0.34962332248687744, "learning_rate": 2.141082380871069e-06, "loss": 0.3514, "step": 2923 }, { "epoch": 2.173439048562934, "grad_norm": 0.33963191509246826, "learning_rate": 2.137534323251445e-06, "loss": 0.3801, "step": 2924 }, { "epoch": 2.1741823587710605, "grad_norm": 0.3683713376522064, "learning_rate": 2.133988408460785e-06, "loss": 0.3732, "step": 2925 }, { "epoch": 2.174925668979187, "grad_norm": 0.3535967469215393, "learning_rate": 2.1304446391535482e-06, "loss": 0.3736, "step": 2926 }, { "epoch": 2.1756689791873143, "grad_norm": 0.35157492756843567, "learning_rate": 2.1269030179825834e-06, "loss": 0.348, "step": 2927 }, { "epoch": 2.176412289395441, "grad_norm": 0.34551864862442017, "learning_rate": 2.123363547599129e-06, "loss": 0.3936, "step": 2928 }, { "epoch": 2.177155599603568, "grad_norm": 0.3404235541820526, "learning_rate": 2.1198262306528207e-06, "loss": 0.3581, "step": 2929 }, { "epoch": 2.1778989098116948, "grad_norm": 0.3862476944923401, "learning_rate": 2.1162910697916773e-06, "loss": 0.407, "step": 2930 }, { "epoch": 2.1786422200198214, "grad_norm": 0.3574124574661255, "learning_rate": 2.1127580676621023e-06, "loss": 0.3941, "step": 2931 }, { "epoch": 2.1793855302279486, "grad_norm": 0.3443765938282013, "learning_rate": 2.1092272269088887e-06, "loss": 0.3421, "step": 2932 }, { "epoch": 2.1801288404360752, "grad_norm": 0.3902258574962616, "learning_rate": 2.1056985501752063e-06, "loss": 0.4231, "step": 2933 }, { "epoch": 2.1808721506442024, "grad_norm": 0.33318063616752625, "learning_rate": 2.1021720401026047e-06, "loss": 0.3335, "step": 2934 }, { "epoch": 2.181615460852329, "grad_norm": 0.39416030049324036, "learning_rate": 2.0986476993310167e-06, "loss": 0.3863, "step": 2935 }, { "epoch": 2.1823587710604557, "grad_norm": 0.3433258831501007, "learning_rate": 2.095125530498745e-06, "loss": 0.395, "step": 2936 }, { "epoch": 2.183102081268583, "grad_norm": 0.33552631735801697, "learning_rate": 2.091605536242471e-06, "loss": 0.3787, "step": 2937 }, { "epoch": 2.1838453914767095, "grad_norm": 0.4063169062137604, "learning_rate": 2.088087719197248e-06, "loss": 0.4374, "step": 2938 }, { "epoch": 2.1845887016848367, "grad_norm": 0.3598313629627228, "learning_rate": 2.0845720819964955e-06, "loss": 0.3872, "step": 2939 }, { "epoch": 2.1853320118929633, "grad_norm": 0.36598289012908936, "learning_rate": 2.0810586272720067e-06, "loss": 0.4185, "step": 2940 }, { "epoch": 2.18607532210109, "grad_norm": 0.36862504482269287, "learning_rate": 2.077547357653936e-06, "loss": 0.3693, "step": 2941 }, { "epoch": 2.186818632309217, "grad_norm": 0.3361433148384094, "learning_rate": 2.074038275770802e-06, "loss": 0.3755, "step": 2942 }, { "epoch": 2.187561942517344, "grad_norm": 0.33257386088371277, "learning_rate": 2.070531384249493e-06, "loss": 0.3526, "step": 2943 }, { "epoch": 2.188305252725471, "grad_norm": 0.3532215356826782, "learning_rate": 2.067026685715248e-06, "loss": 0.4153, "step": 2944 }, { "epoch": 2.1890485629335976, "grad_norm": 0.3525550365447998, "learning_rate": 2.0635241827916667e-06, "loss": 0.4051, "step": 2945 }, { "epoch": 2.1897918731417243, "grad_norm": 0.3320842981338501, "learning_rate": 2.060023878100714e-06, "loss": 0.3724, "step": 2946 }, { "epoch": 2.1905351833498514, "grad_norm": 0.37983906269073486, "learning_rate": 2.0565257742626986e-06, "loss": 0.4072, "step": 2947 }, { "epoch": 2.191278493557978, "grad_norm": 0.3565690219402313, "learning_rate": 2.0530298738962836e-06, "loss": 0.3856, "step": 2948 }, { "epoch": 2.1920218037661052, "grad_norm": 0.3785719871520996, "learning_rate": 2.0495361796184874e-06, "loss": 0.3939, "step": 2949 }, { "epoch": 2.192765113974232, "grad_norm": 0.3993881046772003, "learning_rate": 2.0460446940446726e-06, "loss": 0.4376, "step": 2950 }, { "epoch": 2.1935084241823586, "grad_norm": 0.35972851514816284, "learning_rate": 2.0425554197885478e-06, "loss": 0.4474, "step": 2951 }, { "epoch": 2.1942517343904857, "grad_norm": 0.33954915404319763, "learning_rate": 2.0390683594621697e-06, "loss": 0.3864, "step": 2952 }, { "epoch": 2.1949950445986124, "grad_norm": 0.3114657402038574, "learning_rate": 2.035583515675938e-06, "loss": 0.345, "step": 2953 }, { "epoch": 2.1957383548067395, "grad_norm": 0.3489568531513214, "learning_rate": 2.032100891038587e-06, "loss": 0.3488, "step": 2954 }, { "epoch": 2.196481665014866, "grad_norm": 0.39245712757110596, "learning_rate": 2.028620488157198e-06, "loss": 0.4374, "step": 2955 }, { "epoch": 2.197224975222993, "grad_norm": 0.3480879068374634, "learning_rate": 2.0251423096371818e-06, "loss": 0.3952, "step": 2956 }, { "epoch": 2.19796828543112, "grad_norm": 0.34766486287117004, "learning_rate": 2.02166635808229e-06, "loss": 0.3898, "step": 2957 }, { "epoch": 2.1987115956392467, "grad_norm": 0.31023725867271423, "learning_rate": 2.0181926360946025e-06, "loss": 0.347, "step": 2958 }, { "epoch": 2.199454905847374, "grad_norm": 0.34676453471183777, "learning_rate": 2.014721146274531e-06, "loss": 0.3849, "step": 2959 }, { "epoch": 2.2001982160555005, "grad_norm": 0.34920036792755127, "learning_rate": 2.0112518912208183e-06, "loss": 0.3518, "step": 2960 }, { "epoch": 2.200941526263627, "grad_norm": 0.36112645268440247, "learning_rate": 2.0077848735305354e-06, "loss": 0.366, "step": 2961 }, { "epoch": 2.2016848364717543, "grad_norm": 0.39478886127471924, "learning_rate": 2.004320095799072e-06, "loss": 0.3566, "step": 2962 }, { "epoch": 2.202428146679881, "grad_norm": 0.358015775680542, "learning_rate": 2.0008575606201496e-06, "loss": 0.4122, "step": 2963 }, { "epoch": 2.203171456888008, "grad_norm": 0.36189454793930054, "learning_rate": 1.997397270585804e-06, "loss": 0.4028, "step": 2964 }, { "epoch": 2.2039147670961348, "grad_norm": 0.3676878809928894, "learning_rate": 1.9939392282863917e-06, "loss": 0.3485, "step": 2965 }, { "epoch": 2.2046580773042614, "grad_norm": 0.35376274585723877, "learning_rate": 1.9904834363105908e-06, "loss": 0.3986, "step": 2966 }, { "epoch": 2.2054013875123886, "grad_norm": 0.38610294461250305, "learning_rate": 1.9870298972453894e-06, "loss": 0.4409, "step": 2967 }, { "epoch": 2.2061446977205152, "grad_norm": 0.34612682461738586, "learning_rate": 1.9835786136760888e-06, "loss": 0.3413, "step": 2968 }, { "epoch": 2.2068880079286424, "grad_norm": 0.3520868420600891, "learning_rate": 1.980129588186311e-06, "loss": 0.4313, "step": 2969 }, { "epoch": 2.207631318136769, "grad_norm": 0.3295358121395111, "learning_rate": 1.976682823357977e-06, "loss": 0.3743, "step": 2970 }, { "epoch": 2.2083746283448957, "grad_norm": 0.3662126064300537, "learning_rate": 1.9732383217713184e-06, "loss": 0.4034, "step": 2971 }, { "epoch": 2.209117938553023, "grad_norm": 0.34287703037261963, "learning_rate": 1.9697960860048774e-06, "loss": 0.3554, "step": 2972 }, { "epoch": 2.2098612487611495, "grad_norm": 0.41447731852531433, "learning_rate": 1.966356118635494e-06, "loss": 0.3837, "step": 2973 }, { "epoch": 2.2106045589692767, "grad_norm": 0.3617353141307831, "learning_rate": 1.96291842223831e-06, "loss": 0.3983, "step": 2974 }, { "epoch": 2.2113478691774033, "grad_norm": 0.3926834762096405, "learning_rate": 1.9594829993867725e-06, "loss": 0.3951, "step": 2975 }, { "epoch": 2.2120911793855305, "grad_norm": 0.35471466183662415, "learning_rate": 1.9560498526526245e-06, "loss": 0.3485, "step": 2976 }, { "epoch": 2.212834489593657, "grad_norm": 0.37934234738349915, "learning_rate": 1.9526189846059013e-06, "loss": 0.3977, "step": 2977 }, { "epoch": 2.213577799801784, "grad_norm": 0.3723660111427307, "learning_rate": 1.9491903978149386e-06, "loss": 0.3669, "step": 2978 }, { "epoch": 2.214321110009911, "grad_norm": 0.36026614904403687, "learning_rate": 1.9457640948463575e-06, "loss": 0.3737, "step": 2979 }, { "epoch": 2.2150644202180376, "grad_norm": 0.32516950368881226, "learning_rate": 1.942340078265076e-06, "loss": 0.382, "step": 2980 }, { "epoch": 2.2158077304261647, "grad_norm": 0.42014896869659424, "learning_rate": 1.938918350634297e-06, "loss": 0.3874, "step": 2981 }, { "epoch": 2.2165510406342914, "grad_norm": 0.36264267563819885, "learning_rate": 1.9354989145155077e-06, "loss": 0.3631, "step": 2982 }, { "epoch": 2.217294350842418, "grad_norm": 0.35048025846481323, "learning_rate": 1.9320817724684855e-06, "loss": 0.4009, "step": 2983 }, { "epoch": 2.2180376610505452, "grad_norm": 0.46670055389404297, "learning_rate": 1.928666927051288e-06, "loss": 0.4419, "step": 2984 }, { "epoch": 2.218780971258672, "grad_norm": 0.38125115633010864, "learning_rate": 1.9252543808202507e-06, "loss": 0.3426, "step": 2985 }, { "epoch": 2.219524281466799, "grad_norm": 0.37375977635383606, "learning_rate": 1.9218441363299926e-06, "loss": 0.377, "step": 2986 }, { "epoch": 2.2202675916749257, "grad_norm": 0.3899598717689514, "learning_rate": 1.9184361961334074e-06, "loss": 0.3734, "step": 2987 }, { "epoch": 2.2210109018830524, "grad_norm": 0.37753960490226746, "learning_rate": 1.915030562781661e-06, "loss": 0.44, "step": 2988 }, { "epoch": 2.2217542120911795, "grad_norm": 0.32407626509666443, "learning_rate": 1.9116272388241994e-06, "loss": 0.3568, "step": 2989 }, { "epoch": 2.222497522299306, "grad_norm": 0.41458556056022644, "learning_rate": 1.9082262268087338e-06, "loss": 0.4566, "step": 2990 }, { "epoch": 2.2232408325074333, "grad_norm": 0.3278855085372925, "learning_rate": 1.9048275292812424e-06, "loss": 0.3174, "step": 2991 }, { "epoch": 2.22398414271556, "grad_norm": 0.3701920509338379, "learning_rate": 1.901431148785982e-06, "loss": 0.379, "step": 2992 }, { "epoch": 2.2247274529236867, "grad_norm": 0.3735508322715759, "learning_rate": 1.8980370878654646e-06, "loss": 0.4097, "step": 2993 }, { "epoch": 2.225470763131814, "grad_norm": 0.3575340211391449, "learning_rate": 1.8946453490604677e-06, "loss": 0.3928, "step": 2994 }, { "epoch": 2.2262140733399405, "grad_norm": 0.3711152970790863, "learning_rate": 1.8912559349100347e-06, "loss": 0.3925, "step": 2995 }, { "epoch": 2.2269573835480676, "grad_norm": 0.367652028799057, "learning_rate": 1.8878688479514634e-06, "loss": 0.3838, "step": 2996 }, { "epoch": 2.2277006937561943, "grad_norm": 0.34006860852241516, "learning_rate": 1.8844840907203115e-06, "loss": 0.3459, "step": 2997 }, { "epoch": 2.228444003964321, "grad_norm": 0.3509629964828491, "learning_rate": 1.881101665750396e-06, "loss": 0.3688, "step": 2998 }, { "epoch": 2.229187314172448, "grad_norm": 0.3914438486099243, "learning_rate": 1.877721575573782e-06, "loss": 0.3355, "step": 2999 }, { "epoch": 2.2299306243805748, "grad_norm": 0.43060705065727234, "learning_rate": 1.8743438227207905e-06, "loss": 0.3768, "step": 3000 }, { "epoch": 2.230673934588702, "grad_norm": 0.3788752257823944, "learning_rate": 1.8709684097199948e-06, "loss": 0.4559, "step": 3001 }, { "epoch": 2.2314172447968286, "grad_norm": 0.3214876353740692, "learning_rate": 1.8675953390982103e-06, "loss": 0.3351, "step": 3002 }, { "epoch": 2.2321605550049552, "grad_norm": 0.37666741013526917, "learning_rate": 1.864224613380506e-06, "loss": 0.4133, "step": 3003 }, { "epoch": 2.2329038652130824, "grad_norm": 0.3471716046333313, "learning_rate": 1.8608562350901893e-06, "loss": 0.3749, "step": 3004 }, { "epoch": 2.233647175421209, "grad_norm": 0.35595592856407166, "learning_rate": 1.8574902067488132e-06, "loss": 0.3899, "step": 3005 }, { "epoch": 2.234390485629336, "grad_norm": 0.346516489982605, "learning_rate": 1.854126530876172e-06, "loss": 0.3685, "step": 3006 }, { "epoch": 2.235133795837463, "grad_norm": 0.4056026041507721, "learning_rate": 1.8507652099903001e-06, "loss": 0.4174, "step": 3007 }, { "epoch": 2.2358771060455895, "grad_norm": 0.34890490770339966, "learning_rate": 1.8474062466074637e-06, "loss": 0.3454, "step": 3008 }, { "epoch": 2.2366204162537167, "grad_norm": 0.3402920365333557, "learning_rate": 1.8440496432421723e-06, "loss": 0.3746, "step": 3009 }, { "epoch": 2.2373637264618433, "grad_norm": 0.3678801655769348, "learning_rate": 1.840695402407161e-06, "loss": 0.413, "step": 3010 }, { "epoch": 2.2381070366699705, "grad_norm": 0.3456621766090393, "learning_rate": 1.8373435266133983e-06, "loss": 0.3484, "step": 3011 }, { "epoch": 2.238850346878097, "grad_norm": 0.34502989053726196, "learning_rate": 1.8339940183700872e-06, "loss": 0.3565, "step": 3012 }, { "epoch": 2.239593657086224, "grad_norm": 0.379826158285141, "learning_rate": 1.8306468801846522e-06, "loss": 0.3677, "step": 3013 }, { "epoch": 2.240336967294351, "grad_norm": 0.35751911997795105, "learning_rate": 1.8273021145627423e-06, "loss": 0.4215, "step": 3014 }, { "epoch": 2.2410802775024776, "grad_norm": 0.32938477396965027, "learning_rate": 1.8239597240082419e-06, "loss": 0.3314, "step": 3015 }, { "epoch": 2.2418235877106047, "grad_norm": 0.34948059916496277, "learning_rate": 1.8206197110232454e-06, "loss": 0.3602, "step": 3016 }, { "epoch": 2.2425668979187314, "grad_norm": 0.38026997447013855, "learning_rate": 1.8172820781080697e-06, "loss": 0.3949, "step": 3017 }, { "epoch": 2.243310208126858, "grad_norm": 0.35310155153274536, "learning_rate": 1.8139468277612559e-06, "loss": 0.4031, "step": 3018 }, { "epoch": 2.2440535183349852, "grad_norm": 0.3136982023715973, "learning_rate": 1.8106139624795537e-06, "loss": 0.3716, "step": 3019 }, { "epoch": 2.244796828543112, "grad_norm": 0.4144144654273987, "learning_rate": 1.8072834847579347e-06, "loss": 0.4111, "step": 3020 }, { "epoch": 2.245540138751239, "grad_norm": 0.3507081866264343, "learning_rate": 1.8039553970895774e-06, "loss": 0.3155, "step": 3021 }, { "epoch": 2.2462834489593657, "grad_norm": 0.37943992018699646, "learning_rate": 1.800629701965872e-06, "loss": 0.4293, "step": 3022 }, { "epoch": 2.2470267591674924, "grad_norm": 0.3741309344768524, "learning_rate": 1.7973064018764203e-06, "loss": 0.4061, "step": 3023 }, { "epoch": 2.2477700693756195, "grad_norm": 0.3463471233844757, "learning_rate": 1.7939854993090316e-06, "loss": 0.3617, "step": 3024 }, { "epoch": 2.248513379583746, "grad_norm": 0.3280263841152191, "learning_rate": 1.7906669967497158e-06, "loss": 0.3495, "step": 3025 }, { "epoch": 2.2492566897918733, "grad_norm": 0.3527916967868805, "learning_rate": 1.7873508966826918e-06, "loss": 0.4229, "step": 3026 }, { "epoch": 2.25, "grad_norm": 0.3204686641693115, "learning_rate": 1.7840372015903762e-06, "loss": 0.3311, "step": 3027 }, { "epoch": 2.2507433102081267, "grad_norm": 0.3695390820503235, "learning_rate": 1.780725913953384e-06, "loss": 0.4099, "step": 3028 }, { "epoch": 2.251486620416254, "grad_norm": 0.3965820372104645, "learning_rate": 1.7774170362505321e-06, "loss": 0.3785, "step": 3029 }, { "epoch": 2.2522299306243805, "grad_norm": 0.3598635494709015, "learning_rate": 1.7741105709588336e-06, "loss": 0.3921, "step": 3030 }, { "epoch": 2.2529732408325076, "grad_norm": 0.34786489605903625, "learning_rate": 1.7708065205534896e-06, "loss": 0.3775, "step": 3031 }, { "epoch": 2.2537165510406343, "grad_norm": 0.35848209261894226, "learning_rate": 1.767504887507901e-06, "loss": 0.4171, "step": 3032 }, { "epoch": 2.254459861248761, "grad_norm": 0.36310237646102905, "learning_rate": 1.7642056742936536e-06, "loss": 0.3727, "step": 3033 }, { "epoch": 2.255203171456888, "grad_norm": 0.3490123748779297, "learning_rate": 1.7609088833805227e-06, "loss": 0.4107, "step": 3034 }, { "epoch": 2.2559464816650148, "grad_norm": 0.38054898381233215, "learning_rate": 1.757614517236474e-06, "loss": 0.4081, "step": 3035 }, { "epoch": 2.256689791873142, "grad_norm": 0.35038718581199646, "learning_rate": 1.754322578327653e-06, "loss": 0.3824, "step": 3036 }, { "epoch": 2.2574331020812686, "grad_norm": 0.38290804624557495, "learning_rate": 1.751033069118388e-06, "loss": 0.4048, "step": 3037 }, { "epoch": 2.2581764122893953, "grad_norm": 0.3326457142829895, "learning_rate": 1.747745992071197e-06, "loss": 0.3654, "step": 3038 }, { "epoch": 2.2589197224975224, "grad_norm": 0.35597726702690125, "learning_rate": 1.7444613496467684e-06, "loss": 0.3691, "step": 3039 }, { "epoch": 2.259663032705649, "grad_norm": 0.37539756298065186, "learning_rate": 1.7411791443039683e-06, "loss": 0.391, "step": 3040 }, { "epoch": 2.260406342913776, "grad_norm": 0.367281973361969, "learning_rate": 1.7378993784998443e-06, "loss": 0.3542, "step": 3041 }, { "epoch": 2.261149653121903, "grad_norm": 0.36887624859809875, "learning_rate": 1.734622054689612e-06, "loss": 0.4219, "step": 3042 }, { "epoch": 2.2618929633300295, "grad_norm": 0.34298792481422424, "learning_rate": 1.7313471753266637e-06, "loss": 0.3394, "step": 3043 }, { "epoch": 2.2626362735381567, "grad_norm": 0.3660745918750763, "learning_rate": 1.7280747428625577e-06, "loss": 0.3773, "step": 3044 }, { "epoch": 2.2633795837462833, "grad_norm": 0.3429892957210541, "learning_rate": 1.7248047597470214e-06, "loss": 0.3708, "step": 3045 }, { "epoch": 2.2641228939544105, "grad_norm": 0.35026153922080994, "learning_rate": 1.7215372284279502e-06, "loss": 0.3751, "step": 3046 }, { "epoch": 2.264866204162537, "grad_norm": 0.38039958477020264, "learning_rate": 1.7182721513514056e-06, "loss": 0.3726, "step": 3047 }, { "epoch": 2.265609514370664, "grad_norm": 0.3806914985179901, "learning_rate": 1.715009530961606e-06, "loss": 0.4167, "step": 3048 }, { "epoch": 2.266352824578791, "grad_norm": 0.38965511322021484, "learning_rate": 1.7117493697009385e-06, "loss": 0.4057, "step": 3049 }, { "epoch": 2.2670961347869176, "grad_norm": 0.36990174651145935, "learning_rate": 1.708491670009943e-06, "loss": 0.3763, "step": 3050 }, { "epoch": 2.2678394449950448, "grad_norm": 0.3407042324542999, "learning_rate": 1.7052364343273164e-06, "loss": 0.3534, "step": 3051 }, { "epoch": 2.2685827552031714, "grad_norm": 0.4041297435760498, "learning_rate": 1.7019836650899186e-06, "loss": 0.4122, "step": 3052 }, { "epoch": 2.269326065411298, "grad_norm": 0.3337760865688324, "learning_rate": 1.698733364732753e-06, "loss": 0.3794, "step": 3053 }, { "epoch": 2.2700693756194252, "grad_norm": 0.36151987314224243, "learning_rate": 1.6954855356889826e-06, "loss": 0.3963, "step": 3054 }, { "epoch": 2.270812685827552, "grad_norm": 0.3709504008293152, "learning_rate": 1.6922401803899197e-06, "loss": 0.3979, "step": 3055 }, { "epoch": 2.271555996035679, "grad_norm": 0.3495272397994995, "learning_rate": 1.6889973012650196e-06, "loss": 0.3681, "step": 3056 }, { "epoch": 2.2722993062438057, "grad_norm": 0.37867283821105957, "learning_rate": 1.685756900741886e-06, "loss": 0.3716, "step": 3057 }, { "epoch": 2.2730426164519324, "grad_norm": 0.3382737934589386, "learning_rate": 1.6825189812462722e-06, "loss": 0.3982, "step": 3058 }, { "epoch": 2.2737859266600595, "grad_norm": 0.3405672013759613, "learning_rate": 1.6792835452020668e-06, "loss": 0.3362, "step": 3059 }, { "epoch": 2.274529236868186, "grad_norm": 0.34836679697036743, "learning_rate": 1.6760505950313033e-06, "loss": 0.4456, "step": 3060 }, { "epoch": 2.2752725470763133, "grad_norm": 0.3520950376987457, "learning_rate": 1.672820133154156e-06, "loss": 0.3777, "step": 3061 }, { "epoch": 2.27601585728444, "grad_norm": 0.3398626446723938, "learning_rate": 1.6695921619889327e-06, "loss": 0.3839, "step": 3062 }, { "epoch": 2.2767591674925667, "grad_norm": 0.3894604742527008, "learning_rate": 1.6663666839520765e-06, "loss": 0.4313, "step": 3063 }, { "epoch": 2.277502477700694, "grad_norm": 0.34793993830680847, "learning_rate": 1.6631437014581686e-06, "loss": 0.3476, "step": 3064 }, { "epoch": 2.2782457879088205, "grad_norm": 0.3718230128288269, "learning_rate": 1.6599232169199164e-06, "loss": 0.3685, "step": 3065 }, { "epoch": 2.2789890981169476, "grad_norm": 0.380293607711792, "learning_rate": 1.6567052327481635e-06, "loss": 0.4069, "step": 3066 }, { "epoch": 2.2797324083250743, "grad_norm": 0.3828063905239105, "learning_rate": 1.6534897513518767e-06, "loss": 0.3676, "step": 3067 }, { "epoch": 2.280475718533201, "grad_norm": 0.32160040736198425, "learning_rate": 1.6502767751381488e-06, "loss": 0.334, "step": 3068 }, { "epoch": 2.281219028741328, "grad_norm": 0.36721158027648926, "learning_rate": 1.6470663065122017e-06, "loss": 0.4192, "step": 3069 }, { "epoch": 2.2819623389494548, "grad_norm": 0.3427736163139343, "learning_rate": 1.643858347877379e-06, "loss": 0.3653, "step": 3070 }, { "epoch": 2.282705649157582, "grad_norm": 0.34863707423210144, "learning_rate": 1.6406529016351413e-06, "loss": 0.3329, "step": 3071 }, { "epoch": 2.2834489593657086, "grad_norm": 0.38619932532310486, "learning_rate": 1.6374499701850737e-06, "loss": 0.4308, "step": 3072 }, { "epoch": 2.2841922695738353, "grad_norm": 0.33924251794815063, "learning_rate": 1.6342495559248762e-06, "loss": 0.3319, "step": 3073 }, { "epoch": 2.2849355797819624, "grad_norm": 0.34128355979919434, "learning_rate": 1.6310516612503614e-06, "loss": 0.397, "step": 3074 }, { "epoch": 2.285678889990089, "grad_norm": 0.3492671549320221, "learning_rate": 1.6278562885554628e-06, "loss": 0.4137, "step": 3075 }, { "epoch": 2.286422200198216, "grad_norm": 0.32725223898887634, "learning_rate": 1.6246634402322188e-06, "loss": 0.3509, "step": 3076 }, { "epoch": 2.287165510406343, "grad_norm": 0.35813966393470764, "learning_rate": 1.6214731186707833e-06, "loss": 0.4128, "step": 3077 }, { "epoch": 2.2879088206144695, "grad_norm": 0.34484022855758667, "learning_rate": 1.6182853262594173e-06, "loss": 0.4116, "step": 3078 }, { "epoch": 2.2886521308225967, "grad_norm": 0.35129314661026, "learning_rate": 1.6151000653844879e-06, "loss": 0.3905, "step": 3079 }, { "epoch": 2.2893954410307233, "grad_norm": 0.33847907185554504, "learning_rate": 1.6119173384304638e-06, "loss": 0.3612, "step": 3080 }, { "epoch": 2.2901387512388505, "grad_norm": 0.3704979121685028, "learning_rate": 1.6087371477799241e-06, "loss": 0.3877, "step": 3081 }, { "epoch": 2.290882061446977, "grad_norm": 0.3986649215221405, "learning_rate": 1.6055594958135424e-06, "loss": 0.3607, "step": 3082 }, { "epoch": 2.291625371655104, "grad_norm": 0.4069364070892334, "learning_rate": 1.6023843849100956e-06, "loss": 0.4047, "step": 3083 }, { "epoch": 2.292368681863231, "grad_norm": 0.3357767164707184, "learning_rate": 1.59921181744646e-06, "loss": 0.3533, "step": 3084 }, { "epoch": 2.2931119920713576, "grad_norm": 0.34654900431632996, "learning_rate": 1.5960417957976026e-06, "loss": 0.401, "step": 3085 }, { "epoch": 2.2938553022794848, "grad_norm": 0.37481167912483215, "learning_rate": 1.5928743223365866e-06, "loss": 0.3483, "step": 3086 }, { "epoch": 2.2945986124876114, "grad_norm": 0.4279390275478363, "learning_rate": 1.5897093994345714e-06, "loss": 0.4121, "step": 3087 }, { "epoch": 2.295341922695738, "grad_norm": 0.39790138602256775, "learning_rate": 1.586547029460801e-06, "loss": 0.37, "step": 3088 }, { "epoch": 2.2960852329038652, "grad_norm": 0.3519779443740845, "learning_rate": 1.5833872147826141e-06, "loss": 0.3911, "step": 3089 }, { "epoch": 2.296828543111992, "grad_norm": 0.3450671434402466, "learning_rate": 1.5802299577654328e-06, "loss": 0.3899, "step": 3090 }, { "epoch": 2.297571853320119, "grad_norm": 0.3359356224536896, "learning_rate": 1.5770752607727625e-06, "loss": 0.3415, "step": 3091 }, { "epoch": 2.2983151635282457, "grad_norm": 0.390878289937973, "learning_rate": 1.573923126166201e-06, "loss": 0.3956, "step": 3092 }, { "epoch": 2.2990584737363724, "grad_norm": 0.3704856336116791, "learning_rate": 1.5707735563054188e-06, "loss": 0.3507, "step": 3093 }, { "epoch": 2.2998017839444995, "grad_norm": 0.4059693515300751, "learning_rate": 1.5676265535481683e-06, "loss": 0.4545, "step": 3094 }, { "epoch": 2.300545094152626, "grad_norm": 0.3400088846683502, "learning_rate": 1.5644821202502857e-06, "loss": 0.3398, "step": 3095 }, { "epoch": 2.3012884043607533, "grad_norm": 0.37118858098983765, "learning_rate": 1.5613402587656772e-06, "loss": 0.349, "step": 3096 }, { "epoch": 2.30203171456888, "grad_norm": 0.41319602727890015, "learning_rate": 1.5582009714463247e-06, "loss": 0.4224, "step": 3097 }, { "epoch": 2.302775024777007, "grad_norm": 0.3502465784549713, "learning_rate": 1.5550642606422882e-06, "loss": 0.3571, "step": 3098 }, { "epoch": 2.303518334985134, "grad_norm": 0.37589937448501587, "learning_rate": 1.551930128701692e-06, "loss": 0.3871, "step": 3099 }, { "epoch": 2.3042616451932605, "grad_norm": 0.3180294334888458, "learning_rate": 1.5487985779707348e-06, "loss": 0.3468, "step": 3100 }, { "epoch": 2.3050049554013876, "grad_norm": 0.38744163513183594, "learning_rate": 1.5456696107936825e-06, "loss": 0.4711, "step": 3101 }, { "epoch": 2.3057482656095143, "grad_norm": 0.31219395995140076, "learning_rate": 1.542543229512865e-06, "loss": 0.3082, "step": 3102 }, { "epoch": 2.3064915758176414, "grad_norm": 0.3518911898136139, "learning_rate": 1.5394194364686754e-06, "loss": 0.3894, "step": 3103 }, { "epoch": 2.307234886025768, "grad_norm": 0.32586538791656494, "learning_rate": 1.5362982339995741e-06, "loss": 0.354, "step": 3104 }, { "epoch": 2.3079781962338948, "grad_norm": 0.3605780303478241, "learning_rate": 1.5331796244420766e-06, "loss": 0.4213, "step": 3105 }, { "epoch": 2.308721506442022, "grad_norm": 0.33747872710227966, "learning_rate": 1.5300636101307631e-06, "loss": 0.3519, "step": 3106 }, { "epoch": 2.3094648166501486, "grad_norm": 0.3243147134780884, "learning_rate": 1.5269501933982645e-06, "loss": 0.3893, "step": 3107 }, { "epoch": 2.3102081268582757, "grad_norm": 0.352943480014801, "learning_rate": 1.523839376575274e-06, "loss": 0.4334, "step": 3108 }, { "epoch": 2.3109514370664024, "grad_norm": 0.3445689082145691, "learning_rate": 1.520731161990532e-06, "loss": 0.3576, "step": 3109 }, { "epoch": 2.311694747274529, "grad_norm": 0.3927154541015625, "learning_rate": 1.5176255519708367e-06, "loss": 0.3782, "step": 3110 }, { "epoch": 2.312438057482656, "grad_norm": 0.3276536464691162, "learning_rate": 1.514522548841031e-06, "loss": 0.3842, "step": 3111 }, { "epoch": 2.313181367690783, "grad_norm": 0.329365611076355, "learning_rate": 1.5114221549240127e-06, "loss": 0.4048, "step": 3112 }, { "epoch": 2.31392467789891, "grad_norm": 0.3497539460659027, "learning_rate": 1.5083243725407203e-06, "loss": 0.3705, "step": 3113 }, { "epoch": 2.3146679881070367, "grad_norm": 0.3610522747039795, "learning_rate": 1.5052292040101374e-06, "loss": 0.3755, "step": 3114 }, { "epoch": 2.3154112983151633, "grad_norm": 0.309767484664917, "learning_rate": 1.502136651649299e-06, "loss": 0.3267, "step": 3115 }, { "epoch": 2.3161546085232905, "grad_norm": 0.33955416083335876, "learning_rate": 1.4990467177732732e-06, "loss": 0.3945, "step": 3116 }, { "epoch": 2.316897918731417, "grad_norm": 0.3293681740760803, "learning_rate": 1.4959594046951687e-06, "loss": 0.3665, "step": 3117 }, { "epoch": 2.3176412289395443, "grad_norm": 0.3608019948005676, "learning_rate": 1.492874714726138e-06, "loss": 0.4354, "step": 3118 }, { "epoch": 2.318384539147671, "grad_norm": 0.32620951533317566, "learning_rate": 1.489792650175364e-06, "loss": 0.3615, "step": 3119 }, { "epoch": 2.3191278493557976, "grad_norm": 0.36166560649871826, "learning_rate": 1.486713213350065e-06, "loss": 0.4188, "step": 3120 }, { "epoch": 2.3198711595639248, "grad_norm": 0.3196386396884918, "learning_rate": 1.483636406555497e-06, "loss": 0.3325, "step": 3121 }, { "epoch": 2.3206144697720514, "grad_norm": 0.3563154935836792, "learning_rate": 1.4805622320949404e-06, "loss": 0.41, "step": 3122 }, { "epoch": 2.3213577799801786, "grad_norm": 0.34884417057037354, "learning_rate": 1.4774906922697096e-06, "loss": 0.3585, "step": 3123 }, { "epoch": 2.3221010901883052, "grad_norm": 0.3606015145778656, "learning_rate": 1.474421789379148e-06, "loss": 0.4044, "step": 3124 }, { "epoch": 2.322844400396432, "grad_norm": 0.3262064754962921, "learning_rate": 1.4713555257206202e-06, "loss": 0.3885, "step": 3125 }, { "epoch": 2.323587710604559, "grad_norm": 0.37529534101486206, "learning_rate": 1.468291903589516e-06, "loss": 0.399, "step": 3126 }, { "epoch": 2.3243310208126857, "grad_norm": 0.33005237579345703, "learning_rate": 1.4652309252792524e-06, "loss": 0.3914, "step": 3127 }, { "epoch": 2.325074331020813, "grad_norm": 0.32938408851623535, "learning_rate": 1.462172593081261e-06, "loss": 0.3711, "step": 3128 }, { "epoch": 2.3258176412289395, "grad_norm": 0.3298938274383545, "learning_rate": 1.4591169092849988e-06, "loss": 0.3802, "step": 3129 }, { "epoch": 2.3265609514370666, "grad_norm": 0.3582051992416382, "learning_rate": 1.4560638761779334e-06, "loss": 0.3935, "step": 3130 }, { "epoch": 2.3273042616451933, "grad_norm": 0.3601090610027313, "learning_rate": 1.4530134960455534e-06, "loss": 0.3746, "step": 3131 }, { "epoch": 2.32804757185332, "grad_norm": 0.35663560032844543, "learning_rate": 1.4499657711713617e-06, "loss": 0.3838, "step": 3132 }, { "epoch": 2.328790882061447, "grad_norm": 0.37814247608184814, "learning_rate": 1.4469207038368693e-06, "loss": 0.3782, "step": 3133 }, { "epoch": 2.329534192269574, "grad_norm": 0.3168559968471527, "learning_rate": 1.4438782963215992e-06, "loss": 0.3519, "step": 3134 }, { "epoch": 2.330277502477701, "grad_norm": 0.36925992369651794, "learning_rate": 1.4408385509030859e-06, "loss": 0.4227, "step": 3135 }, { "epoch": 2.3310208126858276, "grad_norm": 0.3699934184551239, "learning_rate": 1.4378014698568686e-06, "loss": 0.3936, "step": 3136 }, { "epoch": 2.3317641228939543, "grad_norm": 0.33775594830513, "learning_rate": 1.4347670554564896e-06, "loss": 0.3832, "step": 3137 }, { "epoch": 2.3325074331020814, "grad_norm": 0.3980458378791809, "learning_rate": 1.4317353099735033e-06, "loss": 0.4069, "step": 3138 }, { "epoch": 2.333250743310208, "grad_norm": 0.32051020860671997, "learning_rate": 1.4287062356774578e-06, "loss": 0.3312, "step": 3139 }, { "epoch": 2.333994053518335, "grad_norm": 0.36102551221847534, "learning_rate": 1.4256798348359036e-06, "loss": 0.3975, "step": 3140 }, { "epoch": 2.334737363726462, "grad_norm": 0.3244664669036865, "learning_rate": 1.4226561097143938e-06, "loss": 0.3763, "step": 3141 }, { "epoch": 2.3354806739345886, "grad_norm": 0.3406892716884613, "learning_rate": 1.4196350625764731e-06, "loss": 0.3651, "step": 3142 }, { "epoch": 2.3362239841427157, "grad_norm": 0.3660113513469696, "learning_rate": 1.416616695683683e-06, "loss": 0.378, "step": 3143 }, { "epoch": 2.3369672943508424, "grad_norm": 0.36611899733543396, "learning_rate": 1.4136010112955623e-06, "loss": 0.3837, "step": 3144 }, { "epoch": 2.3377106045589695, "grad_norm": 0.3658992648124695, "learning_rate": 1.4105880116696358e-06, "loss": 0.4063, "step": 3145 }, { "epoch": 2.338453914767096, "grad_norm": 0.3147423565387726, "learning_rate": 1.4075776990614232e-06, "loss": 0.345, "step": 3146 }, { "epoch": 2.339197224975223, "grad_norm": 0.3901991546154022, "learning_rate": 1.4045700757244317e-06, "loss": 0.3734, "step": 3147 }, { "epoch": 2.33994053518335, "grad_norm": 0.38595277070999146, "learning_rate": 1.4015651439101546e-06, "loss": 0.3739, "step": 3148 }, { "epoch": 2.3406838453914767, "grad_norm": 0.3560275733470917, "learning_rate": 1.3985629058680678e-06, "loss": 0.3683, "step": 3149 }, { "epoch": 2.341427155599604, "grad_norm": 0.3538680374622345, "learning_rate": 1.3955633638456362e-06, "loss": 0.4091, "step": 3150 }, { "epoch": 2.3421704658077305, "grad_norm": 0.34269294142723083, "learning_rate": 1.392566520088301e-06, "loss": 0.3712, "step": 3151 }, { "epoch": 2.342913776015857, "grad_norm": 0.3482663929462433, "learning_rate": 1.3895723768394886e-06, "loss": 0.3787, "step": 3152 }, { "epoch": 2.3436570862239843, "grad_norm": 0.3545241951942444, "learning_rate": 1.3865809363405986e-06, "loss": 0.3355, "step": 3153 }, { "epoch": 2.344400396432111, "grad_norm": 0.3567512035369873, "learning_rate": 1.383592200831011e-06, "loss": 0.3354, "step": 3154 }, { "epoch": 2.345143706640238, "grad_norm": 0.38926199078559875, "learning_rate": 1.3806061725480813e-06, "loss": 0.3917, "step": 3155 }, { "epoch": 2.3458870168483648, "grad_norm": 0.35104435682296753, "learning_rate": 1.3776228537271358e-06, "loss": 0.3895, "step": 3156 }, { "epoch": 2.3466303270564914, "grad_norm": 0.3312903940677643, "learning_rate": 1.374642246601472e-06, "loss": 0.3677, "step": 3157 }, { "epoch": 2.3473736372646186, "grad_norm": 0.3663668930530548, "learning_rate": 1.3716643534023621e-06, "loss": 0.3903, "step": 3158 }, { "epoch": 2.3481169474727452, "grad_norm": 0.3236231207847595, "learning_rate": 1.3686891763590414e-06, "loss": 0.4057, "step": 3159 }, { "epoch": 2.3488602576808724, "grad_norm": 0.3441905081272125, "learning_rate": 1.3657167176987135e-06, "loss": 0.4041, "step": 3160 }, { "epoch": 2.349603567888999, "grad_norm": 0.3621191382408142, "learning_rate": 1.362746979646548e-06, "loss": 0.3693, "step": 3161 }, { "epoch": 2.3503468780971257, "grad_norm": 0.3269801735877991, "learning_rate": 1.3597799644256805e-06, "loss": 0.3704, "step": 3162 }, { "epoch": 2.351090188305253, "grad_norm": 0.33785900473594666, "learning_rate": 1.3568156742572013e-06, "loss": 0.3897, "step": 3163 }, { "epoch": 2.3518334985133795, "grad_norm": 0.32219353318214417, "learning_rate": 1.3538541113601677e-06, "loss": 0.373, "step": 3164 }, { "epoch": 2.3525768087215067, "grad_norm": 0.3284028470516205, "learning_rate": 1.350895277951591e-06, "loss": 0.386, "step": 3165 }, { "epoch": 2.3533201189296333, "grad_norm": 0.3365001082420349, "learning_rate": 1.3479391762464394e-06, "loss": 0.3984, "step": 3166 }, { "epoch": 2.35406342913776, "grad_norm": 0.3571934103965759, "learning_rate": 1.34498580845764e-06, "loss": 0.3832, "step": 3167 }, { "epoch": 2.354806739345887, "grad_norm": 0.3382384181022644, "learning_rate": 1.3420351767960682e-06, "loss": 0.3623, "step": 3168 }, { "epoch": 2.355550049554014, "grad_norm": 0.3502410650253296, "learning_rate": 1.3390872834705543e-06, "loss": 0.3918, "step": 3169 }, { "epoch": 2.356293359762141, "grad_norm": 0.36045780777931213, "learning_rate": 1.3361421306878803e-06, "loss": 0.376, "step": 3170 }, { "epoch": 2.3570366699702676, "grad_norm": 0.35572585463523865, "learning_rate": 1.3331997206527713e-06, "loss": 0.4282, "step": 3171 }, { "epoch": 2.3577799801783943, "grad_norm": 0.3298781216144562, "learning_rate": 1.3302600555679045e-06, "loss": 0.313, "step": 3172 }, { "epoch": 2.3585232903865214, "grad_norm": 0.34611976146698, "learning_rate": 1.3273231376338997e-06, "loss": 0.3923, "step": 3173 }, { "epoch": 2.359266600594648, "grad_norm": 0.33244821429252625, "learning_rate": 1.3243889690493177e-06, "loss": 0.3766, "step": 3174 }, { "epoch": 2.3600099108027752, "grad_norm": 0.36457163095474243, "learning_rate": 1.3214575520106677e-06, "loss": 0.3957, "step": 3175 }, { "epoch": 2.360753221010902, "grad_norm": 0.3432174026966095, "learning_rate": 1.3185288887123925e-06, "loss": 0.3826, "step": 3176 }, { "epoch": 2.3614965312190286, "grad_norm": 0.32993578910827637, "learning_rate": 1.3156029813468774e-06, "loss": 0.3811, "step": 3177 }, { "epoch": 2.3622398414271557, "grad_norm": 0.3437096178531647, "learning_rate": 1.312679832104445e-06, "loss": 0.3871, "step": 3178 }, { "epoch": 2.3629831516352824, "grad_norm": 0.33508598804473877, "learning_rate": 1.30975944317335e-06, "loss": 0.3864, "step": 3179 }, { "epoch": 2.3637264618434095, "grad_norm": 0.3492944538593292, "learning_rate": 1.3068418167397816e-06, "loss": 0.3829, "step": 3180 }, { "epoch": 2.364469772051536, "grad_norm": 0.3550223112106323, "learning_rate": 1.3039269549878652e-06, "loss": 0.3752, "step": 3181 }, { "epoch": 2.365213082259663, "grad_norm": 0.33172938227653503, "learning_rate": 1.3010148600996503e-06, "loss": 0.3601, "step": 3182 }, { "epoch": 2.36595639246779, "grad_norm": 0.3463656008243561, "learning_rate": 1.2981055342551175e-06, "loss": 0.3966, "step": 3183 }, { "epoch": 2.3666997026759167, "grad_norm": 0.39228034019470215, "learning_rate": 1.2951989796321768e-06, "loss": 0.3663, "step": 3184 }, { "epoch": 2.367443012884044, "grad_norm": 0.3546292185783386, "learning_rate": 1.292295198406663e-06, "loss": 0.3965, "step": 3185 }, { "epoch": 2.3681863230921705, "grad_norm": 0.3665103614330292, "learning_rate": 1.2893941927523306e-06, "loss": 0.404, "step": 3186 }, { "epoch": 2.368929633300297, "grad_norm": 0.38429582118988037, "learning_rate": 1.2864959648408626e-06, "loss": 0.3789, "step": 3187 }, { "epoch": 2.3696729435084243, "grad_norm": 0.32124063372612, "learning_rate": 1.283600516841858e-06, "loss": 0.3648, "step": 3188 }, { "epoch": 2.370416253716551, "grad_norm": 0.35684922337532043, "learning_rate": 1.2807078509228343e-06, "loss": 0.373, "step": 3189 }, { "epoch": 2.371159563924678, "grad_norm": 0.3153168261051178, "learning_rate": 1.2778179692492322e-06, "loss": 0.3374, "step": 3190 }, { "epoch": 2.3719028741328048, "grad_norm": 0.36595389246940613, "learning_rate": 1.2749308739844002e-06, "loss": 0.4225, "step": 3191 }, { "epoch": 2.3726461843409314, "grad_norm": 0.3252049386501312, "learning_rate": 1.2720465672896066e-06, "loss": 0.3843, "step": 3192 }, { "epoch": 2.3733894945490586, "grad_norm": 0.3861742913722992, "learning_rate": 1.2691650513240323e-06, "loss": 0.3849, "step": 3193 }, { "epoch": 2.3741328047571852, "grad_norm": 0.33378323912620544, "learning_rate": 1.2662863282447635e-06, "loss": 0.3514, "step": 3194 }, { "epoch": 2.3748761149653124, "grad_norm": 0.3472925126552582, "learning_rate": 1.2634104002068032e-06, "loss": 0.4003, "step": 3195 }, { "epoch": 2.375619425173439, "grad_norm": 0.3379301428794861, "learning_rate": 1.2605372693630564e-06, "loss": 0.3936, "step": 3196 }, { "epoch": 2.3763627353815657, "grad_norm": 0.35978272557258606, "learning_rate": 1.257666937864334e-06, "loss": 0.4527, "step": 3197 }, { "epoch": 2.377106045589693, "grad_norm": 0.3347962498664856, "learning_rate": 1.2547994078593568e-06, "loss": 0.3913, "step": 3198 }, { "epoch": 2.3778493557978195, "grad_norm": 0.3464134931564331, "learning_rate": 1.251934681494742e-06, "loss": 0.3793, "step": 3199 }, { "epoch": 2.3785926660059467, "grad_norm": 0.3315369784832001, "learning_rate": 1.2490727609150121e-06, "loss": 0.3801, "step": 3200 }, { "epoch": 2.3793359762140733, "grad_norm": 0.32183921337127686, "learning_rate": 1.2462136482625897e-06, "loss": 0.3666, "step": 3201 }, { "epoch": 2.3800792864222, "grad_norm": 0.34504377841949463, "learning_rate": 1.2433573456777926e-06, "loss": 0.3759, "step": 3202 }, { "epoch": 2.380822596630327, "grad_norm": 0.3475985825061798, "learning_rate": 1.2405038552988336e-06, "loss": 0.393, "step": 3203 }, { "epoch": 2.381565906838454, "grad_norm": 0.3571268618106842, "learning_rate": 1.2376531792618263e-06, "loss": 0.3766, "step": 3204 }, { "epoch": 2.382309217046581, "grad_norm": 0.31648391485214233, "learning_rate": 1.2348053197007725e-06, "loss": 0.3421, "step": 3205 }, { "epoch": 2.3830525272547076, "grad_norm": 0.39465975761413574, "learning_rate": 1.231960278747566e-06, "loss": 0.4361, "step": 3206 }, { "epoch": 2.3837958374628343, "grad_norm": 0.36206626892089844, "learning_rate": 1.2291180585319923e-06, "loss": 0.363, "step": 3207 }, { "epoch": 2.3845391476709614, "grad_norm": 0.3623907268047333, "learning_rate": 1.2262786611817273e-06, "loss": 0.3935, "step": 3208 }, { "epoch": 2.385282457879088, "grad_norm": 0.35465484857559204, "learning_rate": 1.223442088822327e-06, "loss": 0.3964, "step": 3209 }, { "epoch": 2.3860257680872152, "grad_norm": 0.3463999032974243, "learning_rate": 1.220608343577241e-06, "loss": 0.3679, "step": 3210 }, { "epoch": 2.386769078295342, "grad_norm": 0.3723702132701874, "learning_rate": 1.217777427567795e-06, "loss": 0.3966, "step": 3211 }, { "epoch": 2.3875123885034686, "grad_norm": 0.39343294501304626, "learning_rate": 1.2149493429132003e-06, "loss": 0.3871, "step": 3212 }, { "epoch": 2.3882556987115957, "grad_norm": 0.34892505407333374, "learning_rate": 1.2121240917305505e-06, "loss": 0.3866, "step": 3213 }, { "epoch": 2.3889990089197224, "grad_norm": 0.31823453307151794, "learning_rate": 1.2093016761348137e-06, "loss": 0.3639, "step": 3214 }, { "epoch": 2.3897423191278495, "grad_norm": 0.34014081954956055, "learning_rate": 1.2064820982388375e-06, "loss": 0.3441, "step": 3215 }, { "epoch": 2.390485629335976, "grad_norm": 0.3841252028942108, "learning_rate": 1.2036653601533483e-06, "loss": 0.4456, "step": 3216 }, { "epoch": 2.391228939544103, "grad_norm": 0.34729722142219543, "learning_rate": 1.2008514639869402e-06, "loss": 0.3686, "step": 3217 }, { "epoch": 2.39197224975223, "grad_norm": 0.38662445545196533, "learning_rate": 1.198040411846086e-06, "loss": 0.3681, "step": 3218 }, { "epoch": 2.3927155599603567, "grad_norm": 0.3592659533023834, "learning_rate": 1.1952322058351252e-06, "loss": 0.411, "step": 3219 }, { "epoch": 2.393458870168484, "grad_norm": 0.3408881425857544, "learning_rate": 1.1924268480562667e-06, "loss": 0.3978, "step": 3220 }, { "epoch": 2.3942021803766105, "grad_norm": 0.3738653361797333, "learning_rate": 1.189624340609592e-06, "loss": 0.4295, "step": 3221 }, { "epoch": 2.394945490584737, "grad_norm": 0.3601418137550354, "learning_rate": 1.1868246855930426e-06, "loss": 0.3552, "step": 3222 }, { "epoch": 2.3956888007928643, "grad_norm": 0.38174837827682495, "learning_rate": 1.1840278851024294e-06, "loss": 0.381, "step": 3223 }, { "epoch": 2.396432111000991, "grad_norm": 0.36246728897094727, "learning_rate": 1.1812339412314255e-06, "loss": 0.4393, "step": 3224 }, { "epoch": 2.397175421209118, "grad_norm": 0.3065725266933441, "learning_rate": 1.1784428560715633e-06, "loss": 0.3751, "step": 3225 }, { "epoch": 2.3979187314172448, "grad_norm": 0.37712612748146057, "learning_rate": 1.1756546317122364e-06, "loss": 0.3832, "step": 3226 }, { "epoch": 2.3986620416253714, "grad_norm": 0.345598042011261, "learning_rate": 1.1728692702406985e-06, "loss": 0.4161, "step": 3227 }, { "epoch": 2.3994053518334986, "grad_norm": 0.35029569268226624, "learning_rate": 1.1700867737420578e-06, "loss": 0.3824, "step": 3228 }, { "epoch": 2.4001486620416252, "grad_norm": 0.3376743793487549, "learning_rate": 1.1673071442992772e-06, "loss": 0.408, "step": 3229 }, { "epoch": 2.4008919722497524, "grad_norm": 0.3003757894039154, "learning_rate": 1.1645303839931764e-06, "loss": 0.3377, "step": 3230 }, { "epoch": 2.401635282457879, "grad_norm": 0.36260974407196045, "learning_rate": 1.1617564949024274e-06, "loss": 0.3981, "step": 3231 }, { "epoch": 2.4023785926660057, "grad_norm": 0.3364342749118805, "learning_rate": 1.1589854791035476e-06, "loss": 0.3653, "step": 3232 }, { "epoch": 2.403121902874133, "grad_norm": 0.3695155084133148, "learning_rate": 1.1562173386709107e-06, "loss": 0.3966, "step": 3233 }, { "epoch": 2.4038652130822595, "grad_norm": 0.3396022617816925, "learning_rate": 1.1534520756767304e-06, "loss": 0.3837, "step": 3234 }, { "epoch": 2.4046085232903867, "grad_norm": 0.35342541337013245, "learning_rate": 1.1506896921910738e-06, "loss": 0.3615, "step": 3235 }, { "epoch": 2.4053518334985133, "grad_norm": 0.3336713910102844, "learning_rate": 1.147930190281847e-06, "loss": 0.3645, "step": 3236 }, { "epoch": 2.40609514370664, "grad_norm": 0.33778271079063416, "learning_rate": 1.1451735720147995e-06, "loss": 0.3653, "step": 3237 }, { "epoch": 2.406838453914767, "grad_norm": 0.34298956394195557, "learning_rate": 1.142419839453524e-06, "loss": 0.4094, "step": 3238 }, { "epoch": 2.407581764122894, "grad_norm": 0.35539528727531433, "learning_rate": 1.1396689946594552e-06, "loss": 0.3961, "step": 3239 }, { "epoch": 2.408325074331021, "grad_norm": 0.32189488410949707, "learning_rate": 1.1369210396918595e-06, "loss": 0.3533, "step": 3240 }, { "epoch": 2.4090683845391476, "grad_norm": 0.3759627640247345, "learning_rate": 1.1341759766078465e-06, "loss": 0.4383, "step": 3241 }, { "epoch": 2.4098116947472743, "grad_norm": 0.3274308741092682, "learning_rate": 1.1314338074623565e-06, "loss": 0.3746, "step": 3242 }, { "epoch": 2.4105550049554014, "grad_norm": 0.34865736961364746, "learning_rate": 1.128694534308164e-06, "loss": 0.3765, "step": 3243 }, { "epoch": 2.411298315163528, "grad_norm": 0.37561625242233276, "learning_rate": 1.1259581591958796e-06, "loss": 0.4494, "step": 3244 }, { "epoch": 2.4120416253716552, "grad_norm": 0.31631165742874146, "learning_rate": 1.123224684173938e-06, "loss": 0.3816, "step": 3245 }, { "epoch": 2.412784935579782, "grad_norm": 0.35050728917121887, "learning_rate": 1.120494111288608e-06, "loss": 0.379, "step": 3246 }, { "epoch": 2.4135282457879086, "grad_norm": 0.34235864877700806, "learning_rate": 1.1177664425839857e-06, "loss": 0.391, "step": 3247 }, { "epoch": 2.4142715559960357, "grad_norm": 0.3283383250236511, "learning_rate": 1.115041680101991e-06, "loss": 0.3643, "step": 3248 }, { "epoch": 2.4150148662041624, "grad_norm": 0.38320842385292053, "learning_rate": 1.1123198258823658e-06, "loss": 0.3675, "step": 3249 }, { "epoch": 2.4157581764122895, "grad_norm": 0.3983575999736786, "learning_rate": 1.1096008819626818e-06, "loss": 0.3866, "step": 3250 }, { "epoch": 2.416501486620416, "grad_norm": 0.33409979939460754, "learning_rate": 1.1068848503783269e-06, "loss": 0.3673, "step": 3251 }, { "epoch": 2.417244796828543, "grad_norm": 0.33632954955101013, "learning_rate": 1.1041717331625086e-06, "loss": 0.3988, "step": 3252 }, { "epoch": 2.41798810703667, "grad_norm": 0.332052081823349, "learning_rate": 1.1014615323462552e-06, "loss": 0.3648, "step": 3253 }, { "epoch": 2.4187314172447967, "grad_norm": 0.3362327218055725, "learning_rate": 1.098754249958413e-06, "loss": 0.379, "step": 3254 }, { "epoch": 2.419474727452924, "grad_norm": 0.3339698910713196, "learning_rate": 1.0960498880256387e-06, "loss": 0.4385, "step": 3255 }, { "epoch": 2.4202180376610505, "grad_norm": 0.32616233825683594, "learning_rate": 1.0933484485724077e-06, "loss": 0.3496, "step": 3256 }, { "epoch": 2.420961347869177, "grad_norm": 0.3732871115207672, "learning_rate": 1.090649933621003e-06, "loss": 0.4225, "step": 3257 }, { "epoch": 2.4217046580773043, "grad_norm": 0.33732879161834717, "learning_rate": 1.0879543451915231e-06, "loss": 0.3302, "step": 3258 }, { "epoch": 2.422447968285431, "grad_norm": 0.393576055765152, "learning_rate": 1.0852616853018726e-06, "loss": 0.4147, "step": 3259 }, { "epoch": 2.423191278493558, "grad_norm": 0.35399049520492554, "learning_rate": 1.0825719559677633e-06, "loss": 0.3583, "step": 3260 }, { "epoch": 2.4239345887016848, "grad_norm": 0.33781713247299194, "learning_rate": 1.0798851592027154e-06, "loss": 0.3712, "step": 3261 }, { "epoch": 2.4246778989098114, "grad_norm": 0.3325958847999573, "learning_rate": 1.0772012970180545e-06, "loss": 0.3635, "step": 3262 }, { "epoch": 2.4254212091179386, "grad_norm": 0.3661769926548004, "learning_rate": 1.0745203714229053e-06, "loss": 0.3779, "step": 3263 }, { "epoch": 2.4261645193260652, "grad_norm": 0.3225574791431427, "learning_rate": 1.0718423844241992e-06, "loss": 0.3678, "step": 3264 }, { "epoch": 2.4269078295341924, "grad_norm": 0.3131442964076996, "learning_rate": 1.0691673380266637e-06, "loss": 0.36, "step": 3265 }, { "epoch": 2.427651139742319, "grad_norm": 0.3175901770591736, "learning_rate": 1.0664952342328256e-06, "loss": 0.3477, "step": 3266 }, { "epoch": 2.428394449950446, "grad_norm": 0.37695977091789246, "learning_rate": 1.0638260750430118e-06, "loss": 0.3778, "step": 3267 }, { "epoch": 2.429137760158573, "grad_norm": 0.33714571595191956, "learning_rate": 1.061159862455341e-06, "loss": 0.3641, "step": 3268 }, { "epoch": 2.4298810703666995, "grad_norm": 0.35906925797462463, "learning_rate": 1.0584965984657293e-06, "loss": 0.406, "step": 3269 }, { "epoch": 2.4306243805748267, "grad_norm": 0.3567310869693756, "learning_rate": 1.055836285067885e-06, "loss": 0.3856, "step": 3270 }, { "epoch": 2.4313676907829533, "grad_norm": 0.34132570028305054, "learning_rate": 1.0531789242533053e-06, "loss": 0.405, "step": 3271 }, { "epoch": 2.4321110009910805, "grad_norm": 0.3215070962905884, "learning_rate": 1.0505245180112778e-06, "loss": 0.3518, "step": 3272 }, { "epoch": 2.432854311199207, "grad_norm": 0.3405250310897827, "learning_rate": 1.0478730683288818e-06, "loss": 0.3656, "step": 3273 }, { "epoch": 2.433597621407334, "grad_norm": 0.35017165541648865, "learning_rate": 1.0452245771909774e-06, "loss": 0.4273, "step": 3274 }, { "epoch": 2.434340931615461, "grad_norm": 0.35533493757247925, "learning_rate": 1.0425790465802165e-06, "loss": 0.3455, "step": 3275 }, { "epoch": 2.4350842418235876, "grad_norm": 0.38906610012054443, "learning_rate": 1.0399364784770284e-06, "loss": 0.38, "step": 3276 }, { "epoch": 2.4358275520317147, "grad_norm": 0.3435938358306885, "learning_rate": 1.0372968748596308e-06, "loss": 0.3928, "step": 3277 }, { "epoch": 2.4365708622398414, "grad_norm": 0.31246691942214966, "learning_rate": 1.0346602377040156e-06, "loss": 0.3614, "step": 3278 }, { "epoch": 2.437314172447968, "grad_norm": 0.334517240524292, "learning_rate": 1.032026568983961e-06, "loss": 0.3732, "step": 3279 }, { "epoch": 2.4380574826560952, "grad_norm": 0.39131462574005127, "learning_rate": 1.0293958706710166e-06, "loss": 0.4182, "step": 3280 }, { "epoch": 2.438800792864222, "grad_norm": 0.33603742718696594, "learning_rate": 1.0267681447345145e-06, "loss": 0.3474, "step": 3281 }, { "epoch": 2.439544103072349, "grad_norm": 0.3410196304321289, "learning_rate": 1.0241433931415562e-06, "loss": 0.3967, "step": 3282 }, { "epoch": 2.4402874132804757, "grad_norm": 0.3131977617740631, "learning_rate": 1.0215216178570187e-06, "loss": 0.371, "step": 3283 }, { "epoch": 2.4410307234886024, "grad_norm": 0.3371375799179077, "learning_rate": 1.018902820843552e-06, "loss": 0.3679, "step": 3284 }, { "epoch": 2.4417740336967295, "grad_norm": 0.349528968334198, "learning_rate": 1.0162870040615774e-06, "loss": 0.4087, "step": 3285 }, { "epoch": 2.442517343904856, "grad_norm": 0.29981377720832825, "learning_rate": 1.0136741694692803e-06, "loss": 0.3263, "step": 3286 }, { "epoch": 2.4432606541129833, "grad_norm": 0.35476595163345337, "learning_rate": 1.0110643190226204e-06, "loss": 0.4228, "step": 3287 }, { "epoch": 2.44400396432111, "grad_norm": 0.34923413395881653, "learning_rate": 1.0084574546753184e-06, "loss": 0.3945, "step": 3288 }, { "epoch": 2.4447472745292367, "grad_norm": 0.35318008065223694, "learning_rate": 1.0058535783788604e-06, "loss": 0.352, "step": 3289 }, { "epoch": 2.445490584737364, "grad_norm": 0.3837135136127472, "learning_rate": 1.0032526920824982e-06, "loss": 0.4245, "step": 3290 }, { "epoch": 2.4462338949454905, "grad_norm": 0.3375266492366791, "learning_rate": 1.0006547977332426e-06, "loss": 0.392, "step": 3291 }, { "epoch": 2.4469772051536176, "grad_norm": 0.27478718757629395, "learning_rate": 9.98059897275863e-07, "loss": 0.3316, "step": 3292 }, { "epoch": 2.4477205153617443, "grad_norm": 0.35682839155197144, "learning_rate": 9.954679926528966e-07, "loss": 0.4704, "step": 3293 }, { "epoch": 2.448463825569871, "grad_norm": 0.32918721437454224, "learning_rate": 9.928790858046278e-07, "loss": 0.3306, "step": 3294 }, { "epoch": 2.449207135777998, "grad_norm": 0.3791501522064209, "learning_rate": 9.902931786690994e-07, "loss": 0.4195, "step": 3295 }, { "epoch": 2.4499504459861248, "grad_norm": 0.33196699619293213, "learning_rate": 9.877102731821125e-07, "loss": 0.3808, "step": 3296 }, { "epoch": 2.450693756194252, "grad_norm": 0.37119340896606445, "learning_rate": 9.851303712772164e-07, "loss": 0.3604, "step": 3297 }, { "epoch": 2.4514370664023786, "grad_norm": 0.39634397625923157, "learning_rate": 9.825534748857169e-07, "loss": 0.379, "step": 3298 }, { "epoch": 2.4521803766105057, "grad_norm": 0.3156127333641052, "learning_rate": 9.799795859366633e-07, "loss": 0.3449, "step": 3299 }, { "epoch": 2.4529236868186324, "grad_norm": 0.37200653553009033, "learning_rate": 9.774087063568615e-07, "loss": 0.414, "step": 3300 }, { "epoch": 2.453666997026759, "grad_norm": 0.3384486734867096, "learning_rate": 9.748408380708563e-07, "loss": 0.3614, "step": 3301 }, { "epoch": 2.454410307234886, "grad_norm": 0.32128608226776123, "learning_rate": 9.72275983000946e-07, "loss": 0.3222, "step": 3302 }, { "epoch": 2.455153617443013, "grad_norm": 0.3474943935871124, "learning_rate": 9.697141430671663e-07, "loss": 0.405, "step": 3303 }, { "epoch": 2.45589692765114, "grad_norm": 0.33377400040626526, "learning_rate": 9.671553201873024e-07, "loss": 0.383, "step": 3304 }, { "epoch": 2.4566402378592667, "grad_norm": 0.3706541657447815, "learning_rate": 9.645995162768756e-07, "loss": 0.3891, "step": 3305 }, { "epoch": 2.4573835480673933, "grad_norm": 0.3704281449317932, "learning_rate": 9.620467332491489e-07, "loss": 0.3742, "step": 3306 }, { "epoch": 2.4581268582755205, "grad_norm": 0.33516499400138855, "learning_rate": 9.594969730151244e-07, "loss": 0.3382, "step": 3307 }, { "epoch": 2.458870168483647, "grad_norm": 0.3619866967201233, "learning_rate": 9.569502374835433e-07, "loss": 0.4042, "step": 3308 }, { "epoch": 2.4596134786917743, "grad_norm": 0.34300142526626587, "learning_rate": 9.544065285608778e-07, "loss": 0.3756, "step": 3309 }, { "epoch": 2.460356788899901, "grad_norm": 0.3429133892059326, "learning_rate": 9.518658481513388e-07, "loss": 0.3557, "step": 3310 }, { "epoch": 2.4611000991080276, "grad_norm": 0.3439011871814728, "learning_rate": 9.493281981568675e-07, "loss": 0.4446, "step": 3311 }, { "epoch": 2.4618434093161548, "grad_norm": 0.32877710461616516, "learning_rate": 9.467935804771361e-07, "loss": 0.3747, "step": 3312 }, { "epoch": 2.4625867195242814, "grad_norm": 0.32675549387931824, "learning_rate": 9.442619970095507e-07, "loss": 0.3631, "step": 3313 }, { "epoch": 2.4633300297324086, "grad_norm": 0.31608107686042786, "learning_rate": 9.417334496492403e-07, "loss": 0.3527, "step": 3314 }, { "epoch": 2.4640733399405352, "grad_norm": 0.3768887519836426, "learning_rate": 9.392079402890653e-07, "loss": 0.413, "step": 3315 }, { "epoch": 2.464816650148662, "grad_norm": 0.3530103266239166, "learning_rate": 9.366854708196127e-07, "loss": 0.4286, "step": 3316 }, { "epoch": 2.465559960356789, "grad_norm": 0.3774814307689667, "learning_rate": 9.341660431291899e-07, "loss": 0.3887, "step": 3317 }, { "epoch": 2.4663032705649157, "grad_norm": 0.35296469926834106, "learning_rate": 9.316496591038282e-07, "loss": 0.3847, "step": 3318 }, { "epoch": 2.467046580773043, "grad_norm": 0.3420185148715973, "learning_rate": 9.291363206272841e-07, "loss": 0.3785, "step": 3319 }, { "epoch": 2.4677898909811695, "grad_norm": 0.35034453868865967, "learning_rate": 9.266260295810286e-07, "loss": 0.4072, "step": 3320 }, { "epoch": 2.468533201189296, "grad_norm": 0.3384521007537842, "learning_rate": 9.24118787844257e-07, "loss": 0.3572, "step": 3321 }, { "epoch": 2.4692765113974233, "grad_norm": 0.3435074985027313, "learning_rate": 9.216145972938767e-07, "loss": 0.3653, "step": 3322 }, { "epoch": 2.47001982160555, "grad_norm": 0.36644065380096436, "learning_rate": 9.191134598045159e-07, "loss": 0.358, "step": 3323 }, { "epoch": 2.470763131813677, "grad_norm": 0.2996695637702942, "learning_rate": 9.166153772485126e-07, "loss": 0.3494, "step": 3324 }, { "epoch": 2.471506442021804, "grad_norm": 0.37699082493782043, "learning_rate": 9.14120351495923e-07, "loss": 0.4114, "step": 3325 }, { "epoch": 2.4722497522299305, "grad_norm": 0.35300543904304504, "learning_rate": 9.116283844145085e-07, "loss": 0.3915, "step": 3326 }, { "epoch": 2.4729930624380576, "grad_norm": 0.32319068908691406, "learning_rate": 9.091394778697482e-07, "loss": 0.3586, "step": 3327 }, { "epoch": 2.4737363726461843, "grad_norm": 0.3454497456550598, "learning_rate": 9.066536337248249e-07, "loss": 0.4268, "step": 3328 }, { "epoch": 2.4744796828543114, "grad_norm": 0.30617555975914, "learning_rate": 9.041708538406285e-07, "loss": 0.328, "step": 3329 }, { "epoch": 2.475222993062438, "grad_norm": 0.33670011162757874, "learning_rate": 9.016911400757589e-07, "loss": 0.4219, "step": 3330 }, { "epoch": 2.4759663032705648, "grad_norm": 0.333703875541687, "learning_rate": 8.992144942865194e-07, "loss": 0.3893, "step": 3331 }, { "epoch": 2.476709613478692, "grad_norm": 0.3241724669933319, "learning_rate": 8.967409183269138e-07, "loss": 0.3643, "step": 3332 }, { "epoch": 2.4774529236868186, "grad_norm": 0.3406752943992615, "learning_rate": 8.942704140486524e-07, "loss": 0.3977, "step": 3333 }, { "epoch": 2.4781962338949457, "grad_norm": 0.33454588055610657, "learning_rate": 8.918029833011421e-07, "loss": 0.3966, "step": 3334 }, { "epoch": 2.4789395441030724, "grad_norm": 0.3171430230140686, "learning_rate": 8.893386279314892e-07, "loss": 0.3536, "step": 3335 }, { "epoch": 2.479682854311199, "grad_norm": 0.3209599256515503, "learning_rate": 8.868773497845018e-07, "loss": 0.3744, "step": 3336 }, { "epoch": 2.480426164519326, "grad_norm": 0.3677012324333191, "learning_rate": 8.844191507026794e-07, "loss": 0.3938, "step": 3337 }, { "epoch": 2.481169474727453, "grad_norm": 0.3570628762245178, "learning_rate": 8.819640325262202e-07, "loss": 0.4007, "step": 3338 }, { "epoch": 2.48191278493558, "grad_norm": 0.3047831058502197, "learning_rate": 8.795119970930149e-07, "loss": 0.3079, "step": 3339 }, { "epoch": 2.4826560951437067, "grad_norm": 0.3598984479904175, "learning_rate": 8.77063046238646e-07, "loss": 0.4319, "step": 3340 }, { "epoch": 2.4833994053518333, "grad_norm": 0.2935451865196228, "learning_rate": 8.746171817963856e-07, "loss": 0.3706, "step": 3341 }, { "epoch": 2.4841427155599605, "grad_norm": 0.3339981138706207, "learning_rate": 8.721744055971987e-07, "loss": 0.397, "step": 3342 }, { "epoch": 2.484886025768087, "grad_norm": 0.3689916431903839, "learning_rate": 8.697347194697348e-07, "loss": 0.4057, "step": 3343 }, { "epoch": 2.4856293359762143, "grad_norm": 0.36507648229599, "learning_rate": 8.672981252403345e-07, "loss": 0.3917, "step": 3344 }, { "epoch": 2.486372646184341, "grad_norm": 0.3663923144340515, "learning_rate": 8.648646247330201e-07, "loss": 0.3884, "step": 3345 }, { "epoch": 2.4871159563924676, "grad_norm": 0.34887877106666565, "learning_rate": 8.624342197694962e-07, "loss": 0.3964, "step": 3346 }, { "epoch": 2.4878592666005948, "grad_norm": 0.34639912843704224, "learning_rate": 8.600069121691584e-07, "loss": 0.3974, "step": 3347 }, { "epoch": 2.4886025768087214, "grad_norm": 0.3735826909542084, "learning_rate": 8.575827037490753e-07, "loss": 0.424, "step": 3348 }, { "epoch": 2.4893458870168486, "grad_norm": 0.35757720470428467, "learning_rate": 8.551615963239978e-07, "loss": 0.3446, "step": 3349 }, { "epoch": 2.4900891972249752, "grad_norm": 0.3833138644695282, "learning_rate": 8.527435917063587e-07, "loss": 0.4064, "step": 3350 }, { "epoch": 2.490832507433102, "grad_norm": 0.3060486912727356, "learning_rate": 8.503286917062636e-07, "loss": 0.3021, "step": 3351 }, { "epoch": 2.491575817641229, "grad_norm": 0.38942068815231323, "learning_rate": 8.479168981314945e-07, "loss": 0.3635, "step": 3352 }, { "epoch": 2.4923191278493557, "grad_norm": 0.3920958936214447, "learning_rate": 8.455082127875113e-07, "loss": 0.3826, "step": 3353 }, { "epoch": 2.493062438057483, "grad_norm": 0.36689743399620056, "learning_rate": 8.431026374774453e-07, "loss": 0.3954, "step": 3354 }, { "epoch": 2.4938057482656095, "grad_norm": 0.33964109420776367, "learning_rate": 8.407001740020976e-07, "loss": 0.3504, "step": 3355 }, { "epoch": 2.494549058473736, "grad_norm": 0.3802856504917145, "learning_rate": 8.383008241599439e-07, "loss": 0.4292, "step": 3356 }, { "epoch": 2.4952923686818633, "grad_norm": 0.3372541666030884, "learning_rate": 8.359045897471251e-07, "loss": 0.3598, "step": 3357 }, { "epoch": 2.49603567888999, "grad_norm": 0.4074156582355499, "learning_rate": 8.335114725574505e-07, "loss": 0.4008, "step": 3358 }, { "epoch": 2.496778989098117, "grad_norm": 0.3398004174232483, "learning_rate": 8.311214743823997e-07, "loss": 0.362, "step": 3359 }, { "epoch": 2.497522299306244, "grad_norm": 0.34226763248443604, "learning_rate": 8.287345970111122e-07, "loss": 0.3563, "step": 3360 }, { "epoch": 2.4982656095143705, "grad_norm": 0.3577001094818115, "learning_rate": 8.263508422303946e-07, "loss": 0.4024, "step": 3361 }, { "epoch": 2.4990089197224976, "grad_norm": 0.3875311315059662, "learning_rate": 8.239702118247162e-07, "loss": 0.3737, "step": 3362 }, { "epoch": 2.4997522299306243, "grad_norm": 0.34588783979415894, "learning_rate": 8.215927075762048e-07, "loss": 0.3858, "step": 3363 }, { "epoch": 2.5004955401387514, "grad_norm": 0.3257432281970978, "learning_rate": 8.192183312646485e-07, "loss": 0.3755, "step": 3364 }, { "epoch": 2.501238850346878, "grad_norm": 0.3467220664024353, "learning_rate": 8.168470846674975e-07, "loss": 0.3902, "step": 3365 }, { "epoch": 2.5019821605550048, "grad_norm": 0.3485172390937805, "learning_rate": 8.144789695598538e-07, "loss": 0.4182, "step": 3366 }, { "epoch": 2.502725470763132, "grad_norm": 0.3594188988208771, "learning_rate": 8.121139877144795e-07, "loss": 0.3932, "step": 3367 }, { "epoch": 2.5034687809712586, "grad_norm": 0.312722384929657, "learning_rate": 8.097521409017895e-07, "loss": 0.3384, "step": 3368 }, { "epoch": 2.5042120911793857, "grad_norm": 0.3473462164402008, "learning_rate": 8.073934308898479e-07, "loss": 0.3742, "step": 3369 }, { "epoch": 2.5049554013875124, "grad_norm": 0.35528701543807983, "learning_rate": 8.050378594443798e-07, "loss": 0.3729, "step": 3370 }, { "epoch": 2.505698711595639, "grad_norm": 0.33505916595458984, "learning_rate": 8.026854283287533e-07, "loss": 0.3864, "step": 3371 }, { "epoch": 2.506442021803766, "grad_norm": 0.3379124104976654, "learning_rate": 8.003361393039866e-07, "loss": 0.3637, "step": 3372 }, { "epoch": 2.507185332011893, "grad_norm": 0.3352341651916504, "learning_rate": 7.979899941287488e-07, "loss": 0.3423, "step": 3373 }, { "epoch": 2.50792864222002, "grad_norm": 0.36355865001678467, "learning_rate": 7.956469945593531e-07, "loss": 0.4368, "step": 3374 }, { "epoch": 2.5086719524281467, "grad_norm": 0.32590600848197937, "learning_rate": 7.933071423497568e-07, "loss": 0.3622, "step": 3375 }, { "epoch": 2.5094152626362733, "grad_norm": 0.3485233187675476, "learning_rate": 7.909704392515638e-07, "loss": 0.4108, "step": 3376 }, { "epoch": 2.5101585728444005, "grad_norm": 0.34723156690597534, "learning_rate": 7.88636887014021e-07, "loss": 0.3677, "step": 3377 }, { "epoch": 2.510901883052527, "grad_norm": 0.33610326051712036, "learning_rate": 7.863064873840132e-07, "loss": 0.3555, "step": 3378 }, { "epoch": 2.5116451932606543, "grad_norm": 0.35787495970726013, "learning_rate": 7.839792421060688e-07, "loss": 0.3966, "step": 3379 }, { "epoch": 2.512388503468781, "grad_norm": 0.32354673743247986, "learning_rate": 7.816551529223526e-07, "loss": 0.3984, "step": 3380 }, { "epoch": 2.5131318136769076, "grad_norm": 0.3749980628490448, "learning_rate": 7.793342215726651e-07, "loss": 0.4391, "step": 3381 }, { "epoch": 2.5138751238850348, "grad_norm": 0.35031282901763916, "learning_rate": 7.770164497944488e-07, "loss": 0.3533, "step": 3382 }, { "epoch": 2.5146184340931614, "grad_norm": 0.3619048297405243, "learning_rate": 7.747018393227739e-07, "loss": 0.4185, "step": 3383 }, { "epoch": 2.5153617443012886, "grad_norm": 0.3100355863571167, "learning_rate": 7.723903918903497e-07, "loss": 0.3403, "step": 3384 }, { "epoch": 2.5161050545094152, "grad_norm": 0.3428206145763397, "learning_rate": 7.700821092275151e-07, "loss": 0.4263, "step": 3385 }, { "epoch": 2.516848364717542, "grad_norm": 0.31218090653419495, "learning_rate": 7.677769930622403e-07, "loss": 0.3738, "step": 3386 }, { "epoch": 2.517591674925669, "grad_norm": 0.326463907957077, "learning_rate": 7.654750451201226e-07, "loss": 0.3469, "step": 3387 }, { "epoch": 2.5183349851337957, "grad_norm": 0.32726380228996277, "learning_rate": 7.631762671243931e-07, "loss": 0.4061, "step": 3388 }, { "epoch": 2.519078295341923, "grad_norm": 0.31379199028015137, "learning_rate": 7.608806607959041e-07, "loss": 0.3536, "step": 3389 }, { "epoch": 2.5198216055500495, "grad_norm": 0.33954837918281555, "learning_rate": 7.585882278531381e-07, "loss": 0.3964, "step": 3390 }, { "epoch": 2.520564915758176, "grad_norm": 0.2991616129875183, "learning_rate": 7.562989700121997e-07, "loss": 0.3323, "step": 3391 }, { "epoch": 2.5213082259663033, "grad_norm": 0.323360800743103, "learning_rate": 7.540128889868142e-07, "loss": 0.4352, "step": 3392 }, { "epoch": 2.52205153617443, "grad_norm": 0.3172169327735901, "learning_rate": 7.517299864883365e-07, "loss": 0.4117, "step": 3393 }, { "epoch": 2.522794846382557, "grad_norm": 0.32008862495422363, "learning_rate": 7.494502642257351e-07, "loss": 0.3674, "step": 3394 }, { "epoch": 2.523538156590684, "grad_norm": 0.345503032207489, "learning_rate": 7.471737239055976e-07, "loss": 0.3847, "step": 3395 }, { "epoch": 2.5242814667988105, "grad_norm": 0.34375736117362976, "learning_rate": 7.449003672321359e-07, "loss": 0.3657, "step": 3396 }, { "epoch": 2.5250247770069376, "grad_norm": 0.3376677632331848, "learning_rate": 7.426301959071719e-07, "loss": 0.3838, "step": 3397 }, { "epoch": 2.5257680872150643, "grad_norm": 0.32749414443969727, "learning_rate": 7.403632116301452e-07, "loss": 0.3906, "step": 3398 }, { "epoch": 2.5265113974231914, "grad_norm": 0.32663843035697937, "learning_rate": 7.380994160981108e-07, "loss": 0.3814, "step": 3399 }, { "epoch": 2.527254707631318, "grad_norm": 0.3156656324863434, "learning_rate": 7.35838811005738e-07, "loss": 0.3595, "step": 3400 }, { "epoch": 2.5279980178394448, "grad_norm": 0.3577403426170349, "learning_rate": 7.335813980453027e-07, "loss": 0.3803, "step": 3401 }, { "epoch": 2.528741328047572, "grad_norm": 0.3464386761188507, "learning_rate": 7.313271789066962e-07, "loss": 0.3675, "step": 3402 }, { "epoch": 2.5294846382556986, "grad_norm": 0.31268033385276794, "learning_rate": 7.290761552774156e-07, "loss": 0.4005, "step": 3403 }, { "epoch": 2.5302279484638257, "grad_norm": 0.3154951333999634, "learning_rate": 7.268283288425665e-07, "loss": 0.3603, "step": 3404 }, { "epoch": 2.5309712586719524, "grad_norm": 0.3950781226158142, "learning_rate": 7.245837012848639e-07, "loss": 0.4415, "step": 3405 }, { "epoch": 2.531714568880079, "grad_norm": 0.31421300768852234, "learning_rate": 7.223422742846231e-07, "loss": 0.3517, "step": 3406 }, { "epoch": 2.532457879088206, "grad_norm": 0.31384211778640747, "learning_rate": 7.201040495197681e-07, "loss": 0.3625, "step": 3407 }, { "epoch": 2.533201189296333, "grad_norm": 0.3411378860473633, "learning_rate": 7.178690286658247e-07, "loss": 0.4131, "step": 3408 }, { "epoch": 2.53394449950446, "grad_norm": 0.31339430809020996, "learning_rate": 7.156372133959171e-07, "loss": 0.3762, "step": 3409 }, { "epoch": 2.5346878097125867, "grad_norm": 0.35929059982299805, "learning_rate": 7.134086053807749e-07, "loss": 0.3985, "step": 3410 }, { "epoch": 2.5354311199207133, "grad_norm": 0.31627702713012695, "learning_rate": 7.111832062887225e-07, "loss": 0.3449, "step": 3411 }, { "epoch": 2.5361744301288405, "grad_norm": 0.37606626749038696, "learning_rate": 7.089610177856826e-07, "loss": 0.4132, "step": 3412 }, { "epoch": 2.536917740336967, "grad_norm": 0.36350852251052856, "learning_rate": 7.067420415351784e-07, "loss": 0.3787, "step": 3413 }, { "epoch": 2.5376610505450943, "grad_norm": 0.3320479989051819, "learning_rate": 7.045262791983237e-07, "loss": 0.3596, "step": 3414 }, { "epoch": 2.538404360753221, "grad_norm": 0.3030188977718353, "learning_rate": 7.023137324338258e-07, "loss": 0.351, "step": 3415 }, { "epoch": 2.5391476709613476, "grad_norm": 0.3293921649456024, "learning_rate": 7.00104402897993e-07, "loss": 0.4097, "step": 3416 }, { "epoch": 2.5398909811694748, "grad_norm": 0.34632042050361633, "learning_rate": 6.978982922447158e-07, "loss": 0.3914, "step": 3417 }, { "epoch": 2.5406342913776014, "grad_norm": 0.33734163641929626, "learning_rate": 6.956954021254786e-07, "loss": 0.3945, "step": 3418 }, { "epoch": 2.5413776015857286, "grad_norm": 0.36986759305000305, "learning_rate": 6.934957341893573e-07, "loss": 0.3909, "step": 3419 }, { "epoch": 2.5421209117938552, "grad_norm": 0.3420787751674652, "learning_rate": 6.912992900830118e-07, "loss": 0.3766, "step": 3420 }, { "epoch": 2.542864222001982, "grad_norm": 0.30360302329063416, "learning_rate": 6.891060714506892e-07, "loss": 0.328, "step": 3421 }, { "epoch": 2.543607532210109, "grad_norm": 0.3377675712108612, "learning_rate": 6.869160799342256e-07, "loss": 0.4114, "step": 3422 }, { "epoch": 2.5443508424182357, "grad_norm": 0.32193732261657715, "learning_rate": 6.847293171730368e-07, "loss": 0.365, "step": 3423 }, { "epoch": 2.545094152626363, "grad_norm": 0.311640202999115, "learning_rate": 6.825457848041239e-07, "loss": 0.3757, "step": 3424 }, { "epoch": 2.5458374628344895, "grad_norm": 0.3533160090446472, "learning_rate": 6.803654844620716e-07, "loss": 0.3844, "step": 3425 }, { "epoch": 2.546580773042616, "grad_norm": 0.33108943700790405, "learning_rate": 6.781884177790415e-07, "loss": 0.3359, "step": 3426 }, { "epoch": 2.5473240832507433, "grad_norm": 0.34299591183662415, "learning_rate": 6.760145863847745e-07, "loss": 0.4131, "step": 3427 }, { "epoch": 2.54806739345887, "grad_norm": 0.3578473627567291, "learning_rate": 6.738439919065937e-07, "loss": 0.3465, "step": 3428 }, { "epoch": 2.548810703666997, "grad_norm": 0.383902370929718, "learning_rate": 6.716766359693939e-07, "loss": 0.3768, "step": 3429 }, { "epoch": 2.549554013875124, "grad_norm": 0.33165010809898376, "learning_rate": 6.695125201956487e-07, "loss": 0.3604, "step": 3430 }, { "epoch": 2.5502973240832505, "grad_norm": 0.35626551508903503, "learning_rate": 6.673516462054075e-07, "loss": 0.4143, "step": 3431 }, { "epoch": 2.5510406342913776, "grad_norm": 0.3379501700401306, "learning_rate": 6.651940156162878e-07, "loss": 0.3912, "step": 3432 }, { "epoch": 2.5517839444995043, "grad_norm": 0.35998550057411194, "learning_rate": 6.630396300434855e-07, "loss": 0.3982, "step": 3433 }, { "epoch": 2.5525272547076314, "grad_norm": 0.33589527010917664, "learning_rate": 6.608884910997615e-07, "loss": 0.361, "step": 3434 }, { "epoch": 2.553270564915758, "grad_norm": 0.3706020414829254, "learning_rate": 6.587406003954488e-07, "loss": 0.4258, "step": 3435 }, { "epoch": 2.554013875123885, "grad_norm": 0.32875820994377136, "learning_rate": 6.565959595384502e-07, "loss": 0.364, "step": 3436 }, { "epoch": 2.554757185332012, "grad_norm": 0.3118140995502472, "learning_rate": 6.544545701342331e-07, "loss": 0.3763, "step": 3437 }, { "epoch": 2.555500495540139, "grad_norm": 0.32620179653167725, "learning_rate": 6.523164337858307e-07, "loss": 0.3862, "step": 3438 }, { "epoch": 2.5562438057482657, "grad_norm": 0.3478686809539795, "learning_rate": 6.501815520938459e-07, "loss": 0.4596, "step": 3439 }, { "epoch": 2.5569871159563924, "grad_norm": 0.3434828519821167, "learning_rate": 6.480499266564394e-07, "loss": 0.3732, "step": 3440 }, { "epoch": 2.557730426164519, "grad_norm": 0.3353089392185211, "learning_rate": 6.459215590693358e-07, "loss": 0.3948, "step": 3441 }, { "epoch": 2.558473736372646, "grad_norm": 0.32464438676834106, "learning_rate": 6.437964509258232e-07, "loss": 0.399, "step": 3442 }, { "epoch": 2.5592170465807733, "grad_norm": 0.31091374158859253, "learning_rate": 6.416746038167471e-07, "loss": 0.3317, "step": 3443 }, { "epoch": 2.5599603567889, "grad_norm": 0.3927433490753174, "learning_rate": 6.395560193305117e-07, "loss": 0.4269, "step": 3444 }, { "epoch": 2.5607036669970267, "grad_norm": 0.3389318585395813, "learning_rate": 6.374406990530818e-07, "loss": 0.3582, "step": 3445 }, { "epoch": 2.5614469772051534, "grad_norm": 0.32410937547683716, "learning_rate": 6.353286445679752e-07, "loss": 0.3736, "step": 3446 }, { "epoch": 2.5621902874132805, "grad_norm": 0.3293868899345398, "learning_rate": 6.332198574562664e-07, "loss": 0.3846, "step": 3447 }, { "epoch": 2.5629335976214076, "grad_norm": 0.33777400851249695, "learning_rate": 6.311143392965863e-07, "loss": 0.4219, "step": 3448 }, { "epoch": 2.5636769078295343, "grad_norm": 0.3677363991737366, "learning_rate": 6.290120916651132e-07, "loss": 0.3781, "step": 3449 }, { "epoch": 2.564420218037661, "grad_norm": 0.3549802005290985, "learning_rate": 6.269131161355829e-07, "loss": 0.3634, "step": 3450 }, { "epoch": 2.5651635282457876, "grad_norm": 0.3251866400241852, "learning_rate": 6.248174142792773e-07, "loss": 0.3797, "step": 3451 }, { "epoch": 2.5659068384539148, "grad_norm": 0.35108667612075806, "learning_rate": 6.227249876650293e-07, "loss": 0.3887, "step": 3452 }, { "epoch": 2.566650148662042, "grad_norm": 0.31912630796432495, "learning_rate": 6.206358378592209e-07, "loss": 0.3895, "step": 3453 }, { "epoch": 2.5673934588701686, "grad_norm": 0.37445881962776184, "learning_rate": 6.18549966425781e-07, "loss": 0.3779, "step": 3454 }, { "epoch": 2.5681367690782952, "grad_norm": 0.3612947463989258, "learning_rate": 6.164673749261813e-07, "loss": 0.378, "step": 3455 }, { "epoch": 2.568880079286422, "grad_norm": 0.34581878781318665, "learning_rate": 6.143880649194434e-07, "loss": 0.3991, "step": 3456 }, { "epoch": 2.569623389494549, "grad_norm": 0.3767089545726776, "learning_rate": 6.123120379621272e-07, "loss": 0.4007, "step": 3457 }, { "epoch": 2.570366699702676, "grad_norm": 0.3219103217124939, "learning_rate": 6.102392956083375e-07, "loss": 0.3245, "step": 3458 }, { "epoch": 2.571110009910803, "grad_norm": 0.35060837864875793, "learning_rate": 6.081698394097213e-07, "loss": 0.3632, "step": 3459 }, { "epoch": 2.5718533201189295, "grad_norm": 0.35217031836509705, "learning_rate": 6.061036709154627e-07, "loss": 0.3591, "step": 3460 }, { "epoch": 2.5725966303270567, "grad_norm": 0.33175602555274963, "learning_rate": 6.040407916722851e-07, "loss": 0.3895, "step": 3461 }, { "epoch": 2.5733399405351833, "grad_norm": 0.3303731083869934, "learning_rate": 6.019812032244543e-07, "loss": 0.4204, "step": 3462 }, { "epoch": 2.5740832507433105, "grad_norm": 0.31055209040641785, "learning_rate": 5.999249071137664e-07, "loss": 0.3341, "step": 3463 }, { "epoch": 2.574826560951437, "grad_norm": 0.3594001531600952, "learning_rate": 5.978719048795551e-07, "loss": 0.3764, "step": 3464 }, { "epoch": 2.575569871159564, "grad_norm": 0.3557899296283722, "learning_rate": 5.958221980586909e-07, "loss": 0.4276, "step": 3465 }, { "epoch": 2.576313181367691, "grad_norm": 0.3123757839202881, "learning_rate": 5.937757881855733e-07, "loss": 0.3821, "step": 3466 }, { "epoch": 2.5770564915758176, "grad_norm": 0.34537169337272644, "learning_rate": 5.917326767921355e-07, "loss": 0.3821, "step": 3467 }, { "epoch": 2.5777998017839447, "grad_norm": 0.32632356882095337, "learning_rate": 5.896928654078427e-07, "loss": 0.356, "step": 3468 }, { "epoch": 2.5785431119920714, "grad_norm": 0.3975873291492462, "learning_rate": 5.876563555596865e-07, "loss": 0.4237, "step": 3469 }, { "epoch": 2.579286422200198, "grad_norm": 0.33300405740737915, "learning_rate": 5.856231487721909e-07, "loss": 0.3792, "step": 3470 }, { "epoch": 2.5800297324083252, "grad_norm": 0.3461776077747345, "learning_rate": 5.835932465674061e-07, "loss": 0.344, "step": 3471 }, { "epoch": 2.580773042616452, "grad_norm": 0.35256701707839966, "learning_rate": 5.815666504649048e-07, "loss": 0.345, "step": 3472 }, { "epoch": 2.581516352824579, "grad_norm": 0.3673005700111389, "learning_rate": 5.795433619817914e-07, "loss": 0.4132, "step": 3473 }, { "epoch": 2.5822596630327057, "grad_norm": 0.32880696654319763, "learning_rate": 5.775233826326882e-07, "loss": 0.3801, "step": 3474 }, { "epoch": 2.5830029732408324, "grad_norm": 0.3996683359146118, "learning_rate": 5.755067139297422e-07, "loss": 0.3681, "step": 3475 }, { "epoch": 2.5837462834489595, "grad_norm": 0.333993524312973, "learning_rate": 5.734933573826246e-07, "loss": 0.3782, "step": 3476 }, { "epoch": 2.584489593657086, "grad_norm": 0.34938544034957886, "learning_rate": 5.714833144985227e-07, "loss": 0.4275, "step": 3477 }, { "epoch": 2.5852329038652133, "grad_norm": 0.2915072739124298, "learning_rate": 5.694765867821473e-07, "loss": 0.3218, "step": 3478 }, { "epoch": 2.58597621407334, "grad_norm": 0.3304077088832855, "learning_rate": 5.674731757357265e-07, "loss": 0.3566, "step": 3479 }, { "epoch": 2.5867195242814667, "grad_norm": 0.35124602913856506, "learning_rate": 5.654730828590032e-07, "loss": 0.3791, "step": 3480 }, { "epoch": 2.587462834489594, "grad_norm": 0.34840187430381775, "learning_rate": 5.63476309649238e-07, "loss": 0.3778, "step": 3481 }, { "epoch": 2.5882061446977205, "grad_norm": 0.35325369238853455, "learning_rate": 5.614828576012077e-07, "loss": 0.4154, "step": 3482 }, { "epoch": 2.5889494549058476, "grad_norm": 0.315580278635025, "learning_rate": 5.594927282072004e-07, "loss": 0.3465, "step": 3483 }, { "epoch": 2.5896927651139743, "grad_norm": 0.3714572787284851, "learning_rate": 5.575059229570162e-07, "loss": 0.421, "step": 3484 }, { "epoch": 2.590436075322101, "grad_norm": 0.34046658873558044, "learning_rate": 5.55522443337973e-07, "loss": 0.4019, "step": 3485 }, { "epoch": 2.591179385530228, "grad_norm": 0.3610271215438843, "learning_rate": 5.535422908348925e-07, "loss": 0.4233, "step": 3486 }, { "epoch": 2.5919226957383548, "grad_norm": 0.3479354977607727, "learning_rate": 5.515654669301068e-07, "loss": 0.3916, "step": 3487 }, { "epoch": 2.592666005946482, "grad_norm": 0.29926806688308716, "learning_rate": 5.495919731034594e-07, "loss": 0.3581, "step": 3488 }, { "epoch": 2.5934093161546086, "grad_norm": 0.3429805338382721, "learning_rate": 5.476218108322973e-07, "loss": 0.4252, "step": 3489 }, { "epoch": 2.5941526263627352, "grad_norm": 0.325579971075058, "learning_rate": 5.45654981591477e-07, "loss": 0.3353, "step": 3490 }, { "epoch": 2.5948959365708624, "grad_norm": 0.3551679253578186, "learning_rate": 5.436914868533566e-07, "loss": 0.3415, "step": 3491 }, { "epoch": 2.595639246778989, "grad_norm": 0.3459392786026001, "learning_rate": 5.41731328087799e-07, "loss": 0.3799, "step": 3492 }, { "epoch": 2.596382556987116, "grad_norm": 0.3100513219833374, "learning_rate": 5.397745067621712e-07, "loss": 0.3741, "step": 3493 }, { "epoch": 2.597125867195243, "grad_norm": 0.3313014507293701, "learning_rate": 5.378210243413418e-07, "loss": 0.4074, "step": 3494 }, { "epoch": 2.5978691774033695, "grad_norm": 0.35886573791503906, "learning_rate": 5.358708822876768e-07, "loss": 0.3749, "step": 3495 }, { "epoch": 2.5986124876114967, "grad_norm": 0.3336045742034912, "learning_rate": 5.339240820610459e-07, "loss": 0.4024, "step": 3496 }, { "epoch": 2.5993557978196233, "grad_norm": 0.34622257947921753, "learning_rate": 5.319806251188137e-07, "loss": 0.4086, "step": 3497 }, { "epoch": 2.6000991080277505, "grad_norm": 0.3304833769798279, "learning_rate": 5.300405129158432e-07, "loss": 0.3502, "step": 3498 }, { "epoch": 2.600842418235877, "grad_norm": 0.33020108938217163, "learning_rate": 5.281037469044947e-07, "loss": 0.3444, "step": 3499 }, { "epoch": 2.601585728444004, "grad_norm": 0.3547552824020386, "learning_rate": 5.261703285346209e-07, "loss": 0.4037, "step": 3500 }, { "epoch": 2.602329038652131, "grad_norm": 0.31656402349472046, "learning_rate": 5.242402592535717e-07, "loss": 0.3698, "step": 3501 }, { "epoch": 2.6030723488602576, "grad_norm": 0.32245200872421265, "learning_rate": 5.223135405061891e-07, "loss": 0.3912, "step": 3502 }, { "epoch": 2.6038156590683847, "grad_norm": 0.32255640625953674, "learning_rate": 5.203901737348044e-07, "loss": 0.3528, "step": 3503 }, { "epoch": 2.6045589692765114, "grad_norm": 0.35167503356933594, "learning_rate": 5.184701603792409e-07, "loss": 0.3995, "step": 3504 }, { "epoch": 2.605302279484638, "grad_norm": 0.3546602129936218, "learning_rate": 5.165535018768136e-07, "loss": 0.416, "step": 3505 }, { "epoch": 2.6060455896927652, "grad_norm": 0.3477526903152466, "learning_rate": 5.14640199662324e-07, "loss": 0.3887, "step": 3506 }, { "epoch": 2.606788899900892, "grad_norm": 0.3188173770904541, "learning_rate": 5.127302551680591e-07, "loss": 0.3925, "step": 3507 }, { "epoch": 2.607532210109019, "grad_norm": 0.2982521951198578, "learning_rate": 5.108236698237995e-07, "loss": 0.3524, "step": 3508 }, { "epoch": 2.6082755203171457, "grad_norm": 0.37978363037109375, "learning_rate": 5.089204450568036e-07, "loss": 0.3974, "step": 3509 }, { "epoch": 2.6090188305252724, "grad_norm": 0.3335146903991699, "learning_rate": 5.070205822918162e-07, "loss": 0.3721, "step": 3510 }, { "epoch": 2.6097621407333995, "grad_norm": 0.3348720073699951, "learning_rate": 5.051240829510678e-07, "loss": 0.3648, "step": 3511 }, { "epoch": 2.610505450941526, "grad_norm": 0.3400416970252991, "learning_rate": 5.03230948454268e-07, "loss": 0.3496, "step": 3512 }, { "epoch": 2.6112487611496533, "grad_norm": 0.358188271522522, "learning_rate": 5.013411802186103e-07, "loss": 0.4104, "step": 3513 }, { "epoch": 2.61199207135778, "grad_norm": 0.33877459168434143, "learning_rate": 4.994547796587651e-07, "loss": 0.3749, "step": 3514 }, { "epoch": 2.6127353815659067, "grad_norm": 0.3115633726119995, "learning_rate": 4.975717481868842e-07, "loss": 0.3692, "step": 3515 }, { "epoch": 2.613478691774034, "grad_norm": 0.3263721764087677, "learning_rate": 4.956920872125953e-07, "loss": 0.3975, "step": 3516 }, { "epoch": 2.6142220019821605, "grad_norm": 0.35218945145606995, "learning_rate": 4.93815798143007e-07, "loss": 0.4469, "step": 3517 }, { "epoch": 2.6149653121902876, "grad_norm": 0.32585445046424866, "learning_rate": 4.919428823826983e-07, "loss": 0.3687, "step": 3518 }, { "epoch": 2.6157086223984143, "grad_norm": 0.35160380601882935, "learning_rate": 4.900733413337267e-07, "loss": 0.3626, "step": 3519 }, { "epoch": 2.616451932606541, "grad_norm": 0.3560079336166382, "learning_rate": 4.882071763956225e-07, "loss": 0.399, "step": 3520 }, { "epoch": 2.617195242814668, "grad_norm": 0.3341442346572876, "learning_rate": 4.863443889653868e-07, "loss": 0.4043, "step": 3521 }, { "epoch": 2.6179385530227948, "grad_norm": 0.36564382910728455, "learning_rate": 4.844849804374957e-07, "loss": 0.3572, "step": 3522 }, { "epoch": 2.618681863230922, "grad_norm": 0.3477229177951813, "learning_rate": 4.826289522038924e-07, "loss": 0.3242, "step": 3523 }, { "epoch": 2.6194251734390486, "grad_norm": 0.34585678577423096, "learning_rate": 4.807763056539922e-07, "loss": 0.4151, "step": 3524 }, { "epoch": 2.6201684836471753, "grad_norm": 0.34454283118247986, "learning_rate": 4.789270421746794e-07, "loss": 0.3528, "step": 3525 }, { "epoch": 2.6209117938553024, "grad_norm": 0.34925180673599243, "learning_rate": 4.770811631503025e-07, "loss": 0.3736, "step": 3526 }, { "epoch": 2.621655104063429, "grad_norm": 0.30749955773353577, "learning_rate": 4.752386699626782e-07, "loss": 0.3698, "step": 3527 }, { "epoch": 2.622398414271556, "grad_norm": 0.3584008812904358, "learning_rate": 4.733995639910899e-07, "loss": 0.3709, "step": 3528 }, { "epoch": 2.623141724479683, "grad_norm": 0.3400258421897888, "learning_rate": 4.7156384661228237e-07, "loss": 0.4047, "step": 3529 }, { "epoch": 2.6238850346878095, "grad_norm": 0.34250569343566895, "learning_rate": 4.6973151920046757e-07, "loss": 0.4074, "step": 3530 }, { "epoch": 2.6246283448959367, "grad_norm": 0.3265153169631958, "learning_rate": 4.679025831273154e-07, "loss": 0.3716, "step": 3531 }, { "epoch": 2.6253716551040633, "grad_norm": 0.3445400297641754, "learning_rate": 4.660770397619607e-07, "loss": 0.3932, "step": 3532 }, { "epoch": 2.6261149653121905, "grad_norm": 0.3447023332118988, "learning_rate": 4.6425489047099595e-07, "loss": 0.4219, "step": 3533 }, { "epoch": 2.626858275520317, "grad_norm": 0.2838222086429596, "learning_rate": 4.62436136618476e-07, "loss": 0.3105, "step": 3534 }, { "epoch": 2.627601585728444, "grad_norm": 0.32646414637565613, "learning_rate": 4.6062077956590845e-07, "loss": 0.3854, "step": 3535 }, { "epoch": 2.628344895936571, "grad_norm": 0.37052375078201294, "learning_rate": 4.588088206722652e-07, "loss": 0.4002, "step": 3536 }, { "epoch": 2.6290882061446976, "grad_norm": 0.3694020211696625, "learning_rate": 4.5700026129396837e-07, "loss": 0.3768, "step": 3537 }, { "epoch": 2.6298315163528247, "grad_norm": 0.35985368490219116, "learning_rate": 4.5519510278489664e-07, "loss": 0.3805, "step": 3538 }, { "epoch": 2.6305748265609514, "grad_norm": 0.29933685064315796, "learning_rate": 4.533933464963847e-07, "loss": 0.3455, "step": 3539 }, { "epoch": 2.631318136769078, "grad_norm": 0.36193856596946716, "learning_rate": 4.5159499377722016e-07, "loss": 0.4332, "step": 3540 }, { "epoch": 2.6320614469772052, "grad_norm": 0.349868506193161, "learning_rate": 4.4980004597364015e-07, "loss": 0.3853, "step": 3541 }, { "epoch": 2.632804757185332, "grad_norm": 0.32413697242736816, "learning_rate": 4.480085044293359e-07, "loss": 0.3696, "step": 3542 }, { "epoch": 2.633548067393459, "grad_norm": 0.36126261949539185, "learning_rate": 4.4622037048544676e-07, "loss": 0.4497, "step": 3543 }, { "epoch": 2.6342913776015857, "grad_norm": 0.30248749256134033, "learning_rate": 4.4443564548056097e-07, "loss": 0.3593, "step": 3544 }, { "epoch": 2.6350346878097124, "grad_norm": 0.2806961238384247, "learning_rate": 4.4265433075071693e-07, "loss": 0.3294, "step": 3545 }, { "epoch": 2.6357779980178395, "grad_norm": 0.32598981261253357, "learning_rate": 4.4087642762939754e-07, "loss": 0.3624, "step": 3546 }, { "epoch": 2.636521308225966, "grad_norm": 0.35427048802375793, "learning_rate": 4.3910193744753414e-07, "loss": 0.4268, "step": 3547 }, { "epoch": 2.6372646184340933, "grad_norm": 0.33973097801208496, "learning_rate": 4.3733086153350255e-07, "loss": 0.3635, "step": 3548 }, { "epoch": 2.63800792864222, "grad_norm": 0.3435434103012085, "learning_rate": 4.355632012131217e-07, "loss": 0.3896, "step": 3549 }, { "epoch": 2.6387512388503467, "grad_norm": 0.3411772847175598, "learning_rate": 4.3379895780965355e-07, "loss": 0.3706, "step": 3550 }, { "epoch": 2.639494549058474, "grad_norm": 0.33123400807380676, "learning_rate": 4.320381326438039e-07, "loss": 0.372, "step": 3551 }, { "epoch": 2.6402378592666005, "grad_norm": 0.3330991864204407, "learning_rate": 4.3028072703371673e-07, "loss": 0.3856, "step": 3552 }, { "epoch": 2.6409811694747276, "grad_norm": 0.36670368909835815, "learning_rate": 4.285267422949807e-07, "loss": 0.4283, "step": 3553 }, { "epoch": 2.6417244796828543, "grad_norm": 0.3370797634124756, "learning_rate": 4.2677617974061837e-07, "loss": 0.3552, "step": 3554 }, { "epoch": 2.642467789890981, "grad_norm": 0.35361459851264954, "learning_rate": 4.2502904068109475e-07, "loss": 0.3746, "step": 3555 }, { "epoch": 2.643211100099108, "grad_norm": 0.34603288769721985, "learning_rate": 4.232853264243092e-07, "loss": 0.3661, "step": 3556 }, { "epoch": 2.6439544103072348, "grad_norm": 0.3484894931316376, "learning_rate": 4.2154503827559976e-07, "loss": 0.3882, "step": 3557 }, { "epoch": 2.644697720515362, "grad_norm": 0.3305910527706146, "learning_rate": 4.1980817753773606e-07, "loss": 0.3535, "step": 3558 }, { "epoch": 2.6454410307234886, "grad_norm": 0.3276902735233307, "learning_rate": 4.180747455109269e-07, "loss": 0.3978, "step": 3559 }, { "epoch": 2.6461843409316153, "grad_norm": 0.3290998637676239, "learning_rate": 4.163447434928103e-07, "loss": 0.3699, "step": 3560 }, { "epoch": 2.6469276511397424, "grad_norm": 0.322940468788147, "learning_rate": 4.146181727784576e-07, "loss": 0.3553, "step": 3561 }, { "epoch": 2.647670961347869, "grad_norm": 0.375618577003479, "learning_rate": 4.128950346603722e-07, "loss": 0.4016, "step": 3562 }, { "epoch": 2.648414271555996, "grad_norm": 0.3457200825214386, "learning_rate": 4.111753304284888e-07, "loss": 0.3816, "step": 3563 }, { "epoch": 2.649157581764123, "grad_norm": 0.3070869743824005, "learning_rate": 4.094590613701682e-07, "loss": 0.3835, "step": 3564 }, { "epoch": 2.6499008919722495, "grad_norm": 0.34021544456481934, "learning_rate": 4.077462287702039e-07, "loss": 0.3783, "step": 3565 }, { "epoch": 2.6506442021803767, "grad_norm": 0.3253824710845947, "learning_rate": 4.0603683391081336e-07, "loss": 0.3677, "step": 3566 }, { "epoch": 2.6513875123885033, "grad_norm": 0.3389133810997009, "learning_rate": 4.043308780716415e-07, "loss": 0.3666, "step": 3567 }, { "epoch": 2.6521308225966305, "grad_norm": 0.31851956248283386, "learning_rate": 4.0262836252976054e-07, "loss": 0.3635, "step": 3568 }, { "epoch": 2.652874132804757, "grad_norm": 0.3498193919658661, "learning_rate": 4.0092928855966494e-07, "loss": 0.4054, "step": 3569 }, { "epoch": 2.653617443012884, "grad_norm": 0.3327455222606659, "learning_rate": 3.992336574332739e-07, "loss": 0.437, "step": 3570 }, { "epoch": 2.654360753221011, "grad_norm": 0.3411284387111664, "learning_rate": 3.975414704199304e-07, "loss": 0.3701, "step": 3571 }, { "epoch": 2.6551040634291376, "grad_norm": 0.31761693954467773, "learning_rate": 3.9585272878639723e-07, "loss": 0.3882, "step": 3572 }, { "epoch": 2.6558473736372648, "grad_norm": 0.33142977952957153, "learning_rate": 3.941674337968582e-07, "loss": 0.401, "step": 3573 }, { "epoch": 2.6565906838453914, "grad_norm": 0.3459870219230652, "learning_rate": 3.924855867129185e-07, "loss": 0.4129, "step": 3574 }, { "epoch": 2.657333994053518, "grad_norm": 0.3214836120605469, "learning_rate": 3.908071887936005e-07, "loss": 0.3637, "step": 3575 }, { "epoch": 2.6580773042616452, "grad_norm": 0.337846040725708, "learning_rate": 3.8913224129534686e-07, "loss": 0.3913, "step": 3576 }, { "epoch": 2.658820614469772, "grad_norm": 0.36832350492477417, "learning_rate": 3.874607454720136e-07, "loss": 0.4262, "step": 3577 }, { "epoch": 2.659563924677899, "grad_norm": 0.37006548047065735, "learning_rate": 3.85792702574877e-07, "loss": 0.3668, "step": 3578 }, { "epoch": 2.6603072348860257, "grad_norm": 0.3460327982902527, "learning_rate": 3.841281138526248e-07, "loss": 0.4132, "step": 3579 }, { "epoch": 2.6610505450941524, "grad_norm": 0.30378904938697815, "learning_rate": 3.8246698055136256e-07, "loss": 0.3281, "step": 3580 }, { "epoch": 2.6617938553022795, "grad_norm": 0.35731178522109985, "learning_rate": 3.8080930391460547e-07, "loss": 0.4374, "step": 3581 }, { "epoch": 2.662537165510406, "grad_norm": 0.3303460478782654, "learning_rate": 3.7915508518328424e-07, "loss": 0.336, "step": 3582 }, { "epoch": 2.6632804757185333, "grad_norm": 0.32463324069976807, "learning_rate": 3.7750432559573834e-07, "loss": 0.3942, "step": 3583 }, { "epoch": 2.66402378592666, "grad_norm": 0.33028727769851685, "learning_rate": 3.7585702638771926e-07, "loss": 0.3619, "step": 3584 }, { "epoch": 2.6647670961347867, "grad_norm": 0.3630736470222473, "learning_rate": 3.7421318879238777e-07, "loss": 0.4134, "step": 3585 }, { "epoch": 2.665510406342914, "grad_norm": 0.3041676878929138, "learning_rate": 3.7257281404031453e-07, "loss": 0.3549, "step": 3586 }, { "epoch": 2.6662537165510405, "grad_norm": 0.35206085443496704, "learning_rate": 3.7093590335947504e-07, "loss": 0.4626, "step": 3587 }, { "epoch": 2.6669970267591676, "grad_norm": 0.3527780771255493, "learning_rate": 3.6930245797525523e-07, "loss": 0.3705, "step": 3588 }, { "epoch": 2.6677403369672943, "grad_norm": 0.3249681293964386, "learning_rate": 3.6767247911044304e-07, "loss": 0.3364, "step": 3589 }, { "epoch": 2.668483647175421, "grad_norm": 0.3222861886024475, "learning_rate": 3.660459679852335e-07, "loss": 0.3784, "step": 3590 }, { "epoch": 2.669226957383548, "grad_norm": 0.3488226532936096, "learning_rate": 3.644229258172272e-07, "loss": 0.4161, "step": 3591 }, { "epoch": 2.6699702675916748, "grad_norm": 0.3438178300857544, "learning_rate": 3.6280335382142426e-07, "loss": 0.3369, "step": 3592 }, { "epoch": 2.670713577799802, "grad_norm": 0.35328179597854614, "learning_rate": 3.611872532102301e-07, "loss": 0.3681, "step": 3593 }, { "epoch": 2.6714568880079286, "grad_norm": 0.36612123250961304, "learning_rate": 3.5957462519345066e-07, "loss": 0.4359, "step": 3594 }, { "epoch": 2.6722001982160553, "grad_norm": 0.3379398584365845, "learning_rate": 3.5796547097829203e-07, "loss": 0.3915, "step": 3595 }, { "epoch": 2.6729435084241824, "grad_norm": 0.33357247710227966, "learning_rate": 3.563597917693584e-07, "loss": 0.3749, "step": 3596 }, { "epoch": 2.673686818632309, "grad_norm": 0.32640862464904785, "learning_rate": 3.5475758876865553e-07, "loss": 0.3592, "step": 3597 }, { "epoch": 2.674430128840436, "grad_norm": 0.3702544867992401, "learning_rate": 3.5315886317558445e-07, "loss": 0.3928, "step": 3598 }, { "epoch": 2.675173439048563, "grad_norm": 0.31617268919944763, "learning_rate": 3.5156361618694514e-07, "loss": 0.3583, "step": 3599 }, { "epoch": 2.6759167492566895, "grad_norm": 0.31834882497787476, "learning_rate": 3.499718489969306e-07, "loss": 0.3527, "step": 3600 }, { "epoch": 2.6766600594648167, "grad_norm": 0.3654824197292328, "learning_rate": 3.483835627971327e-07, "loss": 0.4136, "step": 3601 }, { "epoch": 2.6774033696729433, "grad_norm": 0.31783896684646606, "learning_rate": 3.4679875877653313e-07, "loss": 0.3642, "step": 3602 }, { "epoch": 2.6781466798810705, "grad_norm": 0.3088558316230774, "learning_rate": 3.452174381215112e-07, "loss": 0.39, "step": 3603 }, { "epoch": 2.678889990089197, "grad_norm": 0.33577480912208557, "learning_rate": 3.4363960201583444e-07, "loss": 0.3799, "step": 3604 }, { "epoch": 2.679633300297324, "grad_norm": 0.3575844168663025, "learning_rate": 3.420652516406664e-07, "loss": 0.4285, "step": 3605 }, { "epoch": 2.680376610505451, "grad_norm": 0.315159410238266, "learning_rate": 3.404943881745565e-07, "loss": 0.3382, "step": 3606 }, { "epoch": 2.681119920713578, "grad_norm": 0.3395121693611145, "learning_rate": 3.389270127934463e-07, "loss": 0.3726, "step": 3607 }, { "epoch": 2.6818632309217048, "grad_norm": 0.3748883008956909, "learning_rate": 3.373631266706667e-07, "loss": 0.4282, "step": 3608 }, { "epoch": 2.6826065411298314, "grad_norm": 0.339819997549057, "learning_rate": 3.358027309769363e-07, "loss": 0.3377, "step": 3609 }, { "epoch": 2.683349851337958, "grad_norm": 0.3409182131290436, "learning_rate": 3.342458268803589e-07, "loss": 0.367, "step": 3610 }, { "epoch": 2.6840931615460852, "grad_norm": 0.34955671429634094, "learning_rate": 3.326924155464273e-07, "loss": 0.4142, "step": 3611 }, { "epoch": 2.6848364717542124, "grad_norm": 0.3252088725566864, "learning_rate": 3.311424981380179e-07, "loss": 0.3592, "step": 3612 }, { "epoch": 2.685579781962339, "grad_norm": 0.33751818537712097, "learning_rate": 3.2959607581539043e-07, "loss": 0.3645, "step": 3613 }, { "epoch": 2.6863230921704657, "grad_norm": 0.39097681641578674, "learning_rate": 3.280531497361922e-07, "loss": 0.369, "step": 3614 }, { "epoch": 2.6870664023785924, "grad_norm": 0.3499930500984192, "learning_rate": 3.2651372105544866e-07, "loss": 0.4329, "step": 3615 }, { "epoch": 2.6878097125867195, "grad_norm": 0.34028908610343933, "learning_rate": 3.249777909255697e-07, "loss": 0.3415, "step": 3616 }, { "epoch": 2.6885530227948466, "grad_norm": 0.3224200904369354, "learning_rate": 3.2344536049634665e-07, "loss": 0.3698, "step": 3617 }, { "epoch": 2.6892963330029733, "grad_norm": 0.3299586772918701, "learning_rate": 3.219164309149497e-07, "loss": 0.3584, "step": 3618 }, { "epoch": 2.6900396432111, "grad_norm": 0.36553871631622314, "learning_rate": 3.2039100332592775e-07, "loss": 0.4283, "step": 3619 }, { "epoch": 2.6907829534192267, "grad_norm": 0.3147832751274109, "learning_rate": 3.1886907887121064e-07, "loss": 0.376, "step": 3620 }, { "epoch": 2.691526263627354, "grad_norm": 0.34075266122817993, "learning_rate": 3.173506586901026e-07, "loss": 0.3912, "step": 3621 }, { "epoch": 2.692269573835481, "grad_norm": 0.31407400965690613, "learning_rate": 3.1583574391928715e-07, "loss": 0.3585, "step": 3622 }, { "epoch": 2.6930128840436076, "grad_norm": 0.3671194911003113, "learning_rate": 3.1432433569282216e-07, "loss": 0.3835, "step": 3623 }, { "epoch": 2.6937561942517343, "grad_norm": 0.3539622724056244, "learning_rate": 3.1281643514214087e-07, "loss": 0.3634, "step": 3624 }, { "epoch": 2.694499504459861, "grad_norm": 0.3488197922706604, "learning_rate": 3.11312043396052e-07, "loss": 0.384, "step": 3625 }, { "epoch": 2.695242814667988, "grad_norm": 0.39799612760543823, "learning_rate": 3.0981116158073587e-07, "loss": 0.3764, "step": 3626 }, { "epoch": 2.695986124876115, "grad_norm": 0.34554746747016907, "learning_rate": 3.0831379081974533e-07, "loss": 0.3773, "step": 3627 }, { "epoch": 2.696729435084242, "grad_norm": 0.3721977174282074, "learning_rate": 3.06819932234006e-07, "loss": 0.3762, "step": 3628 }, { "epoch": 2.6974727452923686, "grad_norm": 0.318433940410614, "learning_rate": 3.053295869418138e-07, "loss": 0.3827, "step": 3629 }, { "epoch": 2.6982160555004957, "grad_norm": 0.34560152888298035, "learning_rate": 3.0384275605883363e-07, "loss": 0.3898, "step": 3630 }, { "epoch": 2.6989593657086224, "grad_norm": 0.3625716269016266, "learning_rate": 3.023594406981012e-07, "loss": 0.3682, "step": 3631 }, { "epoch": 2.6997026759167495, "grad_norm": 0.3408142924308777, "learning_rate": 3.0087964197001996e-07, "loss": 0.3633, "step": 3632 }, { "epoch": 2.700445986124876, "grad_norm": 0.3476026952266693, "learning_rate": 2.994033609823599e-07, "loss": 0.3743, "step": 3633 }, { "epoch": 2.701189296333003, "grad_norm": 0.3301336467266083, "learning_rate": 2.979305988402592e-07, "loss": 0.3804, "step": 3634 }, { "epoch": 2.70193260654113, "grad_norm": 0.35420072078704834, "learning_rate": 2.9646135664622043e-07, "loss": 0.3993, "step": 3635 }, { "epoch": 2.7026759167492567, "grad_norm": 0.35516223311424255, "learning_rate": 2.949956355001116e-07, "loss": 0.4005, "step": 3636 }, { "epoch": 2.703419226957384, "grad_norm": 0.34228330850601196, "learning_rate": 2.9353343649916544e-07, "loss": 0.374, "step": 3637 }, { "epoch": 2.7041625371655105, "grad_norm": 0.3282659351825714, "learning_rate": 2.9207476073797645e-07, "loss": 0.3784, "step": 3638 }, { "epoch": 2.704905847373637, "grad_norm": 0.3029738664627075, "learning_rate": 2.9061960930850343e-07, "loss": 0.3875, "step": 3639 }, { "epoch": 2.7056491575817643, "grad_norm": 0.3427216708660126, "learning_rate": 2.891679833000671e-07, "loss": 0.3993, "step": 3640 }, { "epoch": 2.706392467789891, "grad_norm": 0.32390889525413513, "learning_rate": 2.877198837993478e-07, "loss": 0.4003, "step": 3641 }, { "epoch": 2.707135777998018, "grad_norm": 0.29045066237449646, "learning_rate": 2.862753118903855e-07, "loss": 0.3462, "step": 3642 }, { "epoch": 2.7078790882061448, "grad_norm": 0.3875015676021576, "learning_rate": 2.848342686545807e-07, "loss": 0.3735, "step": 3643 }, { "epoch": 2.7086223984142714, "grad_norm": 0.3635460138320923, "learning_rate": 2.8339675517069174e-07, "loss": 0.4472, "step": 3644 }, { "epoch": 2.7093657086223986, "grad_norm": 0.3180927634239197, "learning_rate": 2.819627725148355e-07, "loss": 0.3343, "step": 3645 }, { "epoch": 2.7101090188305252, "grad_norm": 0.33353063464164734, "learning_rate": 2.8053232176048327e-07, "loss": 0.3658, "step": 3646 }, { "epoch": 2.7108523290386524, "grad_norm": 0.3308344781398773, "learning_rate": 2.7910540397846574e-07, "loss": 0.3838, "step": 3647 }, { "epoch": 2.711595639246779, "grad_norm": 0.3467325270175934, "learning_rate": 2.776820202369673e-07, "loss": 0.364, "step": 3648 }, { "epoch": 2.7123389494549057, "grad_norm": 0.3387138545513153, "learning_rate": 2.762621716015257e-07, "loss": 0.3975, "step": 3649 }, { "epoch": 2.713082259663033, "grad_norm": 0.34138524532318115, "learning_rate": 2.7484585913503337e-07, "loss": 0.3893, "step": 3650 }, { "epoch": 2.7138255698711595, "grad_norm": 0.3450799286365509, "learning_rate": 2.734330838977356e-07, "loss": 0.3396, "step": 3651 }, { "epoch": 2.7145688800792867, "grad_norm": 0.3230713903903961, "learning_rate": 2.7202384694723027e-07, "loss": 0.419, "step": 3652 }, { "epoch": 2.7153121902874133, "grad_norm": 0.37166258692741394, "learning_rate": 2.706181493384641e-07, "loss": 0.3815, "step": 3653 }, { "epoch": 2.71605550049554, "grad_norm": 0.33326444029808044, "learning_rate": 2.6921599212373695e-07, "loss": 0.3702, "step": 3654 }, { "epoch": 2.716798810703667, "grad_norm": 0.31730496883392334, "learning_rate": 2.678173763526987e-07, "loss": 0.3596, "step": 3655 }, { "epoch": 2.717542120911794, "grad_norm": 0.3109362721443176, "learning_rate": 2.664223030723445e-07, "loss": 0.4028, "step": 3656 }, { "epoch": 2.718285431119921, "grad_norm": 0.3270038664340973, "learning_rate": 2.650307733270224e-07, "loss": 0.418, "step": 3657 }, { "epoch": 2.7190287413280476, "grad_norm": 0.29827332496643066, "learning_rate": 2.636427881584236e-07, "loss": 0.3239, "step": 3658 }, { "epoch": 2.7197720515361743, "grad_norm": 0.3710884749889374, "learning_rate": 2.6225834860558796e-07, "loss": 0.3927, "step": 3659 }, { "epoch": 2.7205153617443014, "grad_norm": 0.3791698217391968, "learning_rate": 2.6087745570490095e-07, "loss": 0.3492, "step": 3660 }, { "epoch": 2.721258671952428, "grad_norm": 0.324626088142395, "learning_rate": 2.595001104900924e-07, "loss": 0.3835, "step": 3661 }, { "epoch": 2.7220019821605552, "grad_norm": 0.34665173292160034, "learning_rate": 2.5812631399223686e-07, "loss": 0.3874, "step": 3662 }, { "epoch": 2.722745292368682, "grad_norm": 0.34162676334381104, "learning_rate": 2.567560672397529e-07, "loss": 0.408, "step": 3663 }, { "epoch": 2.7234886025768086, "grad_norm": 0.3261469602584839, "learning_rate": 2.553893712583999e-07, "loss": 0.3549, "step": 3664 }, { "epoch": 2.7242319127849357, "grad_norm": 0.3212304711341858, "learning_rate": 2.5402622707128177e-07, "loss": 0.3589, "step": 3665 }, { "epoch": 2.7249752229930624, "grad_norm": 0.34278371930122375, "learning_rate": 2.5266663569884164e-07, "loss": 0.4574, "step": 3666 }, { "epoch": 2.7257185332011895, "grad_norm": 0.3061690330505371, "learning_rate": 2.513105981588626e-07, "loss": 0.3569, "step": 3667 }, { "epoch": 2.726461843409316, "grad_norm": 0.34416767954826355, "learning_rate": 2.499581154664693e-07, "loss": 0.3956, "step": 3668 }, { "epoch": 2.727205153617443, "grad_norm": 0.31906992197036743, "learning_rate": 2.486091886341235e-07, "loss": 0.3549, "step": 3669 }, { "epoch": 2.72794846382557, "grad_norm": 0.33836886286735535, "learning_rate": 2.4726381867162586e-07, "loss": 0.4334, "step": 3670 }, { "epoch": 2.7286917740336967, "grad_norm": 0.325720876455307, "learning_rate": 2.459220065861151e-07, "loss": 0.4069, "step": 3671 }, { "epoch": 2.729435084241824, "grad_norm": 0.297094464302063, "learning_rate": 2.4458375338206553e-07, "loss": 0.3507, "step": 3672 }, { "epoch": 2.7301783944499505, "grad_norm": 0.3216373920440674, "learning_rate": 2.4324906006128637e-07, "loss": 0.3741, "step": 3673 }, { "epoch": 2.730921704658077, "grad_norm": 0.34813687205314636, "learning_rate": 2.41917927622925e-07, "loss": 0.4267, "step": 3674 }, { "epoch": 2.7316650148662043, "grad_norm": 0.31913360953330994, "learning_rate": 2.405903570634593e-07, "loss": 0.4059, "step": 3675 }, { "epoch": 2.732408325074331, "grad_norm": 0.30518263578414917, "learning_rate": 2.3926634937670333e-07, "loss": 0.3696, "step": 3676 }, { "epoch": 2.733151635282458, "grad_norm": 0.3252687156200409, "learning_rate": 2.3794590555380248e-07, "loss": 0.3957, "step": 3677 }, { "epoch": 2.7338949454905848, "grad_norm": 0.29841434955596924, "learning_rate": 2.3662902658323717e-07, "loss": 0.3132, "step": 3678 }, { "epoch": 2.7346382556987114, "grad_norm": 0.3687325119972229, "learning_rate": 2.353157134508155e-07, "loss": 0.4346, "step": 3679 }, { "epoch": 2.7353815659068386, "grad_norm": 0.2987310290336609, "learning_rate": 2.3400596713967827e-07, "loss": 0.3482, "step": 3680 }, { "epoch": 2.7361248761149652, "grad_norm": 0.34879541397094727, "learning_rate": 2.3269978863029618e-07, "loss": 0.3846, "step": 3681 }, { "epoch": 2.7368681863230924, "grad_norm": 0.34952250123023987, "learning_rate": 2.3139717890046654e-07, "loss": 0.4305, "step": 3682 }, { "epoch": 2.737611496531219, "grad_norm": 0.29752251505851746, "learning_rate": 2.3009813892531986e-07, "loss": 0.3797, "step": 3683 }, { "epoch": 2.7383548067393457, "grad_norm": 0.31674882769584656, "learning_rate": 2.2880266967730936e-07, "loss": 0.3501, "step": 3684 }, { "epoch": 2.739098116947473, "grad_norm": 0.36155274510383606, "learning_rate": 2.2751077212621876e-07, "loss": 0.4119, "step": 3685 }, { "epoch": 2.7398414271555995, "grad_norm": 0.33380356431007385, "learning_rate": 2.2622244723915722e-07, "loss": 0.3854, "step": 3686 }, { "epoch": 2.7405847373637267, "grad_norm": 0.3228900730609894, "learning_rate": 2.249376959805577e-07, "loss": 0.3862, "step": 3687 }, { "epoch": 2.7413280475718533, "grad_norm": 0.3209807574748993, "learning_rate": 2.2365651931218035e-07, "loss": 0.392, "step": 3688 }, { "epoch": 2.74207135777998, "grad_norm": 0.30914726853370667, "learning_rate": 2.2237891819310798e-07, "loss": 0.3688, "step": 3689 }, { "epoch": 2.742814667988107, "grad_norm": 0.32062268257141113, "learning_rate": 2.2110489357974607e-07, "loss": 0.4109, "step": 3690 }, { "epoch": 2.743557978196234, "grad_norm": 0.35236066579818726, "learning_rate": 2.1983444642582507e-07, "loss": 0.3919, "step": 3691 }, { "epoch": 2.744301288404361, "grad_norm": 0.3580743670463562, "learning_rate": 2.1856757768239479e-07, "loss": 0.404, "step": 3692 }, { "epoch": 2.7450445986124876, "grad_norm": 0.3323100805282593, "learning_rate": 2.1730428829782767e-07, "loss": 0.3351, "step": 3693 }, { "epoch": 2.7457879088206143, "grad_norm": 0.3282054662704468, "learning_rate": 2.160445792178184e-07, "loss": 0.4041, "step": 3694 }, { "epoch": 2.7465312190287414, "grad_norm": 0.32810819149017334, "learning_rate": 2.1478845138537762e-07, "loss": 0.3655, "step": 3695 }, { "epoch": 2.747274529236868, "grad_norm": 0.3312892019748688, "learning_rate": 2.1353590574083705e-07, "loss": 0.4715, "step": 3696 }, { "epoch": 2.7480178394449952, "grad_norm": 0.2774101793766022, "learning_rate": 2.1228694322184773e-07, "loss": 0.3513, "step": 3697 }, { "epoch": 2.748761149653122, "grad_norm": 0.3155681788921356, "learning_rate": 2.110415647633768e-07, "loss": 0.3694, "step": 3698 }, { "epoch": 2.7495044598612486, "grad_norm": 0.3537297248840332, "learning_rate": 2.0979977129770845e-07, "loss": 0.3688, "step": 3699 }, { "epoch": 2.7502477700693757, "grad_norm": 0.32949137687683105, "learning_rate": 2.0856156375444413e-07, "loss": 0.3611, "step": 3700 }, { "epoch": 2.7509910802775024, "grad_norm": 0.3170293867588043, "learning_rate": 2.0732694306050128e-07, "loss": 0.384, "step": 3701 }, { "epoch": 2.7517343904856295, "grad_norm": 0.35876283049583435, "learning_rate": 2.0609591014010998e-07, "loss": 0.3763, "step": 3702 }, { "epoch": 2.752477700693756, "grad_norm": 0.34015464782714844, "learning_rate": 2.0486846591481758e-07, "loss": 0.3858, "step": 3703 }, { "epoch": 2.753221010901883, "grad_norm": 0.33700427412986755, "learning_rate": 2.0364461130348178e-07, "loss": 0.3706, "step": 3704 }, { "epoch": 2.75396432111001, "grad_norm": 0.31501173973083496, "learning_rate": 2.0242434722227587e-07, "loss": 0.3865, "step": 3705 }, { "epoch": 2.7547076313181367, "grad_norm": 0.316914826631546, "learning_rate": 2.0120767458468304e-07, "loss": 0.3581, "step": 3706 }, { "epoch": 2.755450941526264, "grad_norm": 0.3390875458717346, "learning_rate": 1.9999459430149915e-07, "loss": 0.3954, "step": 3707 }, { "epoch": 2.7561942517343905, "grad_norm": 0.32005366683006287, "learning_rate": 1.987851072808311e-07, "loss": 0.342, "step": 3708 }, { "epoch": 2.756937561942517, "grad_norm": 0.3592652678489685, "learning_rate": 1.975792144280958e-07, "loss": 0.4301, "step": 3709 }, { "epoch": 2.7576808721506443, "grad_norm": 0.3370995819568634, "learning_rate": 1.9637691664601832e-07, "loss": 0.391, "step": 3710 }, { "epoch": 2.758424182358771, "grad_norm": 0.3212362229824066, "learning_rate": 1.9517821483463428e-07, "loss": 0.3618, "step": 3711 }, { "epoch": 2.759167492566898, "grad_norm": 0.33853816986083984, "learning_rate": 1.939831098912859e-07, "loss": 0.3744, "step": 3712 }, { "epoch": 2.7599108027750248, "grad_norm": 0.34293466806411743, "learning_rate": 1.9279160271062304e-07, "loss": 0.3727, "step": 3713 }, { "epoch": 2.7606541129831514, "grad_norm": 0.3495626151561737, "learning_rate": 1.9160369418460334e-07, "loss": 0.4045, "step": 3714 }, { "epoch": 2.7613974231912786, "grad_norm": 0.3785433769226074, "learning_rate": 1.9041938520248937e-07, "loss": 0.3821, "step": 3715 }, { "epoch": 2.7621407333994052, "grad_norm": 0.3435104787349701, "learning_rate": 1.8923867665084917e-07, "loss": 0.3784, "step": 3716 }, { "epoch": 2.7628840436075324, "grad_norm": 0.3636457622051239, "learning_rate": 1.880615694135568e-07, "loss": 0.391, "step": 3717 }, { "epoch": 2.763627353815659, "grad_norm": 0.31064629554748535, "learning_rate": 1.8688806437178965e-07, "loss": 0.3424, "step": 3718 }, { "epoch": 2.7643706640237857, "grad_norm": 0.3246710002422333, "learning_rate": 1.8571816240402662e-07, "loss": 0.4004, "step": 3719 }, { "epoch": 2.765113974231913, "grad_norm": 0.3444274067878723, "learning_rate": 1.8455186438605277e-07, "loss": 0.3808, "step": 3720 }, { "epoch": 2.7658572844400395, "grad_norm": 0.33140072226524353, "learning_rate": 1.83389171190953e-07, "loss": 0.3854, "step": 3721 }, { "epoch": 2.7666005946481667, "grad_norm": 0.3463555574417114, "learning_rate": 1.8223008368911387e-07, "loss": 0.3426, "step": 3722 }, { "epoch": 2.7673439048562933, "grad_norm": 0.3460528552532196, "learning_rate": 1.8107460274822296e-07, "loss": 0.3672, "step": 3723 }, { "epoch": 2.76808721506442, "grad_norm": 0.3352486193180084, "learning_rate": 1.7992272923326947e-07, "loss": 0.3803, "step": 3724 }, { "epoch": 2.768830525272547, "grad_norm": 0.3410413861274719, "learning_rate": 1.787744640065392e-07, "loss": 0.3762, "step": 3725 }, { "epoch": 2.769573835480674, "grad_norm": 0.3888305723667145, "learning_rate": 1.7762980792761953e-07, "loss": 0.4095, "step": 3726 }, { "epoch": 2.770317145688801, "grad_norm": 0.31619301438331604, "learning_rate": 1.7648876185339336e-07, "loss": 0.4015, "step": 3727 }, { "epoch": 2.7710604558969276, "grad_norm": 0.3280472457408905, "learning_rate": 1.7535132663804465e-07, "loss": 0.4062, "step": 3728 }, { "epoch": 2.7718037661050543, "grad_norm": 0.32219988107681274, "learning_rate": 1.742175031330512e-07, "loss": 0.3314, "step": 3729 }, { "epoch": 2.7725470763131814, "grad_norm": 0.36063966155052185, "learning_rate": 1.7308729218718734e-07, "loss": 0.4012, "step": 3730 }, { "epoch": 2.773290386521308, "grad_norm": 0.3219292461872101, "learning_rate": 1.719606946465252e-07, "loss": 0.3618, "step": 3731 }, { "epoch": 2.7740336967294352, "grad_norm": 0.3382083773612976, "learning_rate": 1.7083771135443071e-07, "loss": 0.4106, "step": 3732 }, { "epoch": 2.774777006937562, "grad_norm": 0.3206212818622589, "learning_rate": 1.697183431515631e-07, "loss": 0.3702, "step": 3733 }, { "epoch": 2.7755203171456886, "grad_norm": 0.3241714537143707, "learning_rate": 1.6860259087587705e-07, "loss": 0.3627, "step": 3734 }, { "epoch": 2.7762636273538157, "grad_norm": 0.33629587292671204, "learning_rate": 1.6749045536262053e-07, "loss": 0.3966, "step": 3735 }, { "epoch": 2.7770069375619424, "grad_norm": 0.33336764574050903, "learning_rate": 1.6638193744433096e-07, "loss": 0.3699, "step": 3736 }, { "epoch": 2.7777502477700695, "grad_norm": 0.34643295407295227, "learning_rate": 1.6527703795084227e-07, "loss": 0.4042, "step": 3737 }, { "epoch": 2.778493557978196, "grad_norm": 0.3282679617404938, "learning_rate": 1.6417575770927619e-07, "loss": 0.3806, "step": 3738 }, { "epoch": 2.779236868186323, "grad_norm": 0.33272168040275574, "learning_rate": 1.630780975440449e-07, "loss": 0.3986, "step": 3739 }, { "epoch": 2.77998017839445, "grad_norm": 0.31825581192970276, "learning_rate": 1.619840582768545e-07, "loss": 0.382, "step": 3740 }, { "epoch": 2.7807234886025767, "grad_norm": 0.3236848711967468, "learning_rate": 1.6089364072669588e-07, "loss": 0.3603, "step": 3741 }, { "epoch": 2.781466798810704, "grad_norm": 0.3125016689300537, "learning_rate": 1.5980684570985172e-07, "loss": 0.3756, "step": 3742 }, { "epoch": 2.7822101090188305, "grad_norm": 0.31585437059402466, "learning_rate": 1.5872367403989175e-07, "loss": 0.4193, "step": 3743 }, { "epoch": 2.782953419226957, "grad_norm": 0.3737429082393646, "learning_rate": 1.5764412652767237e-07, "loss": 0.4062, "step": 3744 }, { "epoch": 2.7836967294350843, "grad_norm": 0.31965190172195435, "learning_rate": 1.5656820398133987e-07, "loss": 0.3266, "step": 3745 }, { "epoch": 2.784440039643211, "grad_norm": 0.34518736600875854, "learning_rate": 1.5549590720632334e-07, "loss": 0.3832, "step": 3746 }, { "epoch": 2.785183349851338, "grad_norm": 0.34518963098526, "learning_rate": 1.5442723700534067e-07, "loss": 0.3773, "step": 3747 }, { "epoch": 2.7859266600594648, "grad_norm": 0.30497676134109497, "learning_rate": 1.5336219417839248e-07, "loss": 0.3699, "step": 3748 }, { "epoch": 2.7866699702675914, "grad_norm": 0.3388044834136963, "learning_rate": 1.52300779522766e-07, "loss": 0.3619, "step": 3749 }, { "epoch": 2.7874132804757186, "grad_norm": 0.3740440011024475, "learning_rate": 1.512429938330301e-07, "loss": 0.4301, "step": 3750 }, { "epoch": 2.7881565906838452, "grad_norm": 0.33988380432128906, "learning_rate": 1.5018883790104023e-07, "loss": 0.4028, "step": 3751 }, { "epoch": 2.7888999008919724, "grad_norm": 0.31281766295433044, "learning_rate": 1.491383125159318e-07, "loss": 0.3574, "step": 3752 }, { "epoch": 2.789643211100099, "grad_norm": 0.2948988080024719, "learning_rate": 1.48091418464123e-07, "loss": 0.3348, "step": 3753 }, { "epoch": 2.7903865213082257, "grad_norm": 0.32226285338401794, "learning_rate": 1.470481565293147e-07, "loss": 0.3954, "step": 3754 }, { "epoch": 2.791129831516353, "grad_norm": 0.3671989142894745, "learning_rate": 1.4600852749248827e-07, "loss": 0.4296, "step": 3755 }, { "epoch": 2.7918731417244795, "grad_norm": 0.3198404312133789, "learning_rate": 1.4497253213190397e-07, "loss": 0.3896, "step": 3756 }, { "epoch": 2.7926164519326067, "grad_norm": 0.3123762607574463, "learning_rate": 1.4394017122310532e-07, "loss": 0.3742, "step": 3757 }, { "epoch": 2.7933597621407333, "grad_norm": 0.34860214591026306, "learning_rate": 1.429114455389119e-07, "loss": 0.4272, "step": 3758 }, { "epoch": 2.79410307234886, "grad_norm": 0.3223695456981659, "learning_rate": 1.4188635584942269e-07, "loss": 0.351, "step": 3759 }, { "epoch": 2.794846382556987, "grad_norm": 0.3124809265136719, "learning_rate": 1.408649029220166e-07, "loss": 0.3682, "step": 3760 }, { "epoch": 2.795589692765114, "grad_norm": 0.3216976523399353, "learning_rate": 1.3984708752134758e-07, "loss": 0.3888, "step": 3761 }, { "epoch": 2.796333002973241, "grad_norm": 0.32522428035736084, "learning_rate": 1.388329104093472e-07, "loss": 0.3765, "step": 3762 }, { "epoch": 2.7970763131813676, "grad_norm": 0.32932916283607483, "learning_rate": 1.3782237234522543e-07, "loss": 0.3685, "step": 3763 }, { "epoch": 2.7978196233894943, "grad_norm": 0.34168654680252075, "learning_rate": 1.3681547408546602e-07, "loss": 0.3992, "step": 3764 }, { "epoch": 2.7985629335976214, "grad_norm": 0.342421293258667, "learning_rate": 1.3581221638382826e-07, "loss": 0.3535, "step": 3765 }, { "epoch": 2.799306243805748, "grad_norm": 0.34484970569610596, "learning_rate": 1.3481259999134632e-07, "loss": 0.4045, "step": 3766 }, { "epoch": 2.8000495540138752, "grad_norm": 0.3042052686214447, "learning_rate": 1.338166256563278e-07, "loss": 0.3658, "step": 3767 }, { "epoch": 2.800792864222002, "grad_norm": 0.34621384739875793, "learning_rate": 1.3282429412435683e-07, "loss": 0.4025, "step": 3768 }, { "epoch": 2.8015361744301286, "grad_norm": 0.3411814272403717, "learning_rate": 1.3183560613828638e-07, "loss": 0.3665, "step": 3769 }, { "epoch": 2.8022794846382557, "grad_norm": 0.3769450783729553, "learning_rate": 1.308505624382439e-07, "loss": 0.3799, "step": 3770 }, { "epoch": 2.8030227948463824, "grad_norm": 0.3534631133079529, "learning_rate": 1.298691637616295e-07, "loss": 0.3723, "step": 3771 }, { "epoch": 2.8037661050545095, "grad_norm": 0.31069663166999817, "learning_rate": 1.2889141084311385e-07, "loss": 0.3694, "step": 3772 }, { "epoch": 2.804509415262636, "grad_norm": 0.3094932436943054, "learning_rate": 1.2791730441463757e-07, "loss": 0.3721, "step": 3773 }, { "epoch": 2.805252725470763, "grad_norm": 0.372620552778244, "learning_rate": 1.2694684520541346e-07, "loss": 0.4232, "step": 3774 }, { "epoch": 2.80599603567889, "grad_norm": 0.3233175575733185, "learning_rate": 1.2598003394192203e-07, "loss": 0.3496, "step": 3775 }, { "epoch": 2.806739345887017, "grad_norm": 0.3541308045387268, "learning_rate": 1.2501687134791386e-07, "loss": 0.4243, "step": 3776 }, { "epoch": 2.807482656095144, "grad_norm": 0.29624083638191223, "learning_rate": 1.2405735814440878e-07, "loss": 0.3295, "step": 3777 }, { "epoch": 2.8082259663032705, "grad_norm": 0.33311066031455994, "learning_rate": 1.2310149504969392e-07, "loss": 0.3895, "step": 3778 }, { "epoch": 2.808969276511397, "grad_norm": 0.37912705540657043, "learning_rate": 1.2214928277932302e-07, "loss": 0.3823, "step": 3779 }, { "epoch": 2.8097125867195243, "grad_norm": 0.3322674334049225, "learning_rate": 1.2120072204611978e-07, "loss": 0.3913, "step": 3780 }, { "epoch": 2.8104558969276514, "grad_norm": 0.3758462369441986, "learning_rate": 1.2025581356017169e-07, "loss": 0.373, "step": 3781 }, { "epoch": 2.811199207135778, "grad_norm": 0.3585951626300812, "learning_rate": 1.193145580288324e-07, "loss": 0.397, "step": 3782 }, { "epoch": 2.8119425173439048, "grad_norm": 0.35420677065849304, "learning_rate": 1.1837695615672218e-07, "loss": 0.3967, "step": 3783 }, { "epoch": 2.8126858275520314, "grad_norm": 0.3297853171825409, "learning_rate": 1.1744300864572512e-07, "loss": 0.3694, "step": 3784 }, { "epoch": 2.8134291377601586, "grad_norm": 0.35278066992759705, "learning_rate": 1.1651271619499027e-07, "loss": 0.4279, "step": 3785 }, { "epoch": 2.8141724479682857, "grad_norm": 0.31033214926719666, "learning_rate": 1.1558607950093115e-07, "loss": 0.3711, "step": 3786 }, { "epoch": 2.8149157581764124, "grad_norm": 0.36761006712913513, "learning_rate": 1.146630992572234e-07, "loss": 0.4338, "step": 3787 }, { "epoch": 2.815659068384539, "grad_norm": 0.3094707429409027, "learning_rate": 1.1374377615480547e-07, "loss": 0.3363, "step": 3788 }, { "epoch": 2.8164023785926657, "grad_norm": 0.3448610305786133, "learning_rate": 1.1282811088187906e-07, "loss": 0.4172, "step": 3789 }, { "epoch": 2.817145688800793, "grad_norm": 0.33280327916145325, "learning_rate": 1.1191610412390642e-07, "loss": 0.4029, "step": 3790 }, { "epoch": 2.81788899900892, "grad_norm": 0.3206738233566284, "learning_rate": 1.110077565636125e-07, "loss": 0.3444, "step": 3791 }, { "epoch": 2.8186323092170467, "grad_norm": 0.3711231052875519, "learning_rate": 1.1010306888098176e-07, "loss": 0.3744, "step": 3792 }, { "epoch": 2.8193756194251733, "grad_norm": 0.3383576273918152, "learning_rate": 1.092020417532591e-07, "loss": 0.3991, "step": 3793 }, { "epoch": 2.8201189296333, "grad_norm": 0.3202008903026581, "learning_rate": 1.0830467585494941e-07, "loss": 0.3852, "step": 3794 }, { "epoch": 2.820862239841427, "grad_norm": 0.3197800815105438, "learning_rate": 1.0741097185781812e-07, "loss": 0.3713, "step": 3795 }, { "epoch": 2.8216055500495543, "grad_norm": 0.354966938495636, "learning_rate": 1.0652093043088618e-07, "loss": 0.4554, "step": 3796 }, { "epoch": 2.822348860257681, "grad_norm": 0.35659822821617126, "learning_rate": 1.0563455224043562e-07, "loss": 0.3482, "step": 3797 }, { "epoch": 2.8230921704658076, "grad_norm": 0.32030314207077026, "learning_rate": 1.047518379500051e-07, "loss": 0.3598, "step": 3798 }, { "epoch": 2.8238354806739348, "grad_norm": 0.33420777320861816, "learning_rate": 1.0387278822038993e-07, "loss": 0.3744, "step": 3799 }, { "epoch": 2.8245787908820614, "grad_norm": 0.3598559498786926, "learning_rate": 1.0299740370964373e-07, "loss": 0.3834, "step": 3800 }, { "epoch": 2.8253221010901886, "grad_norm": 0.3414316475391388, "learning_rate": 1.0212568507307507e-07, "loss": 0.356, "step": 3801 }, { "epoch": 2.8260654112983152, "grad_norm": 0.3600325882434845, "learning_rate": 1.0125763296324809e-07, "loss": 0.4364, "step": 3802 }, { "epoch": 2.826808721506442, "grad_norm": 0.37809687852859497, "learning_rate": 1.0039324802998351e-07, "loss": 0.3695, "step": 3803 }, { "epoch": 2.827552031714569, "grad_norm": 0.3606187701225281, "learning_rate": 9.953253092035597e-08, "loss": 0.3968, "step": 3804 }, { "epoch": 2.8282953419226957, "grad_norm": 0.32740193605422974, "learning_rate": 9.867548227869339e-08, "loss": 0.3828, "step": 3805 }, { "epoch": 2.829038652130823, "grad_norm": 0.31878820061683655, "learning_rate": 9.782210274657978e-08, "loss": 0.3457, "step": 3806 }, { "epoch": 2.8297819623389495, "grad_norm": 0.3352677822113037, "learning_rate": 9.697239296285022e-08, "loss": 0.3991, "step": 3807 }, { "epoch": 2.830525272547076, "grad_norm": 0.36180925369262695, "learning_rate": 9.612635356359479e-08, "loss": 0.3769, "step": 3808 }, { "epoch": 2.8312685827552033, "grad_norm": 0.325186550617218, "learning_rate": 9.528398518215354e-08, "loss": 0.3509, "step": 3809 }, { "epoch": 2.83201189296333, "grad_norm": 0.37409183382987976, "learning_rate": 9.444528844912093e-08, "loss": 0.4438, "step": 3810 }, { "epoch": 2.832755203171457, "grad_norm": 0.31495949625968933, "learning_rate": 9.36102639923403e-08, "loss": 0.3372, "step": 3811 }, { "epoch": 2.833498513379584, "grad_norm": 0.36320653557777405, "learning_rate": 9.277891243690829e-08, "loss": 0.3692, "step": 3812 }, { "epoch": 2.8342418235877105, "grad_norm": 0.337201327085495, "learning_rate": 9.19512344051704e-08, "loss": 0.3818, "step": 3813 }, { "epoch": 2.8349851337958376, "grad_norm": 0.33606988191604614, "learning_rate": 9.112723051672268e-08, "loss": 0.4076, "step": 3814 }, { "epoch": 2.8357284440039643, "grad_norm": 0.31431248784065247, "learning_rate": 9.030690138841059e-08, "loss": 0.3501, "step": 3815 }, { "epoch": 2.8364717542120914, "grad_norm": 0.36629927158355713, "learning_rate": 8.949024763432957e-08, "loss": 0.4242, "step": 3816 }, { "epoch": 2.837215064420218, "grad_norm": 0.333042711019516, "learning_rate": 8.86772698658217e-08, "loss": 0.3677, "step": 3817 }, { "epoch": 2.8379583746283448, "grad_norm": 0.3328089118003845, "learning_rate": 8.786796869147963e-08, "loss": 0.4061, "step": 3818 }, { "epoch": 2.838701684836472, "grad_norm": 0.3331633508205414, "learning_rate": 8.70623447171426e-08, "loss": 0.3783, "step": 3819 }, { "epoch": 2.8394449950445986, "grad_norm": 0.3353897035121918, "learning_rate": 8.626039854589652e-08, "loss": 0.3637, "step": 3820 }, { "epoch": 2.8401883052527257, "grad_norm": 0.3191912770271301, "learning_rate": 8.546213077807452e-08, "loss": 0.4232, "step": 3821 }, { "epoch": 2.8409316154608524, "grad_norm": 0.3197457492351532, "learning_rate": 8.466754201125693e-08, "loss": 0.3496, "step": 3822 }, { "epoch": 2.841674925668979, "grad_norm": 0.3421800434589386, "learning_rate": 8.387663284026904e-08, "loss": 0.4101, "step": 3823 }, { "epoch": 2.842418235877106, "grad_norm": 0.3245975077152252, "learning_rate": 8.308940385718168e-08, "loss": 0.3776, "step": 3824 }, { "epoch": 2.843161546085233, "grad_norm": 0.3231024742126465, "learning_rate": 8.230585565131121e-08, "loss": 0.3511, "step": 3825 }, { "epoch": 2.84390485629336, "grad_norm": 0.31483033299446106, "learning_rate": 8.152598880921902e-08, "loss": 0.4294, "step": 3826 }, { "epoch": 2.8446481665014867, "grad_norm": 0.3014974892139435, "learning_rate": 8.074980391470866e-08, "loss": 0.3984, "step": 3827 }, { "epoch": 2.8453914767096133, "grad_norm": 0.3461579382419586, "learning_rate": 7.997730154882865e-08, "loss": 0.3535, "step": 3828 }, { "epoch": 2.8461347869177405, "grad_norm": 0.3502555191516876, "learning_rate": 7.920848228987199e-08, "loss": 0.41, "step": 3829 }, { "epoch": 2.846878097125867, "grad_norm": 0.30863603949546814, "learning_rate": 7.844334671337273e-08, "loss": 0.3279, "step": 3830 }, { "epoch": 2.8476214073339943, "grad_norm": 0.3643236756324768, "learning_rate": 7.768189539210713e-08, "loss": 0.4219, "step": 3831 }, { "epoch": 2.848364717542121, "grad_norm": 0.3270271122455597, "learning_rate": 7.692412889609591e-08, "loss": 0.3779, "step": 3832 }, { "epoch": 2.8491080277502476, "grad_norm": 0.3325539827346802, "learning_rate": 7.617004779259807e-08, "loss": 0.3885, "step": 3833 }, { "epoch": 2.8498513379583748, "grad_norm": 0.3375898599624634, "learning_rate": 7.541965264611594e-08, "loss": 0.4136, "step": 3834 }, { "epoch": 2.8505946481665014, "grad_norm": 0.30124691128730774, "learning_rate": 7.467294401839243e-08, "loss": 0.3665, "step": 3835 }, { "epoch": 2.8513379583746286, "grad_norm": 0.35185688734054565, "learning_rate": 7.39299224684098e-08, "loss": 0.3953, "step": 3836 }, { "epoch": 2.8520812685827552, "grad_norm": 0.3095182776451111, "learning_rate": 7.319058855239036e-08, "loss": 0.3685, "step": 3837 }, { "epoch": 2.852824578790882, "grad_norm": 0.35261499881744385, "learning_rate": 7.245494282379695e-08, "loss": 0.421, "step": 3838 }, { "epoch": 2.853567888999009, "grad_norm": 0.3111301362514496, "learning_rate": 7.17229858333296e-08, "loss": 0.3345, "step": 3839 }, { "epoch": 2.8543111992071357, "grad_norm": 0.35448822379112244, "learning_rate": 7.099471812892944e-08, "loss": 0.3904, "step": 3840 }, { "epoch": 2.855054509415263, "grad_norm": 0.34954366087913513, "learning_rate": 7.027014025577316e-08, "loss": 0.415, "step": 3841 }, { "epoch": 2.8557978196233895, "grad_norm": 0.3203752636909485, "learning_rate": 6.954925275627688e-08, "loss": 0.3755, "step": 3842 }, { "epoch": 2.856541129831516, "grad_norm": 0.3749403655529022, "learning_rate": 6.883205617009447e-08, "loss": 0.3905, "step": 3843 }, { "epoch": 2.8572844400396433, "grad_norm": 0.2971706986427307, "learning_rate": 6.81185510341148e-08, "loss": 0.3385, "step": 3844 }, { "epoch": 2.85802775024777, "grad_norm": 0.30548569560050964, "learning_rate": 6.74087378824656e-08, "loss": 0.3492, "step": 3845 }, { "epoch": 2.858771060455897, "grad_norm": 0.3277958035469055, "learning_rate": 6.670261724650906e-08, "loss": 0.3964, "step": 3846 }, { "epoch": 2.859514370664024, "grad_norm": 0.3480546176433563, "learning_rate": 6.600018965484457e-08, "loss": 0.3524, "step": 3847 }, { "epoch": 2.8602576808721505, "grad_norm": 0.3546777367591858, "learning_rate": 6.530145563330592e-08, "loss": 0.4203, "step": 3848 }, { "epoch": 2.8610009910802776, "grad_norm": 0.2999485433101654, "learning_rate": 6.46064157049625e-08, "loss": 0.3693, "step": 3849 }, { "epoch": 2.8617443012884043, "grad_norm": 0.34053683280944824, "learning_rate": 6.391507039011813e-08, "loss": 0.3554, "step": 3850 }, { "epoch": 2.8624876114965314, "grad_norm": 0.3355914056301117, "learning_rate": 6.322742020631045e-08, "loss": 0.388, "step": 3851 }, { "epoch": 2.863230921704658, "grad_norm": 0.3175380527973175, "learning_rate": 6.25434656683116e-08, "loss": 0.3408, "step": 3852 }, { "epoch": 2.8639742319127848, "grad_norm": 0.3518241047859192, "learning_rate": 6.186320728812645e-08, "loss": 0.4239, "step": 3853 }, { "epoch": 2.864717542120912, "grad_norm": 0.31521138548851013, "learning_rate": 6.118664557499376e-08, "loss": 0.3747, "step": 3854 }, { "epoch": 2.8654608523290386, "grad_norm": 0.3258119821548462, "learning_rate": 6.051378103538452e-08, "loss": 0.3682, "step": 3855 }, { "epoch": 2.8662041625371657, "grad_norm": 0.32203030586242676, "learning_rate": 5.984461417300246e-08, "loss": 0.3814, "step": 3856 }, { "epoch": 2.8669474727452924, "grad_norm": 0.3281584084033966, "learning_rate": 5.91791454887819e-08, "loss": 0.3631, "step": 3857 }, { "epoch": 2.867690782953419, "grad_norm": 0.3356817960739136, "learning_rate": 5.851737548089098e-08, "loss": 0.3865, "step": 3858 }, { "epoch": 2.868434093161546, "grad_norm": 0.3504719138145447, "learning_rate": 5.785930464472678e-08, "loss": 0.3961, "step": 3859 }, { "epoch": 2.869177403369673, "grad_norm": 0.3284200131893158, "learning_rate": 5.720493347291911e-08, "loss": 0.3361, "step": 3860 }, { "epoch": 2.8699207135778, "grad_norm": 0.3548144996166229, "learning_rate": 5.6554262455326136e-08, "loss": 0.4459, "step": 3861 }, { "epoch": 2.8706640237859267, "grad_norm": 0.33476972579956055, "learning_rate": 5.590729207903767e-08, "loss": 0.3667, "step": 3862 }, { "epoch": 2.8714073339940533, "grad_norm": 0.31432369351387024, "learning_rate": 5.526402282837351e-08, "loss": 0.3553, "step": 3863 }, { "epoch": 2.8721506442021805, "grad_norm": 0.3124595880508423, "learning_rate": 5.46244551848818e-08, "loss": 0.369, "step": 3864 }, { "epoch": 2.872893954410307, "grad_norm": 0.32317861914634705, "learning_rate": 5.3988589627339e-08, "loss": 0.4033, "step": 3865 }, { "epoch": 2.8736372646184343, "grad_norm": 0.3632037341594696, "learning_rate": 5.335642663175267e-08, "loss": 0.4478, "step": 3866 }, { "epoch": 2.874380574826561, "grad_norm": 0.3291177451610565, "learning_rate": 5.2727966671357047e-08, "loss": 0.3699, "step": 3867 }, { "epoch": 2.8751238850346876, "grad_norm": 0.3434256613254547, "learning_rate": 5.210321021661302e-08, "loss": 0.3801, "step": 3868 }, { "epoch": 2.8758671952428148, "grad_norm": 0.32145461440086365, "learning_rate": 5.1482157735212036e-08, "loss": 0.3784, "step": 3869 }, { "epoch": 2.8766105054509414, "grad_norm": 0.33865225315093994, "learning_rate": 5.086480969206997e-08, "loss": 0.3754, "step": 3870 }, { "epoch": 2.8773538156590686, "grad_norm": 0.3301805257797241, "learning_rate": 5.0251166549331595e-08, "loss": 0.3319, "step": 3871 }, { "epoch": 2.8780971258671952, "grad_norm": 0.33617299795150757, "learning_rate": 4.964122876636779e-08, "loss": 0.3902, "step": 3872 }, { "epoch": 2.878840436075322, "grad_norm": 0.3278627097606659, "learning_rate": 4.9034996799773856e-08, "loss": 0.3464, "step": 3873 }, { "epoch": 2.879583746283449, "grad_norm": 0.3446265757083893, "learning_rate": 4.843247110337346e-08, "loss": 0.3637, "step": 3874 }, { "epoch": 2.8803270564915757, "grad_norm": 0.31772372126579285, "learning_rate": 4.783365212821467e-08, "loss": 0.3319, "step": 3875 }, { "epoch": 2.881070366699703, "grad_norm": 0.3482321500778198, "learning_rate": 4.723854032257003e-08, "loss": 0.4091, "step": 3876 }, { "epoch": 2.8818136769078295, "grad_norm": 0.3354433476924896, "learning_rate": 4.664713613193761e-08, "loss": 0.4332, "step": 3877 }, { "epoch": 2.882556987115956, "grad_norm": 0.3021473288536072, "learning_rate": 4.60594399990405e-08, "loss": 0.3799, "step": 3878 }, { "epoch": 2.8833002973240833, "grad_norm": 0.3120580315589905, "learning_rate": 4.547545236382511e-08, "loss": 0.3665, "step": 3879 }, { "epoch": 2.88404360753221, "grad_norm": 0.3367762267589569, "learning_rate": 4.48951736634623e-08, "loss": 0.3873, "step": 3880 }, { "epoch": 2.884786917740337, "grad_norm": 0.3426544964313507, "learning_rate": 4.431860433234514e-08, "loss": 0.3961, "step": 3881 }, { "epoch": 2.885530227948464, "grad_norm": 0.3164845108985901, "learning_rate": 4.3745744802091706e-08, "loss": 0.3426, "step": 3882 }, { "epoch": 2.8862735381565905, "grad_norm": 0.3149460256099701, "learning_rate": 4.317659550154174e-08, "loss": 0.3777, "step": 3883 }, { "epoch": 2.8870168483647176, "grad_norm": 0.3018018305301666, "learning_rate": 4.2611156856758315e-08, "loss": 0.3931, "step": 3884 }, { "epoch": 2.8877601585728443, "grad_norm": 0.35976284742355347, "learning_rate": 4.2049429291025065e-08, "loss": 0.4201, "step": 3885 }, { "epoch": 2.8885034687809714, "grad_norm": 0.36039769649505615, "learning_rate": 4.1491413224850064e-08, "loss": 0.3855, "step": 3886 }, { "epoch": 2.889246778989098, "grad_norm": 0.3175292909145355, "learning_rate": 4.0937109075960826e-08, "loss": 0.3873, "step": 3887 }, { "epoch": 2.8899900891972248, "grad_norm": 0.34473103284835815, "learning_rate": 4.038651725930765e-08, "loss": 0.3827, "step": 3888 }, { "epoch": 2.890733399405352, "grad_norm": 0.34236323833465576, "learning_rate": 3.983963818706027e-08, "loss": 0.3686, "step": 3889 }, { "epoch": 2.8914767096134786, "grad_norm": 0.3348133862018585, "learning_rate": 3.9296472268610666e-08, "loss": 0.3951, "step": 3890 }, { "epoch": 2.8922200198216057, "grad_norm": 0.3377768099308014, "learning_rate": 3.8757019910570236e-08, "loss": 0.3954, "step": 3891 }, { "epoch": 2.8929633300297324, "grad_norm": 0.3096199035644531, "learning_rate": 3.822128151676985e-08, "loss": 0.3651, "step": 3892 }, { "epoch": 2.893706640237859, "grad_norm": 0.3297830820083618, "learning_rate": 3.768925748826202e-08, "loss": 0.3595, "step": 3893 }, { "epoch": 2.894449950445986, "grad_norm": 0.33860325813293457, "learning_rate": 3.7160948223316505e-08, "loss": 0.4161, "step": 3894 }, { "epoch": 2.895193260654113, "grad_norm": 0.33023664355278015, "learning_rate": 3.6636354117423613e-08, "loss": 0.3973, "step": 3895 }, { "epoch": 2.89593657086224, "grad_norm": 0.3146475851535797, "learning_rate": 3.611547556329253e-08, "loss": 0.3561, "step": 3896 }, { "epoch": 2.8966798810703667, "grad_norm": 0.31828275322914124, "learning_rate": 3.5598312950850254e-08, "loss": 0.3983, "step": 3897 }, { "epoch": 2.8974231912784933, "grad_norm": 0.30645060539245605, "learning_rate": 3.5084866667242644e-08, "loss": 0.3975, "step": 3898 }, { "epoch": 2.8981665014866205, "grad_norm": 0.32914918661117554, "learning_rate": 3.45751370968328e-08, "loss": 0.4252, "step": 3899 }, { "epoch": 2.898909811694747, "grad_norm": 0.27998411655426025, "learning_rate": 3.406912462120271e-08, "loss": 0.3195, "step": 3900 }, { "epoch": 2.8996531219028743, "grad_norm": 0.361803263425827, "learning_rate": 3.3566829619149923e-08, "loss": 0.4351, "step": 3901 }, { "epoch": 2.900396432111001, "grad_norm": 0.3502798080444336, "learning_rate": 3.3068252466691433e-08, "loss": 0.3626, "step": 3902 }, { "epoch": 2.9011397423191276, "grad_norm": 0.34311333298683167, "learning_rate": 3.257339353705924e-08, "loss": 0.3887, "step": 3903 }, { "epoch": 2.9018830525272548, "grad_norm": 0.3602387011051178, "learning_rate": 3.208225320070313e-08, "loss": 0.4154, "step": 3904 }, { "epoch": 2.9026263627353814, "grad_norm": 0.3194058835506439, "learning_rate": 3.159483182528733e-08, "loss": 0.3869, "step": 3905 }, { "epoch": 2.9033696729435086, "grad_norm": 0.33995321393013, "learning_rate": 3.1111129775694414e-08, "loss": 0.3888, "step": 3906 }, { "epoch": 2.9041129831516352, "grad_norm": 0.3534347712993622, "learning_rate": 3.0631147414020846e-08, "loss": 0.3892, "step": 3907 }, { "epoch": 2.904856293359762, "grad_norm": 0.31231117248535156, "learning_rate": 3.0154885099579755e-08, "loss": 0.3741, "step": 3908 }, { "epoch": 2.905599603567889, "grad_norm": 0.33542418479919434, "learning_rate": 2.968234318889873e-08, "loss": 0.3682, "step": 3909 }, { "epoch": 2.9063429137760157, "grad_norm": 0.3046053349971771, "learning_rate": 2.921352203572092e-08, "loss": 0.3686, "step": 3910 }, { "epoch": 2.907086223984143, "grad_norm": 0.3303387463092804, "learning_rate": 2.874842199100336e-08, "loss": 0.3813, "step": 3911 }, { "epoch": 2.9078295341922695, "grad_norm": 0.296303927898407, "learning_rate": 2.8287043402917546e-08, "loss": 0.3397, "step": 3912 }, { "epoch": 2.908572844400396, "grad_norm": 0.36864811182022095, "learning_rate": 2.7829386616850527e-08, "loss": 0.453, "step": 3913 }, { "epoch": 2.9093161546085233, "grad_norm": 0.32720261812210083, "learning_rate": 2.737545197540048e-08, "loss": 0.3862, "step": 3914 }, { "epoch": 2.91005946481665, "grad_norm": 0.3118422031402588, "learning_rate": 2.6925239818382244e-08, "loss": 0.3542, "step": 3915 }, { "epoch": 2.910802775024777, "grad_norm": 0.34275442361831665, "learning_rate": 2.6478750482822334e-08, "loss": 0.3889, "step": 3916 }, { "epoch": 2.911546085232904, "grad_norm": 0.3177582621574402, "learning_rate": 2.603598430296006e-08, "loss": 0.3866, "step": 3917 }, { "epoch": 2.9122893954410305, "grad_norm": 0.3256147801876068, "learning_rate": 2.5596941610248614e-08, "loss": 0.3933, "step": 3918 }, { "epoch": 2.9130327056491576, "grad_norm": 0.3116803765296936, "learning_rate": 2.516162273335343e-08, "loss": 0.3788, "step": 3919 }, { "epoch": 2.9137760158572843, "grad_norm": 0.33887848258018494, "learning_rate": 2.473002799815216e-08, "loss": 0.4112, "step": 3920 }, { "epoch": 2.9145193260654114, "grad_norm": 0.32306045293807983, "learning_rate": 2.4302157727735254e-08, "loss": 0.36, "step": 3921 }, { "epoch": 2.915262636273538, "grad_norm": 0.31518062949180603, "learning_rate": 2.387801224240316e-08, "loss": 0.3593, "step": 3922 }, { "epoch": 2.916005946481665, "grad_norm": 0.322780579328537, "learning_rate": 2.345759185967078e-08, "loss": 0.3675, "step": 3923 }, { "epoch": 2.916749256689792, "grad_norm": 0.31857386231422424, "learning_rate": 2.304089689426192e-08, "loss": 0.3914, "step": 3924 }, { "epoch": 2.9174925668979186, "grad_norm": 0.33132946491241455, "learning_rate": 2.2627927658112614e-08, "loss": 0.3727, "step": 3925 }, { "epoch": 2.9182358771060457, "grad_norm": 0.354278028011322, "learning_rate": 2.2218684460370567e-08, "loss": 0.3794, "step": 3926 }, { "epoch": 2.9189791873141724, "grad_norm": 0.3158569633960724, "learning_rate": 2.1813167607392383e-08, "loss": 0.4066, "step": 3927 }, { "epoch": 2.919722497522299, "grad_norm": 0.3452337384223938, "learning_rate": 2.14113774027469e-08, "loss": 0.3827, "step": 3928 }, { "epoch": 2.920465807730426, "grad_norm": 0.3006749153137207, "learning_rate": 2.1013314147212416e-08, "loss": 0.3686, "step": 3929 }, { "epoch": 2.921209117938553, "grad_norm": 0.3352106213569641, "learning_rate": 2.0618978138776668e-08, "loss": 0.4401, "step": 3930 }, { "epoch": 2.92195242814668, "grad_norm": 0.31450480222702026, "learning_rate": 2.0228369672638524e-08, "loss": 0.3575, "step": 3931 }, { "epoch": 2.9226957383548067, "grad_norm": 0.33034759759902954, "learning_rate": 1.9841489041205197e-08, "loss": 0.4018, "step": 3932 }, { "epoch": 2.9234390485629334, "grad_norm": 0.3233030140399933, "learning_rate": 1.9458336534094457e-08, "loss": 0.3643, "step": 3933 }, { "epoch": 2.9241823587710605, "grad_norm": 0.33880332112312317, "learning_rate": 1.907891243813187e-08, "loss": 0.3734, "step": 3934 }, { "epoch": 2.924925668979187, "grad_norm": 0.35557013750076294, "learning_rate": 1.8703217037353007e-08, "loss": 0.4006, "step": 3935 }, { "epoch": 2.9256689791873143, "grad_norm": 0.3385699987411499, "learning_rate": 1.833125061300123e-08, "loss": 0.3332, "step": 3936 }, { "epoch": 2.926412289395441, "grad_norm": 0.3399967551231384, "learning_rate": 1.7963013443529908e-08, "loss": 0.3703, "step": 3937 }, { "epoch": 2.9271555996035676, "grad_norm": 0.3091278076171875, "learning_rate": 1.7598505804599097e-08, "loss": 0.371, "step": 3938 }, { "epoch": 2.9278989098116948, "grad_norm": 0.30064159631729126, "learning_rate": 1.723772796907719e-08, "loss": 0.368, "step": 3939 }, { "epoch": 2.9286422200198214, "grad_norm": 0.31195610761642456, "learning_rate": 1.688068020704148e-08, "loss": 0.3666, "step": 3940 }, { "epoch": 2.9293855302279486, "grad_norm": 0.32089805603027344, "learning_rate": 1.652736278577649e-08, "loss": 0.384, "step": 3941 }, { "epoch": 2.9301288404360752, "grad_norm": 0.35328733921051025, "learning_rate": 1.6177775969773436e-08, "loss": 0.3965, "step": 3942 }, { "epoch": 2.930872150644202, "grad_norm": 0.33329328894615173, "learning_rate": 1.5831920020731863e-08, "loss": 0.3741, "step": 3943 }, { "epoch": 2.931615460852329, "grad_norm": 0.31525665521621704, "learning_rate": 1.5489795197558e-08, "loss": 0.3182, "step": 3944 }, { "epoch": 2.932358771060456, "grad_norm": 0.360193133354187, "learning_rate": 1.5151401756364205e-08, "loss": 0.4201, "step": 3945 }, { "epoch": 2.933102081268583, "grad_norm": 0.335430771112442, "learning_rate": 1.4816739950471726e-08, "loss": 0.348, "step": 3946 }, { "epoch": 2.9338453914767095, "grad_norm": 0.3451646566390991, "learning_rate": 1.4485810030405722e-08, "loss": 0.3758, "step": 3947 }, { "epoch": 2.934588701684836, "grad_norm": 0.34506693482398987, "learning_rate": 1.4158612243898584e-08, "loss": 0.3988, "step": 3948 }, { "epoch": 2.9353320118929633, "grad_norm": 0.30873459577560425, "learning_rate": 1.383514683589049e-08, "loss": 0.3508, "step": 3949 }, { "epoch": 2.9360753221010905, "grad_norm": 0.32461604475975037, "learning_rate": 1.3515414048524966e-08, "loss": 0.4051, "step": 3950 }, { "epoch": 2.936818632309217, "grad_norm": 0.3178607225418091, "learning_rate": 1.3199414121153332e-08, "loss": 0.386, "step": 3951 }, { "epoch": 2.937561942517344, "grad_norm": 0.3267558217048645, "learning_rate": 1.288714729033136e-08, "loss": 0.4239, "step": 3952 }, { "epoch": 2.9383052527254705, "grad_norm": 0.3246278464794159, "learning_rate": 1.2578613789820393e-08, "loss": 0.3711, "step": 3953 }, { "epoch": 2.9390485629335976, "grad_norm": 0.3564843237400055, "learning_rate": 1.2273813850587347e-08, "loss": 0.3785, "step": 3954 }, { "epoch": 2.9397918731417247, "grad_norm": 0.331054151058197, "learning_rate": 1.1972747700804143e-08, "loss": 0.3731, "step": 3955 }, { "epoch": 2.9405351833498514, "grad_norm": 0.34053197503089905, "learning_rate": 1.167541556584828e-08, "loss": 0.431, "step": 3956 }, { "epoch": 2.941278493557978, "grad_norm": 0.33771219849586487, "learning_rate": 1.1381817668299488e-08, "loss": 0.3977, "step": 3957 }, { "epoch": 2.942021803766105, "grad_norm": 0.32670357823371887, "learning_rate": 1.1091954227945845e-08, "loss": 0.3896, "step": 3958 }, { "epoch": 2.942765113974232, "grad_norm": 0.32699909806251526, "learning_rate": 1.0805825461776553e-08, "loss": 0.342, "step": 3959 }, { "epoch": 2.943508424182359, "grad_norm": 0.354258269071579, "learning_rate": 1.0523431583986388e-08, "loss": 0.4154, "step": 3960 }, { "epoch": 2.9442517343904857, "grad_norm": 0.32147496938705444, "learning_rate": 1.0244772805975135e-08, "loss": 0.3914, "step": 3961 }, { "epoch": 2.9449950445986124, "grad_norm": 0.38417384028434753, "learning_rate": 9.969849336344262e-09, "loss": 0.4147, "step": 3962 }, { "epoch": 2.9457383548067395, "grad_norm": 0.32292479276657104, "learning_rate": 9.69866138090081e-09, "loss": 0.3437, "step": 3963 }, { "epoch": 2.946481665014866, "grad_norm": 0.32637450098991394, "learning_rate": 9.431209142654606e-09, "loss": 0.3608, "step": 3964 }, { "epoch": 2.9472249752229933, "grad_norm": 0.3294259011745453, "learning_rate": 9.167492821819946e-09, "loss": 0.3869, "step": 3965 }, { "epoch": 2.94796828543112, "grad_norm": 0.37152087688446045, "learning_rate": 8.907512615812796e-09, "loss": 0.3534, "step": 3966 }, { "epoch": 2.9487115956392467, "grad_norm": 0.3418882489204407, "learning_rate": 8.651268719253592e-09, "loss": 0.3365, "step": 3967 }, { "epoch": 2.949454905847374, "grad_norm": 0.3542250096797943, "learning_rate": 8.398761323964999e-09, "loss": 0.3827, "step": 3968 }, { "epoch": 2.9501982160555005, "grad_norm": 0.3431815207004547, "learning_rate": 8.149990618973591e-09, "loss": 0.3677, "step": 3969 }, { "epoch": 2.9509415262636276, "grad_norm": 0.35551390051841736, "learning_rate": 7.904956790507622e-09, "loss": 0.3905, "step": 3970 }, { "epoch": 2.9516848364717543, "grad_norm": 0.36599722504615784, "learning_rate": 7.663660021998697e-09, "loss": 0.4344, "step": 3971 }, { "epoch": 2.952428146679881, "grad_norm": 0.2974739074707031, "learning_rate": 7.426100494079547e-09, "loss": 0.3308, "step": 3972 }, { "epoch": 2.953171456888008, "grad_norm": 0.32999420166015625, "learning_rate": 7.192278384586804e-09, "loss": 0.3747, "step": 3973 }, { "epoch": 2.9539147670961348, "grad_norm": 0.36444300413131714, "learning_rate": 6.962193868558787e-09, "loss": 0.4292, "step": 3974 }, { "epoch": 2.954658077304262, "grad_norm": 0.3331241011619568, "learning_rate": 6.735847118235494e-09, "loss": 0.3734, "step": 3975 }, { "epoch": 2.9554013875123886, "grad_norm": 0.34773963689804077, "learning_rate": 6.513238303059166e-09, "loss": 0.3916, "step": 3976 }, { "epoch": 2.9561446977205152, "grad_norm": 0.3057137429714203, "learning_rate": 6.294367589673722e-09, "loss": 0.3785, "step": 3977 }, { "epoch": 2.9568880079286424, "grad_norm": 0.32134121656417847, "learning_rate": 6.079235141924767e-09, "loss": 0.3897, "step": 3978 }, { "epoch": 2.957631318136769, "grad_norm": 0.3451423645019531, "learning_rate": 5.867841120859586e-09, "loss": 0.3944, "step": 3979 }, { "epoch": 2.958374628344896, "grad_norm": 0.3177144527435303, "learning_rate": 5.6601856847265934e-09, "loss": 0.3505, "step": 3980 }, { "epoch": 2.959117938553023, "grad_norm": 0.3522380590438843, "learning_rate": 5.456268988975888e-09, "loss": 0.4014, "step": 3981 }, { "epoch": 2.9598612487611495, "grad_norm": 0.3332817554473877, "learning_rate": 5.256091186258694e-09, "loss": 0.3543, "step": 3982 }, { "epoch": 2.9606045589692767, "grad_norm": 0.3365298807621002, "learning_rate": 5.059652426427364e-09, "loss": 0.3569, "step": 3983 }, { "epoch": 2.9613478691774033, "grad_norm": 0.32865962386131287, "learning_rate": 4.866952856534268e-09, "loss": 0.4046, "step": 3984 }, { "epoch": 2.9620911793855305, "grad_norm": 0.3409794270992279, "learning_rate": 4.677992620834015e-09, "loss": 0.416, "step": 3985 }, { "epoch": 2.962834489593657, "grad_norm": 0.3397228717803955, "learning_rate": 4.492771860781786e-09, "loss": 0.3615, "step": 3986 }, { "epoch": 2.963577799801784, "grad_norm": 0.33594265580177307, "learning_rate": 4.311290715032224e-09, "loss": 0.4149, "step": 3987 }, { "epoch": 2.964321110009911, "grad_norm": 0.3282493054866791, "learning_rate": 4.133549319441655e-09, "loss": 0.3797, "step": 3988 }, { "epoch": 2.9650644202180376, "grad_norm": 0.28026282787323, "learning_rate": 3.959547807066421e-09, "loss": 0.3303, "step": 3989 }, { "epoch": 2.9658077304261647, "grad_norm": 0.3257806599140167, "learning_rate": 3.7892863081634425e-09, "loss": 0.39, "step": 3990 }, { "epoch": 2.9665510406342914, "grad_norm": 0.3304937183856964, "learning_rate": 3.62276495018965e-09, "loss": 0.3985, "step": 3991 }, { "epoch": 2.967294350842418, "grad_norm": 0.325308233499527, "learning_rate": 3.4599838578025514e-09, "loss": 0.3552, "step": 3992 }, { "epoch": 2.9680376610505452, "grad_norm": 0.316763311624527, "learning_rate": 3.300943152858005e-09, "loss": 0.3132, "step": 3993 }, { "epoch": 2.968780971258672, "grad_norm": 0.3349706828594208, "learning_rate": 3.1456429544146625e-09, "loss": 0.3692, "step": 3994 }, { "epoch": 2.969524281466799, "grad_norm": 0.29476526379585266, "learning_rate": 2.9940833787289735e-09, "loss": 0.353, "step": 3995 }, { "epoch": 2.9702675916749257, "grad_norm": 0.34985947608947754, "learning_rate": 2.846264539257959e-09, "loss": 0.3691, "step": 3996 }, { "epoch": 2.9710109018830524, "grad_norm": 0.3540409803390503, "learning_rate": 2.702186546657548e-09, "loss": 0.3812, "step": 3997 }, { "epoch": 2.9717542120911795, "grad_norm": 0.3309096097946167, "learning_rate": 2.561849508784242e-09, "loss": 0.3787, "step": 3998 }, { "epoch": 2.972497522299306, "grad_norm": 0.36047130823135376, "learning_rate": 2.4252535306940052e-09, "loss": 0.4312, "step": 3999 }, { "epoch": 2.9732408325074333, "grad_norm": 0.3568691909313202, "learning_rate": 2.292398714641708e-09, "loss": 0.3877, "step": 4000 }, { "epoch": 2.97398414271556, "grad_norm": 0.3315553665161133, "learning_rate": 2.163285160081685e-09, "loss": 0.3397, "step": 4001 }, { "epoch": 2.9747274529236867, "grad_norm": 0.3460598886013031, "learning_rate": 2.037912963668287e-09, "loss": 0.3989, "step": 4002 }, { "epoch": 2.975470763131814, "grad_norm": 0.3225550949573517, "learning_rate": 1.9162822192542176e-09, "loss": 0.3415, "step": 4003 }, { "epoch": 2.9762140733399405, "grad_norm": 0.3312007784843445, "learning_rate": 1.798393017891642e-09, "loss": 0.4227, "step": 4004 }, { "epoch": 2.9769573835480676, "grad_norm": 0.29484501481056213, "learning_rate": 1.684245447831634e-09, "loss": 0.4047, "step": 4005 }, { "epoch": 2.9777006937561943, "grad_norm": 0.3633817732334137, "learning_rate": 1.5738395945252837e-09, "loss": 0.4083, "step": 4006 }, { "epoch": 2.978444003964321, "grad_norm": 0.33251953125, "learning_rate": 1.4671755406214793e-09, "loss": 0.3827, "step": 4007 }, { "epoch": 2.979187314172448, "grad_norm": 0.3510497808456421, "learning_rate": 1.364253365968571e-09, "loss": 0.4171, "step": 4008 }, { "epoch": 2.9799306243805748, "grad_norm": 0.3101026713848114, "learning_rate": 1.2650731476138156e-09, "loss": 0.3836, "step": 4009 }, { "epoch": 2.980673934588702, "grad_norm": 0.37845084071159363, "learning_rate": 1.1696349598022683e-09, "loss": 0.4262, "step": 4010 }, { "epoch": 2.9814172447968286, "grad_norm": 0.3199447691440582, "learning_rate": 1.077938873979556e-09, "loss": 0.3863, "step": 4011 }, { "epoch": 2.9821605550049552, "grad_norm": 0.30187898874282837, "learning_rate": 9.89984958788548e-10, "loss": 0.346, "step": 4012 }, { "epoch": 2.9829038652130824, "grad_norm": 0.33985385298728943, "learning_rate": 9.057732800710206e-10, "loss": 0.409, "step": 4013 }, { "epoch": 2.983647175421209, "grad_norm": 0.33320972323417664, "learning_rate": 8.253039008676578e-10, "loss": 0.359, "step": 4014 }, { "epoch": 2.984390485629336, "grad_norm": 0.3586447238922119, "learning_rate": 7.485768814169403e-10, "loss": 0.3958, "step": 4015 }, { "epoch": 2.985133795837463, "grad_norm": 0.30936911702156067, "learning_rate": 6.755922791573666e-10, "loss": 0.3369, "step": 4016 }, { "epoch": 2.9858771060455895, "grad_norm": 0.3223113715648651, "learning_rate": 6.063501487235668e-10, "loss": 0.369, "step": 4017 }, { "epoch": 2.9866204162537167, "grad_norm": 0.3352705240249634, "learning_rate": 5.408505419512988e-10, "loss": 0.3916, "step": 4018 }, { "epoch": 2.9873637264618433, "grad_norm": 0.3659493923187256, "learning_rate": 4.790935078724523e-10, "loss": 0.3956, "step": 4019 }, { "epoch": 2.9881070366699705, "grad_norm": 0.33965685963630676, "learning_rate": 4.2107909271837945e-10, "loss": 0.3742, "step": 4020 }, { "epoch": 2.988850346878097, "grad_norm": 0.3507288992404938, "learning_rate": 3.6680733991767416e-10, "loss": 0.437, "step": 4021 }, { "epoch": 2.989593657086224, "grad_norm": 0.27467167377471924, "learning_rate": 3.1627829009894807e-10, "loss": 0.3609, "step": 4022 }, { "epoch": 2.990336967294351, "grad_norm": 0.35095301270484924, "learning_rate": 2.694919810874996e-10, "loss": 0.3891, "step": 4023 }, { "epoch": 2.9910802775024776, "grad_norm": 0.3439078629016876, "learning_rate": 2.264484479075346e-10, "loss": 0.3838, "step": 4024 }, { "epoch": 2.9918235877106047, "grad_norm": 0.3508610427379608, "learning_rate": 1.8714772278105587e-10, "loss": 0.3834, "step": 4025 }, { "epoch": 2.9925668979187314, "grad_norm": 0.3232470750808716, "learning_rate": 1.5158983512897351e-10, "loss": 0.381, "step": 4026 }, { "epoch": 2.993310208126858, "grad_norm": 0.3615477979183197, "learning_rate": 1.197748115688846e-10, "loss": 0.3557, "step": 4027 }, { "epoch": 2.9940535183349852, "grad_norm": 0.36571580171585083, "learning_rate": 9.170267591729343e-11, "loss": 0.3844, "step": 4028 }, { "epoch": 2.994796828543112, "grad_norm": 0.3420450687408447, "learning_rate": 6.737344919016674e-11, "loss": 0.3786, "step": 4029 }, { "epoch": 2.995540138751239, "grad_norm": 0.3430324196815491, "learning_rate": 4.678714959904796e-11, "loss": 0.3685, "step": 4030 }, { "epoch": 2.9962834489593657, "grad_norm": 0.33336225152015686, "learning_rate": 2.9943792555497955e-11, "loss": 0.3766, "step": 4031 }, { "epoch": 2.9970267591674924, "grad_norm": 0.338205486536026, "learning_rate": 1.6843390667764525e-11, "loss": 0.3847, "step": 4032 }, { "epoch": 2.9977700693756195, "grad_norm": 0.32111793756484985, "learning_rate": 7.485953743002761e-12, "loss": 0.3659, "step": 4033 }, { "epoch": 2.998513379583746, "grad_norm": 0.3077859878540039, "learning_rate": 1.871488786164832e-12, "loss": 0.339, "step": 4034 }, { "epoch": 2.9992566897918733, "grad_norm": 0.3281209468841553, "learning_rate": 0.0, "loss": 0.392, "step": 4035 }, { "epoch": 2.9992566897918733, "step": 4035, "total_flos": 4682984608595968.0, "train_loss": 0.4309883663350026, "train_runtime": 135360.2527, "train_samples_per_second": 2.862, "train_steps_per_second": 0.03 } ], "logging_steps": 1.0, "max_steps": 4035, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4682984608595968.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }