{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2063, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004847309743092584, "grad_norm": 7.171422004699707, "learning_rate": 2.4154589371980677e-07, "loss": 3.4607, "step": 1 }, { "epoch": 0.0009694619486185168, "grad_norm": 6.8233819007873535, "learning_rate": 4.830917874396135e-07, "loss": 3.2371, "step": 2 }, { "epoch": 0.001454192922927775, "grad_norm": 7.979361057281494, "learning_rate": 7.246376811594203e-07, "loss": 3.3669, "step": 3 }, { "epoch": 0.0019389238972370335, "grad_norm": 6.270462512969971, "learning_rate": 9.66183574879227e-07, "loss": 3.3027, "step": 4 }, { "epoch": 0.0024236548715462916, "grad_norm": 6.778450012207031, "learning_rate": 1.2077294685990338e-06, "loss": 3.2712, "step": 5 }, { "epoch": 0.00290838584585555, "grad_norm": 6.3815741539001465, "learning_rate": 1.4492753623188406e-06, "loss": 3.3691, "step": 6 }, { "epoch": 0.0033931168201648087, "grad_norm": 6.661952018737793, "learning_rate": 1.6908212560386474e-06, "loss": 3.3269, "step": 7 }, { "epoch": 0.003877847794474067, "grad_norm": 6.819033145904541, "learning_rate": 1.932367149758454e-06, "loss": 3.2467, "step": 8 }, { "epoch": 0.004362578768783325, "grad_norm": 5.535458564758301, "learning_rate": 2.173913043478261e-06, "loss": 3.1249, "step": 9 }, { "epoch": 0.004847309743092583, "grad_norm": 6.2597880363464355, "learning_rate": 2.4154589371980677e-06, "loss": 3.4761, "step": 10 }, { "epoch": 0.005332040717401842, "grad_norm": 6.067697525024414, "learning_rate": 2.6570048309178746e-06, "loss": 3.3821, "step": 11 }, { "epoch": 0.0058167716917111, "grad_norm": 5.567847728729248, "learning_rate": 2.898550724637681e-06, "loss": 3.0755, "step": 12 }, { "epoch": 0.006301502666020358, "grad_norm": 4.956480026245117, "learning_rate": 3.140096618357488e-06, "loss": 3.0035, "step": 13 }, { "epoch": 0.0067862336403296175, "grad_norm": 6.243023872375488, "learning_rate": 3.3816425120772947e-06, "loss": 3.1336, "step": 14 }, { "epoch": 0.007270964614638876, "grad_norm": 5.114922046661377, "learning_rate": 3.6231884057971017e-06, "loss": 2.9698, "step": 15 }, { "epoch": 0.007755695588948134, "grad_norm": 5.099619388580322, "learning_rate": 3.864734299516908e-06, "loss": 3.022, "step": 16 }, { "epoch": 0.008240426563257392, "grad_norm": 5.203329563140869, "learning_rate": 4.106280193236716e-06, "loss": 2.8866, "step": 17 }, { "epoch": 0.00872515753756665, "grad_norm": 4.828646659851074, "learning_rate": 4.347826086956522e-06, "loss": 2.7138, "step": 18 }, { "epoch": 0.009209888511875909, "grad_norm": 5.072122573852539, "learning_rate": 4.589371980676329e-06, "loss": 2.8717, "step": 19 }, { "epoch": 0.009694619486185167, "grad_norm": 4.990163326263428, "learning_rate": 4.830917874396135e-06, "loss": 2.8924, "step": 20 }, { "epoch": 0.010179350460494426, "grad_norm": 4.578018665313721, "learning_rate": 5.072463768115943e-06, "loss": 2.5511, "step": 21 }, { "epoch": 0.010664081434803683, "grad_norm": 4.484613418579102, "learning_rate": 5.314009661835749e-06, "loss": 2.6689, "step": 22 }, { "epoch": 0.011148812409112942, "grad_norm": 4.256691932678223, "learning_rate": 5.555555555555556e-06, "loss": 2.5432, "step": 23 }, { "epoch": 0.0116335433834222, "grad_norm": 4.374969005584717, "learning_rate": 5.797101449275362e-06, "loss": 2.6499, "step": 24 }, { "epoch": 0.012118274357731459, "grad_norm": 4.293645858764648, "learning_rate": 6.038647342995169e-06, "loss": 2.2692, "step": 25 }, { "epoch": 0.012603005332040717, "grad_norm": 3.59814190864563, "learning_rate": 6.280193236714976e-06, "loss": 2.3972, "step": 26 }, { "epoch": 0.013087736306349976, "grad_norm": 4.462075710296631, "learning_rate": 6.521739130434783e-06, "loss": 2.3534, "step": 27 }, { "epoch": 0.013572467280659235, "grad_norm": 3.837050437927246, "learning_rate": 6.7632850241545894e-06, "loss": 2.1863, "step": 28 }, { "epoch": 0.014057198254968492, "grad_norm": 3.4918296337127686, "learning_rate": 7.004830917874397e-06, "loss": 2.0585, "step": 29 }, { "epoch": 0.014541929229277752, "grad_norm": 3.0016872882843018, "learning_rate": 7.246376811594203e-06, "loss": 2.0722, "step": 30 }, { "epoch": 0.015026660203587009, "grad_norm": 3.585313320159912, "learning_rate": 7.48792270531401e-06, "loss": 2.2528, "step": 31 }, { "epoch": 0.015511391177896268, "grad_norm": 3.0016026496887207, "learning_rate": 7.729468599033817e-06, "loss": 1.9671, "step": 32 }, { "epoch": 0.015996122152205527, "grad_norm": 2.7799417972564697, "learning_rate": 7.971014492753623e-06, "loss": 2.1397, "step": 33 }, { "epoch": 0.016480853126514785, "grad_norm": 2.6177585124969482, "learning_rate": 8.212560386473431e-06, "loss": 2.0716, "step": 34 }, { "epoch": 0.016965584100824042, "grad_norm": 2.6502163410186768, "learning_rate": 8.454106280193238e-06, "loss": 2.0137, "step": 35 }, { "epoch": 0.0174503150751333, "grad_norm": 2.5228984355926514, "learning_rate": 8.695652173913044e-06, "loss": 1.7984, "step": 36 }, { "epoch": 0.01793504604944256, "grad_norm": 2.638493537902832, "learning_rate": 8.93719806763285e-06, "loss": 1.8365, "step": 37 }, { "epoch": 0.018419777023751818, "grad_norm": 2.337646007537842, "learning_rate": 9.178743961352658e-06, "loss": 1.8024, "step": 38 }, { "epoch": 0.018904507998061076, "grad_norm": 2.2700746059417725, "learning_rate": 9.420289855072464e-06, "loss": 1.8987, "step": 39 }, { "epoch": 0.019389238972370333, "grad_norm": 2.020162343978882, "learning_rate": 9.66183574879227e-06, "loss": 1.6793, "step": 40 }, { "epoch": 0.019873969946679594, "grad_norm": 2.015110492706299, "learning_rate": 9.903381642512077e-06, "loss": 1.8938, "step": 41 }, { "epoch": 0.02035870092098885, "grad_norm": 2.2413482666015625, "learning_rate": 1.0144927536231885e-05, "loss": 2.1364, "step": 42 }, { "epoch": 0.02084343189529811, "grad_norm": 1.931469202041626, "learning_rate": 1.0386473429951692e-05, "loss": 1.9931, "step": 43 }, { "epoch": 0.021328162869607366, "grad_norm": 1.8747423887252808, "learning_rate": 1.0628019323671499e-05, "loss": 1.6649, "step": 44 }, { "epoch": 0.021812893843916627, "grad_norm": 1.9672399759292603, "learning_rate": 1.0869565217391305e-05, "loss": 1.8262, "step": 45 }, { "epoch": 0.022297624818225885, "grad_norm": 1.9558823108673096, "learning_rate": 1.1111111111111112e-05, "loss": 1.8059, "step": 46 }, { "epoch": 0.022782355792535142, "grad_norm": 1.8525190353393555, "learning_rate": 1.1352657004830918e-05, "loss": 1.721, "step": 47 }, { "epoch": 0.0232670867668444, "grad_norm": 1.8605695962905884, "learning_rate": 1.1594202898550725e-05, "loss": 1.9044, "step": 48 }, { "epoch": 0.02375181774115366, "grad_norm": 2.524609088897705, "learning_rate": 1.1835748792270531e-05, "loss": 1.6106, "step": 49 }, { "epoch": 0.024236548715462918, "grad_norm": 1.8198078870773315, "learning_rate": 1.2077294685990338e-05, "loss": 1.822, "step": 50 }, { "epoch": 0.024721279689772176, "grad_norm": 1.7705700397491455, "learning_rate": 1.2318840579710146e-05, "loss": 1.6892, "step": 51 }, { "epoch": 0.025206010664081433, "grad_norm": 1.9049899578094482, "learning_rate": 1.2560386473429953e-05, "loss": 1.7649, "step": 52 }, { "epoch": 0.025690741638390694, "grad_norm": 1.835711121559143, "learning_rate": 1.2801932367149761e-05, "loss": 1.7394, "step": 53 }, { "epoch": 0.02617547261269995, "grad_norm": 1.6949882507324219, "learning_rate": 1.3043478260869566e-05, "loss": 1.6299, "step": 54 }, { "epoch": 0.02666020358700921, "grad_norm": 1.7638367414474487, "learning_rate": 1.3285024154589374e-05, "loss": 1.5448, "step": 55 }, { "epoch": 0.02714493456131847, "grad_norm": 2.0102837085723877, "learning_rate": 1.3526570048309179e-05, "loss": 1.9325, "step": 56 }, { "epoch": 0.027629665535627727, "grad_norm": 1.7918657064437866, "learning_rate": 1.3768115942028985e-05, "loss": 1.6411, "step": 57 }, { "epoch": 0.028114396509936985, "grad_norm": 2.3990306854248047, "learning_rate": 1.4009661835748794e-05, "loss": 1.8308, "step": 58 }, { "epoch": 0.028599127484246242, "grad_norm": 2.5733494758605957, "learning_rate": 1.4251207729468599e-05, "loss": 2.1079, "step": 59 }, { "epoch": 0.029083858458555503, "grad_norm": 1.8608803749084473, "learning_rate": 1.4492753623188407e-05, "loss": 1.9755, "step": 60 }, { "epoch": 0.02956858943286476, "grad_norm": 1.7092028856277466, "learning_rate": 1.4734299516908212e-05, "loss": 1.6123, "step": 61 }, { "epoch": 0.030053320407174018, "grad_norm": 1.6902247667312622, "learning_rate": 1.497584541062802e-05, "loss": 1.6213, "step": 62 }, { "epoch": 0.030538051381483276, "grad_norm": 1.5993880033493042, "learning_rate": 1.5217391304347828e-05, "loss": 1.5819, "step": 63 }, { "epoch": 0.031022782355792537, "grad_norm": 1.6327762603759766, "learning_rate": 1.5458937198067633e-05, "loss": 1.6991, "step": 64 }, { "epoch": 0.031507513330101794, "grad_norm": 1.6658787727355957, "learning_rate": 1.570048309178744e-05, "loss": 1.9614, "step": 65 }, { "epoch": 0.031992244304411055, "grad_norm": 1.5906341075897217, "learning_rate": 1.5942028985507246e-05, "loss": 1.5806, "step": 66 }, { "epoch": 0.03247697527872031, "grad_norm": 1.7006059885025024, "learning_rate": 1.6183574879227054e-05, "loss": 2.1204, "step": 67 }, { "epoch": 0.03296170625302957, "grad_norm": 1.8104106187820435, "learning_rate": 1.6425120772946863e-05, "loss": 1.7377, "step": 68 }, { "epoch": 0.033446437227338824, "grad_norm": 1.7204387187957764, "learning_rate": 1.6666666666666667e-05, "loss": 1.6536, "step": 69 }, { "epoch": 0.033931168201648085, "grad_norm": 1.77187979221344, "learning_rate": 1.6908212560386476e-05, "loss": 1.7974, "step": 70 }, { "epoch": 0.034415899175957346, "grad_norm": 1.7312054634094238, "learning_rate": 1.714975845410628e-05, "loss": 1.6036, "step": 71 }, { "epoch": 0.0349006301502666, "grad_norm": 1.7339930534362793, "learning_rate": 1.739130434782609e-05, "loss": 1.532, "step": 72 }, { "epoch": 0.03538536112457586, "grad_norm": 1.6795563697814941, "learning_rate": 1.7632850241545894e-05, "loss": 1.7439, "step": 73 }, { "epoch": 0.03587009209888512, "grad_norm": 1.6724114418029785, "learning_rate": 1.78743961352657e-05, "loss": 1.7637, "step": 74 }, { "epoch": 0.036354823073194376, "grad_norm": 1.5222914218902588, "learning_rate": 1.8115942028985507e-05, "loss": 1.4971, "step": 75 }, { "epoch": 0.036839554047503636, "grad_norm": 1.6842753887176514, "learning_rate": 1.8357487922705315e-05, "loss": 1.525, "step": 76 }, { "epoch": 0.03732428502181289, "grad_norm": 1.6687383651733398, "learning_rate": 1.859903381642512e-05, "loss": 1.9496, "step": 77 }, { "epoch": 0.03780901599612215, "grad_norm": 1.8582350015640259, "learning_rate": 1.8840579710144928e-05, "loss": 1.733, "step": 78 }, { "epoch": 0.03829374697043141, "grad_norm": 1.554795265197754, "learning_rate": 1.9082125603864733e-05, "loss": 1.492, "step": 79 }, { "epoch": 0.038778477944740666, "grad_norm": 1.7551424503326416, "learning_rate": 1.932367149758454e-05, "loss": 1.898, "step": 80 }, { "epoch": 0.03926320891904993, "grad_norm": 1.9943029880523682, "learning_rate": 1.956521739130435e-05, "loss": 1.6975, "step": 81 }, { "epoch": 0.03974793989335919, "grad_norm": 1.6358810663223267, "learning_rate": 1.9806763285024154e-05, "loss": 1.4857, "step": 82 }, { "epoch": 0.04023267086766844, "grad_norm": 1.7312710285186768, "learning_rate": 2.0048309178743963e-05, "loss": 1.5569, "step": 83 }, { "epoch": 0.0407174018419777, "grad_norm": 1.6241320371627808, "learning_rate": 2.028985507246377e-05, "loss": 1.734, "step": 84 }, { "epoch": 0.041202132816286964, "grad_norm": 1.6983885765075684, "learning_rate": 2.0531400966183576e-05, "loss": 1.786, "step": 85 }, { "epoch": 0.04168686379059622, "grad_norm": 1.615333080291748, "learning_rate": 2.0772946859903384e-05, "loss": 1.8196, "step": 86 }, { "epoch": 0.04217159476490548, "grad_norm": 1.6279438734054565, "learning_rate": 2.101449275362319e-05, "loss": 1.5445, "step": 87 }, { "epoch": 0.04265632573921473, "grad_norm": 1.582491397857666, "learning_rate": 2.1256038647342997e-05, "loss": 1.456, "step": 88 }, { "epoch": 0.043141056713523994, "grad_norm": 1.701452612876892, "learning_rate": 2.1497584541062805e-05, "loss": 1.8605, "step": 89 }, { "epoch": 0.043625787687833255, "grad_norm": 1.845126986503601, "learning_rate": 2.173913043478261e-05, "loss": 1.5401, "step": 90 }, { "epoch": 0.04411051866214251, "grad_norm": 1.6799591779708862, "learning_rate": 2.198067632850242e-05, "loss": 1.7081, "step": 91 }, { "epoch": 0.04459524963645177, "grad_norm": 1.7880574464797974, "learning_rate": 2.2222222222222223e-05, "loss": 1.5429, "step": 92 }, { "epoch": 0.04507998061076103, "grad_norm": 1.625308632850647, "learning_rate": 2.246376811594203e-05, "loss": 1.5608, "step": 93 }, { "epoch": 0.045564711585070285, "grad_norm": 1.8478410243988037, "learning_rate": 2.2705314009661836e-05, "loss": 2.0223, "step": 94 }, { "epoch": 0.046049442559379546, "grad_norm": 1.5162346363067627, "learning_rate": 2.294685990338164e-05, "loss": 1.2781, "step": 95 }, { "epoch": 0.0465341735336888, "grad_norm": 1.854674220085144, "learning_rate": 2.318840579710145e-05, "loss": 1.6944, "step": 96 }, { "epoch": 0.04701890450799806, "grad_norm": 1.7069923877716064, "learning_rate": 2.3429951690821258e-05, "loss": 1.6425, "step": 97 }, { "epoch": 0.04750363548230732, "grad_norm": 1.5543208122253418, "learning_rate": 2.3671497584541063e-05, "loss": 1.4902, "step": 98 }, { "epoch": 0.047988366456616575, "grad_norm": 1.5168403387069702, "learning_rate": 2.391304347826087e-05, "loss": 1.3613, "step": 99 }, { "epoch": 0.048473097430925836, "grad_norm": 1.7112879753112793, "learning_rate": 2.4154589371980676e-05, "loss": 1.9106, "step": 100 }, { "epoch": 0.0489578284052351, "grad_norm": 1.8067775964736938, "learning_rate": 2.4396135265700484e-05, "loss": 1.8777, "step": 101 }, { "epoch": 0.04944255937954435, "grad_norm": 1.528732419013977, "learning_rate": 2.4637681159420292e-05, "loss": 1.4165, "step": 102 }, { "epoch": 0.04992729035385361, "grad_norm": 1.5768786668777466, "learning_rate": 2.4879227053140097e-05, "loss": 1.5526, "step": 103 }, { "epoch": 0.050412021328162866, "grad_norm": 1.665514349937439, "learning_rate": 2.5120772946859905e-05, "loss": 1.5298, "step": 104 }, { "epoch": 0.05089675230247213, "grad_norm": 1.59950852394104, "learning_rate": 2.5362318840579714e-05, "loss": 1.6167, "step": 105 }, { "epoch": 0.05138148327678139, "grad_norm": 1.6841107606887817, "learning_rate": 2.5603864734299522e-05, "loss": 1.6686, "step": 106 }, { "epoch": 0.05186621425109064, "grad_norm": 1.6083794832229614, "learning_rate": 2.5845410628019323e-05, "loss": 1.3828, "step": 107 }, { "epoch": 0.0523509452253999, "grad_norm": 1.6372400522232056, "learning_rate": 2.608695652173913e-05, "loss": 1.7482, "step": 108 }, { "epoch": 0.052835676199709164, "grad_norm": 1.7453114986419678, "learning_rate": 2.632850241545894e-05, "loss": 1.769, "step": 109 }, { "epoch": 0.05332040717401842, "grad_norm": 1.6545413732528687, "learning_rate": 2.6570048309178748e-05, "loss": 2.123, "step": 110 }, { "epoch": 0.05380513814832768, "grad_norm": 1.8229496479034424, "learning_rate": 2.6811594202898553e-05, "loss": 1.9621, "step": 111 }, { "epoch": 0.05428986912263694, "grad_norm": 1.640529990196228, "learning_rate": 2.7053140096618358e-05, "loss": 1.8864, "step": 112 }, { "epoch": 0.054774600096946194, "grad_norm": 1.4900970458984375, "learning_rate": 2.7294685990338166e-05, "loss": 1.9581, "step": 113 }, { "epoch": 0.055259331071255455, "grad_norm": 1.60316002368927, "learning_rate": 2.753623188405797e-05, "loss": 1.4996, "step": 114 }, { "epoch": 0.05574406204556471, "grad_norm": 1.8271881341934204, "learning_rate": 2.777777777777778e-05, "loss": 1.8359, "step": 115 }, { "epoch": 0.05622879301987397, "grad_norm": 1.6188838481903076, "learning_rate": 2.8019323671497587e-05, "loss": 1.778, "step": 116 }, { "epoch": 0.05671352399418323, "grad_norm": 1.672446608543396, "learning_rate": 2.826086956521739e-05, "loss": 1.9118, "step": 117 }, { "epoch": 0.057198254968492485, "grad_norm": 1.725005030632019, "learning_rate": 2.8502415458937197e-05, "loss": 1.4782, "step": 118 }, { "epoch": 0.057682985942801746, "grad_norm": 1.6945290565490723, "learning_rate": 2.8743961352657005e-05, "loss": 1.4347, "step": 119 }, { "epoch": 0.058167716917111006, "grad_norm": 1.6939204931259155, "learning_rate": 2.8985507246376814e-05, "loss": 1.6457, "step": 120 }, { "epoch": 0.05865244789142026, "grad_norm": 1.7131168842315674, "learning_rate": 2.9227053140096622e-05, "loss": 1.8858, "step": 121 }, { "epoch": 0.05913717886572952, "grad_norm": 1.8440873622894287, "learning_rate": 2.9468599033816423e-05, "loss": 2.0418, "step": 122 }, { "epoch": 0.059621909840038775, "grad_norm": 1.6105564832687378, "learning_rate": 2.971014492753623e-05, "loss": 1.5901, "step": 123 }, { "epoch": 0.060106640814348036, "grad_norm": 1.7111328840255737, "learning_rate": 2.995169082125604e-05, "loss": 1.6965, "step": 124 }, { "epoch": 0.0605913717886573, "grad_norm": 1.6334936618804932, "learning_rate": 3.0193236714975848e-05, "loss": 1.7441, "step": 125 }, { "epoch": 0.06107610276296655, "grad_norm": 1.6838350296020508, "learning_rate": 3.0434782608695656e-05, "loss": 1.7894, "step": 126 }, { "epoch": 0.06156083373727581, "grad_norm": 1.6963026523590088, "learning_rate": 3.067632850241546e-05, "loss": 1.8447, "step": 127 }, { "epoch": 0.06204556471158507, "grad_norm": 1.6151043176651, "learning_rate": 3.0917874396135266e-05, "loss": 1.6308, "step": 128 }, { "epoch": 0.06253029568589433, "grad_norm": 1.802123785018921, "learning_rate": 3.1159420289855074e-05, "loss": 1.5931, "step": 129 }, { "epoch": 0.06301502666020359, "grad_norm": 9.797577857971191, "learning_rate": 3.140096618357488e-05, "loss": 1.4618, "step": 130 }, { "epoch": 0.06349975763451285, "grad_norm": 1.555799126625061, "learning_rate": 3.164251207729469e-05, "loss": 1.4462, "step": 131 }, { "epoch": 0.06398448860882211, "grad_norm": 1.631108283996582, "learning_rate": 3.188405797101449e-05, "loss": 1.6234, "step": 132 }, { "epoch": 0.06446921958313136, "grad_norm": 1.6092970371246338, "learning_rate": 3.21256038647343e-05, "loss": 1.6422, "step": 133 }, { "epoch": 0.06495395055744062, "grad_norm": 1.667833685874939, "learning_rate": 3.236714975845411e-05, "loss": 1.6484, "step": 134 }, { "epoch": 0.06543868153174988, "grad_norm": 1.7068734169006348, "learning_rate": 3.260869565217392e-05, "loss": 1.4214, "step": 135 }, { "epoch": 0.06592341250605914, "grad_norm": 1.5251973867416382, "learning_rate": 3.2850241545893725e-05, "loss": 1.5508, "step": 136 }, { "epoch": 0.0664081434803684, "grad_norm": 1.6476942300796509, "learning_rate": 3.3091787439613533e-05, "loss": 1.49, "step": 137 }, { "epoch": 0.06689287445467765, "grad_norm": 1.7182284593582153, "learning_rate": 3.3333333333333335e-05, "loss": 1.6699, "step": 138 }, { "epoch": 0.06737760542898691, "grad_norm": 1.5735374689102173, "learning_rate": 3.357487922705314e-05, "loss": 1.6516, "step": 139 }, { "epoch": 0.06786233640329617, "grad_norm": 1.6058720350265503, "learning_rate": 3.381642512077295e-05, "loss": 1.6413, "step": 140 }, { "epoch": 0.06834706737760543, "grad_norm": 1.596596360206604, "learning_rate": 3.405797101449276e-05, "loss": 1.5226, "step": 141 }, { "epoch": 0.06883179835191469, "grad_norm": 1.7365012168884277, "learning_rate": 3.429951690821256e-05, "loss": 1.9471, "step": 142 }, { "epoch": 0.06931652932622395, "grad_norm": 1.566796064376831, "learning_rate": 3.454106280193237e-05, "loss": 1.6039, "step": 143 }, { "epoch": 0.0698012603005332, "grad_norm": 2.105440616607666, "learning_rate": 3.478260869565218e-05, "loss": 1.8893, "step": 144 }, { "epoch": 0.07028599127484246, "grad_norm": 1.701600193977356, "learning_rate": 3.502415458937198e-05, "loss": 1.8973, "step": 145 }, { "epoch": 0.07077072224915172, "grad_norm": 1.6586337089538574, "learning_rate": 3.526570048309179e-05, "loss": 1.6766, "step": 146 }, { "epoch": 0.07125545322346098, "grad_norm": 1.6110674142837524, "learning_rate": 3.5507246376811596e-05, "loss": 1.5639, "step": 147 }, { "epoch": 0.07174018419777024, "grad_norm": 1.6405996084213257, "learning_rate": 3.57487922705314e-05, "loss": 1.7666, "step": 148 }, { "epoch": 0.07222491517207949, "grad_norm": 1.6947883367538452, "learning_rate": 3.5990338164251205e-05, "loss": 1.578, "step": 149 }, { "epoch": 0.07270964614638875, "grad_norm": 1.6765739917755127, "learning_rate": 3.6231884057971014e-05, "loss": 1.7495, "step": 150 }, { "epoch": 0.07319437712069801, "grad_norm": 1.8097703456878662, "learning_rate": 3.647342995169082e-05, "loss": 1.3894, "step": 151 }, { "epoch": 0.07367910809500727, "grad_norm": 1.657542109489441, "learning_rate": 3.671497584541063e-05, "loss": 1.615, "step": 152 }, { "epoch": 0.07416383906931653, "grad_norm": 1.6826646327972412, "learning_rate": 3.695652173913043e-05, "loss": 1.5399, "step": 153 }, { "epoch": 0.07464857004362578, "grad_norm": 1.6431878805160522, "learning_rate": 3.719806763285024e-05, "loss": 1.8077, "step": 154 }, { "epoch": 0.07513330101793504, "grad_norm": 1.617641568183899, "learning_rate": 3.743961352657005e-05, "loss": 1.7934, "step": 155 }, { "epoch": 0.0756180319922443, "grad_norm": 1.6802802085876465, "learning_rate": 3.7681159420289856e-05, "loss": 2.0497, "step": 156 }, { "epoch": 0.07610276296655356, "grad_norm": 1.5995146036148071, "learning_rate": 3.7922705314009665e-05, "loss": 1.5435, "step": 157 }, { "epoch": 0.07658749394086282, "grad_norm": 1.6491643190383911, "learning_rate": 3.8164251207729466e-05, "loss": 1.6003, "step": 158 }, { "epoch": 0.07707222491517209, "grad_norm": 1.671040654182434, "learning_rate": 3.8405797101449274e-05, "loss": 1.6768, "step": 159 }, { "epoch": 0.07755695588948133, "grad_norm": 1.4470667839050293, "learning_rate": 3.864734299516908e-05, "loss": 1.4752, "step": 160 }, { "epoch": 0.0780416868637906, "grad_norm": 1.4912703037261963, "learning_rate": 3.888888888888889e-05, "loss": 1.5621, "step": 161 }, { "epoch": 0.07852641783809985, "grad_norm": 1.6416089534759521, "learning_rate": 3.91304347826087e-05, "loss": 1.7115, "step": 162 }, { "epoch": 0.07901114881240912, "grad_norm": 1.738970160484314, "learning_rate": 3.937198067632851e-05, "loss": 1.5945, "step": 163 }, { "epoch": 0.07949587978671838, "grad_norm": 1.50935959815979, "learning_rate": 3.961352657004831e-05, "loss": 1.4379, "step": 164 }, { "epoch": 0.07998061076102762, "grad_norm": 1.8530389070510864, "learning_rate": 3.985507246376812e-05, "loss": 1.6197, "step": 165 }, { "epoch": 0.08046534173533688, "grad_norm": 1.5483351945877075, "learning_rate": 4.0096618357487925e-05, "loss": 1.4075, "step": 166 }, { "epoch": 0.08095007270964615, "grad_norm": 1.7282538414001465, "learning_rate": 4.0338164251207733e-05, "loss": 1.7884, "step": 167 }, { "epoch": 0.0814348036839554, "grad_norm": 1.605089545249939, "learning_rate": 4.057971014492754e-05, "loss": 1.4268, "step": 168 }, { "epoch": 0.08191953465826467, "grad_norm": 1.6566203832626343, "learning_rate": 4.082125603864734e-05, "loss": 1.5855, "step": 169 }, { "epoch": 0.08240426563257393, "grad_norm": 1.7147961854934692, "learning_rate": 4.106280193236715e-05, "loss": 1.9024, "step": 170 }, { "epoch": 0.08288899660688318, "grad_norm": 1.638307809829712, "learning_rate": 4.130434782608696e-05, "loss": 1.5341, "step": 171 }, { "epoch": 0.08337372758119244, "grad_norm": 1.503071665763855, "learning_rate": 4.154589371980677e-05, "loss": 1.3162, "step": 172 }, { "epoch": 0.0838584585555017, "grad_norm": 1.6790330410003662, "learning_rate": 4.1787439613526576e-05, "loss": 1.5488, "step": 173 }, { "epoch": 0.08434318952981096, "grad_norm": 1.6894205808639526, "learning_rate": 4.202898550724638e-05, "loss": 1.4366, "step": 174 }, { "epoch": 0.08482792050412022, "grad_norm": 1.550458550453186, "learning_rate": 4.2270531400966186e-05, "loss": 1.6258, "step": 175 }, { "epoch": 0.08531265147842947, "grad_norm": 1.4660718441009521, "learning_rate": 4.2512077294685994e-05, "loss": 1.3366, "step": 176 }, { "epoch": 0.08579738245273873, "grad_norm": 1.527509331703186, "learning_rate": 4.27536231884058e-05, "loss": 1.5319, "step": 177 }, { "epoch": 0.08628211342704799, "grad_norm": 5.795914173126221, "learning_rate": 4.299516908212561e-05, "loss": 1.3028, "step": 178 }, { "epoch": 0.08676684440135725, "grad_norm": 1.6990413665771484, "learning_rate": 4.323671497584541e-05, "loss": 1.8149, "step": 179 }, { "epoch": 0.08725157537566651, "grad_norm": 1.704795479774475, "learning_rate": 4.347826086956522e-05, "loss": 1.671, "step": 180 }, { "epoch": 0.08773630634997576, "grad_norm": 1.5868449211120605, "learning_rate": 4.371980676328503e-05, "loss": 1.5234, "step": 181 }, { "epoch": 0.08822103732428502, "grad_norm": 1.7695292234420776, "learning_rate": 4.396135265700484e-05, "loss": 1.6046, "step": 182 }, { "epoch": 0.08870576829859428, "grad_norm": 1.6047744750976562, "learning_rate": 4.4202898550724645e-05, "loss": 1.7261, "step": 183 }, { "epoch": 0.08919049927290354, "grad_norm": 1.7648770809173584, "learning_rate": 4.4444444444444447e-05, "loss": 1.6856, "step": 184 }, { "epoch": 0.0896752302472128, "grad_norm": 1.8613518476486206, "learning_rate": 4.4685990338164255e-05, "loss": 1.8129, "step": 185 }, { "epoch": 0.09015996122152206, "grad_norm": 1.7032883167266846, "learning_rate": 4.492753623188406e-05, "loss": 1.6468, "step": 186 }, { "epoch": 0.09064469219583131, "grad_norm": 1.4716066122055054, "learning_rate": 4.5169082125603865e-05, "loss": 1.2393, "step": 187 }, { "epoch": 0.09112942317014057, "grad_norm": 1.582453966140747, "learning_rate": 4.541062801932367e-05, "loss": 1.7276, "step": 188 }, { "epoch": 0.09161415414444983, "grad_norm": 1.5959687232971191, "learning_rate": 4.565217391304348e-05, "loss": 1.4626, "step": 189 }, { "epoch": 0.09209888511875909, "grad_norm": 2.014113426208496, "learning_rate": 4.589371980676328e-05, "loss": 2.0453, "step": 190 }, { "epoch": 0.09258361609306835, "grad_norm": 1.6704699993133545, "learning_rate": 4.613526570048309e-05, "loss": 1.7625, "step": 191 }, { "epoch": 0.0930683470673776, "grad_norm": 1.8854517936706543, "learning_rate": 4.63768115942029e-05, "loss": 1.7175, "step": 192 }, { "epoch": 0.09355307804168686, "grad_norm": 1.588793158531189, "learning_rate": 4.661835748792271e-05, "loss": 1.5578, "step": 193 }, { "epoch": 0.09403780901599612, "grad_norm": 1.5601580142974854, "learning_rate": 4.6859903381642516e-05, "loss": 1.6403, "step": 194 }, { "epoch": 0.09452253999030538, "grad_norm": 1.6801401376724243, "learning_rate": 4.710144927536232e-05, "loss": 1.4691, "step": 195 }, { "epoch": 0.09500727096461464, "grad_norm": 2.123878002166748, "learning_rate": 4.7342995169082125e-05, "loss": 1.5602, "step": 196 }, { "epoch": 0.0954920019389239, "grad_norm": 1.666060447692871, "learning_rate": 4.7584541062801933e-05, "loss": 1.4702, "step": 197 }, { "epoch": 0.09597673291323315, "grad_norm": 1.7319121360778809, "learning_rate": 4.782608695652174e-05, "loss": 1.8495, "step": 198 }, { "epoch": 0.09646146388754241, "grad_norm": 1.5935922861099243, "learning_rate": 4.806763285024155e-05, "loss": 1.5142, "step": 199 }, { "epoch": 0.09694619486185167, "grad_norm": 1.8752068281173706, "learning_rate": 4.830917874396135e-05, "loss": 1.9948, "step": 200 }, { "epoch": 0.09743092583616093, "grad_norm": 1.8214166164398193, "learning_rate": 4.855072463768116e-05, "loss": 1.7985, "step": 201 }, { "epoch": 0.0979156568104702, "grad_norm": 1.6320829391479492, "learning_rate": 4.879227053140097e-05, "loss": 1.6227, "step": 202 }, { "epoch": 0.09840038778477944, "grad_norm": 1.5568000078201294, "learning_rate": 4.9033816425120776e-05, "loss": 1.587, "step": 203 }, { "epoch": 0.0988851187590887, "grad_norm": 1.5281633138656616, "learning_rate": 4.9275362318840584e-05, "loss": 1.4167, "step": 204 }, { "epoch": 0.09936984973339796, "grad_norm": 1.5692189931869507, "learning_rate": 4.9516908212560386e-05, "loss": 1.5648, "step": 205 }, { "epoch": 0.09985458070770722, "grad_norm": 1.644121766090393, "learning_rate": 4.9758454106280194e-05, "loss": 1.8842, "step": 206 }, { "epoch": 0.10033931168201649, "grad_norm": 1.654074788093567, "learning_rate": 5e-05, "loss": 1.3634, "step": 207 }, { "epoch": 0.10082404265632573, "grad_norm": 1.6578999757766724, "learning_rate": 4.9999964185927293e-05, "loss": 1.4366, "step": 208 }, { "epoch": 0.101308773630635, "grad_norm": 1.6873010396957397, "learning_rate": 4.999985674381179e-05, "loss": 1.5582, "step": 209 }, { "epoch": 0.10179350460494425, "grad_norm": 1.5198674201965332, "learning_rate": 4.999967767396132e-05, "loss": 1.3704, "step": 210 }, { "epoch": 0.10227823557925352, "grad_norm": 1.6594791412353516, "learning_rate": 4.999942697688894e-05, "loss": 1.7239, "step": 211 }, { "epoch": 0.10276296655356278, "grad_norm": 1.6230357885360718, "learning_rate": 4.9999104653312926e-05, "loss": 1.5243, "step": 212 }, { "epoch": 0.10324769752787204, "grad_norm": 1.62498140335083, "learning_rate": 4.9998710704156785e-05, "loss": 1.4242, "step": 213 }, { "epoch": 0.10373242850218128, "grad_norm": 1.611889362335205, "learning_rate": 4.9998245130549226e-05, "loss": 1.7185, "step": 214 }, { "epoch": 0.10421715947649055, "grad_norm": 1.7176823616027832, "learning_rate": 4.999770793382418e-05, "loss": 1.7587, "step": 215 }, { "epoch": 0.1047018904507998, "grad_norm": 1.8749507665634155, "learning_rate": 4.999709911552077e-05, "loss": 1.5999, "step": 216 }, { "epoch": 0.10518662142510907, "grad_norm": 2.380711793899536, "learning_rate": 4.999641867738336e-05, "loss": 1.5824, "step": 217 }, { "epoch": 0.10567135239941833, "grad_norm": 1.681323528289795, "learning_rate": 4.999566662136147e-05, "loss": 1.6289, "step": 218 }, { "epoch": 0.10615608337372757, "grad_norm": 1.7086970806121826, "learning_rate": 4.999484294960984e-05, "loss": 1.5793, "step": 219 }, { "epoch": 0.10664081434803684, "grad_norm": 1.6258258819580078, "learning_rate": 4.999394766448841e-05, "loss": 1.7768, "step": 220 }, { "epoch": 0.1071255453223461, "grad_norm": 1.609623670578003, "learning_rate": 4.9992980768562256e-05, "loss": 1.5748, "step": 221 }, { "epoch": 0.10761027629665536, "grad_norm": 1.6131285429000854, "learning_rate": 4.999194226460167e-05, "loss": 1.7234, "step": 222 }, { "epoch": 0.10809500727096462, "grad_norm": 1.7092560529708862, "learning_rate": 4.99908321555821e-05, "loss": 1.8781, "step": 223 }, { "epoch": 0.10857973824527388, "grad_norm": 1.755306601524353, "learning_rate": 4.998965044468414e-05, "loss": 1.647, "step": 224 }, { "epoch": 0.10906446921958313, "grad_norm": 1.637286901473999, "learning_rate": 4.9988397135293567e-05, "loss": 1.401, "step": 225 }, { "epoch": 0.10954920019389239, "grad_norm": 1.505699634552002, "learning_rate": 4.998707223100123e-05, "loss": 1.3702, "step": 226 }, { "epoch": 0.11003393116820165, "grad_norm": 1.5001720190048218, "learning_rate": 4.9985675735603164e-05, "loss": 1.6756, "step": 227 }, { "epoch": 0.11051866214251091, "grad_norm": 1.6514838933944702, "learning_rate": 4.998420765310051e-05, "loss": 1.3885, "step": 228 }, { "epoch": 0.11100339311682017, "grad_norm": 1.6999015808105469, "learning_rate": 4.998266798769951e-05, "loss": 1.2946, "step": 229 }, { "epoch": 0.11148812409112942, "grad_norm": 1.8051220178604126, "learning_rate": 4.998105674381148e-05, "loss": 1.6211, "step": 230 }, { "epoch": 0.11197285506543868, "grad_norm": 1.7223929166793823, "learning_rate": 4.9979373926052865e-05, "loss": 1.6805, "step": 231 }, { "epoch": 0.11245758603974794, "grad_norm": 1.5139886140823364, "learning_rate": 4.997761953924512e-05, "loss": 1.4749, "step": 232 }, { "epoch": 0.1129423170140572, "grad_norm": 1.6115888357162476, "learning_rate": 4.997579358841481e-05, "loss": 1.6298, "step": 233 }, { "epoch": 0.11342704798836646, "grad_norm": 1.5873501300811768, "learning_rate": 4.99738960787935e-05, "loss": 1.5954, "step": 234 }, { "epoch": 0.11391177896267571, "grad_norm": 1.5256075859069824, "learning_rate": 4.99719270158178e-05, "loss": 1.155, "step": 235 }, { "epoch": 0.11439650993698497, "grad_norm": 1.633514165878296, "learning_rate": 4.996988640512931e-05, "loss": 1.8803, "step": 236 }, { "epoch": 0.11488124091129423, "grad_norm": 1.6052523851394653, "learning_rate": 4.996777425257465e-05, "loss": 1.6264, "step": 237 }, { "epoch": 0.11536597188560349, "grad_norm": 1.5508285760879517, "learning_rate": 4.99655905642054e-05, "loss": 1.6035, "step": 238 }, { "epoch": 0.11585070285991275, "grad_norm": 1.9220739603042603, "learning_rate": 4.996333534627809e-05, "loss": 1.7689, "step": 239 }, { "epoch": 0.11633543383422201, "grad_norm": 1.5728856325149536, "learning_rate": 4.9961008605254237e-05, "loss": 1.4674, "step": 240 }, { "epoch": 0.11682016480853126, "grad_norm": 1.6598260402679443, "learning_rate": 4.9958610347800206e-05, "loss": 1.4091, "step": 241 }, { "epoch": 0.11730489578284052, "grad_norm": 1.6171025037765503, "learning_rate": 4.995614058078733e-05, "loss": 1.5094, "step": 242 }, { "epoch": 0.11778962675714978, "grad_norm": 1.5818380117416382, "learning_rate": 4.995359931129179e-05, "loss": 1.464, "step": 243 }, { "epoch": 0.11827435773145904, "grad_norm": 1.5343612432479858, "learning_rate": 4.995098654659465e-05, "loss": 1.4005, "step": 244 }, { "epoch": 0.1187590887057683, "grad_norm": 1.4373968839645386, "learning_rate": 4.99483022941818e-05, "loss": 1.156, "step": 245 }, { "epoch": 0.11924381968007755, "grad_norm": 1.7314152717590332, "learning_rate": 4.994554656174398e-05, "loss": 1.7723, "step": 246 }, { "epoch": 0.11972855065438681, "grad_norm": 1.6805850267410278, "learning_rate": 4.99427193571767e-05, "loss": 1.6244, "step": 247 }, { "epoch": 0.12021328162869607, "grad_norm": 1.5888878107070923, "learning_rate": 4.993982068858025e-05, "loss": 1.6246, "step": 248 }, { "epoch": 0.12069801260300533, "grad_norm": 1.8114043474197388, "learning_rate": 4.9936850564259695e-05, "loss": 1.4527, "step": 249 }, { "epoch": 0.1211827435773146, "grad_norm": 1.7056317329406738, "learning_rate": 4.99338089927248e-05, "loss": 1.5897, "step": 250 }, { "epoch": 0.12166747455162386, "grad_norm": 1.6106626987457275, "learning_rate": 4.993069598269006e-05, "loss": 1.5619, "step": 251 }, { "epoch": 0.1221522055259331, "grad_norm": 1.6103230714797974, "learning_rate": 4.9927511543074635e-05, "loss": 1.3745, "step": 252 }, { "epoch": 0.12263693650024236, "grad_norm": 1.699284315109253, "learning_rate": 4.992425568300234e-05, "loss": 1.5918, "step": 253 }, { "epoch": 0.12312166747455162, "grad_norm": 1.6529169082641602, "learning_rate": 4.992092841180164e-05, "loss": 1.7702, "step": 254 }, { "epoch": 0.12360639844886089, "grad_norm": 1.7270163297653198, "learning_rate": 4.9917529739005574e-05, "loss": 1.5362, "step": 255 }, { "epoch": 0.12409112942317015, "grad_norm": 2.138240098953247, "learning_rate": 4.991405967435177e-05, "loss": 1.7812, "step": 256 }, { "epoch": 0.1245758603974794, "grad_norm": 1.7157567739486694, "learning_rate": 4.991051822778239e-05, "loss": 1.8263, "step": 257 }, { "epoch": 0.12506059137178865, "grad_norm": 1.5743305683135986, "learning_rate": 4.990690540944414e-05, "loss": 1.5631, "step": 258 }, { "epoch": 0.12554532234609792, "grad_norm": 1.4902536869049072, "learning_rate": 4.9903221229688194e-05, "loss": 1.6046, "step": 259 }, { "epoch": 0.12603005332040718, "grad_norm": 1.700891375541687, "learning_rate": 4.989946569907019e-05, "loss": 1.7547, "step": 260 }, { "epoch": 0.12651478429471644, "grad_norm": 1.683803915977478, "learning_rate": 4.989563882835019e-05, "loss": 1.3777, "step": 261 }, { "epoch": 0.1269995152690257, "grad_norm": 1.548683762550354, "learning_rate": 4.989174062849267e-05, "loss": 1.2059, "step": 262 }, { "epoch": 0.12748424624333496, "grad_norm": 1.5738409757614136, "learning_rate": 4.988777111066646e-05, "loss": 1.2089, "step": 263 }, { "epoch": 0.12796897721764422, "grad_norm": 1.5687381029129028, "learning_rate": 4.9883730286244715e-05, "loss": 1.4081, "step": 264 }, { "epoch": 0.12845370819195345, "grad_norm": 1.6630191802978516, "learning_rate": 4.987961816680492e-05, "loss": 1.472, "step": 265 }, { "epoch": 0.1289384391662627, "grad_norm": 1.692008376121521, "learning_rate": 4.987543476412881e-05, "loss": 1.8636, "step": 266 }, { "epoch": 0.12942317014057197, "grad_norm": 1.612111210823059, "learning_rate": 4.987118009020237e-05, "loss": 1.5438, "step": 267 }, { "epoch": 0.12990790111488124, "grad_norm": 2.01253342628479, "learning_rate": 4.986685415721576e-05, "loss": 1.9671, "step": 268 }, { "epoch": 0.1303926320891905, "grad_norm": 1.5312128067016602, "learning_rate": 4.986245697756333e-05, "loss": 1.3681, "step": 269 }, { "epoch": 0.13087736306349976, "grad_norm": 1.6384061574935913, "learning_rate": 4.985798856384355e-05, "loss": 1.6502, "step": 270 }, { "epoch": 0.13136209403780902, "grad_norm": 2.0369157791137695, "learning_rate": 4.985344892885899e-05, "loss": 1.5976, "step": 271 }, { "epoch": 0.13184682501211828, "grad_norm": 1.5668435096740723, "learning_rate": 4.984883808561628e-05, "loss": 1.456, "step": 272 }, { "epoch": 0.13233155598642754, "grad_norm": 3.0037758350372314, "learning_rate": 4.9844156047326054e-05, "loss": 1.4169, "step": 273 }, { "epoch": 0.1328162869607368, "grad_norm": 1.5953983068466187, "learning_rate": 4.9839402827402947e-05, "loss": 1.5507, "step": 274 }, { "epoch": 0.13330101793504606, "grad_norm": 1.6405383348464966, "learning_rate": 4.983457843946554e-05, "loss": 1.5394, "step": 275 }, { "epoch": 0.1337857489093553, "grad_norm": 1.5499027967453003, "learning_rate": 4.98296828973363e-05, "loss": 1.5463, "step": 276 }, { "epoch": 0.13427047988366456, "grad_norm": 1.5397121906280518, "learning_rate": 4.9824716215041575e-05, "loss": 1.5493, "step": 277 }, { "epoch": 0.13475521085797382, "grad_norm": 1.5781245231628418, "learning_rate": 4.981967840681154e-05, "loss": 1.6137, "step": 278 }, { "epoch": 0.13523994183228308, "grad_norm": 1.79483962059021, "learning_rate": 4.981456948708014e-05, "loss": 1.6445, "step": 279 }, { "epoch": 0.13572467280659234, "grad_norm": 1.5061801671981812, "learning_rate": 4.980938947048508e-05, "loss": 1.4163, "step": 280 }, { "epoch": 0.1362094037809016, "grad_norm": 1.5987025499343872, "learning_rate": 4.980413837186775e-05, "loss": 1.531, "step": 281 }, { "epoch": 0.13669413475521086, "grad_norm": 1.7168915271759033, "learning_rate": 4.979881620627322e-05, "loss": 1.4682, "step": 282 }, { "epoch": 0.13717886572952012, "grad_norm": 1.4458422660827637, "learning_rate": 4.979342298895016e-05, "loss": 1.2247, "step": 283 }, { "epoch": 0.13766359670382938, "grad_norm": 1.6409187316894531, "learning_rate": 4.9787958735350816e-05, "loss": 1.7199, "step": 284 }, { "epoch": 0.13814832767813864, "grad_norm": 1.5449514389038086, "learning_rate": 4.978242346113095e-05, "loss": 1.4311, "step": 285 }, { "epoch": 0.1386330586524479, "grad_norm": 1.5891187191009521, "learning_rate": 4.977681718214984e-05, "loss": 1.8869, "step": 286 }, { "epoch": 0.13911778962675714, "grad_norm": 3.7609283924102783, "learning_rate": 4.977113991447017e-05, "loss": 1.7382, "step": 287 }, { "epoch": 0.1396025206010664, "grad_norm": 1.7186360359191895, "learning_rate": 4.976539167435803e-05, "loss": 1.699, "step": 288 }, { "epoch": 0.14008725157537566, "grad_norm": 1.4953521490097046, "learning_rate": 4.9759572478282846e-05, "loss": 1.4215, "step": 289 }, { "epoch": 0.14057198254968492, "grad_norm": 1.5115928649902344, "learning_rate": 4.975368234291734e-05, "loss": 1.2988, "step": 290 }, { "epoch": 0.14105671352399418, "grad_norm": 2.540882110595703, "learning_rate": 4.974772128513751e-05, "loss": 1.651, "step": 291 }, { "epoch": 0.14154144449830344, "grad_norm": 1.6582648754119873, "learning_rate": 4.974168932202252e-05, "loss": 1.7001, "step": 292 }, { "epoch": 0.1420261754726127, "grad_norm": 1.5049525499343872, "learning_rate": 4.973558647085472e-05, "loss": 1.7229, "step": 293 }, { "epoch": 0.14251090644692196, "grad_norm": 1.6676872968673706, "learning_rate": 4.972941274911953e-05, "loss": 1.5541, "step": 294 }, { "epoch": 0.14299563742123123, "grad_norm": 1.5429099798202515, "learning_rate": 4.972316817450544e-05, "loss": 1.813, "step": 295 }, { "epoch": 0.1434803683955405, "grad_norm": 1.6391764879226685, "learning_rate": 4.9716852764903955e-05, "loss": 1.9432, "step": 296 }, { "epoch": 0.14396509936984975, "grad_norm": 1.6778995990753174, "learning_rate": 4.9710466538409505e-05, "loss": 1.5883, "step": 297 }, { "epoch": 0.14444983034415898, "grad_norm": 1.508912205696106, "learning_rate": 4.9704009513319444e-05, "loss": 1.6739, "step": 298 }, { "epoch": 0.14493456131846824, "grad_norm": 1.5249568223953247, "learning_rate": 4.9697481708133955e-05, "loss": 1.3977, "step": 299 }, { "epoch": 0.1454192922927775, "grad_norm": 1.429032802581787, "learning_rate": 4.969088314155602e-05, "loss": 1.3599, "step": 300 }, { "epoch": 0.14590402326708676, "grad_norm": 1.425854206085205, "learning_rate": 4.968421383249137e-05, "loss": 1.4904, "step": 301 }, { "epoch": 0.14638875424139602, "grad_norm": 1.5954697132110596, "learning_rate": 4.967747380004839e-05, "loss": 1.6036, "step": 302 }, { "epoch": 0.14687348521570529, "grad_norm": 1.6121957302093506, "learning_rate": 4.967066306353816e-05, "loss": 1.5024, "step": 303 }, { "epoch": 0.14735821619001455, "grad_norm": 1.502057433128357, "learning_rate": 4.966378164247426e-05, "loss": 1.4643, "step": 304 }, { "epoch": 0.1478429471643238, "grad_norm": 1.4162192344665527, "learning_rate": 4.965682955657286e-05, "loss": 1.279, "step": 305 }, { "epoch": 0.14832767813863307, "grad_norm": 1.5923371315002441, "learning_rate": 4.964980682575253e-05, "loss": 1.4768, "step": 306 }, { "epoch": 0.14881240911294233, "grad_norm": 1.5782544612884521, "learning_rate": 4.964271347013431e-05, "loss": 1.3391, "step": 307 }, { "epoch": 0.14929714008725156, "grad_norm": 2.188934326171875, "learning_rate": 4.9635549510041516e-05, "loss": 1.6648, "step": 308 }, { "epoch": 0.14978187106156082, "grad_norm": 1.6566468477249146, "learning_rate": 4.9628314965999835e-05, "loss": 1.6523, "step": 309 }, { "epoch": 0.15026660203587008, "grad_norm": 1.46741783618927, "learning_rate": 4.9621009858737116e-05, "loss": 1.5349, "step": 310 }, { "epoch": 0.15075133301017934, "grad_norm": 1.486839771270752, "learning_rate": 4.961363420918342e-05, "loss": 1.4895, "step": 311 }, { "epoch": 0.1512360639844886, "grad_norm": 1.4840891361236572, "learning_rate": 4.960618803847092e-05, "loss": 1.5155, "step": 312 }, { "epoch": 0.15172079495879787, "grad_norm": 1.578395962715149, "learning_rate": 4.959867136793384e-05, "loss": 1.5057, "step": 313 }, { "epoch": 0.15220552593310713, "grad_norm": 1.5518931150436401, "learning_rate": 4.959108421910835e-05, "loss": 1.8778, "step": 314 }, { "epoch": 0.1526902569074164, "grad_norm": 1.491754412651062, "learning_rate": 4.958342661373262e-05, "loss": 1.5156, "step": 315 }, { "epoch": 0.15317498788172565, "grad_norm": 1.492876648902893, "learning_rate": 4.957569857374664e-05, "loss": 1.5804, "step": 316 }, { "epoch": 0.1536597188560349, "grad_norm": 1.7110258340835571, "learning_rate": 4.956790012129221e-05, "loss": 1.8366, "step": 317 }, { "epoch": 0.15414444983034417, "grad_norm": 1.5975233316421509, "learning_rate": 4.9560031278712896e-05, "loss": 1.5372, "step": 318 }, { "epoch": 0.1546291808046534, "grad_norm": 1.5784556865692139, "learning_rate": 4.95520920685539e-05, "loss": 1.7814, "step": 319 }, { "epoch": 0.15511391177896267, "grad_norm": 1.4783300161361694, "learning_rate": 4.9544082513562076e-05, "loss": 1.495, "step": 320 }, { "epoch": 0.15559864275327193, "grad_norm": 1.75221586227417, "learning_rate": 4.95360026366858e-05, "loss": 1.7768, "step": 321 }, { "epoch": 0.1560833737275812, "grad_norm": 2.064201831817627, "learning_rate": 4.952785246107494e-05, "loss": 1.983, "step": 322 }, { "epoch": 0.15656810470189045, "grad_norm": 1.7069522142410278, "learning_rate": 4.951963201008076e-05, "loss": 1.6382, "step": 323 }, { "epoch": 0.1570528356761997, "grad_norm": 1.5614511966705322, "learning_rate": 4.951134130725591e-05, "loss": 1.6066, "step": 324 }, { "epoch": 0.15753756665050897, "grad_norm": 1.510345697402954, "learning_rate": 4.950298037635428e-05, "loss": 1.6226, "step": 325 }, { "epoch": 0.15802229762481823, "grad_norm": 1.7443574666976929, "learning_rate": 4.949454924133098e-05, "loss": 1.8012, "step": 326 }, { "epoch": 0.1585070285991275, "grad_norm": 1.6052006483078003, "learning_rate": 4.948604792634229e-05, "loss": 1.7394, "step": 327 }, { "epoch": 0.15899175957343675, "grad_norm": 1.6979362964630127, "learning_rate": 4.947747645574555e-05, "loss": 1.4477, "step": 328 }, { "epoch": 0.159476490547746, "grad_norm": 2.022580146789551, "learning_rate": 4.9468834854099095e-05, "loss": 1.5245, "step": 329 }, { "epoch": 0.15996122152205525, "grad_norm": 1.485262155532837, "learning_rate": 4.94601231461622e-05, "loss": 1.4005, "step": 330 }, { "epoch": 0.1604459524963645, "grad_norm": 1.5557737350463867, "learning_rate": 4.9451341356895e-05, "loss": 1.5077, "step": 331 }, { "epoch": 0.16093068347067377, "grad_norm": 1.7877156734466553, "learning_rate": 4.9442489511458426e-05, "loss": 1.62, "step": 332 }, { "epoch": 0.16141541444498303, "grad_norm": 1.6162307262420654, "learning_rate": 4.943356763521414e-05, "loss": 1.7224, "step": 333 }, { "epoch": 0.1619001454192923, "grad_norm": 1.500104308128357, "learning_rate": 4.942457575372443e-05, "loss": 1.4062, "step": 334 }, { "epoch": 0.16238487639360155, "grad_norm": 1.463844656944275, "learning_rate": 4.941551389275217e-05, "loss": 1.6271, "step": 335 }, { "epoch": 0.1628696073679108, "grad_norm": 1.591209888458252, "learning_rate": 4.940638207826074e-05, "loss": 1.5389, "step": 336 }, { "epoch": 0.16335433834222007, "grad_norm": 1.4040523767471313, "learning_rate": 4.9397180336413915e-05, "loss": 1.355, "step": 337 }, { "epoch": 0.16383906931652933, "grad_norm": 1.4741119146347046, "learning_rate": 4.938790869357587e-05, "loss": 1.5721, "step": 338 }, { "epoch": 0.1643238002908386, "grad_norm": 1.6420493125915527, "learning_rate": 4.937856717631102e-05, "loss": 1.7241, "step": 339 }, { "epoch": 0.16480853126514786, "grad_norm": 1.6836682558059692, "learning_rate": 4.936915581138398e-05, "loss": 1.5049, "step": 340 }, { "epoch": 0.1652932622394571, "grad_norm": 1.529339075088501, "learning_rate": 4.935967462575949e-05, "loss": 1.4003, "step": 341 }, { "epoch": 0.16577799321376635, "grad_norm": 1.4345910549163818, "learning_rate": 4.9350123646602356e-05, "loss": 1.5476, "step": 342 }, { "epoch": 0.1662627241880756, "grad_norm": 1.5849047899246216, "learning_rate": 4.934050290127733e-05, "loss": 1.6484, "step": 343 }, { "epoch": 0.16674745516238487, "grad_norm": 1.5803264379501343, "learning_rate": 4.933081241734905e-05, "loss": 1.5366, "step": 344 }, { "epoch": 0.16723218613669413, "grad_norm": 1.4626617431640625, "learning_rate": 4.9321052222581976e-05, "loss": 1.5589, "step": 345 }, { "epoch": 0.1677169171110034, "grad_norm": 1.4863779544830322, "learning_rate": 4.93112223449403e-05, "loss": 1.3602, "step": 346 }, { "epoch": 0.16820164808531265, "grad_norm": 3.0919888019561768, "learning_rate": 4.930132281258785e-05, "loss": 1.602, "step": 347 }, { "epoch": 0.16868637905962192, "grad_norm": 1.61482834815979, "learning_rate": 4.929135365388804e-05, "loss": 1.594, "step": 348 }, { "epoch": 0.16917111003393118, "grad_norm": 1.4651432037353516, "learning_rate": 4.928131489740375e-05, "loss": 1.353, "step": 349 }, { "epoch": 0.16965584100824044, "grad_norm": 1.6565533876419067, "learning_rate": 4.9271206571897286e-05, "loss": 1.7263, "step": 350 }, { "epoch": 0.1701405719825497, "grad_norm": 1.62041175365448, "learning_rate": 4.926102870633029e-05, "loss": 1.7091, "step": 351 }, { "epoch": 0.17062530295685893, "grad_norm": 1.470937728881836, "learning_rate": 4.9250781329863606e-05, "loss": 1.4829, "step": 352 }, { "epoch": 0.1711100339311682, "grad_norm": 1.6846436262130737, "learning_rate": 4.924046447185726e-05, "loss": 1.6661, "step": 353 }, { "epoch": 0.17159476490547745, "grad_norm": 1.5719223022460938, "learning_rate": 4.923007816187035e-05, "loss": 1.3444, "step": 354 }, { "epoch": 0.17207949587978671, "grad_norm": 1.6110163927078247, "learning_rate": 4.921962242966097e-05, "loss": 1.789, "step": 355 }, { "epoch": 0.17256422685409598, "grad_norm": 1.6574710607528687, "learning_rate": 4.9209097305186094e-05, "loss": 1.4593, "step": 356 }, { "epoch": 0.17304895782840524, "grad_norm": 1.5048030614852905, "learning_rate": 4.9198502818601547e-05, "loss": 1.3008, "step": 357 }, { "epoch": 0.1735336888027145, "grad_norm": 1.6137561798095703, "learning_rate": 4.918783900026184e-05, "loss": 1.4641, "step": 358 }, { "epoch": 0.17401841977702376, "grad_norm": 1.6105793714523315, "learning_rate": 4.9177105880720173e-05, "loss": 1.6433, "step": 359 }, { "epoch": 0.17450315075133302, "grad_norm": 1.8704934120178223, "learning_rate": 4.916630349072828e-05, "loss": 1.9321, "step": 360 }, { "epoch": 0.17498788172564228, "grad_norm": 1.8688757419586182, "learning_rate": 4.915543186123636e-05, "loss": 1.6639, "step": 361 }, { "epoch": 0.1754726126999515, "grad_norm": 1.411590576171875, "learning_rate": 4.9144491023393016e-05, "loss": 1.4361, "step": 362 }, { "epoch": 0.17595734367426077, "grad_norm": 1.503797173500061, "learning_rate": 4.913348100854511e-05, "loss": 1.4521, "step": 363 }, { "epoch": 0.17644207464857004, "grad_norm": 2.0284149646759033, "learning_rate": 4.912240184823772e-05, "loss": 1.6762, "step": 364 }, { "epoch": 0.1769268056228793, "grad_norm": 1.6562719345092773, "learning_rate": 4.911125357421405e-05, "loss": 1.5597, "step": 365 }, { "epoch": 0.17741153659718856, "grad_norm": 1.6344953775405884, "learning_rate": 4.9100036218415285e-05, "loss": 1.6212, "step": 366 }, { "epoch": 0.17789626757149782, "grad_norm": 1.5039825439453125, "learning_rate": 4.908874981298057e-05, "loss": 1.4205, "step": 367 }, { "epoch": 0.17838099854580708, "grad_norm": 1.5786499977111816, "learning_rate": 4.907739439024689e-05, "loss": 1.6106, "step": 368 }, { "epoch": 0.17886572952011634, "grad_norm": 1.7766309976577759, "learning_rate": 4.9065969982748946e-05, "loss": 1.5542, "step": 369 }, { "epoch": 0.1793504604944256, "grad_norm": 1.6327425241470337, "learning_rate": 4.9054476623219104e-05, "loss": 1.5334, "step": 370 }, { "epoch": 0.17983519146873486, "grad_norm": 1.6132947206497192, "learning_rate": 4.904291434458729e-05, "loss": 1.5541, "step": 371 }, { "epoch": 0.18031992244304412, "grad_norm": 1.458174467086792, "learning_rate": 4.9031283179980874e-05, "loss": 1.3287, "step": 372 }, { "epoch": 0.18080465341735336, "grad_norm": 1.5462307929992676, "learning_rate": 4.901958316272462e-05, "loss": 1.6874, "step": 373 }, { "epoch": 0.18128938439166262, "grad_norm": 1.3828489780426025, "learning_rate": 4.9007814326340544e-05, "loss": 1.4119, "step": 374 }, { "epoch": 0.18177411536597188, "grad_norm": 1.6115537881851196, "learning_rate": 4.899597670454785e-05, "loss": 1.6936, "step": 375 }, { "epoch": 0.18225884634028114, "grad_norm": 1.5310516357421875, "learning_rate": 4.89840703312628e-05, "loss": 1.4026, "step": 376 }, { "epoch": 0.1827435773145904, "grad_norm": 1.604029655456543, "learning_rate": 4.897209524059866e-05, "loss": 1.5988, "step": 377 }, { "epoch": 0.18322830828889966, "grad_norm": 1.6310497522354126, "learning_rate": 4.896005146686558e-05, "loss": 1.5445, "step": 378 }, { "epoch": 0.18371303926320892, "grad_norm": 1.655869483947754, "learning_rate": 4.8947939044570467e-05, "loss": 1.5755, "step": 379 }, { "epoch": 0.18419777023751818, "grad_norm": 1.5540688037872314, "learning_rate": 4.893575800841695e-05, "loss": 1.5128, "step": 380 }, { "epoch": 0.18468250121182744, "grad_norm": 1.5631048679351807, "learning_rate": 4.892350839330522e-05, "loss": 1.645, "step": 381 }, { "epoch": 0.1851672321861367, "grad_norm": 1.522678017616272, "learning_rate": 4.891119023433198e-05, "loss": 1.5037, "step": 382 }, { "epoch": 0.18565196316044597, "grad_norm": 1.7026262283325195, "learning_rate": 4.8898803566790296e-05, "loss": 1.6548, "step": 383 }, { "epoch": 0.1861366941347552, "grad_norm": 2.272061824798584, "learning_rate": 4.888634842616953e-05, "loss": 1.6816, "step": 384 }, { "epoch": 0.18662142510906446, "grad_norm": 1.7039604187011719, "learning_rate": 4.887382484815522e-05, "loss": 1.8058, "step": 385 }, { "epoch": 0.18710615608337372, "grad_norm": 1.421434760093689, "learning_rate": 4.8861232868628994e-05, "loss": 1.2899, "step": 386 }, { "epoch": 0.18759088705768298, "grad_norm": 1.4786161184310913, "learning_rate": 4.884857252366847e-05, "loss": 1.5044, "step": 387 }, { "epoch": 0.18807561803199224, "grad_norm": 1.6115511655807495, "learning_rate": 4.8835843849547126e-05, "loss": 1.4588, "step": 388 }, { "epoch": 0.1885603490063015, "grad_norm": 1.4734134674072266, "learning_rate": 4.88230468827342e-05, "loss": 1.5804, "step": 389 }, { "epoch": 0.18904507998061076, "grad_norm": 1.677364468574524, "learning_rate": 4.8810181659894635e-05, "loss": 1.5134, "step": 390 }, { "epoch": 0.18952981095492002, "grad_norm": 1.5314624309539795, "learning_rate": 4.879724821788889e-05, "loss": 1.7543, "step": 391 }, { "epoch": 0.19001454192922929, "grad_norm": 1.8517158031463623, "learning_rate": 4.878424659377292e-05, "loss": 1.641, "step": 392 }, { "epoch": 0.19049927290353855, "grad_norm": 1.5239332914352417, "learning_rate": 4.8771176824798006e-05, "loss": 1.2542, "step": 393 }, { "epoch": 0.1909840038778478, "grad_norm": 1.6505666971206665, "learning_rate": 4.875803894841069e-05, "loss": 1.5467, "step": 394 }, { "epoch": 0.19146873485215704, "grad_norm": 1.5365902185440063, "learning_rate": 4.8744833002252625e-05, "loss": 1.3391, "step": 395 }, { "epoch": 0.1919534658264663, "grad_norm": 1.8492168188095093, "learning_rate": 4.8731559024160524e-05, "loss": 1.6368, "step": 396 }, { "epoch": 0.19243819680077556, "grad_norm": 1.471468210220337, "learning_rate": 4.8718217052165985e-05, "loss": 1.2187, "step": 397 }, { "epoch": 0.19292292777508482, "grad_norm": 1.6151539087295532, "learning_rate": 4.870480712449546e-05, "loss": 1.6523, "step": 398 }, { "epoch": 0.19340765874939408, "grad_norm": 1.5233840942382812, "learning_rate": 4.869132927957007e-05, "loss": 1.6125, "step": 399 }, { "epoch": 0.19389238972370335, "grad_norm": 1.4746246337890625, "learning_rate": 4.8677783556005515e-05, "loss": 1.5412, "step": 400 }, { "epoch": 0.1943771206980126, "grad_norm": 1.7926899194717407, "learning_rate": 4.8664169992612035e-05, "loss": 1.7692, "step": 401 }, { "epoch": 0.19486185167232187, "grad_norm": 1.5407027006149292, "learning_rate": 4.865048862839417e-05, "loss": 1.3119, "step": 402 }, { "epoch": 0.19534658264663113, "grad_norm": 1.5474193096160889, "learning_rate": 4.8636739502550775e-05, "loss": 1.4238, "step": 403 }, { "epoch": 0.1958313136209404, "grad_norm": 1.5824707746505737, "learning_rate": 4.862292265447481e-05, "loss": 1.5708, "step": 404 }, { "epoch": 0.19631604459524965, "grad_norm": 1.5270860195159912, "learning_rate": 4.860903812375329e-05, "loss": 1.5826, "step": 405 }, { "epoch": 0.19680077556955888, "grad_norm": 1.4273207187652588, "learning_rate": 4.859508595016713e-05, "loss": 1.3348, "step": 406 }, { "epoch": 0.19728550654386814, "grad_norm": 1.4353327751159668, "learning_rate": 4.8581066173691074e-05, "loss": 1.2723, "step": 407 }, { "epoch": 0.1977702375181774, "grad_norm": 1.475465178489685, "learning_rate": 4.856697883449355e-05, "loss": 1.5006, "step": 408 }, { "epoch": 0.19825496849248667, "grad_norm": 1.4205036163330078, "learning_rate": 4.8552823972936545e-05, "loss": 1.2869, "step": 409 }, { "epoch": 0.19873969946679593, "grad_norm": 1.5155439376831055, "learning_rate": 4.853860162957552e-05, "loss": 1.4603, "step": 410 }, { "epoch": 0.1992244304411052, "grad_norm": 1.5754328966140747, "learning_rate": 4.8524311845159286e-05, "loss": 1.4744, "step": 411 }, { "epoch": 0.19970916141541445, "grad_norm": 1.5085046291351318, "learning_rate": 4.850995466062988e-05, "loss": 1.4843, "step": 412 }, { "epoch": 0.2001938923897237, "grad_norm": 1.3275054693222046, "learning_rate": 4.849553011712241e-05, "loss": 1.4272, "step": 413 }, { "epoch": 0.20067862336403297, "grad_norm": 1.424777626991272, "learning_rate": 4.848103825596504e-05, "loss": 1.4182, "step": 414 }, { "epoch": 0.20116335433834223, "grad_norm": 1.498131513595581, "learning_rate": 4.8466479118678766e-05, "loss": 1.4618, "step": 415 }, { "epoch": 0.20164808531265146, "grad_norm": 1.4873474836349487, "learning_rate": 4.845185274697734e-05, "loss": 1.2547, "step": 416 }, { "epoch": 0.20213281628696073, "grad_norm": 1.4218641519546509, "learning_rate": 4.843715918276717e-05, "loss": 1.2078, "step": 417 }, { "epoch": 0.20261754726127, "grad_norm": 1.585609793663025, "learning_rate": 4.842239846814716e-05, "loss": 1.3015, "step": 418 }, { "epoch": 0.20310227823557925, "grad_norm": 1.5259923934936523, "learning_rate": 4.840757064540862e-05, "loss": 1.8969, "step": 419 }, { "epoch": 0.2035870092098885, "grad_norm": 1.5763602256774902, "learning_rate": 4.8392675757035114e-05, "loss": 1.6627, "step": 420 }, { "epoch": 0.20407174018419777, "grad_norm": 1.4582678079605103, "learning_rate": 4.837771384570238e-05, "loss": 1.4835, "step": 421 }, { "epoch": 0.20455647115850703, "grad_norm": 1.6100239753723145, "learning_rate": 4.8362684954278174e-05, "loss": 1.3861, "step": 422 }, { "epoch": 0.2050412021328163, "grad_norm": 1.6313788890838623, "learning_rate": 4.834758912582217e-05, "loss": 1.4019, "step": 423 }, { "epoch": 0.20552593310712555, "grad_norm": 1.7470930814743042, "learning_rate": 4.8332426403585805e-05, "loss": 1.7705, "step": 424 }, { "epoch": 0.2060106640814348, "grad_norm": 1.4268667697906494, "learning_rate": 4.831719683101219e-05, "loss": 1.4434, "step": 425 }, { "epoch": 0.20649539505574407, "grad_norm": 1.5203051567077637, "learning_rate": 4.830190045173596e-05, "loss": 1.7992, "step": 426 }, { "epoch": 0.2069801260300533, "grad_norm": 4.344082355499268, "learning_rate": 4.828653730958318e-05, "loss": 1.4597, "step": 427 }, { "epoch": 0.20746485700436257, "grad_norm": 1.5200762748718262, "learning_rate": 4.827110744857117e-05, "loss": 1.4586, "step": 428 }, { "epoch": 0.20794958797867183, "grad_norm": 1.4414129257202148, "learning_rate": 4.825561091290844e-05, "loss": 1.2286, "step": 429 }, { "epoch": 0.2084343189529811, "grad_norm": 1.9879165887832642, "learning_rate": 4.82400477469945e-05, "loss": 1.6122, "step": 430 }, { "epoch": 0.20891904992729035, "grad_norm": 1.4587631225585938, "learning_rate": 4.822441799541979e-05, "loss": 1.5816, "step": 431 }, { "epoch": 0.2094037809015996, "grad_norm": 1.6219898462295532, "learning_rate": 4.82087217029655e-05, "loss": 1.4554, "step": 432 }, { "epoch": 0.20988851187590887, "grad_norm": 1.6090373992919922, "learning_rate": 4.819295891460349e-05, "loss": 1.8978, "step": 433 }, { "epoch": 0.21037324285021813, "grad_norm": 1.6244341135025024, "learning_rate": 4.817712967549614e-05, "loss": 1.5749, "step": 434 }, { "epoch": 0.2108579738245274, "grad_norm": 1.5403733253479004, "learning_rate": 4.8161234030996204e-05, "loss": 1.5755, "step": 435 }, { "epoch": 0.21134270479883666, "grad_norm": 1.6046067476272583, "learning_rate": 4.81452720266467e-05, "loss": 1.8461, "step": 436 }, { "epoch": 0.21182743577314592, "grad_norm": 1.4461714029312134, "learning_rate": 4.8129243708180785e-05, "loss": 1.4177, "step": 437 }, { "epoch": 0.21231216674745515, "grad_norm": 1.6043570041656494, "learning_rate": 4.81131491215216e-05, "loss": 1.5736, "step": 438 }, { "epoch": 0.2127968977217644, "grad_norm": 1.4942891597747803, "learning_rate": 4.8096988312782174e-05, "loss": 1.3317, "step": 439 }, { "epoch": 0.21328162869607367, "grad_norm": 1.5307039022445679, "learning_rate": 4.808076132826524e-05, "loss": 1.493, "step": 440 }, { "epoch": 0.21376635967038293, "grad_norm": 1.3940438032150269, "learning_rate": 4.806446821446317e-05, "loss": 1.4157, "step": 441 }, { "epoch": 0.2142510906446922, "grad_norm": 1.4989084005355835, "learning_rate": 4.8048109018057776e-05, "loss": 1.6034, "step": 442 }, { "epoch": 0.21473582161900145, "grad_norm": 1.5145782232284546, "learning_rate": 4.80316837859202e-05, "loss": 1.6784, "step": 443 }, { "epoch": 0.21522055259331072, "grad_norm": 1.4529794454574585, "learning_rate": 4.801519256511082e-05, "loss": 1.4246, "step": 444 }, { "epoch": 0.21570528356761998, "grad_norm": 1.5570049285888672, "learning_rate": 4.799863540287905e-05, "loss": 1.582, "step": 445 }, { "epoch": 0.21619001454192924, "grad_norm": 1.42445707321167, "learning_rate": 4.798201234666324e-05, "loss": 1.3466, "step": 446 }, { "epoch": 0.2166747455162385, "grad_norm": 1.5558465719223022, "learning_rate": 4.796532344409055e-05, "loss": 1.5421, "step": 447 }, { "epoch": 0.21715947649054776, "grad_norm": 4.552513599395752, "learning_rate": 4.794856874297676e-05, "loss": 2.2261, "step": 448 }, { "epoch": 0.217644207464857, "grad_norm": 1.374784231185913, "learning_rate": 4.793174829132623e-05, "loss": 1.5355, "step": 449 }, { "epoch": 0.21812893843916625, "grad_norm": 1.9197838306427002, "learning_rate": 4.791486213733164e-05, "loss": 1.7625, "step": 450 }, { "epoch": 0.21861366941347551, "grad_norm": 1.4595524072647095, "learning_rate": 4.789791032937397e-05, "loss": 1.2965, "step": 451 }, { "epoch": 0.21909840038778478, "grad_norm": 1.4804373979568481, "learning_rate": 4.7880892916022265e-05, "loss": 1.3907, "step": 452 }, { "epoch": 0.21958313136209404, "grad_norm": 1.4374562501907349, "learning_rate": 4.786380994603356e-05, "loss": 1.4291, "step": 453 }, { "epoch": 0.2200678623364033, "grad_norm": 1.697147250175476, "learning_rate": 4.7846661468352716e-05, "loss": 1.8611, "step": 454 }, { "epoch": 0.22055259331071256, "grad_norm": 1.5372896194458008, "learning_rate": 4.782944753211228e-05, "loss": 1.5396, "step": 455 }, { "epoch": 0.22103732428502182, "grad_norm": 1.520835518836975, "learning_rate": 4.781216818663234e-05, "loss": 1.5921, "step": 456 }, { "epoch": 0.22152205525933108, "grad_norm": 1.7574188709259033, "learning_rate": 4.7794823481420406e-05, "loss": 1.644, "step": 457 }, { "epoch": 0.22200678623364034, "grad_norm": 1.384774923324585, "learning_rate": 4.7777413466171227e-05, "loss": 1.542, "step": 458 }, { "epoch": 0.2224915172079496, "grad_norm": 1.520736575126648, "learning_rate": 4.7759938190766694e-05, "loss": 1.674, "step": 459 }, { "epoch": 0.22297624818225883, "grad_norm": 1.4717646837234497, "learning_rate": 4.7742397705275665e-05, "loss": 1.3583, "step": 460 }, { "epoch": 0.2234609791565681, "grad_norm": 1.4752681255340576, "learning_rate": 4.772479205995385e-05, "loss": 1.4931, "step": 461 }, { "epoch": 0.22394571013087736, "grad_norm": 1.5523077249526978, "learning_rate": 4.7707121305243623e-05, "loss": 1.4731, "step": 462 }, { "epoch": 0.22443044110518662, "grad_norm": 1.4732838869094849, "learning_rate": 4.768938549177393e-05, "loss": 1.4059, "step": 463 }, { "epoch": 0.22491517207949588, "grad_norm": 1.555053472518921, "learning_rate": 4.7671584670360105e-05, "loss": 1.3137, "step": 464 }, { "epoch": 0.22539990305380514, "grad_norm": 1.4933258295059204, "learning_rate": 4.765371889200373e-05, "loss": 1.4378, "step": 465 }, { "epoch": 0.2258846340281144, "grad_norm": 1.409632921218872, "learning_rate": 4.763578820789253e-05, "loss": 1.4902, "step": 466 }, { "epoch": 0.22636936500242366, "grad_norm": 1.4202258586883545, "learning_rate": 4.761779266940015e-05, "loss": 1.3832, "step": 467 }, { "epoch": 0.22685409597673292, "grad_norm": 1.3944295644760132, "learning_rate": 4.759973232808609e-05, "loss": 1.5828, "step": 468 }, { "epoch": 0.22733882695104218, "grad_norm": 1.5394325256347656, "learning_rate": 4.758160723569548e-05, "loss": 1.207, "step": 469 }, { "epoch": 0.22782355792535142, "grad_norm": 1.7582964897155762, "learning_rate": 4.756341744415901e-05, "loss": 1.7935, "step": 470 }, { "epoch": 0.22830828889966068, "grad_norm": 1.4797354936599731, "learning_rate": 4.754516300559271e-05, "loss": 1.5384, "step": 471 }, { "epoch": 0.22879301987396994, "grad_norm": 1.5122283697128296, "learning_rate": 4.752684397229784e-05, "loss": 1.8131, "step": 472 }, { "epoch": 0.2292777508482792, "grad_norm": 1.4401196241378784, "learning_rate": 4.750846039676075e-05, "loss": 1.5188, "step": 473 }, { "epoch": 0.22976248182258846, "grad_norm": 1.4823417663574219, "learning_rate": 4.7490012331652675e-05, "loss": 1.2987, "step": 474 }, { "epoch": 0.23024721279689772, "grad_norm": 1.5985757112503052, "learning_rate": 4.7471499829829666e-05, "loss": 1.7577, "step": 475 }, { "epoch": 0.23073194377120698, "grad_norm": 1.5306726694107056, "learning_rate": 4.7452922944332355e-05, "loss": 1.4455, "step": 476 }, { "epoch": 0.23121667474551624, "grad_norm": 1.585697889328003, "learning_rate": 4.7434281728385867e-05, "loss": 1.4526, "step": 477 }, { "epoch": 0.2317014057198255, "grad_norm": 1.5287288427352905, "learning_rate": 4.741557623539962e-05, "loss": 1.8082, "step": 478 }, { "epoch": 0.23218613669413476, "grad_norm": 1.835727334022522, "learning_rate": 4.73968065189672e-05, "loss": 1.5274, "step": 479 }, { "epoch": 0.23267086766844403, "grad_norm": 1.5724575519561768, "learning_rate": 4.7377972632866226e-05, "loss": 1.8953, "step": 480 }, { "epoch": 0.23315559864275326, "grad_norm": 1.598510503768921, "learning_rate": 4.7359074631058134e-05, "loss": 1.7241, "step": 481 }, { "epoch": 0.23364032961706252, "grad_norm": 1.5090588331222534, "learning_rate": 4.7340112567688085e-05, "loss": 1.3641, "step": 482 }, { "epoch": 0.23412506059137178, "grad_norm": 1.5519758462905884, "learning_rate": 4.732108649708478e-05, "loss": 1.5011, "step": 483 }, { "epoch": 0.23460979156568104, "grad_norm": 1.5860713720321655, "learning_rate": 4.7301996473760304e-05, "loss": 1.5786, "step": 484 }, { "epoch": 0.2350945225399903, "grad_norm": 1.534163236618042, "learning_rate": 4.728284255240996e-05, "loss": 1.4667, "step": 485 }, { "epoch": 0.23557925351429956, "grad_norm": 1.3986639976501465, "learning_rate": 4.726362478791217e-05, "loss": 1.4603, "step": 486 }, { "epoch": 0.23606398448860882, "grad_norm": 1.4852182865142822, "learning_rate": 4.724434323532821e-05, "loss": 1.6303, "step": 487 }, { "epoch": 0.23654871546291809, "grad_norm": 1.4321929216384888, "learning_rate": 4.7224997949902186e-05, "loss": 1.3958, "step": 488 }, { "epoch": 0.23703344643722735, "grad_norm": 1.8100051879882812, "learning_rate": 4.720558898706077e-05, "loss": 1.4998, "step": 489 }, { "epoch": 0.2375181774115366, "grad_norm": 1.4117873907089233, "learning_rate": 4.7186116402413064e-05, "loss": 1.3552, "step": 490 }, { "epoch": 0.23800290838584587, "grad_norm": 1.5080662965774536, "learning_rate": 4.716658025175049e-05, "loss": 1.5167, "step": 491 }, { "epoch": 0.2384876393601551, "grad_norm": 1.5356587171554565, "learning_rate": 4.714698059104658e-05, "loss": 1.3864, "step": 492 }, { "epoch": 0.23897237033446436, "grad_norm": 1.686954140663147, "learning_rate": 4.712731747645682e-05, "loss": 1.6785, "step": 493 }, { "epoch": 0.23945710130877362, "grad_norm": 1.4700133800506592, "learning_rate": 4.7107590964318505e-05, "loss": 1.2093, "step": 494 }, { "epoch": 0.23994183228308288, "grad_norm": 1.4735498428344727, "learning_rate": 4.708780111115057e-05, "loss": 1.6332, "step": 495 }, { "epoch": 0.24042656325739215, "grad_norm": 1.572034239768982, "learning_rate": 4.706794797365346e-05, "loss": 1.5175, "step": 496 }, { "epoch": 0.2409112942317014, "grad_norm": 1.4558137655258179, "learning_rate": 4.7048031608708876e-05, "loss": 1.2924, "step": 497 }, { "epoch": 0.24139602520601067, "grad_norm": 1.6685311794281006, "learning_rate": 4.702805207337974e-05, "loss": 1.0251, "step": 498 }, { "epoch": 0.24188075618031993, "grad_norm": 1.5901682376861572, "learning_rate": 4.7008009424909917e-05, "loss": 2.1593, "step": 499 }, { "epoch": 0.2423654871546292, "grad_norm": 1.4795153141021729, "learning_rate": 4.698790372072411e-05, "loss": 1.7071, "step": 500 }, { "epoch": 0.24285021812893845, "grad_norm": 1.9734545946121216, "learning_rate": 4.696773501842771e-05, "loss": 1.4482, "step": 501 }, { "epoch": 0.2433349491032477, "grad_norm": 1.4583733081817627, "learning_rate": 4.694750337580659e-05, "loss": 1.5411, "step": 502 }, { "epoch": 0.24381968007755694, "grad_norm": 1.5093872547149658, "learning_rate": 4.6927208850826925e-05, "loss": 1.414, "step": 503 }, { "epoch": 0.2443044110518662, "grad_norm": 1.376407504081726, "learning_rate": 4.6906851501635106e-05, "loss": 1.2979, "step": 504 }, { "epoch": 0.24478914202617547, "grad_norm": 1.5103840827941895, "learning_rate": 4.688643138655748e-05, "loss": 1.502, "step": 505 }, { "epoch": 0.24527387300048473, "grad_norm": 1.4392768144607544, "learning_rate": 4.686594856410027e-05, "loss": 1.4235, "step": 506 }, { "epoch": 0.245758603974794, "grad_norm": 1.5379340648651123, "learning_rate": 4.684540309294932e-05, "loss": 1.5077, "step": 507 }, { "epoch": 0.24624333494910325, "grad_norm": 1.5546420812606812, "learning_rate": 4.682479503197001e-05, "loss": 1.5388, "step": 508 }, { "epoch": 0.2467280659234125, "grad_norm": 1.6801774501800537, "learning_rate": 4.6804124440207e-05, "loss": 1.7391, "step": 509 }, { "epoch": 0.24721279689772177, "grad_norm": 1.4841091632843018, "learning_rate": 4.678339137688416e-05, "loss": 1.72, "step": 510 }, { "epoch": 0.24769752787203103, "grad_norm": 1.4708201885223389, "learning_rate": 4.67625959014043e-05, "loss": 1.5798, "step": 511 }, { "epoch": 0.2481822588463403, "grad_norm": 1.5190343856811523, "learning_rate": 4.67417380733491e-05, "loss": 1.6462, "step": 512 }, { "epoch": 0.24866698982064955, "grad_norm": 1.479689121246338, "learning_rate": 4.6720817952478854e-05, "loss": 1.5912, "step": 513 }, { "epoch": 0.2491517207949588, "grad_norm": 1.4845134019851685, "learning_rate": 4.6699835598732325e-05, "loss": 1.4543, "step": 514 }, { "epoch": 0.24963645176926805, "grad_norm": 1.5402343273162842, "learning_rate": 4.667879107222662e-05, "loss": 1.4791, "step": 515 }, { "epoch": 0.2501211827435773, "grad_norm": 1.627976417541504, "learning_rate": 4.6657684433256934e-05, "loss": 1.4067, "step": 516 }, { "epoch": 0.2506059137178866, "grad_norm": 1.530739665031433, "learning_rate": 4.6636515742296464e-05, "loss": 1.9333, "step": 517 }, { "epoch": 0.25109064469219583, "grad_norm": 1.4607384204864502, "learning_rate": 4.661528505999615e-05, "loss": 1.537, "step": 518 }, { "epoch": 0.25157537566650506, "grad_norm": 1.5360444784164429, "learning_rate": 4.6593992447184586e-05, "loss": 1.3789, "step": 519 }, { "epoch": 0.25206010664081435, "grad_norm": 1.4390000104904175, "learning_rate": 4.6572637964867776e-05, "loss": 1.4374, "step": 520 }, { "epoch": 0.2525448376151236, "grad_norm": 1.4811269044876099, "learning_rate": 4.6551221674229003e-05, "loss": 1.561, "step": 521 }, { "epoch": 0.2530295685894329, "grad_norm": 1.4047800302505493, "learning_rate": 4.652974363662864e-05, "loss": 1.2907, "step": 522 }, { "epoch": 0.2535142995637421, "grad_norm": 1.6080119609832764, "learning_rate": 4.650820391360396e-05, "loss": 1.4566, "step": 523 }, { "epoch": 0.2539990305380514, "grad_norm": 1.5174490213394165, "learning_rate": 4.6486602566868975e-05, "loss": 1.4285, "step": 524 }, { "epoch": 0.25448376151236063, "grad_norm": 1.5314956903457642, "learning_rate": 4.6464939658314274e-05, "loss": 1.3154, "step": 525 }, { "epoch": 0.2549684924866699, "grad_norm": 1.4693747758865356, "learning_rate": 4.6443215250006806e-05, "loss": 1.4333, "step": 526 }, { "epoch": 0.25545322346097915, "grad_norm": 1.5617380142211914, "learning_rate": 4.642142940418973e-05, "loss": 1.4411, "step": 527 }, { "epoch": 0.25593795443528844, "grad_norm": 1.4139310121536255, "learning_rate": 4.6399582183282256e-05, "loss": 1.6047, "step": 528 }, { "epoch": 0.2564226854095977, "grad_norm": 1.6654372215270996, "learning_rate": 4.6377673649879396e-05, "loss": 1.7693, "step": 529 }, { "epoch": 0.2569074163839069, "grad_norm": 1.595267415046692, "learning_rate": 4.635570386675186e-05, "loss": 1.458, "step": 530 }, { "epoch": 0.2573921473582162, "grad_norm": 1.4914833307266235, "learning_rate": 4.633367289684586e-05, "loss": 1.4662, "step": 531 }, { "epoch": 0.2578768783325254, "grad_norm": 1.764076828956604, "learning_rate": 4.631158080328287e-05, "loss": 2.023, "step": 532 }, { "epoch": 0.2583616093068347, "grad_norm": 1.4962358474731445, "learning_rate": 4.628942764935954e-05, "loss": 1.1698, "step": 533 }, { "epoch": 0.25884634028114395, "grad_norm": 1.628941535949707, "learning_rate": 4.626721349854742e-05, "loss": 1.6603, "step": 534 }, { "epoch": 0.25933107125545324, "grad_norm": 1.473333477973938, "learning_rate": 4.6244938414492875e-05, "loss": 1.4744, "step": 535 }, { "epoch": 0.25981580222976247, "grad_norm": 1.5255992412567139, "learning_rate": 4.62226024610168e-05, "loss": 1.6574, "step": 536 }, { "epoch": 0.26030053320407176, "grad_norm": 1.4754759073257446, "learning_rate": 4.6200205702114526e-05, "loss": 1.3515, "step": 537 }, { "epoch": 0.260785264178381, "grad_norm": 1.539143681526184, "learning_rate": 4.617774820195557e-05, "loss": 1.361, "step": 538 }, { "epoch": 0.2612699951526903, "grad_norm": 1.4834741353988647, "learning_rate": 4.615523002488352e-05, "loss": 1.4823, "step": 539 }, { "epoch": 0.2617547261269995, "grad_norm": 1.541680932044983, "learning_rate": 4.6132651235415764e-05, "loss": 1.6218, "step": 540 }, { "epoch": 0.26223945710130875, "grad_norm": 1.802275538444519, "learning_rate": 4.6110011898243374e-05, "loss": 1.2527, "step": 541 }, { "epoch": 0.26272418807561804, "grad_norm": 1.4547120332717896, "learning_rate": 4.608731207823093e-05, "loss": 1.2674, "step": 542 }, { "epoch": 0.26320891904992727, "grad_norm": 1.4830158948898315, "learning_rate": 4.606455184041622e-05, "loss": 1.3814, "step": 543 }, { "epoch": 0.26369365002423656, "grad_norm": 1.5940104722976685, "learning_rate": 4.6041731250010246e-05, "loss": 1.644, "step": 544 }, { "epoch": 0.2641783809985458, "grad_norm": 1.4886668920516968, "learning_rate": 4.601885037239683e-05, "loss": 1.3262, "step": 545 }, { "epoch": 0.2646631119728551, "grad_norm": 1.414720058441162, "learning_rate": 4.5995909273132587e-05, "loss": 1.3218, "step": 546 }, { "epoch": 0.2651478429471643, "grad_norm": 1.3138704299926758, "learning_rate": 4.597290801794664e-05, "loss": 1.161, "step": 547 }, { "epoch": 0.2656325739214736, "grad_norm": 1.401702880859375, "learning_rate": 4.594984667274048e-05, "loss": 1.6609, "step": 548 }, { "epoch": 0.26611730489578284, "grad_norm": 1.4239600896835327, "learning_rate": 4.592672530358777e-05, "loss": 1.4492, "step": 549 }, { "epoch": 0.2666020358700921, "grad_norm": 1.3764070272445679, "learning_rate": 4.5903543976734145e-05, "loss": 1.1719, "step": 550 }, { "epoch": 0.26708676684440136, "grad_norm": 1.6861745119094849, "learning_rate": 4.5880302758597e-05, "loss": 1.4993, "step": 551 }, { "epoch": 0.2675714978187106, "grad_norm": 1.3549953699111938, "learning_rate": 4.585700171576538e-05, "loss": 1.2464, "step": 552 }, { "epoch": 0.2680562287930199, "grad_norm": 1.4663500785827637, "learning_rate": 4.583364091499968e-05, "loss": 1.7526, "step": 553 }, { "epoch": 0.2685409597673291, "grad_norm": 1.8229775428771973, "learning_rate": 4.581022042323155e-05, "loss": 1.2188, "step": 554 }, { "epoch": 0.2690256907416384, "grad_norm": 1.5518858432769775, "learning_rate": 4.5786740307563636e-05, "loss": 1.4877, "step": 555 }, { "epoch": 0.26951042171594763, "grad_norm": 1.3994567394256592, "learning_rate": 4.576320063526942e-05, "loss": 1.1313, "step": 556 }, { "epoch": 0.2699951526902569, "grad_norm": 1.5038384199142456, "learning_rate": 4.573960147379304e-05, "loss": 1.4847, "step": 557 }, { "epoch": 0.27047988366456616, "grad_norm": 1.4688293933868408, "learning_rate": 4.5715942890749045e-05, "loss": 1.3024, "step": 558 }, { "epoch": 0.27096461463887545, "grad_norm": 1.4721691608428955, "learning_rate": 4.5692224953922266e-05, "loss": 1.7076, "step": 559 }, { "epoch": 0.2714493456131847, "grad_norm": 1.452632188796997, "learning_rate": 4.566844773126757e-05, "loss": 1.4929, "step": 560 }, { "epoch": 0.27193407658749397, "grad_norm": 1.4588583707809448, "learning_rate": 4.564461129090969e-05, "loss": 1.2069, "step": 561 }, { "epoch": 0.2724188075618032, "grad_norm": 1.426776647567749, "learning_rate": 4.562071570114304e-05, "loss": 1.2884, "step": 562 }, { "epoch": 0.27290353853611243, "grad_norm": 1.4680659770965576, "learning_rate": 4.5596761030431465e-05, "loss": 1.8103, "step": 563 }, { "epoch": 0.2733882695104217, "grad_norm": 1.7482470273971558, "learning_rate": 4.557274734740813e-05, "loss": 1.1736, "step": 564 }, { "epoch": 0.27387300048473096, "grad_norm": 1.5107123851776123, "learning_rate": 4.554867472087525e-05, "loss": 1.6481, "step": 565 }, { "epoch": 0.27435773145904024, "grad_norm": 1.5711710453033447, "learning_rate": 4.552454321980394e-05, "loss": 1.2276, "step": 566 }, { "epoch": 0.2748424624333495, "grad_norm": 1.6438961029052734, "learning_rate": 4.5500352913333974e-05, "loss": 1.4729, "step": 567 }, { "epoch": 0.27532719340765877, "grad_norm": 1.5576547384262085, "learning_rate": 4.547610387077363e-05, "loss": 1.6697, "step": 568 }, { "epoch": 0.275811924381968, "grad_norm": 1.587242841720581, "learning_rate": 4.5451796161599466e-05, "loss": 1.727, "step": 569 }, { "epoch": 0.2762966553562773, "grad_norm": 1.4595431089401245, "learning_rate": 4.5427429855456125e-05, "loss": 1.5131, "step": 570 }, { "epoch": 0.2767813863305865, "grad_norm": 1.8416346311569214, "learning_rate": 4.5403005022156145e-05, "loss": 1.241, "step": 571 }, { "epoch": 0.2772661173048958, "grad_norm": 1.4765833616256714, "learning_rate": 4.5378521731679735e-05, "loss": 1.268, "step": 572 }, { "epoch": 0.27775084827920504, "grad_norm": 1.4958488941192627, "learning_rate": 4.535398005417461e-05, "loss": 1.2896, "step": 573 }, { "epoch": 0.2782355792535143, "grad_norm": 1.6208523511886597, "learning_rate": 4.5329380059955776e-05, "loss": 1.5778, "step": 574 }, { "epoch": 0.27872031022782356, "grad_norm": 1.4747179746627808, "learning_rate": 4.530472181950528e-05, "loss": 1.3638, "step": 575 }, { "epoch": 0.2792050412021328, "grad_norm": 1.5383862257003784, "learning_rate": 4.528000540347212e-05, "loss": 1.2982, "step": 576 }, { "epoch": 0.2796897721764421, "grad_norm": 1.5441182851791382, "learning_rate": 4.52552308826719e-05, "loss": 1.4339, "step": 577 }, { "epoch": 0.2801745031507513, "grad_norm": 1.4504390954971313, "learning_rate": 4.523039832808677e-05, "loss": 1.6174, "step": 578 }, { "epoch": 0.2806592341250606, "grad_norm": 1.409399151802063, "learning_rate": 4.520550781086511e-05, "loss": 1.1168, "step": 579 }, { "epoch": 0.28114396509936984, "grad_norm": 1.5246732234954834, "learning_rate": 4.5180559402321385e-05, "loss": 1.5579, "step": 580 }, { "epoch": 0.28162869607367913, "grad_norm": 1.6616579294204712, "learning_rate": 4.515555317393593e-05, "loss": 1.8633, "step": 581 }, { "epoch": 0.28211342704798836, "grad_norm": 1.4933806657791138, "learning_rate": 4.5130489197354734e-05, "loss": 1.5078, "step": 582 }, { "epoch": 0.28259815802229765, "grad_norm": 1.469255805015564, "learning_rate": 4.510536754438923e-05, "loss": 1.4949, "step": 583 }, { "epoch": 0.2830828889966069, "grad_norm": 1.32322096824646, "learning_rate": 4.508018828701612e-05, "loss": 1.1069, "step": 584 }, { "epoch": 0.2835676199709161, "grad_norm": 1.438407063484192, "learning_rate": 4.5054951497377165e-05, "loss": 1.587, "step": 585 }, { "epoch": 0.2840523509452254, "grad_norm": 1.5011217594146729, "learning_rate": 4.502965724777891e-05, "loss": 1.7561, "step": 586 }, { "epoch": 0.28453708191953464, "grad_norm": 1.436948537826538, "learning_rate": 4.500430561069259e-05, "loss": 1.3473, "step": 587 }, { "epoch": 0.28502181289384393, "grad_norm": 1.5629382133483887, "learning_rate": 4.497889665875382e-05, "loss": 1.8745, "step": 588 }, { "epoch": 0.28550654386815316, "grad_norm": 1.4830938577651978, "learning_rate": 4.495343046476245e-05, "loss": 1.5765, "step": 589 }, { "epoch": 0.28599127484246245, "grad_norm": 1.4268252849578857, "learning_rate": 4.492790710168233e-05, "loss": 1.3325, "step": 590 }, { "epoch": 0.2864760058167717, "grad_norm": 1.4147472381591797, "learning_rate": 4.4902326642641095e-05, "loss": 1.3003, "step": 591 }, { "epoch": 0.286960736791081, "grad_norm": 1.4721652269363403, "learning_rate": 4.487668916093e-05, "loss": 1.3235, "step": 592 }, { "epoch": 0.2874454677653902, "grad_norm": 1.5141065120697021, "learning_rate": 4.4850994730003634e-05, "loss": 1.2092, "step": 593 }, { "epoch": 0.2879301987396995, "grad_norm": 1.4203892946243286, "learning_rate": 4.482524342347978e-05, "loss": 1.8299, "step": 594 }, { "epoch": 0.2884149297140087, "grad_norm": 2.1075599193573, "learning_rate": 4.479943531513918e-05, "loss": 1.9049, "step": 595 }, { "epoch": 0.28889966068831796, "grad_norm": 1.3928402662277222, "learning_rate": 4.477357047892531e-05, "loss": 1.4002, "step": 596 }, { "epoch": 0.28938439166262725, "grad_norm": 1.4497021436691284, "learning_rate": 4.474764898894418e-05, "loss": 1.3542, "step": 597 }, { "epoch": 0.2898691226369365, "grad_norm": 1.6207205057144165, "learning_rate": 4.472167091946411e-05, "loss": 1.8706, "step": 598 }, { "epoch": 0.29035385361124577, "grad_norm": 1.4122729301452637, "learning_rate": 4.469563634491554e-05, "loss": 1.3707, "step": 599 }, { "epoch": 0.290838584585555, "grad_norm": 1.7664668560028076, "learning_rate": 4.4669545339890814e-05, "loss": 1.2924, "step": 600 }, { "epoch": 0.2913233155598643, "grad_norm": 1.4191179275512695, "learning_rate": 4.464339797914393e-05, "loss": 1.4986, "step": 601 }, { "epoch": 0.2918080465341735, "grad_norm": 1.3508961200714111, "learning_rate": 4.4617194337590376e-05, "loss": 1.3907, "step": 602 }, { "epoch": 0.2922927775084828, "grad_norm": 1.5529752969741821, "learning_rate": 4.459093449030688e-05, "loss": 1.3911, "step": 603 }, { "epoch": 0.29277750848279205, "grad_norm": 1.4372109174728394, "learning_rate": 4.4564618512531206e-05, "loss": 1.3964, "step": 604 }, { "epoch": 0.2932622394571013, "grad_norm": 1.628930687904358, "learning_rate": 4.4538246479661936e-05, "loss": 1.8534, "step": 605 }, { "epoch": 0.29374697043141057, "grad_norm": 1.4418190717697144, "learning_rate": 4.451181846725827e-05, "loss": 1.4604, "step": 606 }, { "epoch": 0.2942317014057198, "grad_norm": 1.503642201423645, "learning_rate": 4.448533455103979e-05, "loss": 1.6738, "step": 607 }, { "epoch": 0.2947164323800291, "grad_norm": 1.5811353921890259, "learning_rate": 4.445879480688625e-05, "loss": 1.2007, "step": 608 }, { "epoch": 0.2952011633543383, "grad_norm": 1.4607658386230469, "learning_rate": 4.443219931083734e-05, "loss": 1.3633, "step": 609 }, { "epoch": 0.2956858943286476, "grad_norm": 1.4199997186660767, "learning_rate": 4.440554813909252e-05, "loss": 1.6503, "step": 610 }, { "epoch": 0.29617062530295685, "grad_norm": 1.5280699729919434, "learning_rate": 4.437884136801074e-05, "loss": 1.4311, "step": 611 }, { "epoch": 0.29665535627726614, "grad_norm": 1.4121835231781006, "learning_rate": 4.435207907411026e-05, "loss": 1.3614, "step": 612 }, { "epoch": 0.29714008725157537, "grad_norm": 1.6203902959823608, "learning_rate": 4.4325261334068426e-05, "loss": 1.6557, "step": 613 }, { "epoch": 0.29762481822588466, "grad_norm": 1.4249413013458252, "learning_rate": 4.4298388224721435e-05, "loss": 1.4862, "step": 614 }, { "epoch": 0.2981095492001939, "grad_norm": 1.6460726261138916, "learning_rate": 4.427145982306412e-05, "loss": 1.7338, "step": 615 }, { "epoch": 0.2985942801745031, "grad_norm": 1.5498179197311401, "learning_rate": 4.4244476206249745e-05, "loss": 1.1775, "step": 616 }, { "epoch": 0.2990790111488124, "grad_norm": 1.4984955787658691, "learning_rate": 4.421743745158977e-05, "loss": 1.5588, "step": 617 }, { "epoch": 0.29956374212312165, "grad_norm": 1.8243629932403564, "learning_rate": 4.419034363655362e-05, "loss": 1.6428, "step": 618 }, { "epoch": 0.30004847309743093, "grad_norm": 1.5599781274795532, "learning_rate": 4.4163194838768495e-05, "loss": 1.7171, "step": 619 }, { "epoch": 0.30053320407174017, "grad_norm": 1.6111764907836914, "learning_rate": 4.4135991136019106e-05, "loss": 1.8379, "step": 620 }, { "epoch": 0.30101793504604946, "grad_norm": 1.54121994972229, "learning_rate": 4.4108732606247495e-05, "loss": 1.3949, "step": 621 }, { "epoch": 0.3015026660203587, "grad_norm": 1.552783489227295, "learning_rate": 4.408141932755277e-05, "loss": 1.3488, "step": 622 }, { "epoch": 0.301987396994668, "grad_norm": 1.6675286293029785, "learning_rate": 4.4054051378190915e-05, "loss": 1.2843, "step": 623 }, { "epoch": 0.3024721279689772, "grad_norm": 1.6017731428146362, "learning_rate": 4.402662883657454e-05, "loss": 1.6162, "step": 624 }, { "epoch": 0.3029568589432865, "grad_norm": 1.6486562490463257, "learning_rate": 4.3999151781272694e-05, "loss": 1.723, "step": 625 }, { "epoch": 0.30344158991759573, "grad_norm": 1.474776268005371, "learning_rate": 4.397162029101058e-05, "loss": 1.7979, "step": 626 }, { "epoch": 0.30392632089190497, "grad_norm": 1.4462790489196777, "learning_rate": 4.3944034444669405e-05, "loss": 1.3842, "step": 627 }, { "epoch": 0.30441105186621426, "grad_norm": 1.8951647281646729, "learning_rate": 4.391639432128606e-05, "loss": 1.4682, "step": 628 }, { "epoch": 0.3048957828405235, "grad_norm": 1.4978829622268677, "learning_rate": 4.3888700000052996e-05, "loss": 1.4582, "step": 629 }, { "epoch": 0.3053805138148328, "grad_norm": 1.5500818490982056, "learning_rate": 4.386095156031792e-05, "loss": 1.6484, "step": 630 }, { "epoch": 0.305865244789142, "grad_norm": 1.9269793033599854, "learning_rate": 4.3833149081583604e-05, "loss": 1.5815, "step": 631 }, { "epoch": 0.3063499757634513, "grad_norm": 2.429781436920166, "learning_rate": 4.3805292643507644e-05, "loss": 1.935, "step": 632 }, { "epoch": 0.30683470673776053, "grad_norm": 1.5993130207061768, "learning_rate": 4.377738232590225e-05, "loss": 1.3461, "step": 633 }, { "epoch": 0.3073194377120698, "grad_norm": 1.588075041770935, "learning_rate": 4.374941820873399e-05, "loss": 1.617, "step": 634 }, { "epoch": 0.30780416868637905, "grad_norm": 1.6482924222946167, "learning_rate": 4.372140037212357e-05, "loss": 1.4718, "step": 635 }, { "epoch": 0.30828889966068834, "grad_norm": 1.5200806856155396, "learning_rate": 4.369332889634563e-05, "loss": 1.5187, "step": 636 }, { "epoch": 0.3087736306349976, "grad_norm": 1.4226235151290894, "learning_rate": 4.366520386182846e-05, "loss": 1.2748, "step": 637 }, { "epoch": 0.3092583616093068, "grad_norm": 1.404143214225769, "learning_rate": 4.363702534915385e-05, "loss": 1.3319, "step": 638 }, { "epoch": 0.3097430925836161, "grad_norm": 1.511928677558899, "learning_rate": 4.360879343905676e-05, "loss": 1.8477, "step": 639 }, { "epoch": 0.31022782355792533, "grad_norm": 1.4452472925186157, "learning_rate": 4.358050821242517e-05, "loss": 1.222, "step": 640 }, { "epoch": 0.3107125545322346, "grad_norm": 1.5438624620437622, "learning_rate": 4.3552169750299835e-05, "loss": 1.5466, "step": 641 }, { "epoch": 0.31119728550654385, "grad_norm": 4.010799884796143, "learning_rate": 4.352377813387398e-05, "loss": 2.1084, "step": 642 }, { "epoch": 0.31168201648085314, "grad_norm": 1.4538034200668335, "learning_rate": 4.349533344449318e-05, "loss": 1.5272, "step": 643 }, { "epoch": 0.3121667474551624, "grad_norm": 1.4150643348693848, "learning_rate": 4.346683576365505e-05, "loss": 1.5093, "step": 644 }, { "epoch": 0.31265147842947166, "grad_norm": 1.3800829648971558, "learning_rate": 4.3438285173009006e-05, "loss": 1.6719, "step": 645 }, { "epoch": 0.3131362094037809, "grad_norm": 1.3637222051620483, "learning_rate": 4.340968175435611e-05, "loss": 1.2698, "step": 646 }, { "epoch": 0.3136209403780902, "grad_norm": 1.470177173614502, "learning_rate": 4.338102558964876e-05, "loss": 1.3927, "step": 647 }, { "epoch": 0.3141056713523994, "grad_norm": 2.070209264755249, "learning_rate": 4.335231676099044e-05, "loss": 1.3481, "step": 648 }, { "epoch": 0.31459040232670865, "grad_norm": 1.404061198234558, "learning_rate": 4.332355535063559e-05, "loss": 1.514, "step": 649 }, { "epoch": 0.31507513330101794, "grad_norm": 3.9239437580108643, "learning_rate": 4.329474144098924e-05, "loss": 1.341, "step": 650 }, { "epoch": 0.3155598642753272, "grad_norm": 2.510849952697754, "learning_rate": 4.32658751146069e-05, "loss": 1.4483, "step": 651 }, { "epoch": 0.31604459524963646, "grad_norm": 1.472462773323059, "learning_rate": 4.323695645419419e-05, "loss": 1.7946, "step": 652 }, { "epoch": 0.3165293262239457, "grad_norm": 1.3601329326629639, "learning_rate": 4.320798554260674e-05, "loss": 1.1157, "step": 653 }, { "epoch": 0.317014057198255, "grad_norm": 1.462933897972107, "learning_rate": 4.3178962462849835e-05, "loss": 1.3288, "step": 654 }, { "epoch": 0.3174987881725642, "grad_norm": 1.6446117162704468, "learning_rate": 4.3149887298078276e-05, "loss": 1.9576, "step": 655 }, { "epoch": 0.3179835191468735, "grad_norm": 1.4152432680130005, "learning_rate": 4.312076013159604e-05, "loss": 1.2452, "step": 656 }, { "epoch": 0.31846825012118274, "grad_norm": 2.2185819149017334, "learning_rate": 4.309158104685614e-05, "loss": 1.8676, "step": 657 }, { "epoch": 0.318952981095492, "grad_norm": 1.3394452333450317, "learning_rate": 4.3062350127460325e-05, "loss": 1.2346, "step": 658 }, { "epoch": 0.31943771206980126, "grad_norm": 1.6615955829620361, "learning_rate": 4.303306745715885e-05, "loss": 1.5486, "step": 659 }, { "epoch": 0.3199224430441105, "grad_norm": 1.4415209293365479, "learning_rate": 4.3003733119850256e-05, "loss": 1.5477, "step": 660 }, { "epoch": 0.3204071740184198, "grad_norm": 1.4911268949508667, "learning_rate": 4.29743471995811e-05, "loss": 1.3075, "step": 661 }, { "epoch": 0.320891904992729, "grad_norm": 1.4862326383590698, "learning_rate": 4.2944909780545754e-05, "loss": 1.5706, "step": 662 }, { "epoch": 0.3213766359670383, "grad_norm": 1.5849559307098389, "learning_rate": 4.291542094708612e-05, "loss": 1.4566, "step": 663 }, { "epoch": 0.32186136694134754, "grad_norm": 1.786057949066162, "learning_rate": 4.288588078369141e-05, "loss": 1.7439, "step": 664 }, { "epoch": 0.3223460979156568, "grad_norm": 1.5245951414108276, "learning_rate": 4.2856289374997927e-05, "loss": 1.524, "step": 665 }, { "epoch": 0.32283082888996606, "grad_norm": 1.8819663524627686, "learning_rate": 4.282664680578876e-05, "loss": 1.4255, "step": 666 }, { "epoch": 0.32331555986427535, "grad_norm": 1.5141843557357788, "learning_rate": 4.2796953160993616e-05, "loss": 1.7939, "step": 667 }, { "epoch": 0.3238002908385846, "grad_norm": 1.4404217004776, "learning_rate": 4.276720852568851e-05, "loss": 1.7917, "step": 668 }, { "epoch": 0.32428502181289387, "grad_norm": 1.4955337047576904, "learning_rate": 4.273741298509557e-05, "loss": 1.7536, "step": 669 }, { "epoch": 0.3247697527872031, "grad_norm": 1.5283381938934326, "learning_rate": 4.2707566624582774e-05, "loss": 1.3797, "step": 670 }, { "epoch": 0.32525448376151234, "grad_norm": 1.550523281097412, "learning_rate": 4.267766952966369e-05, "loss": 1.6906, "step": 671 }, { "epoch": 0.3257392147358216, "grad_norm": 1.4687964916229248, "learning_rate": 4.264772178599726e-05, "loss": 1.4744, "step": 672 }, { "epoch": 0.32622394571013086, "grad_norm": 1.485487937927246, "learning_rate": 4.261772347938754e-05, "loss": 1.675, "step": 673 }, { "epoch": 0.32670867668444015, "grad_norm": 1.8627375364303589, "learning_rate": 4.258767469578345e-05, "loss": 1.5248, "step": 674 }, { "epoch": 0.3271934076587494, "grad_norm": 1.4943833351135254, "learning_rate": 4.255757552127855e-05, "loss": 1.3291, "step": 675 }, { "epoch": 0.32767813863305867, "grad_norm": 1.4833825826644897, "learning_rate": 4.252742604211073e-05, "loss": 1.3374, "step": 676 }, { "epoch": 0.3281628696073679, "grad_norm": 1.7539558410644531, "learning_rate": 4.2497226344662065e-05, "loss": 1.9051, "step": 677 }, { "epoch": 0.3286476005816772, "grad_norm": 1.5023777484893799, "learning_rate": 4.2466976515458484e-05, "loss": 1.416, "step": 678 }, { "epoch": 0.3291323315559864, "grad_norm": 1.383313775062561, "learning_rate": 4.243667664116956e-05, "loss": 1.6164, "step": 679 }, { "epoch": 0.3296170625302957, "grad_norm": 1.8263195753097534, "learning_rate": 4.2406326808608225e-05, "loss": 1.6341, "step": 680 }, { "epoch": 0.33010179350460495, "grad_norm": 2.7321722507476807, "learning_rate": 4.237592710473059e-05, "loss": 1.2566, "step": 681 }, { "epoch": 0.3305865244789142, "grad_norm": 1.4957518577575684, "learning_rate": 4.234547761663562e-05, "loss": 1.5391, "step": 682 }, { "epoch": 0.33107125545322347, "grad_norm": 1.3792263269424438, "learning_rate": 4.2314978431564923e-05, "loss": 1.2249, "step": 683 }, { "epoch": 0.3315559864275327, "grad_norm": 1.4482512474060059, "learning_rate": 4.228442963690252e-05, "loss": 1.3106, "step": 684 }, { "epoch": 0.332040717401842, "grad_norm": 1.5233250856399536, "learning_rate": 4.2253831320174534e-05, "loss": 1.4784, "step": 685 }, { "epoch": 0.3325254483761512, "grad_norm": 1.4404493570327759, "learning_rate": 4.2223183569049005e-05, "loss": 1.3078, "step": 686 }, { "epoch": 0.3330101793504605, "grad_norm": 1.585357904434204, "learning_rate": 4.2192486471335585e-05, "loss": 1.6884, "step": 687 }, { "epoch": 0.33349491032476974, "grad_norm": 1.4535739421844482, "learning_rate": 4.216174011498533e-05, "loss": 1.8015, "step": 688 }, { "epoch": 0.33397964129907903, "grad_norm": 1.4156732559204102, "learning_rate": 4.2130944588090415e-05, "loss": 1.5549, "step": 689 }, { "epoch": 0.33446437227338827, "grad_norm": 1.4804260730743408, "learning_rate": 4.2100099978883896e-05, "loss": 1.4739, "step": 690 }, { "epoch": 0.33494910324769755, "grad_norm": 1.4360147714614868, "learning_rate": 4.206920637573946e-05, "loss": 1.3966, "step": 691 }, { "epoch": 0.3354338342220068, "grad_norm": 1.5461288690567017, "learning_rate": 4.203826386717118e-05, "loss": 1.4189, "step": 692 }, { "epoch": 0.335918565196316, "grad_norm": 1.321059226989746, "learning_rate": 4.200727254183322e-05, "loss": 1.1196, "step": 693 }, { "epoch": 0.3364032961706253, "grad_norm": 1.325295329093933, "learning_rate": 4.1976232488519626e-05, "loss": 1.2674, "step": 694 }, { "epoch": 0.33688802714493454, "grad_norm": 1.6588038206100464, "learning_rate": 4.1945143796164076e-05, "loss": 1.7719, "step": 695 }, { "epoch": 0.33737275811924383, "grad_norm": 1.321679711341858, "learning_rate": 4.191400655383956e-05, "loss": 1.238, "step": 696 }, { "epoch": 0.33785748909355307, "grad_norm": 1.4317057132720947, "learning_rate": 4.188282085075821e-05, "loss": 1.5644, "step": 697 }, { "epoch": 0.33834222006786235, "grad_norm": 1.4975212812423706, "learning_rate": 4.185158677627099e-05, "loss": 1.5107, "step": 698 }, { "epoch": 0.3388269510421716, "grad_norm": 1.4152145385742188, "learning_rate": 4.182030441986744e-05, "loss": 1.5376, "step": 699 }, { "epoch": 0.3393116820164809, "grad_norm": 1.418017864227295, "learning_rate": 4.178897387117546e-05, "loss": 1.3491, "step": 700 }, { "epoch": 0.3397964129907901, "grad_norm": 1.5809285640716553, "learning_rate": 4.175759521996101e-05, "loss": 1.487, "step": 701 }, { "epoch": 0.3402811439650994, "grad_norm": 1.5660691261291504, "learning_rate": 4.172616855612787e-05, "loss": 1.6579, "step": 702 }, { "epoch": 0.34076587493940863, "grad_norm": 1.5755140781402588, "learning_rate": 4.169469396971739e-05, "loss": 1.2482, "step": 703 }, { "epoch": 0.34125060591371786, "grad_norm": 1.782956838607788, "learning_rate": 4.166317155090822e-05, "loss": 1.5582, "step": 704 }, { "epoch": 0.34173533688802715, "grad_norm": 1.4416836500167847, "learning_rate": 4.1631601390016055e-05, "loss": 1.653, "step": 705 }, { "epoch": 0.3422200678623364, "grad_norm": 1.537095069885254, "learning_rate": 4.159998357749338e-05, "loss": 1.4798, "step": 706 }, { "epoch": 0.3427047988366457, "grad_norm": 1.4685359001159668, "learning_rate": 4.1568318203929195e-05, "loss": 1.3947, "step": 707 }, { "epoch": 0.3431895298109549, "grad_norm": 1.462533712387085, "learning_rate": 4.1536605360048795e-05, "loss": 1.4598, "step": 708 }, { "epoch": 0.3436742607852642, "grad_norm": 1.5037108659744263, "learning_rate": 4.150484513671346e-05, "loss": 1.5225, "step": 709 }, { "epoch": 0.34415899175957343, "grad_norm": 1.4552758932113647, "learning_rate": 4.147303762492022e-05, "loss": 1.433, "step": 710 }, { "epoch": 0.3446437227338827, "grad_norm": 1.6252245903015137, "learning_rate": 4.144118291580161e-05, "loss": 1.6058, "step": 711 }, { "epoch": 0.34512845370819195, "grad_norm": 1.6239243745803833, "learning_rate": 4.140928110062538e-05, "loss": 1.3355, "step": 712 }, { "epoch": 0.3456131846825012, "grad_norm": 1.3930360078811646, "learning_rate": 4.137733227079423e-05, "loss": 1.4171, "step": 713 }, { "epoch": 0.3460979156568105, "grad_norm": 1.9757750034332275, "learning_rate": 4.134533651784559e-05, "loss": 1.3524, "step": 714 }, { "epoch": 0.3465826466311197, "grad_norm": 1.4614371061325073, "learning_rate": 4.131329393345131e-05, "loss": 1.7791, "step": 715 }, { "epoch": 0.347067377605429, "grad_norm": 1.3664659261703491, "learning_rate": 4.1281204609417435e-05, "loss": 1.2993, "step": 716 }, { "epoch": 0.34755210857973823, "grad_norm": 1.4222404956817627, "learning_rate": 4.1249068637683906e-05, "loss": 1.3012, "step": 717 }, { "epoch": 0.3480368395540475, "grad_norm": 1.4118279218673706, "learning_rate": 4.1216886110324324e-05, "loss": 1.3078, "step": 718 }, { "epoch": 0.34852157052835675, "grad_norm": 1.4675920009613037, "learning_rate": 4.118465711954569e-05, "loss": 1.4492, "step": 719 }, { "epoch": 0.34900630150266604, "grad_norm": 1.490893006324768, "learning_rate": 4.115238175768812e-05, "loss": 1.4637, "step": 720 }, { "epoch": 0.34949103247697527, "grad_norm": 1.3924874067306519, "learning_rate": 4.1120060117224566e-05, "loss": 1.3949, "step": 721 }, { "epoch": 0.34997576345128456, "grad_norm": 1.4966955184936523, "learning_rate": 4.108769229076061e-05, "loss": 1.5634, "step": 722 }, { "epoch": 0.3504604944255938, "grad_norm": 1.4051393270492554, "learning_rate": 4.105527837103414e-05, "loss": 1.2924, "step": 723 }, { "epoch": 0.350945225399903, "grad_norm": 1.4678040742874146, "learning_rate": 4.102281845091512e-05, "loss": 1.6048, "step": 724 }, { "epoch": 0.3514299563742123, "grad_norm": 1.4021576642990112, "learning_rate": 4.0990312623405305e-05, "loss": 1.3513, "step": 725 }, { "epoch": 0.35191468734852155, "grad_norm": 1.5062463283538818, "learning_rate": 4.095776098163798e-05, "loss": 1.489, "step": 726 }, { "epoch": 0.35239941832283084, "grad_norm": 1.4043464660644531, "learning_rate": 4.0925163618877695e-05, "loss": 1.497, "step": 727 }, { "epoch": 0.35288414929714007, "grad_norm": 1.5384563207626343, "learning_rate": 4.0892520628519985e-05, "loss": 1.8048, "step": 728 }, { "epoch": 0.35336888027144936, "grad_norm": 1.4933316707611084, "learning_rate": 4.085983210409114e-05, "loss": 1.2942, "step": 729 }, { "epoch": 0.3538536112457586, "grad_norm": 1.570822834968567, "learning_rate": 4.082709813924789e-05, "loss": 1.6173, "step": 730 }, { "epoch": 0.3543383422200679, "grad_norm": 1.5000869035720825, "learning_rate": 4.079431882777715e-05, "loss": 1.3644, "step": 731 }, { "epoch": 0.3548230731943771, "grad_norm": 1.4136881828308105, "learning_rate": 4.0761494263595796e-05, "loss": 1.7214, "step": 732 }, { "epoch": 0.3553078041686864, "grad_norm": 1.5652875900268555, "learning_rate": 4.072862454075031e-05, "loss": 1.6255, "step": 733 }, { "epoch": 0.35579253514299564, "grad_norm": 1.6801156997680664, "learning_rate": 4.06957097534166e-05, "loss": 1.7948, "step": 734 }, { "epoch": 0.35627726611730487, "grad_norm": 1.3107026815414429, "learning_rate": 4.0662749995899666e-05, "loss": 1.3926, "step": 735 }, { "epoch": 0.35676199709161416, "grad_norm": 1.503405213356018, "learning_rate": 4.062974536263336e-05, "loss": 1.5015, "step": 736 }, { "epoch": 0.3572467280659234, "grad_norm": 1.5230603218078613, "learning_rate": 4.0596695948180116e-05, "loss": 1.3572, "step": 737 }, { "epoch": 0.3577314590402327, "grad_norm": 1.4852657318115234, "learning_rate": 4.056360184723065e-05, "loss": 1.5162, "step": 738 }, { "epoch": 0.3582161900145419, "grad_norm": 1.5229320526123047, "learning_rate": 4.0530463154603747e-05, "loss": 1.5939, "step": 739 }, { "epoch": 0.3587009209888512, "grad_norm": 1.5715887546539307, "learning_rate": 4.049727996524591e-05, "loss": 1.4526, "step": 740 }, { "epoch": 0.35918565196316043, "grad_norm": 1.3569464683532715, "learning_rate": 4.046405237423116e-05, "loss": 1.3617, "step": 741 }, { "epoch": 0.3596703829374697, "grad_norm": 1.477789282798767, "learning_rate": 4.043078047676072e-05, "loss": 1.4437, "step": 742 }, { "epoch": 0.36015511391177896, "grad_norm": 1.3856958150863647, "learning_rate": 4.039746436816277e-05, "loss": 1.3219, "step": 743 }, { "epoch": 0.36063984488608825, "grad_norm": 1.4407496452331543, "learning_rate": 4.036410414389215e-05, "loss": 1.2868, "step": 744 }, { "epoch": 0.3611245758603975, "grad_norm": 1.449094533920288, "learning_rate": 4.03306998995301e-05, "loss": 1.3048, "step": 745 }, { "epoch": 0.3616093068347067, "grad_norm": 1.4655970335006714, "learning_rate": 4.029725173078398e-05, "loss": 1.4586, "step": 746 }, { "epoch": 0.362094037809016, "grad_norm": 1.4294497966766357, "learning_rate": 4.0263759733487015e-05, "loss": 1.417, "step": 747 }, { "epoch": 0.36257876878332523, "grad_norm": 1.5399290323257446, "learning_rate": 4.023022400359797e-05, "loss": 1.6722, "step": 748 }, { "epoch": 0.3630634997576345, "grad_norm": 1.4106359481811523, "learning_rate": 4.019664463720094e-05, "loss": 1.5666, "step": 749 }, { "epoch": 0.36354823073194376, "grad_norm": 1.5670846700668335, "learning_rate": 4.0163021730505045e-05, "loss": 1.6455, "step": 750 }, { "epoch": 0.36403296170625304, "grad_norm": 1.5435312986373901, "learning_rate": 4.012935537984414e-05, "loss": 1.7314, "step": 751 }, { "epoch": 0.3645176926805623, "grad_norm": 1.43980872631073, "learning_rate": 4.009564568167653e-05, "loss": 1.1749, "step": 752 }, { "epoch": 0.36500242365487157, "grad_norm": 1.5904608964920044, "learning_rate": 4.006189273258477e-05, "loss": 1.4091, "step": 753 }, { "epoch": 0.3654871546291808, "grad_norm": 1.569663643836975, "learning_rate": 4.00280966292753e-05, "loss": 1.447, "step": 754 }, { "epoch": 0.3659718856034901, "grad_norm": 1.4279499053955078, "learning_rate": 3.99942574685782e-05, "loss": 1.4849, "step": 755 }, { "epoch": 0.3664566165777993, "grad_norm": 1.4076343774795532, "learning_rate": 3.9960375347446934e-05, "loss": 1.3214, "step": 756 }, { "epoch": 0.36694134755210855, "grad_norm": 1.4956542253494263, "learning_rate": 3.9926450362958024e-05, "loss": 1.4805, "step": 757 }, { "epoch": 0.36742607852641784, "grad_norm": 1.563628911972046, "learning_rate": 3.9892482612310836e-05, "loss": 1.8152, "step": 758 }, { "epoch": 0.3679108095007271, "grad_norm": 1.4647592306137085, "learning_rate": 3.985847219282725e-05, "loss": 1.4635, "step": 759 }, { "epoch": 0.36839554047503636, "grad_norm": 1.4081557989120483, "learning_rate": 3.982441920195138e-05, "loss": 1.5819, "step": 760 }, { "epoch": 0.3688802714493456, "grad_norm": 1.5153309106826782, "learning_rate": 3.9790323737249346e-05, "loss": 1.5674, "step": 761 }, { "epoch": 0.3693650024236549, "grad_norm": 1.4655232429504395, "learning_rate": 3.975618589640894e-05, "loss": 1.4397, "step": 762 }, { "epoch": 0.3698497333979641, "grad_norm": 1.60316801071167, "learning_rate": 3.9722005777239354e-05, "loss": 1.5732, "step": 763 }, { "epoch": 0.3703344643722734, "grad_norm": 1.594388723373413, "learning_rate": 3.9687783477670966e-05, "loss": 1.6924, "step": 764 }, { "epoch": 0.37081919534658264, "grad_norm": 1.587856411933899, "learning_rate": 3.9653519095754934e-05, "loss": 1.7377, "step": 765 }, { "epoch": 0.37130392632089193, "grad_norm": 1.4031965732574463, "learning_rate": 3.961921272966305e-05, "loss": 1.5464, "step": 766 }, { "epoch": 0.37178865729520116, "grad_norm": 1.3980727195739746, "learning_rate": 3.958486447768736e-05, "loss": 1.3223, "step": 767 }, { "epoch": 0.3722733882695104, "grad_norm": 1.411486268043518, "learning_rate": 3.95504744382399e-05, "loss": 1.5066, "step": 768 }, { "epoch": 0.3727581192438197, "grad_norm": 1.5289703607559204, "learning_rate": 3.9516042709852506e-05, "loss": 1.4962, "step": 769 }, { "epoch": 0.3732428502181289, "grad_norm": 1.3834885358810425, "learning_rate": 3.948156939117639e-05, "loss": 1.3076, "step": 770 }, { "epoch": 0.3737275811924382, "grad_norm": 1.5126419067382812, "learning_rate": 3.944705458098194e-05, "loss": 1.203, "step": 771 }, { "epoch": 0.37421231216674744, "grad_norm": 1.4430164098739624, "learning_rate": 3.9412498378158446e-05, "loss": 1.2171, "step": 772 }, { "epoch": 0.37469704314105673, "grad_norm": 1.4813838005065918, "learning_rate": 3.9377900881713764e-05, "loss": 1.4277, "step": 773 }, { "epoch": 0.37518177411536596, "grad_norm": 1.431311011314392, "learning_rate": 3.9343262190774076e-05, "loss": 1.5013, "step": 774 }, { "epoch": 0.37566650508967525, "grad_norm": 1.891334891319275, "learning_rate": 3.93085824045836e-05, "loss": 2.4066, "step": 775 }, { "epoch": 0.3761512360639845, "grad_norm": 1.4745622873306274, "learning_rate": 3.927386162250427e-05, "loss": 1.3788, "step": 776 }, { "epoch": 0.3766359670382938, "grad_norm": 1.437791109085083, "learning_rate": 3.923909994401551e-05, "loss": 1.565, "step": 777 }, { "epoch": 0.377120698012603, "grad_norm": 1.3869900703430176, "learning_rate": 3.92042974687139e-05, "loss": 1.5087, "step": 778 }, { "epoch": 0.37760542898691224, "grad_norm": 1.3274116516113281, "learning_rate": 3.916945429631289e-05, "loss": 1.5359, "step": 779 }, { "epoch": 0.37809015996122153, "grad_norm": 1.4933420419692993, "learning_rate": 3.9134570526642594e-05, "loss": 1.5366, "step": 780 }, { "epoch": 0.37857489093553076, "grad_norm": 1.4309301376342773, "learning_rate": 3.9099646259649364e-05, "loss": 1.3898, "step": 781 }, { "epoch": 0.37905962190984005, "grad_norm": 1.476951241493225, "learning_rate": 3.9064681595395634e-05, "loss": 1.4312, "step": 782 }, { "epoch": 0.3795443528841493, "grad_norm": 1.872909426689148, "learning_rate": 3.902967663405956e-05, "loss": 1.4043, "step": 783 }, { "epoch": 0.38002908385845857, "grad_norm": 1.5244536399841309, "learning_rate": 3.8994631475934775e-05, "loss": 1.382, "step": 784 }, { "epoch": 0.3805138148327678, "grad_norm": 1.5926803350448608, "learning_rate": 3.895954622143004e-05, "loss": 1.3973, "step": 785 }, { "epoch": 0.3809985458070771, "grad_norm": 1.583483338356018, "learning_rate": 3.8924420971069055e-05, "loss": 1.6504, "step": 786 }, { "epoch": 0.3814832767813863, "grad_norm": 1.3475099802017212, "learning_rate": 3.888925582549006e-05, "loss": 1.4933, "step": 787 }, { "epoch": 0.3819680077556956, "grad_norm": 1.3581137657165527, "learning_rate": 3.885405088544563e-05, "loss": 1.2155, "step": 788 }, { "epoch": 0.38245273873000485, "grad_norm": 1.462459921836853, "learning_rate": 3.8818806251802334e-05, "loss": 1.446, "step": 789 }, { "epoch": 0.3829374697043141, "grad_norm": 1.4395034313201904, "learning_rate": 3.878352202554051e-05, "loss": 1.3661, "step": 790 }, { "epoch": 0.38342220067862337, "grad_norm": 1.3227758407592773, "learning_rate": 3.8748198307753874e-05, "loss": 1.2516, "step": 791 }, { "epoch": 0.3839069316529326, "grad_norm": 1.5062460899353027, "learning_rate": 3.871283519964935e-05, "loss": 1.7564, "step": 792 }, { "epoch": 0.3843916626272419, "grad_norm": 1.4209660291671753, "learning_rate": 3.867743280254666e-05, "loss": 1.5597, "step": 793 }, { "epoch": 0.3848763936015511, "grad_norm": 1.5597984790802002, "learning_rate": 3.8641991217878154e-05, "loss": 1.4486, "step": 794 }, { "epoch": 0.3853611245758604, "grad_norm": 1.390090823173523, "learning_rate": 3.8606510547188425e-05, "loss": 1.361, "step": 795 }, { "epoch": 0.38584585555016965, "grad_norm": 1.511312484741211, "learning_rate": 3.857099089213405e-05, "loss": 1.4659, "step": 796 }, { "epoch": 0.38633058652447894, "grad_norm": 1.4220378398895264, "learning_rate": 3.8535432354483313e-05, "loss": 1.8297, "step": 797 }, { "epoch": 0.38681531749878817, "grad_norm": 1.4622609615325928, "learning_rate": 3.849983503611591e-05, "loss": 1.6799, "step": 798 }, { "epoch": 0.38730004847309746, "grad_norm": 1.4184774160385132, "learning_rate": 3.8464199039022605e-05, "loss": 1.3475, "step": 799 }, { "epoch": 0.3877847794474067, "grad_norm": 1.6148196458816528, "learning_rate": 3.842852446530505e-05, "loss": 1.5014, "step": 800 }, { "epoch": 0.3882695104217159, "grad_norm": 1.4410502910614014, "learning_rate": 3.839281141717538e-05, "loss": 1.6122, "step": 801 }, { "epoch": 0.3887542413960252, "grad_norm": 1.4458303451538086, "learning_rate": 3.835705999695595e-05, "loss": 1.3593, "step": 802 }, { "epoch": 0.38923897237033445, "grad_norm": 1.4514504671096802, "learning_rate": 3.832127030707909e-05, "loss": 1.2312, "step": 803 }, { "epoch": 0.38972370334464373, "grad_norm": 1.5186152458190918, "learning_rate": 3.828544245008677e-05, "loss": 1.5824, "step": 804 }, { "epoch": 0.39020843431895297, "grad_norm": 1.2924879789352417, "learning_rate": 3.82495765286303e-05, "loss": 1.3355, "step": 805 }, { "epoch": 0.39069316529326226, "grad_norm": 1.4316754341125488, "learning_rate": 3.821367264547006e-05, "loss": 1.4083, "step": 806 }, { "epoch": 0.3911778962675715, "grad_norm": 1.5610601902008057, "learning_rate": 3.817773090347519e-05, "loss": 1.662, "step": 807 }, { "epoch": 0.3916626272418808, "grad_norm": 1.4195690155029297, "learning_rate": 3.8141751405623317e-05, "loss": 1.4629, "step": 808 }, { "epoch": 0.39214735821619, "grad_norm": 1.4146546125411987, "learning_rate": 3.8105734255000214e-05, "loss": 1.452, "step": 809 }, { "epoch": 0.3926320891904993, "grad_norm": 1.45216965675354, "learning_rate": 3.806967955479955e-05, "loss": 1.2171, "step": 810 }, { "epoch": 0.39311682016480853, "grad_norm": 1.468102216720581, "learning_rate": 3.803358740832257e-05, "loss": 1.5747, "step": 811 }, { "epoch": 0.39360155113911777, "grad_norm": 1.314497947692871, "learning_rate": 3.7997457918977845e-05, "loss": 1.351, "step": 812 }, { "epoch": 0.39408628211342706, "grad_norm": 1.4871678352355957, "learning_rate": 3.796129119028087e-05, "loss": 1.2889, "step": 813 }, { "epoch": 0.3945710130877363, "grad_norm": 1.3904635906219482, "learning_rate": 3.79250873258539e-05, "loss": 1.3541, "step": 814 }, { "epoch": 0.3950557440620456, "grad_norm": 1.4073387384414673, "learning_rate": 3.7888846429425546e-05, "loss": 1.1039, "step": 815 }, { "epoch": 0.3955404750363548, "grad_norm": 1.6151351928710938, "learning_rate": 3.785256860483054e-05, "loss": 1.6067, "step": 816 }, { "epoch": 0.3960252060106641, "grad_norm": 1.456746220588684, "learning_rate": 3.781625395600943e-05, "loss": 1.4839, "step": 817 }, { "epoch": 0.39650993698497333, "grad_norm": 1.4020472764968872, "learning_rate": 3.7779902587008225e-05, "loss": 1.2439, "step": 818 }, { "epoch": 0.3969946679592826, "grad_norm": 1.5153536796569824, "learning_rate": 3.774351460197819e-05, "loss": 1.5316, "step": 819 }, { "epoch": 0.39747939893359185, "grad_norm": 1.8110442161560059, "learning_rate": 3.770709010517549e-05, "loss": 1.4559, "step": 820 }, { "epoch": 0.3979641299079011, "grad_norm": 1.497971773147583, "learning_rate": 3.767062920096086e-05, "loss": 1.587, "step": 821 }, { "epoch": 0.3984488608822104, "grad_norm": 1.3729140758514404, "learning_rate": 3.763413199379941e-05, "loss": 1.27, "step": 822 }, { "epoch": 0.3989335918565196, "grad_norm": 1.5104501247406006, "learning_rate": 3.7597598588260196e-05, "loss": 1.315, "step": 823 }, { "epoch": 0.3994183228308289, "grad_norm": 1.459794521331787, "learning_rate": 3.7561029089016055e-05, "loss": 1.4664, "step": 824 }, { "epoch": 0.39990305380513813, "grad_norm": 1.403060793876648, "learning_rate": 3.7524423600843186e-05, "loss": 1.6218, "step": 825 }, { "epoch": 0.4003877847794474, "grad_norm": 1.4324053525924683, "learning_rate": 3.7487782228620916e-05, "loss": 1.274, "step": 826 }, { "epoch": 0.40087251575375665, "grad_norm": 1.4029552936553955, "learning_rate": 3.7451105077331396e-05, "loss": 1.464, "step": 827 }, { "epoch": 0.40135724672806594, "grad_norm": 1.549617052078247, "learning_rate": 3.741439225205927e-05, "loss": 1.4068, "step": 828 }, { "epoch": 0.4018419777023752, "grad_norm": 1.4655847549438477, "learning_rate": 3.7377643857991416e-05, "loss": 1.5115, "step": 829 }, { "epoch": 0.40232670867668446, "grad_norm": 1.5959397554397583, "learning_rate": 3.7340860000416595e-05, "loss": 1.5153, "step": 830 }, { "epoch": 0.4028114396509937, "grad_norm": 1.6088857650756836, "learning_rate": 3.730404078472518e-05, "loss": 1.7593, "step": 831 }, { "epoch": 0.40329617062530293, "grad_norm": 1.4206966161727905, "learning_rate": 3.726718631640888e-05, "loss": 1.4746, "step": 832 }, { "epoch": 0.4037809015996122, "grad_norm": 1.4388668537139893, "learning_rate": 3.723029670106036e-05, "loss": 1.1222, "step": 833 }, { "epoch": 0.40426563257392145, "grad_norm": 1.5215039253234863, "learning_rate": 3.719337204437302e-05, "loss": 1.3856, "step": 834 }, { "epoch": 0.40475036354823074, "grad_norm": 1.470598578453064, "learning_rate": 3.7156412452140646e-05, "loss": 1.2346, "step": 835 }, { "epoch": 0.40523509452254, "grad_norm": 1.4196856021881104, "learning_rate": 3.711941803025712e-05, "loss": 1.2546, "step": 836 }, { "epoch": 0.40571982549684926, "grad_norm": 1.5476642847061157, "learning_rate": 3.708238888471611e-05, "loss": 1.7273, "step": 837 }, { "epoch": 0.4062045564711585, "grad_norm": 1.4293063879013062, "learning_rate": 3.704532512161079e-05, "loss": 1.5743, "step": 838 }, { "epoch": 0.4066892874454678, "grad_norm": 1.4300776720046997, "learning_rate": 3.700822684713349e-05, "loss": 1.3128, "step": 839 }, { "epoch": 0.407174018419777, "grad_norm": 1.404038667678833, "learning_rate": 3.697109416757544e-05, "loss": 1.1517, "step": 840 }, { "epoch": 0.4076587493940863, "grad_norm": 1.432827115058899, "learning_rate": 3.6933927189326435e-05, "loss": 1.3499, "step": 841 }, { "epoch": 0.40814348036839554, "grad_norm": 1.4601900577545166, "learning_rate": 3.689672601887455e-05, "loss": 1.4287, "step": 842 }, { "epoch": 0.4086282113427048, "grad_norm": 1.4678421020507812, "learning_rate": 3.685949076280583e-05, "loss": 1.5663, "step": 843 }, { "epoch": 0.40911294231701406, "grad_norm": 1.538097620010376, "learning_rate": 3.6822221527803934e-05, "loss": 1.6855, "step": 844 }, { "epoch": 0.4095976732913233, "grad_norm": 1.4171286821365356, "learning_rate": 3.678491842064995e-05, "loss": 1.6271, "step": 845 }, { "epoch": 0.4100824042656326, "grad_norm": 1.5117058753967285, "learning_rate": 3.674758154822194e-05, "loss": 1.356, "step": 846 }, { "epoch": 0.4105671352399418, "grad_norm": 1.442489743232727, "learning_rate": 3.671021101749476e-05, "loss": 1.3713, "step": 847 }, { "epoch": 0.4110518662142511, "grad_norm": 1.4731215238571167, "learning_rate": 3.667280693553967e-05, "loss": 1.4432, "step": 848 }, { "epoch": 0.41153659718856034, "grad_norm": 1.464311122894287, "learning_rate": 3.663536940952409e-05, "loss": 1.5066, "step": 849 }, { "epoch": 0.4120213281628696, "grad_norm": 1.3426711559295654, "learning_rate": 3.659789854671122e-05, "loss": 1.1671, "step": 850 }, { "epoch": 0.41250605913717886, "grad_norm": 1.665104866027832, "learning_rate": 3.6560394454459814e-05, "loss": 1.9996, "step": 851 }, { "epoch": 0.41299079011148815, "grad_norm": 1.2623895406723022, "learning_rate": 3.652285724022379e-05, "loss": 1.1562, "step": 852 }, { "epoch": 0.4134755210857974, "grad_norm": 1.361686110496521, "learning_rate": 3.648528701155203e-05, "loss": 1.278, "step": 853 }, { "epoch": 0.4139602520601066, "grad_norm": 1.4689781665802002, "learning_rate": 3.644768387608793e-05, "loss": 1.417, "step": 854 }, { "epoch": 0.4144449830344159, "grad_norm": 1.5472491979599, "learning_rate": 3.6410047941569224e-05, "loss": 1.4856, "step": 855 }, { "epoch": 0.41492971400872514, "grad_norm": 1.4804879426956177, "learning_rate": 3.637237931582759e-05, "loss": 1.7454, "step": 856 }, { "epoch": 0.4154144449830344, "grad_norm": 1.4140907526016235, "learning_rate": 3.633467810678839e-05, "loss": 1.4682, "step": 857 }, { "epoch": 0.41589917595734366, "grad_norm": 1.4889259338378906, "learning_rate": 3.629694442247032e-05, "loss": 1.594, "step": 858 }, { "epoch": 0.41638390693165295, "grad_norm": 1.4880090951919556, "learning_rate": 3.6259178370985144e-05, "loss": 1.4029, "step": 859 }, { "epoch": 0.4168686379059622, "grad_norm": 1.4780687093734741, "learning_rate": 3.6221380060537333e-05, "loss": 1.4561, "step": 860 }, { "epoch": 0.41735336888027147, "grad_norm": 1.4494913816452026, "learning_rate": 3.6183549599423815e-05, "loss": 1.4076, "step": 861 }, { "epoch": 0.4178380998545807, "grad_norm": 1.416123390197754, "learning_rate": 3.6145687096033634e-05, "loss": 1.3328, "step": 862 }, { "epoch": 0.41832283082889, "grad_norm": 1.4615715742111206, "learning_rate": 3.6107792658847595e-05, "loss": 1.3547, "step": 863 }, { "epoch": 0.4188075618031992, "grad_norm": 1.3456425666809082, "learning_rate": 3.606986639643805e-05, "loss": 1.6143, "step": 864 }, { "epoch": 0.41929229277750846, "grad_norm": 1.4085299968719482, "learning_rate": 3.603190841746851e-05, "loss": 1.4381, "step": 865 }, { "epoch": 0.41977702375181775, "grad_norm": 1.3223602771759033, "learning_rate": 3.599391883069335e-05, "loss": 1.1998, "step": 866 }, { "epoch": 0.420261754726127, "grad_norm": 1.3724223375320435, "learning_rate": 3.595589774495753e-05, "loss": 1.3599, "step": 867 }, { "epoch": 0.42074648570043627, "grad_norm": 1.5481332540512085, "learning_rate": 3.591784526919624e-05, "loss": 1.6104, "step": 868 }, { "epoch": 0.4212312166747455, "grad_norm": 1.4483201503753662, "learning_rate": 3.58797615124346e-05, "loss": 1.3523, "step": 869 }, { "epoch": 0.4217159476490548, "grad_norm": 2.079836845397949, "learning_rate": 3.584164658378738e-05, "loss": 1.6446, "step": 870 }, { "epoch": 0.422200678623364, "grad_norm": 1.3948813676834106, "learning_rate": 3.580350059245864e-05, "loss": 1.3267, "step": 871 }, { "epoch": 0.4226854095976733, "grad_norm": 1.3875503540039062, "learning_rate": 3.576532364774145e-05, "loss": 1.2546, "step": 872 }, { "epoch": 0.42317014057198254, "grad_norm": 1.590307354927063, "learning_rate": 3.572711585901755e-05, "loss": 1.733, "step": 873 }, { "epoch": 0.42365487154629183, "grad_norm": 1.4814637899398804, "learning_rate": 3.568887733575706e-05, "loss": 1.5166, "step": 874 }, { "epoch": 0.42413960252060107, "grad_norm": 1.3917155265808105, "learning_rate": 3.565060818751816e-05, "loss": 1.2831, "step": 875 }, { "epoch": 0.4246243334949103, "grad_norm": 1.4022222757339478, "learning_rate": 3.561230852394679e-05, "loss": 1.2607, "step": 876 }, { "epoch": 0.4251090644692196, "grad_norm": 1.503482460975647, "learning_rate": 3.55739784547763e-05, "loss": 1.5367, "step": 877 }, { "epoch": 0.4255937954435288, "grad_norm": 1.544471025466919, "learning_rate": 3.553561808982715e-05, "loss": 1.5024, "step": 878 }, { "epoch": 0.4260785264178381, "grad_norm": 1.434191346168518, "learning_rate": 3.5497227539006614e-05, "loss": 1.327, "step": 879 }, { "epoch": 0.42656325739214734, "grad_norm": 1.3471134901046753, "learning_rate": 3.545880691230846e-05, "loss": 1.263, "step": 880 }, { "epoch": 0.42704798836645663, "grad_norm": 1.3368321657180786, "learning_rate": 3.542035631981261e-05, "loss": 1.3815, "step": 881 }, { "epoch": 0.42753271934076587, "grad_norm": 1.9717825651168823, "learning_rate": 3.538187587168486e-05, "loss": 1.6113, "step": 882 }, { "epoch": 0.42801745031507515, "grad_norm": 1.412209153175354, "learning_rate": 3.534336567817651e-05, "loss": 1.2464, "step": 883 }, { "epoch": 0.4285021812893844, "grad_norm": 1.6004551649093628, "learning_rate": 3.530482584962414e-05, "loss": 1.7197, "step": 884 }, { "epoch": 0.4289869122636937, "grad_norm": 1.4312201738357544, "learning_rate": 3.5266256496449186e-05, "loss": 1.4607, "step": 885 }, { "epoch": 0.4294716432380029, "grad_norm": 1.418025016784668, "learning_rate": 3.5227657729157705e-05, "loss": 1.441, "step": 886 }, { "epoch": 0.42995637421231214, "grad_norm": 1.4603976011276245, "learning_rate": 3.5189029658340025e-05, "loss": 1.472, "step": 887 }, { "epoch": 0.43044110518662143, "grad_norm": 1.4871946573257446, "learning_rate": 3.5150372394670426e-05, "loss": 1.252, "step": 888 }, { "epoch": 0.43092583616093066, "grad_norm": 2.187009811401367, "learning_rate": 3.5111686048906835e-05, "loss": 1.3653, "step": 889 }, { "epoch": 0.43141056713523995, "grad_norm": 1.4901810884475708, "learning_rate": 3.5072970731890486e-05, "loss": 1.7895, "step": 890 }, { "epoch": 0.4318952981095492, "grad_norm": 1.3609486818313599, "learning_rate": 3.5034226554545656e-05, "loss": 1.2407, "step": 891 }, { "epoch": 0.4323800290838585, "grad_norm": 1.5160627365112305, "learning_rate": 3.499545362787927e-05, "loss": 1.4348, "step": 892 }, { "epoch": 0.4328647600581677, "grad_norm": 1.4251633882522583, "learning_rate": 3.495665206298065e-05, "loss": 1.4585, "step": 893 }, { "epoch": 0.433349491032477, "grad_norm": 1.4607558250427246, "learning_rate": 3.491782197102115e-05, "loss": 1.3888, "step": 894 }, { "epoch": 0.43383422200678623, "grad_norm": 1.49302077293396, "learning_rate": 3.487896346325389e-05, "loss": 1.604, "step": 895 }, { "epoch": 0.4343189529810955, "grad_norm": 1.5757123231887817, "learning_rate": 3.484007665101336e-05, "loss": 1.5465, "step": 896 }, { "epoch": 0.43480368395540475, "grad_norm": 1.4108710289001465, "learning_rate": 3.480116164571519e-05, "loss": 1.4156, "step": 897 }, { "epoch": 0.435288414929714, "grad_norm": 1.2154583930969238, "learning_rate": 3.476221855885576e-05, "loss": 1.4186, "step": 898 }, { "epoch": 0.4357731459040233, "grad_norm": 1.456335425376892, "learning_rate": 3.47232475020119e-05, "loss": 1.5292, "step": 899 }, { "epoch": 0.4362578768783325, "grad_norm": 1.4850857257843018, "learning_rate": 3.468424858684061e-05, "loss": 1.1836, "step": 900 }, { "epoch": 0.4367426078526418, "grad_norm": 1.4295960664749146, "learning_rate": 3.4645221925078674e-05, "loss": 1.3047, "step": 901 }, { "epoch": 0.43722733882695103, "grad_norm": 1.5569299459457397, "learning_rate": 3.4606167628542395e-05, "loss": 1.3896, "step": 902 }, { "epoch": 0.4377120698012603, "grad_norm": 1.4047545194625854, "learning_rate": 3.456708580912725e-05, "loss": 1.5364, "step": 903 }, { "epoch": 0.43819680077556955, "grad_norm": 2.0098063945770264, "learning_rate": 3.452797657880756e-05, "loss": 1.7044, "step": 904 }, { "epoch": 0.43868153174987884, "grad_norm": 1.4840755462646484, "learning_rate": 3.4488840049636195e-05, "loss": 1.4837, "step": 905 }, { "epoch": 0.4391662627241881, "grad_norm": 1.7989381551742554, "learning_rate": 3.444967633374424e-05, "loss": 1.6995, "step": 906 }, { "epoch": 0.43965099369849736, "grad_norm": 1.4424799680709839, "learning_rate": 3.441048554334066e-05, "loss": 1.397, "step": 907 }, { "epoch": 0.4401357246728066, "grad_norm": 1.501582145690918, "learning_rate": 3.4371267790712e-05, "loss": 1.5452, "step": 908 }, { "epoch": 0.4406204556471158, "grad_norm": 1.461760401725769, "learning_rate": 3.433202318822207e-05, "loss": 1.581, "step": 909 }, { "epoch": 0.4411051866214251, "grad_norm": 1.3767198324203491, "learning_rate": 3.429275184831158e-05, "loss": 1.4953, "step": 910 }, { "epoch": 0.44158991759573435, "grad_norm": 1.3784865140914917, "learning_rate": 3.425345388349786e-05, "loss": 1.4367, "step": 911 }, { "epoch": 0.44207464857004364, "grad_norm": 1.4725292921066284, "learning_rate": 3.421412940637453e-05, "loss": 1.3251, "step": 912 }, { "epoch": 0.44255937954435287, "grad_norm": 1.5235271453857422, "learning_rate": 3.417477852961116e-05, "loss": 1.7435, "step": 913 }, { "epoch": 0.44304411051866216, "grad_norm": 1.5075349807739258, "learning_rate": 3.413540136595296e-05, "loss": 1.3295, "step": 914 }, { "epoch": 0.4435288414929714, "grad_norm": 1.5114574432373047, "learning_rate": 3.409599802822047e-05, "loss": 1.6123, "step": 915 }, { "epoch": 0.4440135724672807, "grad_norm": 1.4881386756896973, "learning_rate": 3.40565686293092e-05, "loss": 1.5483, "step": 916 }, { "epoch": 0.4444983034415899, "grad_norm": 1.7126438617706299, "learning_rate": 3.401711328218934e-05, "loss": 1.7761, "step": 917 }, { "epoch": 0.4449830344158992, "grad_norm": 1.6697680950164795, "learning_rate": 3.397763209990542e-05, "loss": 1.5235, "step": 918 }, { "epoch": 0.44546776539020844, "grad_norm": 1.4794918298721313, "learning_rate": 3.3938125195576e-05, "loss": 1.468, "step": 919 }, { "epoch": 0.44595249636451767, "grad_norm": 1.6914706230163574, "learning_rate": 3.3898592682393336e-05, "loss": 1.5126, "step": 920 }, { "epoch": 0.44643722733882696, "grad_norm": 1.4983173608779907, "learning_rate": 3.3859034673623045e-05, "loss": 1.1858, "step": 921 }, { "epoch": 0.4469219583131362, "grad_norm": 1.5133384466171265, "learning_rate": 3.3819451282603805e-05, "loss": 1.3387, "step": 922 }, { "epoch": 0.4474066892874455, "grad_norm": 1.416934609413147, "learning_rate": 3.377984262274701e-05, "loss": 1.3361, "step": 923 }, { "epoch": 0.4478914202617547, "grad_norm": 1.4019713401794434, "learning_rate": 3.3740208807536446e-05, "loss": 1.5387, "step": 924 }, { "epoch": 0.448376151236064, "grad_norm": 1.4528837203979492, "learning_rate": 3.3700549950527994e-05, "loss": 1.332, "step": 925 }, { "epoch": 0.44886088221037324, "grad_norm": 1.435003638267517, "learning_rate": 3.366086616534925e-05, "loss": 1.2677, "step": 926 }, { "epoch": 0.4493456131846825, "grad_norm": 1.4747051000595093, "learning_rate": 3.362115756569926e-05, "loss": 1.2977, "step": 927 }, { "epoch": 0.44983034415899176, "grad_norm": 1.4179006814956665, "learning_rate": 3.358142426534817e-05, "loss": 1.3164, "step": 928 }, { "epoch": 0.450315075133301, "grad_norm": 1.5167887210845947, "learning_rate": 3.354166637813687e-05, "loss": 1.8859, "step": 929 }, { "epoch": 0.4507998061076103, "grad_norm": 1.4719829559326172, "learning_rate": 3.350188401797672e-05, "loss": 1.4828, "step": 930 }, { "epoch": 0.4512845370819195, "grad_norm": 1.4412150382995605, "learning_rate": 3.346207729884918e-05, "loss": 1.4545, "step": 931 }, { "epoch": 0.4517692680562288, "grad_norm": 1.426696538925171, "learning_rate": 3.34222463348055e-05, "loss": 1.4738, "step": 932 }, { "epoch": 0.45225399903053803, "grad_norm": 1.7029757499694824, "learning_rate": 3.338239123996642e-05, "loss": 1.7937, "step": 933 }, { "epoch": 0.4527387300048473, "grad_norm": 1.4222897291183472, "learning_rate": 3.3342512128521794e-05, "loss": 1.4876, "step": 934 }, { "epoch": 0.45322346097915656, "grad_norm": 1.3874694108963013, "learning_rate": 3.33026091147303e-05, "loss": 1.2369, "step": 935 }, { "epoch": 0.45370819195346584, "grad_norm": 1.348215103149414, "learning_rate": 3.3262682312919084e-05, "loss": 1.3298, "step": 936 }, { "epoch": 0.4541929229277751, "grad_norm": 1.54569673538208, "learning_rate": 3.322273183748346e-05, "loss": 1.4331, "step": 937 }, { "epoch": 0.45467765390208437, "grad_norm": 1.403524398803711, "learning_rate": 3.318275780288656e-05, "loss": 1.1923, "step": 938 }, { "epoch": 0.4551623848763936, "grad_norm": 1.4105076789855957, "learning_rate": 3.3142760323659036e-05, "loss": 1.3577, "step": 939 }, { "epoch": 0.45564711585070283, "grad_norm": 1.3613835573196411, "learning_rate": 3.310273951439869e-05, "loss": 1.3631, "step": 940 }, { "epoch": 0.4561318468250121, "grad_norm": 1.5017019510269165, "learning_rate": 3.3062695489770175e-05, "loss": 1.4085, "step": 941 }, { "epoch": 0.45661657779932135, "grad_norm": 1.409903883934021, "learning_rate": 3.302262836450466e-05, "loss": 1.3829, "step": 942 }, { "epoch": 0.45710130877363064, "grad_norm": 1.4461843967437744, "learning_rate": 3.29825382533995e-05, "loss": 1.4971, "step": 943 }, { "epoch": 0.4575860397479399, "grad_norm": 1.486220359802246, "learning_rate": 3.2942425271317914e-05, "loss": 1.4184, "step": 944 }, { "epoch": 0.45807077072224917, "grad_norm": 1.5135023593902588, "learning_rate": 3.2902289533188634e-05, "loss": 1.4369, "step": 945 }, { "epoch": 0.4585555016965584, "grad_norm": 1.5079814195632935, "learning_rate": 3.28621311540056e-05, "loss": 1.5486, "step": 946 }, { "epoch": 0.4590402326708677, "grad_norm": 1.4888428449630737, "learning_rate": 3.282195024882764e-05, "loss": 1.2889, "step": 947 }, { "epoch": 0.4595249636451769, "grad_norm": 1.3688241243362427, "learning_rate": 3.2781746932778076e-05, "loss": 1.2063, "step": 948 }, { "epoch": 0.4600096946194862, "grad_norm": 1.4846850633621216, "learning_rate": 3.274152132104447e-05, "loss": 1.5821, "step": 949 }, { "epoch": 0.46049442559379544, "grad_norm": 1.5739784240722656, "learning_rate": 3.270127352887828e-05, "loss": 1.4057, "step": 950 }, { "epoch": 0.4609791565681047, "grad_norm": 1.5579380989074707, "learning_rate": 3.266100367159448e-05, "loss": 1.5043, "step": 951 }, { "epoch": 0.46146388754241396, "grad_norm": 1.4212524890899658, "learning_rate": 3.2620711864571274e-05, "loss": 1.3979, "step": 952 }, { "epoch": 0.4619486185167232, "grad_norm": 1.4628902673721313, "learning_rate": 3.258039822324977e-05, "loss": 1.377, "step": 953 }, { "epoch": 0.4624333494910325, "grad_norm": 1.5454577207565308, "learning_rate": 3.254006286313362e-05, "loss": 1.4185, "step": 954 }, { "epoch": 0.4629180804653417, "grad_norm": 1.8712515830993652, "learning_rate": 3.24997058997887e-05, "loss": 1.6019, "step": 955 }, { "epoch": 0.463402811439651, "grad_norm": 1.3647255897521973, "learning_rate": 3.245932744884278e-05, "loss": 1.1018, "step": 956 }, { "epoch": 0.46388754241396024, "grad_norm": 1.5049718618392944, "learning_rate": 3.241892762598522e-05, "loss": 1.5208, "step": 957 }, { "epoch": 0.46437227338826953, "grad_norm": 1.5857036113739014, "learning_rate": 3.237850654696659e-05, "loss": 1.33, "step": 958 }, { "epoch": 0.46485700436257876, "grad_norm": 1.4404864311218262, "learning_rate": 3.233806432759837e-05, "loss": 1.3034, "step": 959 }, { "epoch": 0.46534173533688805, "grad_norm": 1.4808937311172485, "learning_rate": 3.22976010837526e-05, "loss": 1.5562, "step": 960 }, { "epoch": 0.4658264663111973, "grad_norm": 1.636323094367981, "learning_rate": 3.225711693136156e-05, "loss": 1.2444, "step": 961 }, { "epoch": 0.4663111972855065, "grad_norm": 1.4531266689300537, "learning_rate": 3.221661198641745e-05, "loss": 1.2632, "step": 962 }, { "epoch": 0.4667959282598158, "grad_norm": 1.3352887630462646, "learning_rate": 3.217608636497203e-05, "loss": 1.2413, "step": 963 }, { "epoch": 0.46728065923412504, "grad_norm": 1.3544855117797852, "learning_rate": 3.213554018313631e-05, "loss": 1.2552, "step": 964 }, { "epoch": 0.46776539020843433, "grad_norm": 1.5176138877868652, "learning_rate": 3.209497355708019e-05, "loss": 1.4344, "step": 965 }, { "epoch": 0.46825012118274356, "grad_norm": 1.3940963745117188, "learning_rate": 3.205438660303216e-05, "loss": 1.418, "step": 966 }, { "epoch": 0.46873485215705285, "grad_norm": 1.6681914329528809, "learning_rate": 3.201377943727896e-05, "loss": 1.5961, "step": 967 }, { "epoch": 0.4692195831313621, "grad_norm": 1.3219623565673828, "learning_rate": 3.1973152176165224e-05, "loss": 1.1749, "step": 968 }, { "epoch": 0.46970431410567137, "grad_norm": 1.4592280387878418, "learning_rate": 3.1932504936093167e-05, "loss": 1.6801, "step": 969 }, { "epoch": 0.4701890450799806, "grad_norm": 1.471772313117981, "learning_rate": 3.189183783352224e-05, "loss": 1.4001, "step": 970 }, { "epoch": 0.4706737760542899, "grad_norm": 1.5347093343734741, "learning_rate": 3.1851150984968814e-05, "loss": 1.4215, "step": 971 }, { "epoch": 0.4711585070285991, "grad_norm": 1.481303334236145, "learning_rate": 3.1810444507005824e-05, "loss": 1.4193, "step": 972 }, { "epoch": 0.47164323800290836, "grad_norm": 1.6206600666046143, "learning_rate": 3.1769718516262466e-05, "loss": 1.4644, "step": 973 }, { "epoch": 0.47212796897721765, "grad_norm": 1.437705159187317, "learning_rate": 3.172897312942381e-05, "loss": 1.4197, "step": 974 }, { "epoch": 0.4726126999515269, "grad_norm": 1.5616474151611328, "learning_rate": 3.168820846323053e-05, "loss": 1.574, "step": 975 }, { "epoch": 0.47309743092583617, "grad_norm": 1.4508112668991089, "learning_rate": 3.16474246344785e-05, "loss": 1.5167, "step": 976 }, { "epoch": 0.4735821619001454, "grad_norm": 1.4133033752441406, "learning_rate": 3.1606621760018554e-05, "loss": 1.3912, "step": 977 }, { "epoch": 0.4740668928744547, "grad_norm": 1.5243926048278809, "learning_rate": 3.156579995675603e-05, "loss": 1.1586, "step": 978 }, { "epoch": 0.4745516238487639, "grad_norm": 1.3419501781463623, "learning_rate": 3.152495934165055e-05, "loss": 1.434, "step": 979 }, { "epoch": 0.4750363548230732, "grad_norm": 1.5045238733291626, "learning_rate": 3.148410003171561e-05, "loss": 1.2983, "step": 980 }, { "epoch": 0.47552108579738245, "grad_norm": 1.3928799629211426, "learning_rate": 3.1443222144018265e-05, "loss": 1.7629, "step": 981 }, { "epoch": 0.47600581677169174, "grad_norm": 1.5783344507217407, "learning_rate": 3.1402325795678814e-05, "loss": 1.4108, "step": 982 }, { "epoch": 0.47649054774600097, "grad_norm": 1.3899141550064087, "learning_rate": 3.1361411103870455e-05, "loss": 1.3194, "step": 983 }, { "epoch": 0.4769752787203102, "grad_norm": 1.438714623451233, "learning_rate": 3.1320478185818896e-05, "loss": 1.6322, "step": 984 }, { "epoch": 0.4774600096946195, "grad_norm": 1.579592227935791, "learning_rate": 3.127952715880212e-05, "loss": 1.7614, "step": 985 }, { "epoch": 0.4779447406689287, "grad_norm": 1.5455819368362427, "learning_rate": 3.1238558140149964e-05, "loss": 1.1737, "step": 986 }, { "epoch": 0.478429471643238, "grad_norm": 1.4268600940704346, "learning_rate": 3.119757124724384e-05, "loss": 1.3511, "step": 987 }, { "epoch": 0.47891420261754725, "grad_norm": 1.4414108991622925, "learning_rate": 3.115656659751632e-05, "loss": 1.6027, "step": 988 }, { "epoch": 0.47939893359185654, "grad_norm": 1.382475733757019, "learning_rate": 3.11155443084509e-05, "loss": 1.3433, "step": 989 }, { "epoch": 0.47988366456616577, "grad_norm": 1.3869885206222534, "learning_rate": 3.10745044975816e-05, "loss": 1.5374, "step": 990 }, { "epoch": 0.48036839554047506, "grad_norm": 1.4492113590240479, "learning_rate": 3.1033447282492646e-05, "loss": 1.4005, "step": 991 }, { "epoch": 0.4808531265147843, "grad_norm": 1.4593931436538696, "learning_rate": 3.0992372780818113e-05, "loss": 1.4781, "step": 992 }, { "epoch": 0.4813378574890936, "grad_norm": 1.2960480451583862, "learning_rate": 3.0951281110241634e-05, "loss": 1.1956, "step": 993 }, { "epoch": 0.4818225884634028, "grad_norm": 3.1581783294677734, "learning_rate": 3.0910172388496e-05, "loss": 1.2586, "step": 994 }, { "epoch": 0.48230731943771205, "grad_norm": 1.422606348991394, "learning_rate": 3.086904673336287e-05, "loss": 1.3854, "step": 995 }, { "epoch": 0.48279205041202133, "grad_norm": 1.3689360618591309, "learning_rate": 3.082790426267243e-05, "loss": 0.9986, "step": 996 }, { "epoch": 0.48327678138633057, "grad_norm": 1.3445098400115967, "learning_rate": 3.0786745094303035e-05, "loss": 1.5066, "step": 997 }, { "epoch": 0.48376151236063986, "grad_norm": 1.4405405521392822, "learning_rate": 3.0745569346180876e-05, "loss": 1.4553, "step": 998 }, { "epoch": 0.4842462433349491, "grad_norm": 1.8509117364883423, "learning_rate": 3.070437713627965e-05, "loss": 1.5544, "step": 999 }, { "epoch": 0.4847309743092584, "grad_norm": 1.540792465209961, "learning_rate": 3.066316858262023e-05, "loss": 1.8954, "step": 1000 }, { "epoch": 0.4852157052835676, "grad_norm": 1.587246060371399, "learning_rate": 3.0621943803270295e-05, "loss": 1.3981, "step": 1001 }, { "epoch": 0.4857004362578769, "grad_norm": 1.6329646110534668, "learning_rate": 3.058070291634403e-05, "loss": 1.795, "step": 1002 }, { "epoch": 0.48618516723218613, "grad_norm": 1.5801475048065186, "learning_rate": 3.053944604000177e-05, "loss": 1.7153, "step": 1003 }, { "epoch": 0.4866698982064954, "grad_norm": 1.7902394533157349, "learning_rate": 3.0498173292449643e-05, "loss": 1.5319, "step": 1004 }, { "epoch": 0.48715462918080465, "grad_norm": 1.4669686555862427, "learning_rate": 3.0456884791939278e-05, "loss": 1.5914, "step": 1005 }, { "epoch": 0.4876393601551139, "grad_norm": 4.603597640991211, "learning_rate": 3.041558065676742e-05, "loss": 1.7194, "step": 1006 }, { "epoch": 0.4881240911294232, "grad_norm": 1.4835340976715088, "learning_rate": 3.0374261005275607e-05, "loss": 1.673, "step": 1007 }, { "epoch": 0.4886088221037324, "grad_norm": 1.3735729455947876, "learning_rate": 3.0332925955849844e-05, "loss": 1.3659, "step": 1008 }, { "epoch": 0.4890935530780417, "grad_norm": 1.5638624429702759, "learning_rate": 3.0291575626920243e-05, "loss": 1.4514, "step": 1009 }, { "epoch": 0.48957828405235093, "grad_norm": 1.4114654064178467, "learning_rate": 3.025021013696071e-05, "loss": 1.4997, "step": 1010 }, { "epoch": 0.4900630150266602, "grad_norm": 1.3967055082321167, "learning_rate": 3.0208829604488563e-05, "loss": 1.4588, "step": 1011 }, { "epoch": 0.49054774600096945, "grad_norm": 1.719627022743225, "learning_rate": 3.0167434148064254e-05, "loss": 1.6016, "step": 1012 }, { "epoch": 0.49103247697527874, "grad_norm": 1.463747262954712, "learning_rate": 3.0126023886290955e-05, "loss": 1.687, "step": 1013 }, { "epoch": 0.491517207949588, "grad_norm": 1.4238579273223877, "learning_rate": 3.008459893781429e-05, "loss": 1.3301, "step": 1014 }, { "epoch": 0.49200193892389726, "grad_norm": 1.401583194732666, "learning_rate": 3.004315942132194e-05, "loss": 1.6042, "step": 1015 }, { "epoch": 0.4924866698982065, "grad_norm": 1.4220138788223267, "learning_rate": 3.0001705455543326e-05, "loss": 1.3374, "step": 1016 }, { "epoch": 0.49297140087251573, "grad_norm": 1.4294856786727905, "learning_rate": 2.99602371592493e-05, "loss": 1.4055, "step": 1017 }, { "epoch": 0.493456131846825, "grad_norm": 1.5217885971069336, "learning_rate": 2.9918754651251723e-05, "loss": 1.4845, "step": 1018 }, { "epoch": 0.49394086282113425, "grad_norm": 1.4662128686904907, "learning_rate": 2.9877258050403212e-05, "loss": 1.2869, "step": 1019 }, { "epoch": 0.49442559379544354, "grad_norm": 1.4692763090133667, "learning_rate": 2.9835747475596743e-05, "loss": 1.3966, "step": 1020 }, { "epoch": 0.4949103247697528, "grad_norm": 1.358508586883545, "learning_rate": 2.979422304576534e-05, "loss": 1.3493, "step": 1021 }, { "epoch": 0.49539505574406206, "grad_norm": 1.510632038116455, "learning_rate": 2.9752684879881725e-05, "loss": 1.6699, "step": 1022 }, { "epoch": 0.4958797867183713, "grad_norm": 1.507602334022522, "learning_rate": 2.9711133096957962e-05, "loss": 1.6718, "step": 1023 }, { "epoch": 0.4963645176926806, "grad_norm": 1.3191126585006714, "learning_rate": 2.966956781604513e-05, "loss": 1.2832, "step": 1024 }, { "epoch": 0.4968492486669898, "grad_norm": 1.4080357551574707, "learning_rate": 2.9627989156233006e-05, "loss": 1.4744, "step": 1025 }, { "epoch": 0.4973339796412991, "grad_norm": 1.4639397859573364, "learning_rate": 2.9586397236649666e-05, "loss": 1.5554, "step": 1026 }, { "epoch": 0.49781871061560834, "grad_norm": 1.427006721496582, "learning_rate": 2.9544792176461205e-05, "loss": 1.8437, "step": 1027 }, { "epoch": 0.4983034415899176, "grad_norm": 1.4412407875061035, "learning_rate": 2.9503174094871344e-05, "loss": 1.5907, "step": 1028 }, { "epoch": 0.49878817256422686, "grad_norm": 1.5524944067001343, "learning_rate": 2.9461543111121128e-05, "loss": 1.5992, "step": 1029 }, { "epoch": 0.4992729035385361, "grad_norm": 1.46015202999115, "learning_rate": 2.941989934448856e-05, "loss": 1.3932, "step": 1030 }, { "epoch": 0.4997576345128454, "grad_norm": 1.3838374614715576, "learning_rate": 2.9378242914288272e-05, "loss": 1.2414, "step": 1031 }, { "epoch": 0.5002423654871546, "grad_norm": 1.40300714969635, "learning_rate": 2.9336573939871186e-05, "loss": 1.2298, "step": 1032 }, { "epoch": 0.5007270964614638, "grad_norm": 1.3450630903244019, "learning_rate": 2.9294892540624147e-05, "loss": 1.6257, "step": 1033 }, { "epoch": 0.5012118274357732, "grad_norm": 1.4591161012649536, "learning_rate": 2.9253198835969607e-05, "loss": 1.5271, "step": 1034 }, { "epoch": 0.5016965584100824, "grad_norm": 1.4087638854980469, "learning_rate": 2.9211492945365288e-05, "loss": 1.2425, "step": 1035 }, { "epoch": 0.5021812893843917, "grad_norm": 1.501910924911499, "learning_rate": 2.9169774988303805e-05, "loss": 1.5748, "step": 1036 }, { "epoch": 0.5026660203587009, "grad_norm": 1.5783244371414185, "learning_rate": 2.9128045084312344e-05, "loss": 1.3556, "step": 1037 }, { "epoch": 0.5031507513330101, "grad_norm": 1.54836905002594, "learning_rate": 2.908630335295235e-05, "loss": 1.6043, "step": 1038 }, { "epoch": 0.5036354823073195, "grad_norm": 1.2901034355163574, "learning_rate": 2.9044549913819124e-05, "loss": 1.5115, "step": 1039 }, { "epoch": 0.5041202132816287, "grad_norm": 1.3442738056182861, "learning_rate": 2.9002784886541517e-05, "loss": 1.4688, "step": 1040 }, { "epoch": 0.5046049442559379, "grad_norm": 1.411210298538208, "learning_rate": 2.8961008390781603e-05, "loss": 1.5606, "step": 1041 }, { "epoch": 0.5050896752302472, "grad_norm": 1.4159587621688843, "learning_rate": 2.8919220546234282e-05, "loss": 1.498, "step": 1042 }, { "epoch": 0.5055744062045565, "grad_norm": 1.3815311193466187, "learning_rate": 2.8877421472626996e-05, "loss": 1.2571, "step": 1043 }, { "epoch": 0.5060591371788657, "grad_norm": 1.5053796768188477, "learning_rate": 2.8835611289719345e-05, "loss": 1.336, "step": 1044 }, { "epoch": 0.506543868153175, "grad_norm": 1.4755936861038208, "learning_rate": 2.8793790117302765e-05, "loss": 1.2908, "step": 1045 }, { "epoch": 0.5070285991274842, "grad_norm": 1.34824800491333, "learning_rate": 2.8751958075200185e-05, "loss": 1.5134, "step": 1046 }, { "epoch": 0.5075133301017936, "grad_norm": 1.3312855958938599, "learning_rate": 2.8710115283265655e-05, "loss": 1.3463, "step": 1047 }, { "epoch": 0.5079980610761028, "grad_norm": 1.3728924989700317, "learning_rate": 2.8668261861384045e-05, "loss": 1.7027, "step": 1048 }, { "epoch": 0.508482792050412, "grad_norm": 1.781646728515625, "learning_rate": 2.8626397929470672e-05, "loss": 1.2995, "step": 1049 }, { "epoch": 0.5089675230247213, "grad_norm": 1.4653760194778442, "learning_rate": 2.8584523607470976e-05, "loss": 1.3966, "step": 1050 }, { "epoch": 0.5094522539990305, "grad_norm": 1.5125970840454102, "learning_rate": 2.854263901536015e-05, "loss": 1.4322, "step": 1051 }, { "epoch": 0.5099369849733398, "grad_norm": 1.3065565824508667, "learning_rate": 2.8500744273142833e-05, "loss": 1.4568, "step": 1052 }, { "epoch": 0.5104217159476491, "grad_norm": 1.6204395294189453, "learning_rate": 2.845883950085271e-05, "loss": 1.5351, "step": 1053 }, { "epoch": 0.5109064469219583, "grad_norm": 1.3245327472686768, "learning_rate": 2.8416924818552238e-05, "loss": 1.4935, "step": 1054 }, { "epoch": 0.5113911778962675, "grad_norm": 1.4986622333526611, "learning_rate": 2.8375000346332255e-05, "loss": 1.3269, "step": 1055 }, { "epoch": 0.5118759088705769, "grad_norm": 1.5058215856552124, "learning_rate": 2.8333066204311654e-05, "loss": 1.7093, "step": 1056 }, { "epoch": 0.5123606398448861, "grad_norm": 1.4276849031448364, "learning_rate": 2.829112251263702e-05, "loss": 1.5234, "step": 1057 }, { "epoch": 0.5128453708191953, "grad_norm": 1.4878147840499878, "learning_rate": 2.824916939148231e-05, "loss": 1.1303, "step": 1058 }, { "epoch": 0.5133301017935046, "grad_norm": 1.4708224534988403, "learning_rate": 2.8207206961048494e-05, "loss": 1.5127, "step": 1059 }, { "epoch": 0.5138148327678138, "grad_norm": 1.4829494953155518, "learning_rate": 2.8165235341563212e-05, "loss": 1.1638, "step": 1060 }, { "epoch": 0.5142995637421232, "grad_norm": 1.3504929542541504, "learning_rate": 2.8123254653280445e-05, "loss": 1.5974, "step": 1061 }, { "epoch": 0.5147842947164324, "grad_norm": 1.5481303930282593, "learning_rate": 2.8081265016480137e-05, "loss": 1.4454, "step": 1062 }, { "epoch": 0.5152690256907416, "grad_norm": 1.4048713445663452, "learning_rate": 2.8039266551467873e-05, "loss": 1.5425, "step": 1063 }, { "epoch": 0.5157537566650509, "grad_norm": 1.3949064016342163, "learning_rate": 2.7997259378574564e-05, "loss": 1.2964, "step": 1064 }, { "epoch": 0.5162384876393602, "grad_norm": 1.4568665027618408, "learning_rate": 2.7955243618156023e-05, "loss": 1.5504, "step": 1065 }, { "epoch": 0.5167232186136694, "grad_norm": 1.3728277683258057, "learning_rate": 2.7913219390592704e-05, "loss": 1.19, "step": 1066 }, { "epoch": 0.5172079495879787, "grad_norm": 1.5279256105422974, "learning_rate": 2.787118681628929e-05, "loss": 1.737, "step": 1067 }, { "epoch": 0.5176926805622879, "grad_norm": 1.4620921611785889, "learning_rate": 2.7829146015674406e-05, "loss": 1.461, "step": 1068 }, { "epoch": 0.5181774115365972, "grad_norm": 1.5227320194244385, "learning_rate": 2.778709710920024e-05, "loss": 1.3925, "step": 1069 }, { "epoch": 0.5186621425109065, "grad_norm": 1.4971587657928467, "learning_rate": 2.7745040217342195e-05, "loss": 1.4625, "step": 1070 }, { "epoch": 0.5191468734852157, "grad_norm": 1.531641960144043, "learning_rate": 2.7702975460598547e-05, "loss": 1.3207, "step": 1071 }, { "epoch": 0.5196316044595249, "grad_norm": 1.3242888450622559, "learning_rate": 2.766090295949013e-05, "loss": 1.5634, "step": 1072 }, { "epoch": 0.5201163354338342, "grad_norm": 1.3874385356903076, "learning_rate": 2.7618822834559947e-05, "loss": 1.35, "step": 1073 }, { "epoch": 0.5206010664081435, "grad_norm": 1.45573091506958, "learning_rate": 2.757673520637285e-05, "loss": 1.6007, "step": 1074 }, { "epoch": 0.5210857973824528, "grad_norm": 1.45209538936615, "learning_rate": 2.75346401955152e-05, "loss": 1.3736, "step": 1075 }, { "epoch": 0.521570528356762, "grad_norm": 1.4319308996200562, "learning_rate": 2.749253792259448e-05, "loss": 1.6778, "step": 1076 }, { "epoch": 0.5220552593310712, "grad_norm": 1.388581395149231, "learning_rate": 2.7450428508239024e-05, "loss": 1.6616, "step": 1077 }, { "epoch": 0.5225399903053806, "grad_norm": 1.3756077289581299, "learning_rate": 2.7408312073097574e-05, "loss": 1.1387, "step": 1078 }, { "epoch": 0.5230247212796898, "grad_norm": 1.4797683954238892, "learning_rate": 2.7366188737839026e-05, "loss": 1.5972, "step": 1079 }, { "epoch": 0.523509452253999, "grad_norm": 1.4945778846740723, "learning_rate": 2.7324058623152056e-05, "loss": 1.5199, "step": 1080 }, { "epoch": 0.5239941832283083, "grad_norm": 1.3682905435562134, "learning_rate": 2.7281921849744714e-05, "loss": 1.3174, "step": 1081 }, { "epoch": 0.5244789142026175, "grad_norm": 1.3681645393371582, "learning_rate": 2.7239778538344163e-05, "loss": 1.3481, "step": 1082 }, { "epoch": 0.5249636451769268, "grad_norm": 1.43455171585083, "learning_rate": 2.7197628809696306e-05, "loss": 1.4334, "step": 1083 }, { "epoch": 0.5254483761512361, "grad_norm": 1.50799560546875, "learning_rate": 2.715547278456541e-05, "loss": 1.5683, "step": 1084 }, { "epoch": 0.5259331071255453, "grad_norm": 1.4469860792160034, "learning_rate": 2.7113310583733797e-05, "loss": 1.4747, "step": 1085 }, { "epoch": 0.5264178380998545, "grad_norm": 1.420809030532837, "learning_rate": 2.7071142328001465e-05, "loss": 1.6305, "step": 1086 }, { "epoch": 0.5269025690741639, "grad_norm": 1.3464183807373047, "learning_rate": 2.7028968138185782e-05, "loss": 1.3846, "step": 1087 }, { "epoch": 0.5273873000484731, "grad_norm": 1.4121391773223877, "learning_rate": 2.6986788135121106e-05, "loss": 1.4262, "step": 1088 }, { "epoch": 0.5278720310227824, "grad_norm": 1.4404566287994385, "learning_rate": 2.6944602439658457e-05, "loss": 1.7716, "step": 1089 }, { "epoch": 0.5283567619970916, "grad_norm": 1.4350225925445557, "learning_rate": 2.6902411172665147e-05, "loss": 1.3612, "step": 1090 }, { "epoch": 0.5288414929714008, "grad_norm": 1.46249520778656, "learning_rate": 2.686021445502448e-05, "loss": 1.699, "step": 1091 }, { "epoch": 0.5293262239457102, "grad_norm": 1.4543901681900024, "learning_rate": 2.681801240763535e-05, "loss": 1.6293, "step": 1092 }, { "epoch": 0.5298109549200194, "grad_norm": 1.5145845413208008, "learning_rate": 2.6775805151411936e-05, "loss": 1.4227, "step": 1093 }, { "epoch": 0.5302956858943286, "grad_norm": 1.4717572927474976, "learning_rate": 2.6733592807283344e-05, "loss": 1.4983, "step": 1094 }, { "epoch": 0.5307804168686379, "grad_norm": 1.4818400144577026, "learning_rate": 2.6691375496193234e-05, "loss": 1.3531, "step": 1095 }, { "epoch": 0.5312651478429472, "grad_norm": 1.3545438051223755, "learning_rate": 2.6649153339099524e-05, "loss": 1.0584, "step": 1096 }, { "epoch": 0.5317498788172564, "grad_norm": 1.457953691482544, "learning_rate": 2.6606926456974013e-05, "loss": 1.4133, "step": 1097 }, { "epoch": 0.5322346097915657, "grad_norm": 1.343666434288025, "learning_rate": 2.656469497080202e-05, "loss": 1.3662, "step": 1098 }, { "epoch": 0.5327193407658749, "grad_norm": 1.5306185483932495, "learning_rate": 2.6522459001582078e-05, "loss": 1.6159, "step": 1099 }, { "epoch": 0.5332040717401842, "grad_norm": 1.4223085641860962, "learning_rate": 2.648021867032554e-05, "loss": 1.3616, "step": 1100 }, { "epoch": 0.5336888027144935, "grad_norm": 1.4389088153839111, "learning_rate": 2.643797409805628e-05, "loss": 1.2718, "step": 1101 }, { "epoch": 0.5341735336888027, "grad_norm": 1.5863919258117676, "learning_rate": 2.6395725405810307e-05, "loss": 1.5832, "step": 1102 }, { "epoch": 0.534658264663112, "grad_norm": 1.531956672668457, "learning_rate": 2.635347271463544e-05, "loss": 1.3952, "step": 1103 }, { "epoch": 0.5351429956374212, "grad_norm": 1.63759446144104, "learning_rate": 2.631121614559096e-05, "loss": 1.3839, "step": 1104 }, { "epoch": 0.5356277266117305, "grad_norm": 1.6401275396347046, "learning_rate": 2.6268955819747247e-05, "loss": 1.6776, "step": 1105 }, { "epoch": 0.5361124575860398, "grad_norm": 1.4038505554199219, "learning_rate": 2.6226691858185454e-05, "loss": 1.4152, "step": 1106 }, { "epoch": 0.536597188560349, "grad_norm": 1.4665429592132568, "learning_rate": 2.6184424381997146e-05, "loss": 1.5916, "step": 1107 }, { "epoch": 0.5370819195346582, "grad_norm": 1.3302826881408691, "learning_rate": 2.6142153512283968e-05, "loss": 1.5929, "step": 1108 }, { "epoch": 0.5375666505089676, "grad_norm": 1.4651814699172974, "learning_rate": 2.609987937015728e-05, "loss": 1.5793, "step": 1109 }, { "epoch": 0.5380513814832768, "grad_norm": 1.3761016130447388, "learning_rate": 2.605760207673781e-05, "loss": 1.4432, "step": 1110 }, { "epoch": 0.538536112457586, "grad_norm": 1.5528533458709717, "learning_rate": 2.601532175315532e-05, "loss": 1.6668, "step": 1111 }, { "epoch": 0.5390208434318953, "grad_norm": 1.3150146007537842, "learning_rate": 2.5973038520548266e-05, "loss": 1.3136, "step": 1112 }, { "epoch": 0.5395055744062045, "grad_norm": 1.4814192056655884, "learning_rate": 2.5930752500063425e-05, "loss": 1.721, "step": 1113 }, { "epoch": 0.5399903053805138, "grad_norm": 1.5142635107040405, "learning_rate": 2.5888463812855578e-05, "loss": 1.5931, "step": 1114 }, { "epoch": 0.5404750363548231, "grad_norm": 1.3756675720214844, "learning_rate": 2.5846172580087112e-05, "loss": 1.0801, "step": 1115 }, { "epoch": 0.5409597673291323, "grad_norm": 1.4426296949386597, "learning_rate": 2.5803878922927755e-05, "loss": 1.4838, "step": 1116 }, { "epoch": 0.5414444983034415, "grad_norm": 1.2579292058944702, "learning_rate": 2.576158296255413e-05, "loss": 1.2591, "step": 1117 }, { "epoch": 0.5419292292777509, "grad_norm": 1.4666320085525513, "learning_rate": 2.5719284820149503e-05, "loss": 1.5415, "step": 1118 }, { "epoch": 0.5424139602520601, "grad_norm": 1.5992366075515747, "learning_rate": 2.5676984616903367e-05, "loss": 1.327, "step": 1119 }, { "epoch": 0.5428986912263694, "grad_norm": 1.4156771898269653, "learning_rate": 2.5634682474011128e-05, "loss": 1.2807, "step": 1120 }, { "epoch": 0.5433834222006786, "grad_norm": 1.4783258438110352, "learning_rate": 2.559237851267374e-05, "loss": 1.5313, "step": 1121 }, { "epoch": 0.5438681531749879, "grad_norm": 1.400573492050171, "learning_rate": 2.555007285409739e-05, "loss": 1.3329, "step": 1122 }, { "epoch": 0.5443528841492972, "grad_norm": 1.3685263395309448, "learning_rate": 2.550776561949311e-05, "loss": 1.4368, "step": 1123 }, { "epoch": 0.5448376151236064, "grad_norm": 1.5202267169952393, "learning_rate": 2.5465456930076435e-05, "loss": 1.5863, "step": 1124 }, { "epoch": 0.5453223460979156, "grad_norm": 1.5026854276657104, "learning_rate": 2.54231469070671e-05, "loss": 1.6135, "step": 1125 }, { "epoch": 0.5458070770722249, "grad_norm": 1.2926137447357178, "learning_rate": 2.5380835671688628e-05, "loss": 1.1667, "step": 1126 }, { "epoch": 0.5462918080465342, "grad_norm": 1.429006814956665, "learning_rate": 2.5338523345168048e-05, "loss": 1.3725, "step": 1127 }, { "epoch": 0.5467765390208434, "grad_norm": 1.605655550956726, "learning_rate": 2.52962100487355e-05, "loss": 1.9144, "step": 1128 }, { "epoch": 0.5472612699951527, "grad_norm": 1.3905277252197266, "learning_rate": 2.525389590362388e-05, "loss": 1.4073, "step": 1129 }, { "epoch": 0.5477460009694619, "grad_norm": 1.443237543106079, "learning_rate": 2.521158103106856e-05, "loss": 1.6917, "step": 1130 }, { "epoch": 0.5482307319437713, "grad_norm": 1.544218897819519, "learning_rate": 2.5169265552306963e-05, "loss": 1.4208, "step": 1131 }, { "epoch": 0.5487154629180805, "grad_norm": 1.4166316986083984, "learning_rate": 2.5126949588578264e-05, "loss": 1.3686, "step": 1132 }, { "epoch": 0.5492001938923897, "grad_norm": 1.3663363456726074, "learning_rate": 2.508463326112302e-05, "loss": 1.3377, "step": 1133 }, { "epoch": 0.549684924866699, "grad_norm": 1.3695237636566162, "learning_rate": 2.504231669118283e-05, "loss": 1.384, "step": 1134 }, { "epoch": 0.5501696558410082, "grad_norm": 1.418594479560852, "learning_rate": 2.5e-05, "loss": 1.6453, "step": 1135 }, { "epoch": 0.5506543868153175, "grad_norm": 1.4432318210601807, "learning_rate": 2.495768330881717e-05, "loss": 1.6813, "step": 1136 }, { "epoch": 0.5511391177896268, "grad_norm": 1.3886134624481201, "learning_rate": 2.4915366738876986e-05, "loss": 1.4878, "step": 1137 }, { "epoch": 0.551623848763936, "grad_norm": 1.3143367767333984, "learning_rate": 2.4873050411421738e-05, "loss": 1.1719, "step": 1138 }, { "epoch": 0.5521085797382452, "grad_norm": 1.4947563409805298, "learning_rate": 2.483073444769304e-05, "loss": 1.9726, "step": 1139 }, { "epoch": 0.5525933107125546, "grad_norm": 1.4314428567886353, "learning_rate": 2.478841896893145e-05, "loss": 1.4453, "step": 1140 }, { "epoch": 0.5530780416868638, "grad_norm": 1.5481895208358765, "learning_rate": 2.4746104096376128e-05, "loss": 1.5185, "step": 1141 }, { "epoch": 0.553562772661173, "grad_norm": 1.3574029207229614, "learning_rate": 2.470378995126451e-05, "loss": 1.2467, "step": 1142 }, { "epoch": 0.5540475036354823, "grad_norm": 1.4265308380126953, "learning_rate": 2.4661476654831958e-05, "loss": 1.3669, "step": 1143 }, { "epoch": 0.5545322346097916, "grad_norm": 1.789808988571167, "learning_rate": 2.4619164328311374e-05, "loss": 1.4957, "step": 1144 }, { "epoch": 0.5550169655841009, "grad_norm": 1.401563048362732, "learning_rate": 2.4576853092932907e-05, "loss": 1.7368, "step": 1145 }, { "epoch": 0.5555016965584101, "grad_norm": 1.3560460805892944, "learning_rate": 2.4534543069923567e-05, "loss": 1.3568, "step": 1146 }, { "epoch": 0.5559864275327193, "grad_norm": 1.4376869201660156, "learning_rate": 2.4492234380506894e-05, "loss": 1.4885, "step": 1147 }, { "epoch": 0.5564711585070286, "grad_norm": 1.4365642070770264, "learning_rate": 2.4449927145902606e-05, "loss": 1.2875, "step": 1148 }, { "epoch": 0.5569558894813379, "grad_norm": 1.4295824766159058, "learning_rate": 2.4407621487326255e-05, "loss": 1.4832, "step": 1149 }, { "epoch": 0.5574406204556471, "grad_norm": 1.3713922500610352, "learning_rate": 2.4365317525988885e-05, "loss": 1.1781, "step": 1150 }, { "epoch": 0.5579253514299564, "grad_norm": 1.5723742246627808, "learning_rate": 2.4323015383096643e-05, "loss": 1.4846, "step": 1151 }, { "epoch": 0.5584100824042656, "grad_norm": 1.2833575010299683, "learning_rate": 2.4280715179850506e-05, "loss": 1.1875, "step": 1152 }, { "epoch": 0.5588948133785749, "grad_norm": 1.355686068534851, "learning_rate": 2.4238417037445875e-05, "loss": 1.1902, "step": 1153 }, { "epoch": 0.5593795443528842, "grad_norm": 1.415571689605713, "learning_rate": 2.419612107707225e-05, "loss": 1.5119, "step": 1154 }, { "epoch": 0.5598642753271934, "grad_norm": 1.4047833681106567, "learning_rate": 2.415382741991289e-05, "loss": 1.3724, "step": 1155 }, { "epoch": 0.5603490063015026, "grad_norm": 1.8079580068588257, "learning_rate": 2.4111536187144425e-05, "loss": 1.6128, "step": 1156 }, { "epoch": 0.5608337372758119, "grad_norm": 1.410330057144165, "learning_rate": 2.406924749993657e-05, "loss": 1.3897, "step": 1157 }, { "epoch": 0.5613184682501212, "grad_norm": 1.3198529481887817, "learning_rate": 2.4026961479451733e-05, "loss": 1.0365, "step": 1158 }, { "epoch": 0.5618031992244304, "grad_norm": 1.3807947635650635, "learning_rate": 2.3984678246844677e-05, "loss": 1.3613, "step": 1159 }, { "epoch": 0.5622879301987397, "grad_norm": 3.3659908771514893, "learning_rate": 2.3942397923262204e-05, "loss": 1.4527, "step": 1160 }, { "epoch": 0.5627726611730489, "grad_norm": 1.3476786613464355, "learning_rate": 2.3900120629842732e-05, "loss": 1.1218, "step": 1161 }, { "epoch": 0.5632573921473583, "grad_norm": 1.6585768461227417, "learning_rate": 2.3857846487716038e-05, "loss": 2.0664, "step": 1162 }, { "epoch": 0.5637421231216675, "grad_norm": 1.4534820318222046, "learning_rate": 2.3815575618002856e-05, "loss": 1.466, "step": 1163 }, { "epoch": 0.5642268540959767, "grad_norm": 1.3563114404678345, "learning_rate": 2.3773308141814552e-05, "loss": 1.3486, "step": 1164 }, { "epoch": 0.564711585070286, "grad_norm": 1.4399093389511108, "learning_rate": 2.3731044180252756e-05, "loss": 1.3011, "step": 1165 }, { "epoch": 0.5651963160445953, "grad_norm": 1.3951159715652466, "learning_rate": 2.3688783854409045e-05, "loss": 1.159, "step": 1166 }, { "epoch": 0.5656810470189045, "grad_norm": 1.4155405759811401, "learning_rate": 2.3646527285364565e-05, "loss": 1.3806, "step": 1167 }, { "epoch": 0.5661657779932138, "grad_norm": 1.3713849782943726, "learning_rate": 2.3604274594189695e-05, "loss": 1.4719, "step": 1168 }, { "epoch": 0.566650508967523, "grad_norm": 1.5539053678512573, "learning_rate": 2.3562025901943726e-05, "loss": 1.5451, "step": 1169 }, { "epoch": 0.5671352399418322, "grad_norm": 1.4450373649597168, "learning_rate": 2.351978132967447e-05, "loss": 1.3855, "step": 1170 }, { "epoch": 0.5676199709161416, "grad_norm": 1.4404420852661133, "learning_rate": 2.347754099841793e-05, "loss": 1.7035, "step": 1171 }, { "epoch": 0.5681047018904508, "grad_norm": 1.4115735292434692, "learning_rate": 2.3435305029197984e-05, "loss": 1.4108, "step": 1172 }, { "epoch": 0.56858943286476, "grad_norm": 1.4475492238998413, "learning_rate": 2.3393073543025996e-05, "loss": 1.2907, "step": 1173 }, { "epoch": 0.5690741638390693, "grad_norm": 1.3643134832382202, "learning_rate": 2.335084666090048e-05, "loss": 1.3279, "step": 1174 }, { "epoch": 0.5695588948133786, "grad_norm": 1.480979323387146, "learning_rate": 2.3308624503806772e-05, "loss": 1.6189, "step": 1175 }, { "epoch": 0.5700436257876879, "grad_norm": 1.3969759941101074, "learning_rate": 2.3266407192716666e-05, "loss": 1.472, "step": 1176 }, { "epoch": 0.5705283567619971, "grad_norm": 1.454965591430664, "learning_rate": 2.3224194848588066e-05, "loss": 1.0431, "step": 1177 }, { "epoch": 0.5710130877363063, "grad_norm": 1.3337841033935547, "learning_rate": 2.3181987592364655e-05, "loss": 1.2462, "step": 1178 }, { "epoch": 0.5714978187106156, "grad_norm": 1.3526654243469238, "learning_rate": 2.3139785544975527e-05, "loss": 1.3556, "step": 1179 }, { "epoch": 0.5719825496849249, "grad_norm": 1.464104413986206, "learning_rate": 2.309758882733486e-05, "loss": 1.594, "step": 1180 }, { "epoch": 0.5724672806592341, "grad_norm": 1.5070477724075317, "learning_rate": 2.305539756034155e-05, "loss": 1.2799, "step": 1181 }, { "epoch": 0.5729520116335434, "grad_norm": 1.747273325920105, "learning_rate": 2.30132118648789e-05, "loss": 1.4735, "step": 1182 }, { "epoch": 0.5734367426078526, "grad_norm": 2.815830945968628, "learning_rate": 2.2971031861814223e-05, "loss": 1.3718, "step": 1183 }, { "epoch": 0.573921473582162, "grad_norm": 1.4527679681777954, "learning_rate": 2.2928857671998538e-05, "loss": 1.4325, "step": 1184 }, { "epoch": 0.5744062045564712, "grad_norm": 1.5000076293945312, "learning_rate": 2.288668941626621e-05, "loss": 1.5461, "step": 1185 }, { "epoch": 0.5748909355307804, "grad_norm": 1.3602912425994873, "learning_rate": 2.2844527215434592e-05, "loss": 1.403, "step": 1186 }, { "epoch": 0.5753756665050896, "grad_norm": 1.4031354188919067, "learning_rate": 2.2802371190303696e-05, "loss": 1.2408, "step": 1187 }, { "epoch": 0.575860397479399, "grad_norm": 1.385236382484436, "learning_rate": 2.2760221461655833e-05, "loss": 1.4098, "step": 1188 }, { "epoch": 0.5763451284537082, "grad_norm": 1.3685550689697266, "learning_rate": 2.27180781502553e-05, "loss": 1.2458, "step": 1189 }, { "epoch": 0.5768298594280175, "grad_norm": 1.4121043682098389, "learning_rate": 2.267594137684796e-05, "loss": 1.6139, "step": 1190 }, { "epoch": 0.5773145904023267, "grad_norm": 1.40248703956604, "learning_rate": 2.2633811262160977e-05, "loss": 1.6284, "step": 1191 }, { "epoch": 0.5777993213766359, "grad_norm": 1.5229443311691284, "learning_rate": 2.2591687926902432e-05, "loss": 1.619, "step": 1192 }, { "epoch": 0.5782840523509453, "grad_norm": 1.4838701486587524, "learning_rate": 2.2549571491760986e-05, "loss": 1.4843, "step": 1193 }, { "epoch": 0.5787687833252545, "grad_norm": 1.4755678176879883, "learning_rate": 2.2507462077405523e-05, "loss": 1.4363, "step": 1194 }, { "epoch": 0.5792535142995637, "grad_norm": 1.4962295293807983, "learning_rate": 2.2465359804484806e-05, "loss": 1.3032, "step": 1195 }, { "epoch": 0.579738245273873, "grad_norm": 1.3037718534469604, "learning_rate": 2.2423264793627148e-05, "loss": 1.3224, "step": 1196 }, { "epoch": 0.5802229762481823, "grad_norm": 1.3326340913772583, "learning_rate": 2.2381177165440055e-05, "loss": 1.5501, "step": 1197 }, { "epoch": 0.5807077072224915, "grad_norm": 1.484641432762146, "learning_rate": 2.2339097040509882e-05, "loss": 1.5076, "step": 1198 }, { "epoch": 0.5811924381968008, "grad_norm": 1.4163663387298584, "learning_rate": 2.2297024539401463e-05, "loss": 1.482, "step": 1199 }, { "epoch": 0.58167716917111, "grad_norm": 1.3082435131072998, "learning_rate": 2.225495978265782e-05, "loss": 1.2917, "step": 1200 }, { "epoch": 0.5821619001454192, "grad_norm": 1.4738914966583252, "learning_rate": 2.2212902890799767e-05, "loss": 1.6231, "step": 1201 }, { "epoch": 0.5826466311197286, "grad_norm": 1.4668748378753662, "learning_rate": 2.2170853984325597e-05, "loss": 1.2994, "step": 1202 }, { "epoch": 0.5831313620940378, "grad_norm": 1.4222160577774048, "learning_rate": 2.2128813183710716e-05, "loss": 1.4073, "step": 1203 }, { "epoch": 0.583616093068347, "grad_norm": 1.6111295223236084, "learning_rate": 2.2086780609407305e-05, "loss": 1.1423, "step": 1204 }, { "epoch": 0.5841008240426563, "grad_norm": 1.3994250297546387, "learning_rate": 2.2044756381843983e-05, "loss": 1.6045, "step": 1205 }, { "epoch": 0.5845855550169656, "grad_norm": 1.3738783597946167, "learning_rate": 2.2002740621425442e-05, "loss": 1.2844, "step": 1206 }, { "epoch": 0.5850702859912749, "grad_norm": 1.3349037170410156, "learning_rate": 2.1960733448532126e-05, "loss": 1.3563, "step": 1207 }, { "epoch": 0.5855550169655841, "grad_norm": 1.464147925376892, "learning_rate": 2.1918734983519873e-05, "loss": 1.5385, "step": 1208 }, { "epoch": 0.5860397479398933, "grad_norm": 1.2928396463394165, "learning_rate": 2.1876745346719567e-05, "loss": 1.1307, "step": 1209 }, { "epoch": 0.5865244789142026, "grad_norm": 1.4599294662475586, "learning_rate": 2.1834764658436797e-05, "loss": 1.3856, "step": 1210 }, { "epoch": 0.5870092098885119, "grad_norm": 1.4603444337844849, "learning_rate": 2.1792793038951515e-05, "loss": 1.4164, "step": 1211 }, { "epoch": 0.5874939408628211, "grad_norm": 1.4382938146591187, "learning_rate": 2.1750830608517696e-05, "loss": 1.661, "step": 1212 }, { "epoch": 0.5879786718371304, "grad_norm": 1.448847770690918, "learning_rate": 2.1708877487362987e-05, "loss": 1.498, "step": 1213 }, { "epoch": 0.5884634028114396, "grad_norm": 1.4043089151382446, "learning_rate": 2.1666933795688352e-05, "loss": 1.4716, "step": 1214 }, { "epoch": 0.588948133785749, "grad_norm": 1.4823040962219238, "learning_rate": 2.1624999653667747e-05, "loss": 1.4944, "step": 1215 }, { "epoch": 0.5894328647600582, "grad_norm": 1.5355263948440552, "learning_rate": 2.1583075181447764e-05, "loss": 1.5809, "step": 1216 }, { "epoch": 0.5899175957343674, "grad_norm": 1.4335788488388062, "learning_rate": 2.1541160499147297e-05, "loss": 1.4292, "step": 1217 }, { "epoch": 0.5904023267086767, "grad_norm": 1.435947299003601, "learning_rate": 2.1499255726857183e-05, "loss": 1.5298, "step": 1218 }, { "epoch": 0.590887057682986, "grad_norm": 1.5709877014160156, "learning_rate": 2.1457360984639853e-05, "loss": 1.304, "step": 1219 }, { "epoch": 0.5913717886572952, "grad_norm": 1.3494329452514648, "learning_rate": 2.141547639252903e-05, "loss": 1.4771, "step": 1220 }, { "epoch": 0.5918565196316045, "grad_norm": 1.6605241298675537, "learning_rate": 2.137360207052933e-05, "loss": 1.8418, "step": 1221 }, { "epoch": 0.5923412506059137, "grad_norm": 1.2385108470916748, "learning_rate": 2.1331738138615958e-05, "loss": 1.3536, "step": 1222 }, { "epoch": 0.5928259815802229, "grad_norm": 1.4509235620498657, "learning_rate": 2.1289884716734347e-05, "loss": 1.4231, "step": 1223 }, { "epoch": 0.5933107125545323, "grad_norm": 1.551924705505371, "learning_rate": 2.124804192479982e-05, "loss": 1.3416, "step": 1224 }, { "epoch": 0.5937954435288415, "grad_norm": 1.4182485342025757, "learning_rate": 2.1206209882697234e-05, "loss": 1.7183, "step": 1225 }, { "epoch": 0.5942801745031507, "grad_norm": 2.135904550552368, "learning_rate": 2.1164388710280654e-05, "loss": 1.2838, "step": 1226 }, { "epoch": 0.59476490547746, "grad_norm": 1.4602768421173096, "learning_rate": 2.1122578527373016e-05, "loss": 1.3902, "step": 1227 }, { "epoch": 0.5952496364517693, "grad_norm": 1.485456943511963, "learning_rate": 2.1080779453765727e-05, "loss": 1.4738, "step": 1228 }, { "epoch": 0.5957343674260785, "grad_norm": 1.4163349866867065, "learning_rate": 2.1038991609218407e-05, "loss": 1.2601, "step": 1229 }, { "epoch": 0.5962190984003878, "grad_norm": 1.474471926689148, "learning_rate": 2.099721511345849e-05, "loss": 1.633, "step": 1230 }, { "epoch": 0.596703829374697, "grad_norm": 1.3297244310379028, "learning_rate": 2.0955450086180882e-05, "loss": 1.1103, "step": 1231 }, { "epoch": 0.5971885603490062, "grad_norm": 1.4120020866394043, "learning_rate": 2.091369664704766e-05, "loss": 1.469, "step": 1232 }, { "epoch": 0.5976732913233156, "grad_norm": 1.4585497379302979, "learning_rate": 2.0871954915687658e-05, "loss": 1.4362, "step": 1233 }, { "epoch": 0.5981580222976248, "grad_norm": 1.4092267751693726, "learning_rate": 2.08302250116962e-05, "loss": 1.3138, "step": 1234 }, { "epoch": 0.5986427532719341, "grad_norm": 1.3591123819351196, "learning_rate": 2.0788507054634714e-05, "loss": 1.3283, "step": 1235 }, { "epoch": 0.5991274842462433, "grad_norm": 1.3640435934066772, "learning_rate": 2.074680116403039e-05, "loss": 1.4952, "step": 1236 }, { "epoch": 0.5996122152205526, "grad_norm": 1.4771053791046143, "learning_rate": 2.070510745937586e-05, "loss": 1.6309, "step": 1237 }, { "epoch": 0.6000969461948619, "grad_norm": 1.4691507816314697, "learning_rate": 2.066342606012882e-05, "loss": 1.348, "step": 1238 }, { "epoch": 0.6005816771691711, "grad_norm": 1.3989927768707275, "learning_rate": 2.0621757085711734e-05, "loss": 1.4557, "step": 1239 }, { "epoch": 0.6010664081434803, "grad_norm": 1.4642021656036377, "learning_rate": 2.058010065551145e-05, "loss": 1.5403, "step": 1240 }, { "epoch": 0.6015511391177897, "grad_norm": 1.473580241203308, "learning_rate": 2.0538456888878878e-05, "loss": 1.3097, "step": 1241 }, { "epoch": 0.6020358700920989, "grad_norm": 1.3591980934143066, "learning_rate": 2.0496825905128665e-05, "loss": 1.4441, "step": 1242 }, { "epoch": 0.6025206010664081, "grad_norm": 1.3612762689590454, "learning_rate": 2.04552078235388e-05, "loss": 1.2061, "step": 1243 }, { "epoch": 0.6030053320407174, "grad_norm": 1.4053421020507812, "learning_rate": 2.0413602763350337e-05, "loss": 1.3619, "step": 1244 }, { "epoch": 0.6034900630150266, "grad_norm": 1.2905552387237549, "learning_rate": 2.0372010843766996e-05, "loss": 1.1999, "step": 1245 }, { "epoch": 0.603974793989336, "grad_norm": 1.4193109273910522, "learning_rate": 2.0330432183954867e-05, "loss": 1.512, "step": 1246 }, { "epoch": 0.6044595249636452, "grad_norm": 1.3693450689315796, "learning_rate": 2.0288866903042054e-05, "loss": 1.3532, "step": 1247 }, { "epoch": 0.6049442559379544, "grad_norm": 1.4601976871490479, "learning_rate": 2.0247315120118284e-05, "loss": 1.4071, "step": 1248 }, { "epoch": 0.6054289869122637, "grad_norm": 1.480519413948059, "learning_rate": 2.0205776954234663e-05, "loss": 1.5318, "step": 1249 }, { "epoch": 0.605913717886573, "grad_norm": 1.3617873191833496, "learning_rate": 2.0164252524403263e-05, "loss": 1.3624, "step": 1250 }, { "epoch": 0.6063984488608822, "grad_norm": 1.3958122730255127, "learning_rate": 2.0122741949596797e-05, "loss": 1.0867, "step": 1251 }, { "epoch": 0.6068831798351915, "grad_norm": 1.5380889177322388, "learning_rate": 2.0081245348748286e-05, "loss": 1.7665, "step": 1252 }, { "epoch": 0.6073679108095007, "grad_norm": 1.4936906099319458, "learning_rate": 2.0039762840750707e-05, "loss": 1.1883, "step": 1253 }, { "epoch": 0.6078526417838099, "grad_norm": 1.3765724897384644, "learning_rate": 1.999829454445667e-05, "loss": 1.2834, "step": 1254 }, { "epoch": 0.6083373727581193, "grad_norm": 1.4031294584274292, "learning_rate": 1.995684057867806e-05, "loss": 1.2124, "step": 1255 }, { "epoch": 0.6088221037324285, "grad_norm": 1.514238715171814, "learning_rate": 1.991540106218572e-05, "loss": 1.6542, "step": 1256 }, { "epoch": 0.6093068347067377, "grad_norm": 1.4218300580978394, "learning_rate": 1.9873976113709048e-05, "loss": 1.6589, "step": 1257 }, { "epoch": 0.609791565681047, "grad_norm": 1.5005664825439453, "learning_rate": 1.983256585193575e-05, "loss": 1.445, "step": 1258 }, { "epoch": 0.6102762966553563, "grad_norm": 1.571577548980713, "learning_rate": 1.979117039551144e-05, "loss": 1.7245, "step": 1259 }, { "epoch": 0.6107610276296656, "grad_norm": 1.4720330238342285, "learning_rate": 1.9749789863039297e-05, "loss": 1.4244, "step": 1260 }, { "epoch": 0.6112457586039748, "grad_norm": 1.5714093446731567, "learning_rate": 1.970842437307976e-05, "loss": 1.6333, "step": 1261 }, { "epoch": 0.611730489578284, "grad_norm": 1.4255337715148926, "learning_rate": 1.9667074044150165e-05, "loss": 1.5165, "step": 1262 }, { "epoch": 0.6122152205525934, "grad_norm": 1.590695858001709, "learning_rate": 1.96257389947244e-05, "loss": 1.33, "step": 1263 }, { "epoch": 0.6126999515269026, "grad_norm": 1.4347730875015259, "learning_rate": 1.9584419343232584e-05, "loss": 1.3907, "step": 1264 }, { "epoch": 0.6131846825012118, "grad_norm": 1.46970796585083, "learning_rate": 1.954311520806072e-05, "loss": 1.3961, "step": 1265 }, { "epoch": 0.6136694134755211, "grad_norm": 1.3484935760498047, "learning_rate": 1.9501826707550366e-05, "loss": 1.4027, "step": 1266 }, { "epoch": 0.6141541444498303, "grad_norm": 1.3133972883224487, "learning_rate": 1.9460553959998244e-05, "loss": 1.3246, "step": 1267 }, { "epoch": 0.6146388754241396, "grad_norm": 1.3001095056533813, "learning_rate": 1.9419297083655976e-05, "loss": 1.3312, "step": 1268 }, { "epoch": 0.6151236063984489, "grad_norm": 1.4388257265090942, "learning_rate": 1.937805619672971e-05, "loss": 1.4682, "step": 1269 }, { "epoch": 0.6156083373727581, "grad_norm": 1.4479858875274658, "learning_rate": 1.9336831417379777e-05, "loss": 1.4709, "step": 1270 }, { "epoch": 0.6160930683470673, "grad_norm": 1.2688299417495728, "learning_rate": 1.9295622863720356e-05, "loss": 0.9973, "step": 1271 }, { "epoch": 0.6165777993213767, "grad_norm": 1.4602196216583252, "learning_rate": 1.9254430653819127e-05, "loss": 1.5403, "step": 1272 }, { "epoch": 0.6170625302956859, "grad_norm": 1.4173598289489746, "learning_rate": 1.9213254905696964e-05, "loss": 1.3238, "step": 1273 }, { "epoch": 0.6175472612699952, "grad_norm": 1.3910014629364014, "learning_rate": 1.9172095737327566e-05, "loss": 1.347, "step": 1274 }, { "epoch": 0.6180319922443044, "grad_norm": 1.4783943891525269, "learning_rate": 1.9130953266637127e-05, "loss": 1.6262, "step": 1275 }, { "epoch": 0.6185167232186136, "grad_norm": 1.5285893678665161, "learning_rate": 1.9089827611504013e-05, "loss": 1.624, "step": 1276 }, { "epoch": 0.619001454192923, "grad_norm": 1.4484494924545288, "learning_rate": 1.9048718889758375e-05, "loss": 1.3973, "step": 1277 }, { "epoch": 0.6194861851672322, "grad_norm": 1.3599704504013062, "learning_rate": 1.900762721918189e-05, "loss": 1.3223, "step": 1278 }, { "epoch": 0.6199709161415414, "grad_norm": 1.4407846927642822, "learning_rate": 1.8966552717507364e-05, "loss": 1.2713, "step": 1279 }, { "epoch": 0.6204556471158507, "grad_norm": 1.4449760913848877, "learning_rate": 1.8925495502418406e-05, "loss": 1.6689, "step": 1280 }, { "epoch": 0.62094037809016, "grad_norm": 1.5764880180358887, "learning_rate": 1.8884455691549105e-05, "loss": 1.5437, "step": 1281 }, { "epoch": 0.6214251090644692, "grad_norm": 1.5581961870193481, "learning_rate": 1.8843433402483683e-05, "loss": 1.4253, "step": 1282 }, { "epoch": 0.6219098400387785, "grad_norm": 1.384125828742981, "learning_rate": 1.8802428752756172e-05, "loss": 1.4686, "step": 1283 }, { "epoch": 0.6223945710130877, "grad_norm": 1.4908874034881592, "learning_rate": 1.876144185985003e-05, "loss": 1.3817, "step": 1284 }, { "epoch": 0.622879301987397, "grad_norm": 1.441921591758728, "learning_rate": 1.8720472841197884e-05, "loss": 1.1528, "step": 1285 }, { "epoch": 0.6233640329617063, "grad_norm": 5.794642925262451, "learning_rate": 1.867952181418111e-05, "loss": 1.5045, "step": 1286 }, { "epoch": 0.6238487639360155, "grad_norm": 1.430221676826477, "learning_rate": 1.8638588896129557e-05, "loss": 1.3444, "step": 1287 }, { "epoch": 0.6243334949103247, "grad_norm": 1.3817723989486694, "learning_rate": 1.8597674204321185e-05, "loss": 1.4123, "step": 1288 }, { "epoch": 0.624818225884634, "grad_norm": 1.4198659658432007, "learning_rate": 1.8556777855981737e-05, "loss": 1.76, "step": 1289 }, { "epoch": 0.6253029568589433, "grad_norm": 1.387041449546814, "learning_rate": 1.85158999682844e-05, "loss": 1.4046, "step": 1290 }, { "epoch": 0.6257876878332526, "grad_norm": 1.4337729215621948, "learning_rate": 1.8475040658349454e-05, "loss": 1.6913, "step": 1291 }, { "epoch": 0.6262724188075618, "grad_norm": 1.2845624685287476, "learning_rate": 1.843420004324397e-05, "loss": 1.3359, "step": 1292 }, { "epoch": 0.626757149781871, "grad_norm": 1.3890070915222168, "learning_rate": 1.839337823998145e-05, "loss": 1.3585, "step": 1293 }, { "epoch": 0.6272418807561804, "grad_norm": 1.6244319677352905, "learning_rate": 1.8352575365521503e-05, "loss": 1.302, "step": 1294 }, { "epoch": 0.6277266117304896, "grad_norm": 1.373167634010315, "learning_rate": 1.8311791536769483e-05, "loss": 1.3026, "step": 1295 }, { "epoch": 0.6282113427047988, "grad_norm": 1.3994495868682861, "learning_rate": 1.8271026870576197e-05, "loss": 1.273, "step": 1296 }, { "epoch": 0.6286960736791081, "grad_norm": 1.373883843421936, "learning_rate": 1.8230281483737537e-05, "loss": 1.5305, "step": 1297 }, { "epoch": 0.6291808046534173, "grad_norm": 1.4813038110733032, "learning_rate": 1.818955549299418e-05, "loss": 1.244, "step": 1298 }, { "epoch": 0.6296655356277266, "grad_norm": 1.3685561418533325, "learning_rate": 1.8148849015031195e-05, "loss": 1.4916, "step": 1299 }, { "epoch": 0.6301502666020359, "grad_norm": 1.4443938732147217, "learning_rate": 1.8108162166477766e-05, "loss": 1.2453, "step": 1300 }, { "epoch": 0.6306349975763451, "grad_norm": 1.4075669050216675, "learning_rate": 1.806749506390684e-05, "loss": 1.5644, "step": 1301 }, { "epoch": 0.6311197285506543, "grad_norm": 1.4213589429855347, "learning_rate": 1.802684782383478e-05, "loss": 1.3007, "step": 1302 }, { "epoch": 0.6316044595249637, "grad_norm": 1.3473411798477173, "learning_rate": 1.798622056272104e-05, "loss": 1.1438, "step": 1303 }, { "epoch": 0.6320891904992729, "grad_norm": 1.3807307481765747, "learning_rate": 1.7945613396967837e-05, "loss": 1.3414, "step": 1304 }, { "epoch": 0.6325739214735822, "grad_norm": 1.6303166151046753, "learning_rate": 1.790502644291982e-05, "loss": 1.7151, "step": 1305 }, { "epoch": 0.6330586524478914, "grad_norm": 1.4952675104141235, "learning_rate": 1.78644598168637e-05, "loss": 1.66, "step": 1306 }, { "epoch": 0.6335433834222006, "grad_norm": 1.3126227855682373, "learning_rate": 1.7823913635027973e-05, "loss": 1.1817, "step": 1307 }, { "epoch": 0.63402811439651, "grad_norm": 1.4245887994766235, "learning_rate": 1.7783388013582553e-05, "loss": 1.4009, "step": 1308 }, { "epoch": 0.6345128453708192, "grad_norm": 1.420230507850647, "learning_rate": 1.7742883068638447e-05, "loss": 1.408, "step": 1309 }, { "epoch": 0.6349975763451284, "grad_norm": 1.3932969570159912, "learning_rate": 1.770239891624741e-05, "loss": 1.5593, "step": 1310 }, { "epoch": 0.6354823073194377, "grad_norm": 1.392712116241455, "learning_rate": 1.7661935672401632e-05, "loss": 1.2642, "step": 1311 }, { "epoch": 0.635967038293747, "grad_norm": 1.4296149015426636, "learning_rate": 1.7621493453033405e-05, "loss": 1.4418, "step": 1312 }, { "epoch": 0.6364517692680562, "grad_norm": 1.367656946182251, "learning_rate": 1.7581072374014777e-05, "loss": 1.3111, "step": 1313 }, { "epoch": 0.6369365002423655, "grad_norm": 1.353005290031433, "learning_rate": 1.7540672551157227e-05, "loss": 1.1314, "step": 1314 }, { "epoch": 0.6374212312166747, "grad_norm": 1.514885663986206, "learning_rate": 1.7500294100211315e-05, "loss": 1.265, "step": 1315 }, { "epoch": 0.637905962190984, "grad_norm": 1.365065336227417, "learning_rate": 1.7459937136866392e-05, "loss": 1.3412, "step": 1316 }, { "epoch": 0.6383906931652933, "grad_norm": 1.6574945449829102, "learning_rate": 1.7419601776750237e-05, "loss": 1.5467, "step": 1317 }, { "epoch": 0.6388754241396025, "grad_norm": 1.4217700958251953, "learning_rate": 1.737928813542873e-05, "loss": 1.5412, "step": 1318 }, { "epoch": 0.6393601551139118, "grad_norm": 1.3700860738754272, "learning_rate": 1.7338996328405526e-05, "loss": 1.3197, "step": 1319 }, { "epoch": 0.639844886088221, "grad_norm": 1.380444049835205, "learning_rate": 1.7298726471121723e-05, "loss": 1.6721, "step": 1320 }, { "epoch": 0.6403296170625303, "grad_norm": 1.3262513875961304, "learning_rate": 1.725847867895553e-05, "loss": 1.274, "step": 1321 }, { "epoch": 0.6408143480368396, "grad_norm": 1.5101852416992188, "learning_rate": 1.7218253067221933e-05, "loss": 1.4451, "step": 1322 }, { "epoch": 0.6412990790111488, "grad_norm": 1.5045435428619385, "learning_rate": 1.7178049751172366e-05, "loss": 1.5808, "step": 1323 }, { "epoch": 0.641783809985458, "grad_norm": 1.5001122951507568, "learning_rate": 1.7137868845994397e-05, "loss": 1.3329, "step": 1324 }, { "epoch": 0.6422685409597674, "grad_norm": 1.3572721481323242, "learning_rate": 1.709771046681137e-05, "loss": 1.0717, "step": 1325 }, { "epoch": 0.6427532719340766, "grad_norm": 1.542609453201294, "learning_rate": 1.7057574728682095e-05, "loss": 1.3739, "step": 1326 }, { "epoch": 0.6432380029083858, "grad_norm": 1.4278963804244995, "learning_rate": 1.7017461746600506e-05, "loss": 1.37, "step": 1327 }, { "epoch": 0.6437227338826951, "grad_norm": 1.5175418853759766, "learning_rate": 1.6977371635495347e-05, "loss": 1.9564, "step": 1328 }, { "epoch": 0.6442074648570043, "grad_norm": 1.5065232515335083, "learning_rate": 1.6937304510229834e-05, "loss": 1.6646, "step": 1329 }, { "epoch": 0.6446921958313137, "grad_norm": 1.5196707248687744, "learning_rate": 1.6897260485601318e-05, "loss": 1.4758, "step": 1330 }, { "epoch": 0.6451769268056229, "grad_norm": 1.4596264362335205, "learning_rate": 1.685723967634097e-05, "loss": 1.3208, "step": 1331 }, { "epoch": 0.6456616577799321, "grad_norm": 1.43583345413208, "learning_rate": 1.681724219711344e-05, "loss": 1.5051, "step": 1332 }, { "epoch": 0.6461463887542414, "grad_norm": 1.985482931137085, "learning_rate": 1.6777268162516548e-05, "loss": 1.5165, "step": 1333 }, { "epoch": 0.6466311197285507, "grad_norm": 1.4382468461990356, "learning_rate": 1.6737317687080922e-05, "loss": 1.889, "step": 1334 }, { "epoch": 0.6471158507028599, "grad_norm": 1.4122154712677002, "learning_rate": 1.6697390885269705e-05, "loss": 1.3209, "step": 1335 }, { "epoch": 0.6476005816771692, "grad_norm": 1.277051568031311, "learning_rate": 1.6657487871478212e-05, "loss": 1.1666, "step": 1336 }, { "epoch": 0.6480853126514784, "grad_norm": 1.3974394798278809, "learning_rate": 1.661760876003358e-05, "loss": 1.6071, "step": 1337 }, { "epoch": 0.6485700436257877, "grad_norm": 1.503825068473816, "learning_rate": 1.65777536651945e-05, "loss": 1.1412, "step": 1338 }, { "epoch": 0.649054774600097, "grad_norm": 1.4791207313537598, "learning_rate": 1.6537922701150828e-05, "loss": 1.2004, "step": 1339 }, { "epoch": 0.6495395055744062, "grad_norm": 1.4980095624923706, "learning_rate": 1.6498115982023285e-05, "loss": 1.5021, "step": 1340 }, { "epoch": 0.6500242365487154, "grad_norm": 1.372834324836731, "learning_rate": 1.645833362186313e-05, "loss": 1.318, "step": 1341 }, { "epoch": 0.6505089675230247, "grad_norm": 1.4026696681976318, "learning_rate": 1.6418575734651832e-05, "loss": 1.4896, "step": 1342 }, { "epoch": 0.650993698497334, "grad_norm": 1.454119324684143, "learning_rate": 1.6378842434300746e-05, "loss": 1.3353, "step": 1343 }, { "epoch": 0.6514784294716433, "grad_norm": 1.4449537992477417, "learning_rate": 1.633913383465076e-05, "loss": 1.7903, "step": 1344 }, { "epoch": 0.6519631604459525, "grad_norm": 1.4829782247543335, "learning_rate": 1.6299450049472022e-05, "loss": 1.4147, "step": 1345 }, { "epoch": 0.6524478914202617, "grad_norm": 1.4531413316726685, "learning_rate": 1.6259791192463557e-05, "loss": 1.0013, "step": 1346 }, { "epoch": 0.6529326223945711, "grad_norm": 1.5042320489883423, "learning_rate": 1.6220157377252994e-05, "loss": 1.7003, "step": 1347 }, { "epoch": 0.6534173533688803, "grad_norm": 1.499081015586853, "learning_rate": 1.6180548717396198e-05, "loss": 1.4914, "step": 1348 }, { "epoch": 0.6539020843431895, "grad_norm": 1.2727124691009521, "learning_rate": 1.6140965326376954e-05, "loss": 1.0353, "step": 1349 }, { "epoch": 0.6543868153174988, "grad_norm": 1.4354982376098633, "learning_rate": 1.6101407317606666e-05, "loss": 1.2793, "step": 1350 }, { "epoch": 0.654871546291808, "grad_norm": 1.458020806312561, "learning_rate": 1.6061874804424e-05, "loss": 1.502, "step": 1351 }, { "epoch": 0.6553562772661173, "grad_norm": 1.4475706815719604, "learning_rate": 1.602236790009458e-05, "loss": 1.6151, "step": 1352 }, { "epoch": 0.6558410082404266, "grad_norm": 1.4461567401885986, "learning_rate": 1.5982886717810676e-05, "loss": 1.6717, "step": 1353 }, { "epoch": 0.6563257392147358, "grad_norm": 1.4085007905960083, "learning_rate": 1.5943431370690815e-05, "loss": 1.4512, "step": 1354 }, { "epoch": 0.656810470189045, "grad_norm": 1.3767906427383423, "learning_rate": 1.590400197177954e-05, "loss": 1.2589, "step": 1355 }, { "epoch": 0.6572952011633544, "grad_norm": 1.5227607488632202, "learning_rate": 1.5864598634047046e-05, "loss": 1.4397, "step": 1356 }, { "epoch": 0.6577799321376636, "grad_norm": 1.4575637578964233, "learning_rate": 1.5825221470388847e-05, "loss": 1.3085, "step": 1357 }, { "epoch": 0.6582646631119728, "grad_norm": 1.2845321893692017, "learning_rate": 1.5785870593625472e-05, "loss": 1.3451, "step": 1358 }, { "epoch": 0.6587493940862821, "grad_norm": 1.3598212003707886, "learning_rate": 1.574654611650214e-05, "loss": 1.4667, "step": 1359 }, { "epoch": 0.6592341250605914, "grad_norm": 1.384535551071167, "learning_rate": 1.5707248151688424e-05, "loss": 1.3087, "step": 1360 }, { "epoch": 0.6597188560349007, "grad_norm": 1.3146976232528687, "learning_rate": 1.5667976811777932e-05, "loss": 1.4136, "step": 1361 }, { "epoch": 0.6602035870092099, "grad_norm": 1.5279932022094727, "learning_rate": 1.5628732209287993e-05, "loss": 1.3477, "step": 1362 }, { "epoch": 0.6606883179835191, "grad_norm": 1.3585039377212524, "learning_rate": 1.558951445665935e-05, "loss": 1.3886, "step": 1363 }, { "epoch": 0.6611730489578284, "grad_norm": 1.3408361673355103, "learning_rate": 1.555032366625577e-05, "loss": 1.677, "step": 1364 }, { "epoch": 0.6616577799321377, "grad_norm": 1.3672118186950684, "learning_rate": 1.5511159950363814e-05, "loss": 1.3577, "step": 1365 }, { "epoch": 0.6621425109064469, "grad_norm": 1.3929879665374756, "learning_rate": 1.5472023421192445e-05, "loss": 1.3506, "step": 1366 }, { "epoch": 0.6626272418807562, "grad_norm": 1.4646409749984741, "learning_rate": 1.5432914190872757e-05, "loss": 1.6497, "step": 1367 }, { "epoch": 0.6631119728550654, "grad_norm": 1.426300287246704, "learning_rate": 1.539383237145761e-05, "loss": 1.3643, "step": 1368 }, { "epoch": 0.6635967038293747, "grad_norm": 1.4488145112991333, "learning_rate": 1.5354778074921332e-05, "loss": 1.2896, "step": 1369 }, { "epoch": 0.664081434803684, "grad_norm": 1.503942847251892, "learning_rate": 1.5315751413159394e-05, "loss": 1.5616, "step": 1370 }, { "epoch": 0.6645661657779932, "grad_norm": 1.4608261585235596, "learning_rate": 1.52767524979881e-05, "loss": 1.3562, "step": 1371 }, { "epoch": 0.6650508967523024, "grad_norm": 1.4247395992279053, "learning_rate": 1.5237781441144256e-05, "loss": 1.4961, "step": 1372 }, { "epoch": 0.6655356277266117, "grad_norm": 1.3943641185760498, "learning_rate": 1.5198838354284817e-05, "loss": 1.5362, "step": 1373 }, { "epoch": 0.666020358700921, "grad_norm": 1.42288339138031, "learning_rate": 1.515992334898664e-05, "loss": 1.2119, "step": 1374 }, { "epoch": 0.6665050896752303, "grad_norm": 1.3145508766174316, "learning_rate": 1.512103653674612e-05, "loss": 1.6042, "step": 1375 }, { "epoch": 0.6669898206495395, "grad_norm": 1.4619483947753906, "learning_rate": 1.5082178028978853e-05, "loss": 1.4172, "step": 1376 }, { "epoch": 0.6674745516238487, "grad_norm": 1.375780463218689, "learning_rate": 1.5043347937019358e-05, "loss": 1.4588, "step": 1377 }, { "epoch": 0.6679592825981581, "grad_norm": 1.339187502861023, "learning_rate": 1.5004546372120736e-05, "loss": 1.4457, "step": 1378 }, { "epoch": 0.6684440135724673, "grad_norm": 1.310111165046692, "learning_rate": 1.4965773445454349e-05, "loss": 1.1335, "step": 1379 }, { "epoch": 0.6689287445467765, "grad_norm": 1.339221477508545, "learning_rate": 1.492702926810951e-05, "loss": 1.2751, "step": 1380 }, { "epoch": 0.6694134755210858, "grad_norm": 6.000275611877441, "learning_rate": 1.4888313951093169e-05, "loss": 1.1046, "step": 1381 }, { "epoch": 0.6698982064953951, "grad_norm": 1.5101951360702515, "learning_rate": 1.4849627605329583e-05, "loss": 1.496, "step": 1382 }, { "epoch": 0.6703829374697043, "grad_norm": 1.4664740562438965, "learning_rate": 1.481097034165998e-05, "loss": 1.5279, "step": 1383 }, { "epoch": 0.6708676684440136, "grad_norm": 1.461575984954834, "learning_rate": 1.4772342270842299e-05, "loss": 1.3002, "step": 1384 }, { "epoch": 0.6713523994183228, "grad_norm": 1.4403537511825562, "learning_rate": 1.4733743503550818e-05, "loss": 1.7751, "step": 1385 }, { "epoch": 0.671837130392632, "grad_norm": 1.3830691576004028, "learning_rate": 1.4695174150375865e-05, "loss": 1.2238, "step": 1386 }, { "epoch": 0.6723218613669414, "grad_norm": 1.3348417282104492, "learning_rate": 1.4656634321823493e-05, "loss": 1.3973, "step": 1387 }, { "epoch": 0.6728065923412506, "grad_norm": 1.3746894598007202, "learning_rate": 1.461812412831515e-05, "loss": 1.114, "step": 1388 }, { "epoch": 0.6732913233155599, "grad_norm": 1.3570497035980225, "learning_rate": 1.457964368018739e-05, "loss": 1.4, "step": 1389 }, { "epoch": 0.6737760542898691, "grad_norm": 1.3715555667877197, "learning_rate": 1.4541193087691535e-05, "loss": 1.2383, "step": 1390 }, { "epoch": 0.6742607852641784, "grad_norm": 1.3852076530456543, "learning_rate": 1.4502772460993385e-05, "loss": 1.2747, "step": 1391 }, { "epoch": 0.6747455162384877, "grad_norm": 1.203576683998108, "learning_rate": 1.4464381910172858e-05, "loss": 0.9592, "step": 1392 }, { "epoch": 0.6752302472127969, "grad_norm": 1.6935615539550781, "learning_rate": 1.4426021545223712e-05, "loss": 1.7171, "step": 1393 }, { "epoch": 0.6757149781871061, "grad_norm": 1.4629912376403809, "learning_rate": 1.438769147605322e-05, "loss": 1.4803, "step": 1394 }, { "epoch": 0.6761997091614154, "grad_norm": 1.3970519304275513, "learning_rate": 1.434939181248184e-05, "loss": 1.5205, "step": 1395 }, { "epoch": 0.6766844401357247, "grad_norm": 1.4061120748519897, "learning_rate": 1.4311122664242954e-05, "loss": 1.6476, "step": 1396 }, { "epoch": 0.6771691711100339, "grad_norm": 1.41211998462677, "learning_rate": 1.4272884140982462e-05, "loss": 1.456, "step": 1397 }, { "epoch": 0.6776539020843432, "grad_norm": 1.3315012454986572, "learning_rate": 1.423467635225856e-05, "loss": 1.4234, "step": 1398 }, { "epoch": 0.6781386330586524, "grad_norm": 1.5095267295837402, "learning_rate": 1.4196499407541359e-05, "loss": 1.4583, "step": 1399 }, { "epoch": 0.6786233640329618, "grad_norm": 1.3793443441390991, "learning_rate": 1.4158353416212622e-05, "loss": 1.2173, "step": 1400 }, { "epoch": 0.679108095007271, "grad_norm": 1.3824712038040161, "learning_rate": 1.4120238487565402e-05, "loss": 1.3028, "step": 1401 }, { "epoch": 0.6795928259815802, "grad_norm": 1.375754952430725, "learning_rate": 1.4082154730803774e-05, "loss": 1.2412, "step": 1402 }, { "epoch": 0.6800775569558895, "grad_norm": 1.4535313844680786, "learning_rate": 1.4044102255042475e-05, "loss": 1.5615, "step": 1403 }, { "epoch": 0.6805622879301988, "grad_norm": 1.379002332687378, "learning_rate": 1.4006081169306656e-05, "loss": 1.5593, "step": 1404 }, { "epoch": 0.681047018904508, "grad_norm": 1.3821449279785156, "learning_rate": 1.3968091582531495e-05, "loss": 1.2261, "step": 1405 }, { "epoch": 0.6815317498788173, "grad_norm": 1.4238389730453491, "learning_rate": 1.3930133603561957e-05, "loss": 1.2891, "step": 1406 }, { "epoch": 0.6820164808531265, "grad_norm": 1.3251632452011108, "learning_rate": 1.3892207341152416e-05, "loss": 1.5867, "step": 1407 }, { "epoch": 0.6825012118274357, "grad_norm": 3.028153896331787, "learning_rate": 1.3854312903966377e-05, "loss": 1.4604, "step": 1408 }, { "epoch": 0.6829859428017451, "grad_norm": 1.363827109336853, "learning_rate": 1.381645040057619e-05, "loss": 1.1675, "step": 1409 }, { "epoch": 0.6834706737760543, "grad_norm": 1.4435791969299316, "learning_rate": 1.3778619939462667e-05, "loss": 1.6013, "step": 1410 }, { "epoch": 0.6839554047503635, "grad_norm": 1.4359902143478394, "learning_rate": 1.3740821629014874e-05, "loss": 1.6007, "step": 1411 }, { "epoch": 0.6844401357246728, "grad_norm": 1.3930972814559937, "learning_rate": 1.3703055577529686e-05, "loss": 1.479, "step": 1412 }, { "epoch": 0.6849248666989821, "grad_norm": 1.5467122793197632, "learning_rate": 1.3665321893211618e-05, "loss": 1.3401, "step": 1413 }, { "epoch": 0.6854095976732913, "grad_norm": 1.3341065645217896, "learning_rate": 1.3627620684172407e-05, "loss": 1.4378, "step": 1414 }, { "epoch": 0.6858943286476006, "grad_norm": 1.5038310289382935, "learning_rate": 1.3589952058430778e-05, "loss": 1.3485, "step": 1415 }, { "epoch": 0.6863790596219098, "grad_norm": 1.3664665222167969, "learning_rate": 1.3552316123912063e-05, "loss": 1.4734, "step": 1416 }, { "epoch": 0.686863790596219, "grad_norm": 1.4052238464355469, "learning_rate": 1.3514712988447972e-05, "loss": 1.3877, "step": 1417 }, { "epoch": 0.6873485215705284, "grad_norm": 1.478548526763916, "learning_rate": 1.3477142759776207e-05, "loss": 1.6317, "step": 1418 }, { "epoch": 0.6878332525448376, "grad_norm": 1.338326096534729, "learning_rate": 1.343960554554019e-05, "loss": 1.3528, "step": 1419 }, { "epoch": 0.6883179835191469, "grad_norm": 1.3133000135421753, "learning_rate": 1.3402101453288785e-05, "loss": 1.3344, "step": 1420 }, { "epoch": 0.6888027144934561, "grad_norm": 1.4202029705047607, "learning_rate": 1.3364630590475923e-05, "loss": 1.3458, "step": 1421 }, { "epoch": 0.6892874454677654, "grad_norm": 1.3237619400024414, "learning_rate": 1.3327193064460342e-05, "loss": 1.2891, "step": 1422 }, { "epoch": 0.6897721764420747, "grad_norm": 1.4759186506271362, "learning_rate": 1.328978898250525e-05, "loss": 1.7053, "step": 1423 }, { "epoch": 0.6902569074163839, "grad_norm": 1.4690028429031372, "learning_rate": 1.325241845177807e-05, "loss": 1.5294, "step": 1424 }, { "epoch": 0.6907416383906931, "grad_norm": 1.433915376663208, "learning_rate": 1.3215081579350058e-05, "loss": 1.2603, "step": 1425 }, { "epoch": 0.6912263693650024, "grad_norm": 1.3608239889144897, "learning_rate": 1.3177778472196068e-05, "loss": 1.1412, "step": 1426 }, { "epoch": 0.6917111003393117, "grad_norm": 1.4877392053604126, "learning_rate": 1.3140509237194176e-05, "loss": 1.8585, "step": 1427 }, { "epoch": 0.692195831313621, "grad_norm": 1.3863829374313354, "learning_rate": 1.3103273981125447e-05, "loss": 1.6264, "step": 1428 }, { "epoch": 0.6926805622879302, "grad_norm": 1.4459201097488403, "learning_rate": 1.3066072810673557e-05, "loss": 1.4349, "step": 1429 }, { "epoch": 0.6931652932622394, "grad_norm": 1.4497578144073486, "learning_rate": 1.302890583242457e-05, "loss": 1.2253, "step": 1430 }, { "epoch": 0.6936500242365488, "grad_norm": 1.4433369636535645, "learning_rate": 1.2991773152866515e-05, "loss": 1.1806, "step": 1431 }, { "epoch": 0.694134755210858, "grad_norm": 1.3377454280853271, "learning_rate": 1.2954674878389223e-05, "loss": 1.2198, "step": 1432 }, { "epoch": 0.6946194861851672, "grad_norm": 1.4302222728729248, "learning_rate": 1.2917611115283901e-05, "loss": 1.4984, "step": 1433 }, { "epoch": 0.6951042171594765, "grad_norm": 1.5178338289260864, "learning_rate": 1.2880581969742886e-05, "loss": 1.3589, "step": 1434 }, { "epoch": 0.6955889481337858, "grad_norm": 1.3676424026489258, "learning_rate": 1.2843587547859361e-05, "loss": 1.18, "step": 1435 }, { "epoch": 0.696073679108095, "grad_norm": 1.4502981901168823, "learning_rate": 1.2806627955626982e-05, "loss": 1.3871, "step": 1436 }, { "epoch": 0.6965584100824043, "grad_norm": 1.550480604171753, "learning_rate": 1.2769703298939646e-05, "loss": 1.2154, "step": 1437 }, { "epoch": 0.6970431410567135, "grad_norm": 1.3464187383651733, "learning_rate": 1.2732813683591121e-05, "loss": 1.3843, "step": 1438 }, { "epoch": 0.6975278720310227, "grad_norm": 1.508142352104187, "learning_rate": 1.2695959215274816e-05, "loss": 1.3424, "step": 1439 }, { "epoch": 0.6980126030053321, "grad_norm": 1.3689768314361572, "learning_rate": 1.2659139999583414e-05, "loss": 1.1724, "step": 1440 }, { "epoch": 0.6984973339796413, "grad_norm": 1.5775456428527832, "learning_rate": 1.2622356142008593e-05, "loss": 1.2543, "step": 1441 }, { "epoch": 0.6989820649539505, "grad_norm": 1.415062427520752, "learning_rate": 1.2585607747940729e-05, "loss": 1.4339, "step": 1442 }, { "epoch": 0.6994667959282598, "grad_norm": 1.458363652229309, "learning_rate": 1.2548894922668612e-05, "loss": 1.499, "step": 1443 }, { "epoch": 0.6999515269025691, "grad_norm": 1.39089834690094, "learning_rate": 1.2512217771379087e-05, "loss": 1.274, "step": 1444 }, { "epoch": 0.7004362578768784, "grad_norm": 1.365566611289978, "learning_rate": 1.2475576399156825e-05, "loss": 1.2721, "step": 1445 }, { "epoch": 0.7009209888511876, "grad_norm": 1.3721204996109009, "learning_rate": 1.2438970910983957e-05, "loss": 1.4607, "step": 1446 }, { "epoch": 0.7014057198254968, "grad_norm": 1.3698453903198242, "learning_rate": 1.2402401411739806e-05, "loss": 1.5212, "step": 1447 }, { "epoch": 0.701890450799806, "grad_norm": 2.067800521850586, "learning_rate": 1.2365868006200603e-05, "loss": 1.5141, "step": 1448 }, { "epoch": 0.7023751817741154, "grad_norm": 1.5352057218551636, "learning_rate": 1.232937079903914e-05, "loss": 1.5916, "step": 1449 }, { "epoch": 0.7028599127484246, "grad_norm": 1.3993531465530396, "learning_rate": 1.2292909894824528e-05, "loss": 1.3013, "step": 1450 }, { "epoch": 0.7033446437227339, "grad_norm": 1.529046654701233, "learning_rate": 1.2256485398021808e-05, "loss": 1.1088, "step": 1451 }, { "epoch": 0.7038293746970431, "grad_norm": 1.5416908264160156, "learning_rate": 1.222009741299178e-05, "loss": 1.5297, "step": 1452 }, { "epoch": 0.7043141056713524, "grad_norm": 1.4917817115783691, "learning_rate": 1.2183746043990577e-05, "loss": 1.2723, "step": 1453 }, { "epoch": 0.7047988366456617, "grad_norm": 1.3837028741836548, "learning_rate": 1.2147431395169459e-05, "loss": 1.2728, "step": 1454 }, { "epoch": 0.7052835676199709, "grad_norm": 1.434131383895874, "learning_rate": 1.2111153570574454e-05, "loss": 1.3624, "step": 1455 }, { "epoch": 0.7057682985942801, "grad_norm": 1.430051326751709, "learning_rate": 1.2074912674146107e-05, "loss": 1.3779, "step": 1456 }, { "epoch": 0.7062530295685895, "grad_norm": 1.4436354637145996, "learning_rate": 1.2038708809719137e-05, "loss": 1.3387, "step": 1457 }, { "epoch": 0.7067377605428987, "grad_norm": 1.3345868587493896, "learning_rate": 1.2002542081022165e-05, "loss": 1.2027, "step": 1458 }, { "epoch": 0.707222491517208, "grad_norm": 1.6215115785598755, "learning_rate": 1.196641259167743e-05, "loss": 1.6066, "step": 1459 }, { "epoch": 0.7077072224915172, "grad_norm": 1.4360243082046509, "learning_rate": 1.1930320445200463e-05, "loss": 1.1322, "step": 1460 }, { "epoch": 0.7081919534658264, "grad_norm": 1.3410156965255737, "learning_rate": 1.1894265744999802e-05, "loss": 1.1842, "step": 1461 }, { "epoch": 0.7086766844401358, "grad_norm": 1.6034032106399536, "learning_rate": 1.185824859437669e-05, "loss": 1.7069, "step": 1462 }, { "epoch": 0.709161415414445, "grad_norm": 1.647525668144226, "learning_rate": 1.1822269096524812e-05, "loss": 1.6953, "step": 1463 }, { "epoch": 0.7096461463887542, "grad_norm": 1.4244221448898315, "learning_rate": 1.1786327354529941e-05, "loss": 1.4405, "step": 1464 }, { "epoch": 0.7101308773630635, "grad_norm": 1.4078463315963745, "learning_rate": 1.1750423471369703e-05, "loss": 1.2905, "step": 1465 }, { "epoch": 0.7106156083373728, "grad_norm": 1.5614463090896606, "learning_rate": 1.1714557549913229e-05, "loss": 1.8514, "step": 1466 }, { "epoch": 0.711100339311682, "grad_norm": 1.4428095817565918, "learning_rate": 1.1678729692920911e-05, "loss": 1.2046, "step": 1467 }, { "epoch": 0.7115850702859913, "grad_norm": 1.3544590473175049, "learning_rate": 1.164294000304406e-05, "loss": 1.2275, "step": 1468 }, { "epoch": 0.7120698012603005, "grad_norm": 1.4303261041641235, "learning_rate": 1.1607188582824635e-05, "loss": 1.3526, "step": 1469 }, { "epoch": 0.7125545322346097, "grad_norm": 1.548387050628662, "learning_rate": 1.1571475534694951e-05, "loss": 1.1652, "step": 1470 }, { "epoch": 0.7130392632089191, "grad_norm": 1.4661262035369873, "learning_rate": 1.1535800960977397e-05, "loss": 1.7172, "step": 1471 }, { "epoch": 0.7135239941832283, "grad_norm": 1.3347164392471313, "learning_rate": 1.1500164963884107e-05, "loss": 1.397, "step": 1472 }, { "epoch": 0.7140087251575375, "grad_norm": 1.4761581420898438, "learning_rate": 1.146456764551669e-05, "loss": 1.7281, "step": 1473 }, { "epoch": 0.7144934561318468, "grad_norm": 1.443963646888733, "learning_rate": 1.142900910786596e-05, "loss": 1.5084, "step": 1474 }, { "epoch": 0.7149781871061561, "grad_norm": 1.5460129976272583, "learning_rate": 1.139348945281158e-05, "loss": 1.4503, "step": 1475 }, { "epoch": 0.7154629180804654, "grad_norm": 1.3878977298736572, "learning_rate": 1.1358008782121848e-05, "loss": 1.4604, "step": 1476 }, { "epoch": 0.7159476490547746, "grad_norm": 1.199691653251648, "learning_rate": 1.1322567197453338e-05, "loss": 1.0213, "step": 1477 }, { "epoch": 0.7164323800290838, "grad_norm": 1.3305445909500122, "learning_rate": 1.128716480035066e-05, "loss": 1.3534, "step": 1478 }, { "epoch": 0.7169171110033932, "grad_norm": 1.2963858842849731, "learning_rate": 1.125180169224613e-05, "loss": 1.1662, "step": 1479 }, { "epoch": 0.7174018419777024, "grad_norm": 1.676682472229004, "learning_rate": 1.1216477974459505e-05, "loss": 1.652, "step": 1480 }, { "epoch": 0.7178865729520116, "grad_norm": 1.3726409673690796, "learning_rate": 1.1181193748197667e-05, "loss": 1.2268, "step": 1481 }, { "epoch": 0.7183713039263209, "grad_norm": 1.3873904943466187, "learning_rate": 1.114594911455438e-05, "loss": 1.2706, "step": 1482 }, { "epoch": 0.7188560349006301, "grad_norm": 1.3303292989730835, "learning_rate": 1.1110744174509952e-05, "loss": 1.256, "step": 1483 }, { "epoch": 0.7193407658749394, "grad_norm": 1.3690974712371826, "learning_rate": 1.107557902893095e-05, "loss": 1.1057, "step": 1484 }, { "epoch": 0.7198254968492487, "grad_norm": 1.5268480777740479, "learning_rate": 1.1040453778569961e-05, "loss": 1.3311, "step": 1485 }, { "epoch": 0.7203102278235579, "grad_norm": 1.5212996006011963, "learning_rate": 1.100536852406523e-05, "loss": 1.3117, "step": 1486 }, { "epoch": 0.7207949587978671, "grad_norm": 1.370766282081604, "learning_rate": 1.0970323365940444e-05, "loss": 1.4976, "step": 1487 }, { "epoch": 0.7212796897721765, "grad_norm": 1.3523683547973633, "learning_rate": 1.0935318404604375e-05, "loss": 1.365, "step": 1488 }, { "epoch": 0.7217644207464857, "grad_norm": 1.9818627834320068, "learning_rate": 1.090035374035065e-05, "loss": 1.1948, "step": 1489 }, { "epoch": 0.722249151720795, "grad_norm": 1.465571641921997, "learning_rate": 1.0865429473357414e-05, "loss": 1.6397, "step": 1490 }, { "epoch": 0.7227338826951042, "grad_norm": 1.360374093055725, "learning_rate": 1.0830545703687109e-05, "loss": 1.3702, "step": 1491 }, { "epoch": 0.7232186136694134, "grad_norm": 1.2991136312484741, "learning_rate": 1.0795702531286106e-05, "loss": 1.2215, "step": 1492 }, { "epoch": 0.7237033446437228, "grad_norm": 1.4738467931747437, "learning_rate": 1.0760900055984496e-05, "loss": 1.7089, "step": 1493 }, { "epoch": 0.724188075618032, "grad_norm": 1.5092021226882935, "learning_rate": 1.0726138377495728e-05, "loss": 1.5763, "step": 1494 }, { "epoch": 0.7246728065923412, "grad_norm": 1.505959391593933, "learning_rate": 1.0691417595416407e-05, "loss": 1.4419, "step": 1495 }, { "epoch": 0.7251575375666505, "grad_norm": 1.417377233505249, "learning_rate": 1.0656737809225928e-05, "loss": 1.264, "step": 1496 }, { "epoch": 0.7256422685409598, "grad_norm": 1.3735849857330322, "learning_rate": 1.0622099118286239e-05, "loss": 1.3909, "step": 1497 }, { "epoch": 0.726126999515269, "grad_norm": 1.5356085300445557, "learning_rate": 1.0587501621841558e-05, "loss": 1.4821, "step": 1498 }, { "epoch": 0.7266117304895783, "grad_norm": 1.3162624835968018, "learning_rate": 1.0552945419018065e-05, "loss": 1.2017, "step": 1499 }, { "epoch": 0.7270964614638875, "grad_norm": 1.4844067096710205, "learning_rate": 1.0518430608823621e-05, "loss": 1.5073, "step": 1500 }, { "epoch": 0.7275811924381969, "grad_norm": 1.652967929840088, "learning_rate": 1.0483957290147494e-05, "loss": 1.5602, "step": 1501 }, { "epoch": 0.7280659234125061, "grad_norm": 1.4265861511230469, "learning_rate": 1.0449525561760098e-05, "loss": 1.5582, "step": 1502 }, { "epoch": 0.7285506543868153, "grad_norm": 1.723618984222412, "learning_rate": 1.041513552231265e-05, "loss": 1.869, "step": 1503 }, { "epoch": 0.7290353853611246, "grad_norm": 1.3224735260009766, "learning_rate": 1.0380787270336955e-05, "loss": 1.3943, "step": 1504 }, { "epoch": 0.7295201163354338, "grad_norm": 1.6832599639892578, "learning_rate": 1.034648090424506e-05, "loss": 1.4809, "step": 1505 }, { "epoch": 0.7300048473097431, "grad_norm": 1.4091854095458984, "learning_rate": 1.0312216522329038e-05, "loss": 1.426, "step": 1506 }, { "epoch": 0.7304895782840524, "grad_norm": 1.501326560974121, "learning_rate": 1.0277994222760645e-05, "loss": 1.6879, "step": 1507 }, { "epoch": 0.7309743092583616, "grad_norm": 1.2879735231399536, "learning_rate": 1.0243814103591074e-05, "loss": 1.4322, "step": 1508 }, { "epoch": 0.7314590402326708, "grad_norm": 1.6264989376068115, "learning_rate": 1.0209676262750658e-05, "loss": 1.2868, "step": 1509 }, { "epoch": 0.7319437712069802, "grad_norm": 1.4507060050964355, "learning_rate": 1.0175580798048625e-05, "loss": 1.5371, "step": 1510 }, { "epoch": 0.7324285021812894, "grad_norm": 1.3660101890563965, "learning_rate": 1.0141527807172766e-05, "loss": 1.4603, "step": 1511 }, { "epoch": 0.7329132331555986, "grad_norm": 1.5483052730560303, "learning_rate": 1.0107517387689166e-05, "loss": 1.2558, "step": 1512 }, { "epoch": 0.7333979641299079, "grad_norm": 1.5407285690307617, "learning_rate": 1.0073549637041985e-05, "loss": 1.3025, "step": 1513 }, { "epoch": 0.7338826951042171, "grad_norm": 1.3726648092269897, "learning_rate": 1.0039624652553073e-05, "loss": 1.5513, "step": 1514 }, { "epoch": 0.7343674260785265, "grad_norm": 1.3971195220947266, "learning_rate": 1.0005742531421805e-05, "loss": 1.2535, "step": 1515 }, { "epoch": 0.7348521570528357, "grad_norm": 1.4116908311843872, "learning_rate": 9.9719033707247e-06, "loss": 1.1999, "step": 1516 }, { "epoch": 0.7353368880271449, "grad_norm": 1.2860015630722046, "learning_rate": 9.938107267415238e-06, "loss": 1.4093, "step": 1517 }, { "epoch": 0.7358216190014542, "grad_norm": 1.4017292261123657, "learning_rate": 9.904354318323474e-06, "loss": 1.3041, "step": 1518 }, { "epoch": 0.7363063499757635, "grad_norm": 1.479777216911316, "learning_rate": 9.870644620155877e-06, "loss": 1.4719, "step": 1519 }, { "epoch": 0.7367910809500727, "grad_norm": 1.596462368965149, "learning_rate": 9.836978269494956e-06, "loss": 1.4582, "step": 1520 }, { "epoch": 0.737275811924382, "grad_norm": 1.3571925163269043, "learning_rate": 9.80335536279906e-06, "loss": 1.4515, "step": 1521 }, { "epoch": 0.7377605428986912, "grad_norm": 1.3262391090393066, "learning_rate": 9.76977599640204e-06, "loss": 1.5619, "step": 1522 }, { "epoch": 0.7382452738730004, "grad_norm": 1.3788524866104126, "learning_rate": 9.736240266512992e-06, "loss": 1.2029, "step": 1523 }, { "epoch": 0.7387300048473098, "grad_norm": 1.3858743906021118, "learning_rate": 9.702748269216021e-06, "loss": 1.466, "step": 1524 }, { "epoch": 0.739214735821619, "grad_norm": 1.5450936555862427, "learning_rate": 9.669300100469902e-06, "loss": 1.4527, "step": 1525 }, { "epoch": 0.7396994667959282, "grad_norm": 1.3684203624725342, "learning_rate": 9.635895856107855e-06, "loss": 1.2464, "step": 1526 }, { "epoch": 0.7401841977702375, "grad_norm": 1.3800790309906006, "learning_rate": 9.60253563183724e-06, "loss": 1.2337, "step": 1527 }, { "epoch": 0.7406689287445468, "grad_norm": 1.3781445026397705, "learning_rate": 9.569219523239292e-06, "loss": 1.4606, "step": 1528 }, { "epoch": 0.741153659718856, "grad_norm": 1.3597383499145508, "learning_rate": 9.535947625768851e-06, "loss": 1.2775, "step": 1529 }, { "epoch": 0.7416383906931653, "grad_norm": 1.5373189449310303, "learning_rate": 9.5027200347541e-06, "loss": 1.4209, "step": 1530 }, { "epoch": 0.7421231216674745, "grad_norm": 1.3820744752883911, "learning_rate": 9.46953684539626e-06, "loss": 1.482, "step": 1531 }, { "epoch": 0.7426078526417839, "grad_norm": 1.2832772731781006, "learning_rate": 9.436398152769349e-06, "loss": 0.9302, "step": 1532 }, { "epoch": 0.7430925836160931, "grad_norm": 1.4201370477676392, "learning_rate": 9.403304051819883e-06, "loss": 1.6234, "step": 1533 }, { "epoch": 0.7435773145904023, "grad_norm": 1.4880331754684448, "learning_rate": 9.370254637366638e-06, "loss": 1.3413, "step": 1534 }, { "epoch": 0.7440620455647116, "grad_norm": 1.386278510093689, "learning_rate": 9.337250004100337e-06, "loss": 1.5433, "step": 1535 }, { "epoch": 0.7445467765390208, "grad_norm": 1.4576743841171265, "learning_rate": 9.304290246583398e-06, "loss": 1.686, "step": 1536 }, { "epoch": 0.7450315075133301, "grad_norm": 1.4495551586151123, "learning_rate": 9.271375459249698e-06, "loss": 1.4784, "step": 1537 }, { "epoch": 0.7455162384876394, "grad_norm": 1.3655641078948975, "learning_rate": 9.238505736404212e-06, "loss": 1.4092, "step": 1538 }, { "epoch": 0.7460009694619486, "grad_norm": 1.4185867309570312, "learning_rate": 9.205681172222854e-06, "loss": 1.4356, "step": 1539 }, { "epoch": 0.7464857004362578, "grad_norm": 1.4596229791641235, "learning_rate": 9.172901860752117e-06, "loss": 1.3854, "step": 1540 }, { "epoch": 0.7469704314105672, "grad_norm": 1.3605375289916992, "learning_rate": 9.140167895908867e-06, "loss": 1.283, "step": 1541 }, { "epoch": 0.7474551623848764, "grad_norm": 1.421411156654358, "learning_rate": 9.107479371480016e-06, "loss": 1.5658, "step": 1542 }, { "epoch": 0.7479398933591856, "grad_norm": 1.4605516195297241, "learning_rate": 9.074836381122312e-06, "loss": 1.5596, "step": 1543 }, { "epoch": 0.7484246243334949, "grad_norm": 1.7730802297592163, "learning_rate": 9.04223901836202e-06, "loss": 1.5225, "step": 1544 }, { "epoch": 0.7489093553078041, "grad_norm": 1.4480133056640625, "learning_rate": 9.009687376594694e-06, "loss": 1.0663, "step": 1545 }, { "epoch": 0.7493940862821135, "grad_norm": 1.3374335765838623, "learning_rate": 8.977181549084884e-06, "loss": 1.0286, "step": 1546 }, { "epoch": 0.7498788172564227, "grad_norm": 1.3053975105285645, "learning_rate": 8.944721628965868e-06, "loss": 1.3729, "step": 1547 }, { "epoch": 0.7503635482307319, "grad_norm": 1.3824571371078491, "learning_rate": 8.912307709239394e-06, "loss": 1.3485, "step": 1548 }, { "epoch": 0.7508482792050412, "grad_norm": 1.3053250312805176, "learning_rate": 8.879939882775443e-06, "loss": 1.314, "step": 1549 }, { "epoch": 0.7513330101793505, "grad_norm": 1.4634974002838135, "learning_rate": 8.847618242311895e-06, "loss": 1.3767, "step": 1550 }, { "epoch": 0.7518177411536597, "grad_norm": 1.7195087671279907, "learning_rate": 8.815342880454311e-06, "loss": 1.7894, "step": 1551 }, { "epoch": 0.752302472127969, "grad_norm": 1.4599210023880005, "learning_rate": 8.783113889675679e-06, "loss": 1.4189, "step": 1552 }, { "epoch": 0.7527872031022782, "grad_norm": 1.5068151950836182, "learning_rate": 8.750931362316094e-06, "loss": 1.4675, "step": 1553 }, { "epoch": 0.7532719340765875, "grad_norm": 1.356566071510315, "learning_rate": 8.718795390582569e-06, "loss": 1.0757, "step": 1554 }, { "epoch": 0.7537566650508968, "grad_norm": 1.467347264289856, "learning_rate": 8.686706066548686e-06, "loss": 1.4609, "step": 1555 }, { "epoch": 0.754241396025206, "grad_norm": 1.4406360387802124, "learning_rate": 8.654663482154419e-06, "loss": 1.4178, "step": 1556 }, { "epoch": 0.7547261269995152, "grad_norm": 1.337432861328125, "learning_rate": 8.622667729205771e-06, "loss": 1.1877, "step": 1557 }, { "epoch": 0.7552108579738245, "grad_norm": 1.2744303941726685, "learning_rate": 8.590718899374628e-06, "loss": 1.3257, "step": 1558 }, { "epoch": 0.7556955889481338, "grad_norm": 1.3614236116409302, "learning_rate": 8.558817084198387e-06, "loss": 1.2649, "step": 1559 }, { "epoch": 0.7561803199224431, "grad_norm": 1.5040825605392456, "learning_rate": 8.52696237507978e-06, "loss": 1.4065, "step": 1560 }, { "epoch": 0.7566650508967523, "grad_norm": 1.427902102470398, "learning_rate": 8.495154863286548e-06, "loss": 1.6126, "step": 1561 }, { "epoch": 0.7571497818710615, "grad_norm": 1.4786641597747803, "learning_rate": 8.463394639951206e-06, "loss": 1.6583, "step": 1562 }, { "epoch": 0.7576345128453709, "grad_norm": 1.4081834554672241, "learning_rate": 8.431681796070809e-06, "loss": 1.3911, "step": 1563 }, { "epoch": 0.7581192438196801, "grad_norm": 1.4340317249298096, "learning_rate": 8.400016422506624e-06, "loss": 1.5016, "step": 1564 }, { "epoch": 0.7586039747939893, "grad_norm": 1.3345164060592651, "learning_rate": 8.368398609983945e-06, "loss": 1.0097, "step": 1565 }, { "epoch": 0.7590887057682986, "grad_norm": 1.5775506496429443, "learning_rate": 8.336828449091786e-06, "loss": 1.7549, "step": 1566 }, { "epoch": 0.7595734367426078, "grad_norm": 1.4792371988296509, "learning_rate": 8.305306030282617e-06, "loss": 1.3905, "step": 1567 }, { "epoch": 0.7600581677169171, "grad_norm": 1.5295783281326294, "learning_rate": 8.273831443872132e-06, "loss": 1.6053, "step": 1568 }, { "epoch": 0.7605428986912264, "grad_norm": 1.3806251287460327, "learning_rate": 8.242404780038996e-06, "loss": 1.2319, "step": 1569 }, { "epoch": 0.7610276296655356, "grad_norm": 1.5314702987670898, "learning_rate": 8.211026128824539e-06, "loss": 1.4693, "step": 1570 }, { "epoch": 0.7615123606398448, "grad_norm": 1.3532733917236328, "learning_rate": 8.179695580132563e-06, "loss": 1.4102, "step": 1571 }, { "epoch": 0.7619970916141542, "grad_norm": 1.3659793138504028, "learning_rate": 8.14841322372901e-06, "loss": 1.041, "step": 1572 }, { "epoch": 0.7624818225884634, "grad_norm": 1.439633846282959, "learning_rate": 8.117179149241788e-06, "loss": 1.4129, "step": 1573 }, { "epoch": 0.7629665535627727, "grad_norm": 1.3845553398132324, "learning_rate": 8.085993446160442e-06, "loss": 1.2368, "step": 1574 }, { "epoch": 0.7634512845370819, "grad_norm": 1.5613845586776733, "learning_rate": 8.054856203835934e-06, "loss": 1.5857, "step": 1575 }, { "epoch": 0.7639360155113912, "grad_norm": 1.3105788230895996, "learning_rate": 8.023767511480378e-06, "loss": 1.2001, "step": 1576 }, { "epoch": 0.7644207464857005, "grad_norm": 1.5210412740707397, "learning_rate": 7.992727458166788e-06, "loss": 1.6572, "step": 1577 }, { "epoch": 0.7649054774600097, "grad_norm": 1.3836086988449097, "learning_rate": 7.96173613282883e-06, "loss": 1.3445, "step": 1578 }, { "epoch": 0.7653902084343189, "grad_norm": 1.2600528001785278, "learning_rate": 7.93079362426054e-06, "loss": 1.184, "step": 1579 }, { "epoch": 0.7658749394086282, "grad_norm": 1.443926453590393, "learning_rate": 7.89990002111611e-06, "loss": 1.3783, "step": 1580 }, { "epoch": 0.7663596703829375, "grad_norm": 1.4408482313156128, "learning_rate": 7.86905541190959e-06, "loss": 1.4508, "step": 1581 }, { "epoch": 0.7668444013572467, "grad_norm": 1.4853960275650024, "learning_rate": 7.838259885014676e-06, "loss": 1.5522, "step": 1582 }, { "epoch": 0.767329132331556, "grad_norm": 1.9142934083938599, "learning_rate": 7.807513528664414e-06, "loss": 1.2533, "step": 1583 }, { "epoch": 0.7678138633058652, "grad_norm": 1.4642645120620728, "learning_rate": 7.776816430950997e-06, "loss": 1.2509, "step": 1584 }, { "epoch": 0.7682985942801746, "grad_norm": 1.3161667585372925, "learning_rate": 7.746168679825468e-06, "loss": 1.1, "step": 1585 }, { "epoch": 0.7687833252544838, "grad_norm": 1.469278335571289, "learning_rate": 7.715570363097487e-06, "loss": 1.3898, "step": 1586 }, { "epoch": 0.769268056228793, "grad_norm": 1.4469612836837769, "learning_rate": 7.685021568435074e-06, "loss": 1.358, "step": 1587 }, { "epoch": 0.7697527872031023, "grad_norm": 1.5007667541503906, "learning_rate": 7.654522383364387e-06, "loss": 1.5311, "step": 1588 }, { "epoch": 0.7702375181774115, "grad_norm": 1.4023587703704834, "learning_rate": 7.624072895269418e-06, "loss": 1.3025, "step": 1589 }, { "epoch": 0.7707222491517208, "grad_norm": 1.348415493965149, "learning_rate": 7.593673191391776e-06, "loss": 1.2927, "step": 1590 }, { "epoch": 0.7712069801260301, "grad_norm": 1.487410068511963, "learning_rate": 7.563323358830448e-06, "loss": 1.8384, "step": 1591 }, { "epoch": 0.7716917111003393, "grad_norm": 1.5158098936080933, "learning_rate": 7.533023484541513e-06, "loss": 1.3989, "step": 1592 }, { "epoch": 0.7721764420746485, "grad_norm": 1.2837767601013184, "learning_rate": 7.502773655337936e-06, "loss": 1.0266, "step": 1593 }, { "epoch": 0.7726611730489579, "grad_norm": 1.5535281896591187, "learning_rate": 7.472573957889267e-06, "loss": 1.5511, "step": 1594 }, { "epoch": 0.7731459040232671, "grad_norm": 1.699720025062561, "learning_rate": 7.4424244787214656e-06, "loss": 1.3947, "step": 1595 }, { "epoch": 0.7736306349975763, "grad_norm": 1.3902910947799683, "learning_rate": 7.4123253042165495e-06, "loss": 1.5077, "step": 1596 }, { "epoch": 0.7741153659718856, "grad_norm": 1.4567160606384277, "learning_rate": 7.382276520612463e-06, "loss": 1.377, "step": 1597 }, { "epoch": 0.7746000969461949, "grad_norm": 1.6392182111740112, "learning_rate": 7.352278214002739e-06, "loss": 1.7202, "step": 1598 }, { "epoch": 0.7750848279205041, "grad_norm": 1.4235200881958008, "learning_rate": 7.3223304703363135e-06, "loss": 1.3698, "step": 1599 }, { "epoch": 0.7755695588948134, "grad_norm": 1.3485275506973267, "learning_rate": 7.292433375417232e-06, "loss": 1.1991, "step": 1600 }, { "epoch": 0.7760542898691226, "grad_norm": 1.5387061834335327, "learning_rate": 7.262587014904429e-06, "loss": 1.414, "step": 1601 }, { "epoch": 0.7765390208434318, "grad_norm": 1.4910553693771362, "learning_rate": 7.232791474311493e-06, "loss": 1.3939, "step": 1602 }, { "epoch": 0.7770237518177412, "grad_norm": 1.3060449361801147, "learning_rate": 7.203046839006383e-06, "loss": 1.3913, "step": 1603 }, { "epoch": 0.7775084827920504, "grad_norm": 1.4318301677703857, "learning_rate": 7.173353194211247e-06, "loss": 1.3158, "step": 1604 }, { "epoch": 0.7779932137663597, "grad_norm": 1.4487580060958862, "learning_rate": 7.143710625002078e-06, "loss": 1.4206, "step": 1605 }, { "epoch": 0.7784779447406689, "grad_norm": 2.377192258834839, "learning_rate": 7.114119216308593e-06, "loss": 1.3553, "step": 1606 }, { "epoch": 0.7789626757149782, "grad_norm": 1.4438358545303345, "learning_rate": 7.084579052913884e-06, "loss": 1.6249, "step": 1607 }, { "epoch": 0.7794474066892875, "grad_norm": 1.6939022541046143, "learning_rate": 7.0550902194542525e-06, "loss": 1.5022, "step": 1608 }, { "epoch": 0.7799321376635967, "grad_norm": 1.4086045026779175, "learning_rate": 7.0256528004188995e-06, "loss": 1.486, "step": 1609 }, { "epoch": 0.7804168686379059, "grad_norm": 1.3695285320281982, "learning_rate": 6.996266880149749e-06, "loss": 1.2197, "step": 1610 }, { "epoch": 0.7809015996122152, "grad_norm": 1.3833764791488647, "learning_rate": 6.966932542841156e-06, "loss": 1.2276, "step": 1611 }, { "epoch": 0.7813863305865245, "grad_norm": 1.2639784812927246, "learning_rate": 6.937649872539675e-06, "loss": 1.1809, "step": 1612 }, { "epoch": 0.7818710615608337, "grad_norm": 1.495296597480774, "learning_rate": 6.908418953143861e-06, "loss": 1.3947, "step": 1613 }, { "epoch": 0.782355792535143, "grad_norm": 1.2453677654266357, "learning_rate": 6.879239868403964e-06, "loss": 1.3538, "step": 1614 }, { "epoch": 0.7828405235094522, "grad_norm": 1.3305635452270508, "learning_rate": 6.8501127019217346e-06, "loss": 1.3287, "step": 1615 }, { "epoch": 0.7833252544837616, "grad_norm": 1.4396018981933594, "learning_rate": 6.8210375371501625e-06, "loss": 1.4193, "step": 1616 }, { "epoch": 0.7838099854580708, "grad_norm": 1.4076224565505981, "learning_rate": 6.7920144573932695e-06, "loss": 1.3454, "step": 1617 }, { "epoch": 0.78429471643238, "grad_norm": 1.4297701120376587, "learning_rate": 6.7630435458058114e-06, "loss": 1.4004, "step": 1618 }, { "epoch": 0.7847794474066893, "grad_norm": 1.3946882486343384, "learning_rate": 6.734124885393111e-06, "loss": 1.5085, "step": 1619 }, { "epoch": 0.7852641783809986, "grad_norm": 1.538183331489563, "learning_rate": 6.705258559010755e-06, "loss": 1.7725, "step": 1620 }, { "epoch": 0.7857489093553078, "grad_norm": 1.3094892501831055, "learning_rate": 6.676444649364416e-06, "loss": 1.3554, "step": 1621 }, { "epoch": 0.7862336403296171, "grad_norm": 1.6407068967819214, "learning_rate": 6.647683239009556e-06, "loss": 1.5119, "step": 1622 }, { "epoch": 0.7867183713039263, "grad_norm": 1.463111400604248, "learning_rate": 6.618974410351247e-06, "loss": 1.4116, "step": 1623 }, { "epoch": 0.7872031022782355, "grad_norm": 1.4133925437927246, "learning_rate": 6.590318245643887e-06, "loss": 1.3629, "step": 1624 }, { "epoch": 0.7876878332525449, "grad_norm": 1.5195677280426025, "learning_rate": 6.561714826990998e-06, "loss": 1.8831, "step": 1625 }, { "epoch": 0.7881725642268541, "grad_norm": 1.3426353931427002, "learning_rate": 6.533164236344966e-06, "loss": 1.1815, "step": 1626 }, { "epoch": 0.7886572952011633, "grad_norm": 1.381893277168274, "learning_rate": 6.504666555506825e-06, "loss": 1.4206, "step": 1627 }, { "epoch": 0.7891420261754726, "grad_norm": 1.40556800365448, "learning_rate": 6.476221866126029e-06, "loss": 1.2124, "step": 1628 }, { "epoch": 0.7896267571497819, "grad_norm": 1.4516103267669678, "learning_rate": 6.447830249700174e-06, "loss": 1.2965, "step": 1629 }, { "epoch": 0.7901114881240912, "grad_norm": 1.3030951023101807, "learning_rate": 6.41949178757483e-06, "loss": 1.2891, "step": 1630 }, { "epoch": 0.7905962190984004, "grad_norm": 1.4862583875656128, "learning_rate": 6.3912065609432415e-06, "loss": 1.4867, "step": 1631 }, { "epoch": 0.7910809500727096, "grad_norm": 1.3782804012298584, "learning_rate": 6.362974650846157e-06, "loss": 1.4044, "step": 1632 }, { "epoch": 0.7915656810470189, "grad_norm": 1.3330339193344116, "learning_rate": 6.334796138171542e-06, "loss": 1.4485, "step": 1633 }, { "epoch": 0.7920504120213282, "grad_norm": 1.527451753616333, "learning_rate": 6.306671103654382e-06, "loss": 1.5054, "step": 1634 }, { "epoch": 0.7925351429956374, "grad_norm": 1.3172121047973633, "learning_rate": 6.278599627876433e-06, "loss": 1.3527, "step": 1635 }, { "epoch": 0.7930198739699467, "grad_norm": 1.3731809854507446, "learning_rate": 6.250581791266019e-06, "loss": 1.4858, "step": 1636 }, { "epoch": 0.7935046049442559, "grad_norm": 1.3459367752075195, "learning_rate": 6.22261767409775e-06, "loss": 1.3209, "step": 1637 }, { "epoch": 0.7939893359185652, "grad_norm": 2.2093305587768555, "learning_rate": 6.1947073564923576e-06, "loss": 1.2707, "step": 1638 }, { "epoch": 0.7944740668928745, "grad_norm": 1.42824387550354, "learning_rate": 6.166850918416406e-06, "loss": 1.3117, "step": 1639 }, { "epoch": 0.7949587978671837, "grad_norm": 1.436257243156433, "learning_rate": 6.139048439682085e-06, "loss": 1.4706, "step": 1640 }, { "epoch": 0.7954435288414929, "grad_norm": 1.4861083030700684, "learning_rate": 6.111299999947009e-06, "loss": 1.4673, "step": 1641 }, { "epoch": 0.7959282598158022, "grad_norm": 1.4589564800262451, "learning_rate": 6.083605678713939e-06, "loss": 1.4155, "step": 1642 }, { "epoch": 0.7964129907901115, "grad_norm": 1.4518787860870361, "learning_rate": 6.055965555330606e-06, "loss": 1.3371, "step": 1643 }, { "epoch": 0.7968977217644208, "grad_norm": 1.3608945608139038, "learning_rate": 6.028379708989418e-06, "loss": 1.2777, "step": 1644 }, { "epoch": 0.79738245273873, "grad_norm": 1.3667936325073242, "learning_rate": 6.000848218727312e-06, "loss": 1.2991, "step": 1645 }, { "epoch": 0.7978671837130392, "grad_norm": 1.4242197275161743, "learning_rate": 5.973371163425456e-06, "loss": 1.4603, "step": 1646 }, { "epoch": 0.7983519146873486, "grad_norm": 1.4276293516159058, "learning_rate": 5.945948621809091e-06, "loss": 1.214, "step": 1647 }, { "epoch": 0.7988366456616578, "grad_norm": 1.4498988389968872, "learning_rate": 5.91858067244723e-06, "loss": 1.3601, "step": 1648 }, { "epoch": 0.799321376635967, "grad_norm": 1.4795246124267578, "learning_rate": 5.891267393752509e-06, "loss": 1.5095, "step": 1649 }, { "epoch": 0.7998061076102763, "grad_norm": 1.371968388557434, "learning_rate": 5.864008863980897e-06, "loss": 1.562, "step": 1650 }, { "epoch": 0.8002908385845856, "grad_norm": 1.39975106716156, "learning_rate": 5.836805161231507e-06, "loss": 1.4923, "step": 1651 }, { "epoch": 0.8007755695588948, "grad_norm": 1.4395267963409424, "learning_rate": 5.809656363446381e-06, "loss": 1.2459, "step": 1652 }, { "epoch": 0.8012603005332041, "grad_norm": 1.383536696434021, "learning_rate": 5.782562548410236e-06, "loss": 1.1381, "step": 1653 }, { "epoch": 0.8017450315075133, "grad_norm": 1.2943404912948608, "learning_rate": 5.7555237937502616e-06, "loss": 1.2698, "step": 1654 }, { "epoch": 0.8022297624818225, "grad_norm": 1.4058960676193237, "learning_rate": 5.7285401769358845e-06, "loss": 1.6954, "step": 1655 }, { "epoch": 0.8027144934561319, "grad_norm": 1.4720020294189453, "learning_rate": 5.701611775278573e-06, "loss": 1.5013, "step": 1656 }, { "epoch": 0.8031992244304411, "grad_norm": 1.3335487842559814, "learning_rate": 5.674738665931575e-06, "loss": 1.239, "step": 1657 }, { "epoch": 0.8036839554047503, "grad_norm": 1.3698316812515259, "learning_rate": 5.647920925889744e-06, "loss": 1.0645, "step": 1658 }, { "epoch": 0.8041686863790596, "grad_norm": 1.3854635953903198, "learning_rate": 5.6211586319892625e-06, "loss": 1.3054, "step": 1659 }, { "epoch": 0.8046534173533689, "grad_norm": 1.5994597673416138, "learning_rate": 5.594451860907485e-06, "loss": 1.411, "step": 1660 }, { "epoch": 0.8051381483276782, "grad_norm": 1.4336647987365723, "learning_rate": 5.567800689162658e-06, "loss": 1.4892, "step": 1661 }, { "epoch": 0.8056228793019874, "grad_norm": 1.753812551498413, "learning_rate": 5.541205193113763e-06, "loss": 1.6698, "step": 1662 }, { "epoch": 0.8061076102762966, "grad_norm": 1.2079178094863892, "learning_rate": 5.51466544896021e-06, "loss": 1.0658, "step": 1663 }, { "epoch": 0.8065923412506059, "grad_norm": 1.3493430614471436, "learning_rate": 5.488181532741732e-06, "loss": 1.3803, "step": 1664 }, { "epoch": 0.8070770722249152, "grad_norm": 1.5290086269378662, "learning_rate": 5.46175352033807e-06, "loss": 1.5866, "step": 1665 }, { "epoch": 0.8075618031992244, "grad_norm": 1.3983334302902222, "learning_rate": 5.435381487468799e-06, "loss": 1.595, "step": 1666 }, { "epoch": 0.8080465341735337, "grad_norm": 1.5040364265441895, "learning_rate": 5.409065509693126e-06, "loss": 1.4392, "step": 1667 }, { "epoch": 0.8085312651478429, "grad_norm": 1.456047534942627, "learning_rate": 5.382805662409623e-06, "loss": 1.504, "step": 1668 }, { "epoch": 0.8090159961221522, "grad_norm": 1.338990330696106, "learning_rate": 5.356602020856072e-06, "loss": 1.5103, "step": 1669 }, { "epoch": 0.8095007270964615, "grad_norm": 1.4504512548446655, "learning_rate": 5.330454660109185e-06, "loss": 1.0707, "step": 1670 }, { "epoch": 0.8099854580707707, "grad_norm": 1.4192159175872803, "learning_rate": 5.30436365508446e-06, "loss": 1.6072, "step": 1671 }, { "epoch": 0.81047018904508, "grad_norm": 1.5492674112319946, "learning_rate": 5.278329080535896e-06, "loss": 1.6888, "step": 1672 }, { "epoch": 0.8109549200193893, "grad_norm": 1.4452522993087769, "learning_rate": 5.252351011055831e-06, "loss": 1.4031, "step": 1673 }, { "epoch": 0.8114396509936985, "grad_norm": 1.4986207485198975, "learning_rate": 5.226429521074691e-06, "loss": 1.5358, "step": 1674 }, { "epoch": 0.8119243819680078, "grad_norm": 1.3144514560699463, "learning_rate": 5.20056468486082e-06, "loss": 1.5166, "step": 1675 }, { "epoch": 0.812409112942317, "grad_norm": 1.328102946281433, "learning_rate": 5.174756576520218e-06, "loss": 0.9864, "step": 1676 }, { "epoch": 0.8128938439166262, "grad_norm": 1.3630974292755127, "learning_rate": 5.149005269996374e-06, "loss": 1.5685, "step": 1677 }, { "epoch": 0.8133785748909356, "grad_norm": 1.7009714841842651, "learning_rate": 5.123310839070011e-06, "loss": 1.2827, "step": 1678 }, { "epoch": 0.8138633058652448, "grad_norm": 1.3941096067428589, "learning_rate": 5.097673357358907e-06, "loss": 1.3748, "step": 1679 }, { "epoch": 0.814348036839554, "grad_norm": 1.5804961919784546, "learning_rate": 5.072092898317679e-06, "loss": 1.9325, "step": 1680 }, { "epoch": 0.8148327678138633, "grad_norm": 1.3486220836639404, "learning_rate": 5.04656953523755e-06, "loss": 1.0934, "step": 1681 }, { "epoch": 0.8153174987881726, "grad_norm": 1.475723385810852, "learning_rate": 5.021103341246186e-06, "loss": 1.1359, "step": 1682 }, { "epoch": 0.8158022297624818, "grad_norm": 1.3328871726989746, "learning_rate": 4.995694389307412e-06, "loss": 1.1081, "step": 1683 }, { "epoch": 0.8162869607367911, "grad_norm": 1.5600857734680176, "learning_rate": 4.9703427522210914e-06, "loss": 1.7475, "step": 1684 }, { "epoch": 0.8167716917111003, "grad_norm": 1.3647994995117188, "learning_rate": 4.94504850262284e-06, "loss": 1.35, "step": 1685 }, { "epoch": 0.8172564226854095, "grad_norm": 1.3073344230651855, "learning_rate": 4.91981171298388e-06, "loss": 1.2369, "step": 1686 }, { "epoch": 0.8177411536597189, "grad_norm": 1.335671067237854, "learning_rate": 4.894632455610773e-06, "loss": 1.2816, "step": 1687 }, { "epoch": 0.8182258846340281, "grad_norm": 1.4057748317718506, "learning_rate": 4.8695108026452745e-06, "loss": 1.1653, "step": 1688 }, { "epoch": 0.8187106156083374, "grad_norm": 1.358802318572998, "learning_rate": 4.8444468260640755e-06, "loss": 1.5163, "step": 1689 }, { "epoch": 0.8191953465826466, "grad_norm": 1.3890354633331299, "learning_rate": 4.819440597678612e-06, "loss": 1.3894, "step": 1690 }, { "epoch": 0.8196800775569559, "grad_norm": 1.4253144264221191, "learning_rate": 4.794492189134892e-06, "loss": 1.4175, "step": 1691 }, { "epoch": 0.8201648085312652, "grad_norm": 1.2631031274795532, "learning_rate": 4.769601671913234e-06, "loss": 1.0669, "step": 1692 }, { "epoch": 0.8206495395055744, "grad_norm": 1.477522611618042, "learning_rate": 4.744769117328107e-06, "loss": 1.6204, "step": 1693 }, { "epoch": 0.8211342704798836, "grad_norm": 1.3935991525650024, "learning_rate": 4.719994596527894e-06, "loss": 1.2437, "step": 1694 }, { "epoch": 0.821619001454193, "grad_norm": 1.2869259119033813, "learning_rate": 4.695278180494725e-06, "loss": 1.383, "step": 1695 }, { "epoch": 0.8221037324285022, "grad_norm": 1.3717503547668457, "learning_rate": 4.670619940044233e-06, "loss": 1.3475, "step": 1696 }, { "epoch": 0.8225884634028114, "grad_norm": 1.3983089923858643, "learning_rate": 4.646019945825392e-06, "loss": 1.5844, "step": 1697 }, { "epoch": 0.8230731943771207, "grad_norm": 1.434476375579834, "learning_rate": 4.621478268320265e-06, "loss": 1.3396, "step": 1698 }, { "epoch": 0.8235579253514299, "grad_norm": 1.392899990081787, "learning_rate": 4.5969949778438575e-06, "loss": 1.595, "step": 1699 }, { "epoch": 0.8240426563257393, "grad_norm": 1.4684665203094482, "learning_rate": 4.5725701445438775e-06, "loss": 1.4212, "step": 1700 }, { "epoch": 0.8245273873000485, "grad_norm": 1.4998910427093506, "learning_rate": 4.548203838400539e-06, "loss": 1.7012, "step": 1701 }, { "epoch": 0.8250121182743577, "grad_norm": 1.3950318098068237, "learning_rate": 4.523896129226371e-06, "loss": 1.4352, "step": 1702 }, { "epoch": 0.825496849248667, "grad_norm": 1.3992888927459717, "learning_rate": 4.499647086666029e-06, "loss": 1.3451, "step": 1703 }, { "epoch": 0.8259815802229763, "grad_norm": 1.4925978183746338, "learning_rate": 4.475456780196066e-06, "loss": 1.5352, "step": 1704 }, { "epoch": 0.8264663111972855, "grad_norm": 1.5991015434265137, "learning_rate": 4.451325279124749e-06, "loss": 1.5298, "step": 1705 }, { "epoch": 0.8269510421715948, "grad_norm": 1.3293036222457886, "learning_rate": 4.427252652591876e-06, "loss": 1.1902, "step": 1706 }, { "epoch": 0.827435773145904, "grad_norm": 1.5411173105239868, "learning_rate": 4.40323896956854e-06, "loss": 1.3795, "step": 1707 }, { "epoch": 0.8279205041202132, "grad_norm": 1.4103530645370483, "learning_rate": 4.379284298856973e-06, "loss": 1.5143, "step": 1708 }, { "epoch": 0.8284052350945226, "grad_norm": 1.4393373727798462, "learning_rate": 4.3553887090903075e-06, "loss": 1.4624, "step": 1709 }, { "epoch": 0.8288899660688318, "grad_norm": 1.8007562160491943, "learning_rate": 4.331552268732433e-06, "loss": 1.6461, "step": 1710 }, { "epoch": 0.829374697043141, "grad_norm": 1.4097933769226074, "learning_rate": 4.307775046077739e-06, "loss": 1.1364, "step": 1711 }, { "epoch": 0.8298594280174503, "grad_norm": 1.2913793325424194, "learning_rate": 4.284057109250961e-06, "loss": 1.37, "step": 1712 }, { "epoch": 0.8303441589917596, "grad_norm": 1.320348858833313, "learning_rate": 4.2603985262069656e-06, "loss": 1.6305, "step": 1713 }, { "epoch": 0.8308288899660689, "grad_norm": 1.3655133247375488, "learning_rate": 4.236799364730582e-06, "loss": 1.4158, "step": 1714 }, { "epoch": 0.8313136209403781, "grad_norm": 1.4090162515640259, "learning_rate": 4.213259692436367e-06, "loss": 1.4813, "step": 1715 }, { "epoch": 0.8317983519146873, "grad_norm": 1.4175288677215576, "learning_rate": 4.189779576768454e-06, "loss": 1.6809, "step": 1716 }, { "epoch": 0.8322830828889967, "grad_norm": 1.5763258934020996, "learning_rate": 4.166359085000324e-06, "loss": 1.8811, "step": 1717 }, { "epoch": 0.8327678138633059, "grad_norm": 1.3878095149993896, "learning_rate": 4.142998284234622e-06, "loss": 1.35, "step": 1718 }, { "epoch": 0.8332525448376151, "grad_norm": 1.4590060710906982, "learning_rate": 4.119697241402998e-06, "loss": 1.3375, "step": 1719 }, { "epoch": 0.8337372758119244, "grad_norm": 1.3511408567428589, "learning_rate": 4.096456023265866e-06, "loss": 1.384, "step": 1720 }, { "epoch": 0.8342220067862336, "grad_norm": 1.4394463300704956, "learning_rate": 4.073274696412235e-06, "loss": 1.6411, "step": 1721 }, { "epoch": 0.8347067377605429, "grad_norm": 1.3683598041534424, "learning_rate": 4.05015332725952e-06, "loss": 1.2006, "step": 1722 }, { "epoch": 0.8351914687348522, "grad_norm": 1.4397015571594238, "learning_rate": 4.027091982053369e-06, "loss": 1.4946, "step": 1723 }, { "epoch": 0.8356761997091614, "grad_norm": 1.5546510219573975, "learning_rate": 4.004090726867416e-06, "loss": 1.5381, "step": 1724 }, { "epoch": 0.8361609306834706, "grad_norm": 1.4166737794876099, "learning_rate": 3.98114962760317e-06, "loss": 1.4553, "step": 1725 }, { "epoch": 0.83664566165778, "grad_norm": 1.4045753479003906, "learning_rate": 3.9582687499897545e-06, "loss": 1.3718, "step": 1726 }, { "epoch": 0.8371303926320892, "grad_norm": 1.3510545492172241, "learning_rate": 3.935448159583774e-06, "loss": 1.4418, "step": 1727 }, { "epoch": 0.8376151236063984, "grad_norm": 1.3021389245986938, "learning_rate": 3.912687921769082e-06, "loss": 1.183, "step": 1728 }, { "epoch": 0.8380998545807077, "grad_norm": 1.3169225454330444, "learning_rate": 3.88998810175662e-06, "loss": 1.5796, "step": 1729 }, { "epoch": 0.8385845855550169, "grad_norm": 1.3681930303573608, "learning_rate": 3.8673487645842415e-06, "loss": 1.2193, "step": 1730 }, { "epoch": 0.8390693165293263, "grad_norm": 1.4157400131225586, "learning_rate": 3.844769975116488e-06, "loss": 1.3846, "step": 1731 }, { "epoch": 0.8395540475036355, "grad_norm": 1.3580200672149658, "learning_rate": 3.8222517980444325e-06, "loss": 1.0942, "step": 1732 }, { "epoch": 0.8400387784779447, "grad_norm": 1.3693870306015015, "learning_rate": 3.7997942978854785e-06, "loss": 1.2999, "step": 1733 }, { "epoch": 0.840523509452254, "grad_norm": 1.290262222290039, "learning_rate": 3.7773975389832043e-06, "loss": 1.3288, "step": 1734 }, { "epoch": 0.8410082404265633, "grad_norm": 1.3575208187103271, "learning_rate": 3.7550615855071277e-06, "loss": 1.5665, "step": 1735 }, { "epoch": 0.8414929714008725, "grad_norm": 1.3926094770431519, "learning_rate": 3.7327865014525787e-06, "loss": 1.3471, "step": 1736 }, { "epoch": 0.8419777023751818, "grad_norm": 1.4269956350326538, "learning_rate": 3.710572350640465e-06, "loss": 1.618, "step": 1737 }, { "epoch": 0.842462433349491, "grad_norm": 1.4620566368103027, "learning_rate": 3.6884191967171327e-06, "loss": 1.4229, "step": 1738 }, { "epoch": 0.8429471643238002, "grad_norm": 1.3834177255630493, "learning_rate": 3.666327103154149e-06, "loss": 1.5917, "step": 1739 }, { "epoch": 0.8434318952981096, "grad_norm": 1.575303077697754, "learning_rate": 3.644296133248143e-06, "loss": 2.0171, "step": 1740 }, { "epoch": 0.8439166262724188, "grad_norm": 1.4172624349594116, "learning_rate": 3.6223263501206113e-06, "loss": 1.7448, "step": 1741 }, { "epoch": 0.844401357246728, "grad_norm": 1.4966297149658203, "learning_rate": 3.600417816717755e-06, "loss": 1.1987, "step": 1742 }, { "epoch": 0.8448860882210373, "grad_norm": 1.4303022623062134, "learning_rate": 3.578570595810274e-06, "loss": 1.5639, "step": 1743 }, { "epoch": 0.8453708191953466, "grad_norm": 1.5101985931396484, "learning_rate": 3.5567847499932e-06, "loss": 1.6489, "step": 1744 }, { "epoch": 0.8458555501696559, "grad_norm": 1.4596188068389893, "learning_rate": 3.535060341685731e-06, "loss": 1.5892, "step": 1745 }, { "epoch": 0.8463402811439651, "grad_norm": 1.283461332321167, "learning_rate": 3.513397433131024e-06, "loss": 1.3714, "step": 1746 }, { "epoch": 0.8468250121182743, "grad_norm": 1.439650535583496, "learning_rate": 3.491796086396043e-06, "loss": 1.2675, "step": 1747 }, { "epoch": 0.8473097430925837, "grad_norm": 1.3690756559371948, "learning_rate": 3.4702563633713577e-06, "loss": 1.255, "step": 1748 }, { "epoch": 0.8477944740668929, "grad_norm": 1.511044979095459, "learning_rate": 3.4487783257710015e-06, "loss": 1.1604, "step": 1749 }, { "epoch": 0.8482792050412021, "grad_norm": 1.4531290531158447, "learning_rate": 3.4273620351322257e-06, "loss": 1.4041, "step": 1750 }, { "epoch": 0.8487639360155114, "grad_norm": 1.4664944410324097, "learning_rate": 3.406007552815421e-06, "loss": 1.4089, "step": 1751 }, { "epoch": 0.8492486669898206, "grad_norm": 1.3850915431976318, "learning_rate": 3.3847149400038527e-06, "loss": 1.3229, "step": 1752 }, { "epoch": 0.8497333979641299, "grad_norm": 1.419942021369934, "learning_rate": 3.3634842577035447e-06, "loss": 1.423, "step": 1753 }, { "epoch": 0.8502181289384392, "grad_norm": 1.2842981815338135, "learning_rate": 3.3423155667430708e-06, "loss": 1.1818, "step": 1754 }, { "epoch": 0.8507028599127484, "grad_norm": 1.3830044269561768, "learning_rate": 3.321208927773384e-06, "loss": 1.3562, "step": 1755 }, { "epoch": 0.8511875908870576, "grad_norm": 1.5061596632003784, "learning_rate": 3.3001644012676773e-06, "loss": 1.2338, "step": 1756 }, { "epoch": 0.851672321861367, "grad_norm": 1.3219194412231445, "learning_rate": 3.279182047521151e-06, "loss": 1.3248, "step": 1757 }, { "epoch": 0.8521570528356762, "grad_norm": 1.2967970371246338, "learning_rate": 3.258261926650902e-06, "loss": 1.1634, "step": 1758 }, { "epoch": 0.8526417838099855, "grad_norm": 1.3681789636611938, "learning_rate": 3.2374040985957004e-06, "loss": 1.3534, "step": 1759 }, { "epoch": 0.8531265147842947, "grad_norm": 1.4028301239013672, "learning_rate": 3.216608623115852e-06, "loss": 1.4824, "step": 1760 }, { "epoch": 0.8536112457586039, "grad_norm": 1.3395938873291016, "learning_rate": 3.1958755597930017e-06, "loss": 1.3257, "step": 1761 }, { "epoch": 0.8540959767329133, "grad_norm": 1.5202916860580444, "learning_rate": 3.175204968029999e-06, "loss": 1.4002, "step": 1762 }, { "epoch": 0.8545807077072225, "grad_norm": 1.3687208890914917, "learning_rate": 3.1545969070506747e-06, "loss": 1.3911, "step": 1763 }, { "epoch": 0.8550654386815317, "grad_norm": 1.5068252086639404, "learning_rate": 3.1340514358997293e-06, "loss": 1.427, "step": 1764 }, { "epoch": 0.855550169655841, "grad_norm": 1.3460315465927124, "learning_rate": 3.1135686134425134e-06, "loss": 1.2252, "step": 1765 }, { "epoch": 0.8560349006301503, "grad_norm": 1.3183640241622925, "learning_rate": 3.093148498364898e-06, "loss": 1.1255, "step": 1766 }, { "epoch": 0.8565196316044595, "grad_norm": 1.3901616334915161, "learning_rate": 3.0727911491730764e-06, "loss": 1.5205, "step": 1767 }, { "epoch": 0.8570043625787688, "grad_norm": 1.479112982749939, "learning_rate": 3.0524966241934153e-06, "loss": 1.7212, "step": 1768 }, { "epoch": 0.857489093553078, "grad_norm": 1.5991971492767334, "learning_rate": 3.0322649815722915e-06, "loss": 1.5865, "step": 1769 }, { "epoch": 0.8579738245273874, "grad_norm": 1.4487693309783936, "learning_rate": 3.012096279275892e-06, "loss": 1.5127, "step": 1770 }, { "epoch": 0.8584585555016966, "grad_norm": 1.623945951461792, "learning_rate": 2.991990575090095e-06, "loss": 1.5826, "step": 1771 }, { "epoch": 0.8589432864760058, "grad_norm": 1.5717607736587524, "learning_rate": 2.9719479266202664e-06, "loss": 1.505, "step": 1772 }, { "epoch": 0.859428017450315, "grad_norm": 1.4932787418365479, "learning_rate": 2.9519683912911266e-06, "loss": 1.4719, "step": 1773 }, { "epoch": 0.8599127484246243, "grad_norm": 1.3678524494171143, "learning_rate": 2.9320520263465463e-06, "loss": 1.3981, "step": 1774 }, { "epoch": 0.8603974793989336, "grad_norm": 1.2875282764434814, "learning_rate": 2.9121988888494297e-06, "loss": 1.2622, "step": 1775 }, { "epoch": 0.8608822103732429, "grad_norm": 1.5293846130371094, "learning_rate": 2.892409035681498e-06, "loss": 1.6768, "step": 1776 }, { "epoch": 0.8613669413475521, "grad_norm": 1.5416409969329834, "learning_rate": 2.872682523543185e-06, "loss": 1.4337, "step": 1777 }, { "epoch": 0.8618516723218613, "grad_norm": 1.515479326248169, "learning_rate": 2.8530194089534225e-06, "loss": 1.3993, "step": 1778 }, { "epoch": 0.8623364032961707, "grad_norm": 1.349339246749878, "learning_rate": 2.833419748249511e-06, "loss": 1.4137, "step": 1779 }, { "epoch": 0.8628211342704799, "grad_norm": 1.4984550476074219, "learning_rate": 2.8138835975869358e-06, "loss": 1.7335, "step": 1780 }, { "epoch": 0.8633058652447891, "grad_norm": 1.4722235202789307, "learning_rate": 2.794411012939238e-06, "loss": 1.2962, "step": 1781 }, { "epoch": 0.8637905962190984, "grad_norm": 1.4962884187698364, "learning_rate": 2.7750020500978193e-06, "loss": 1.3812, "step": 1782 }, { "epoch": 0.8642753271934076, "grad_norm": 1.2720054388046265, "learning_rate": 2.7556567646717907e-06, "loss": 1.2682, "step": 1783 }, { "epoch": 0.864760058167717, "grad_norm": 1.5404802560806274, "learning_rate": 2.7363752120878437e-06, "loss": 1.5413, "step": 1784 }, { "epoch": 0.8652447891420262, "grad_norm": 1.352376103401184, "learning_rate": 2.717157447590041e-06, "loss": 1.3569, "step": 1785 }, { "epoch": 0.8657295201163354, "grad_norm": 1.4405183792114258, "learning_rate": 2.6980035262397037e-06, "loss": 1.4793, "step": 1786 }, { "epoch": 0.8662142510906446, "grad_norm": 1.5227733850479126, "learning_rate": 2.6789135029152173e-06, "loss": 1.4665, "step": 1787 }, { "epoch": 0.866698982064954, "grad_norm": 1.4717252254486084, "learning_rate": 2.659887432311917e-06, "loss": 1.2643, "step": 1788 }, { "epoch": 0.8671837130392632, "grad_norm": 1.542318344116211, "learning_rate": 2.6409253689418656e-06, "loss": 1.4383, "step": 1789 }, { "epoch": 0.8676684440135725, "grad_norm": 1.4374393224716187, "learning_rate": 2.6220273671337807e-06, "loss": 1.2678, "step": 1790 }, { "epoch": 0.8681531749878817, "grad_norm": 1.5301721096038818, "learning_rate": 2.603193481032801e-06, "loss": 1.7229, "step": 1791 }, { "epoch": 0.868637905962191, "grad_norm": 1.2812771797180176, "learning_rate": 2.584423764600391e-06, "loss": 1.6145, "step": 1792 }, { "epoch": 0.8691226369365003, "grad_norm": 1.3902934789657593, "learning_rate": 2.5657182716141452e-06, "loss": 1.3589, "step": 1793 }, { "epoch": 0.8696073679108095, "grad_norm": 1.4703373908996582, "learning_rate": 2.547077055667646e-06, "loss": 1.4929, "step": 1794 }, { "epoch": 0.8700920988851187, "grad_norm": 1.8839432001113892, "learning_rate": 2.528500170170339e-06, "loss": 1.3907, "step": 1795 }, { "epoch": 0.870576829859428, "grad_norm": 1.3980306386947632, "learning_rate": 2.5099876683473244e-06, "loss": 1.5614, "step": 1796 }, { "epoch": 0.8710615608337373, "grad_norm": 1.3951743841171265, "learning_rate": 2.4915396032392567e-06, "loss": 1.3248, "step": 1797 }, { "epoch": 0.8715462918080465, "grad_norm": 1.453779697418213, "learning_rate": 2.473156027702164e-06, "loss": 1.4249, "step": 1798 }, { "epoch": 0.8720310227823558, "grad_norm": 1.3146569728851318, "learning_rate": 2.4548369944073004e-06, "loss": 1.2215, "step": 1799 }, { "epoch": 0.872515753756665, "grad_norm": 1.4370759725570679, "learning_rate": 2.4365825558409966e-06, "loss": 1.5261, "step": 1800 }, { "epoch": 0.8730004847309744, "grad_norm": 1.3861037492752075, "learning_rate": 2.4183927643045253e-06, "loss": 1.3451, "step": 1801 }, { "epoch": 0.8734852157052836, "grad_norm": 1.2893579006195068, "learning_rate": 2.4002676719139166e-06, "loss": 1.2465, "step": 1802 }, { "epoch": 0.8739699466795928, "grad_norm": 1.4540156126022339, "learning_rate": 2.3822073305998534e-06, "loss": 1.3423, "step": 1803 }, { "epoch": 0.8744546776539021, "grad_norm": 1.4508488178253174, "learning_rate": 2.3642117921074734e-06, "loss": 1.527, "step": 1804 }, { "epoch": 0.8749394086282113, "grad_norm": 1.4460153579711914, "learning_rate": 2.3462811079962705e-06, "loss": 1.7063, "step": 1805 }, { "epoch": 0.8754241396025206, "grad_norm": 1.468252420425415, "learning_rate": 2.328415329639902e-06, "loss": 1.5137, "step": 1806 }, { "epoch": 0.8759088705768299, "grad_norm": 1.3156002759933472, "learning_rate": 2.310614508226078e-06, "loss": 1.2969, "step": 1807 }, { "epoch": 0.8763936015511391, "grad_norm": 1.447495698928833, "learning_rate": 2.292878694756384e-06, "loss": 1.3326, "step": 1808 }, { "epoch": 0.8768783325254483, "grad_norm": 1.4856112003326416, "learning_rate": 2.2752079400461564e-06, "loss": 1.3841, "step": 1809 }, { "epoch": 0.8773630634997577, "grad_norm": 1.4724615812301636, "learning_rate": 2.257602294724337e-06, "loss": 1.5061, "step": 1810 }, { "epoch": 0.8778477944740669, "grad_norm": 1.5109659433364868, "learning_rate": 2.24006180923331e-06, "loss": 1.5312, "step": 1811 }, { "epoch": 0.8783325254483761, "grad_norm": 1.8590974807739258, "learning_rate": 2.222586533828777e-06, "loss": 1.6278, "step": 1812 }, { "epoch": 0.8788172564226854, "grad_norm": 1.5733957290649414, "learning_rate": 2.2051765185795965e-06, "loss": 1.323, "step": 1813 }, { "epoch": 0.8793019873969947, "grad_norm": 1.3470954895019531, "learning_rate": 2.1878318133676607e-06, "loss": 1.1649, "step": 1814 }, { "epoch": 0.879786718371304, "grad_norm": 1.4746617078781128, "learning_rate": 2.170552467887721e-06, "loss": 1.7258, "step": 1815 }, { "epoch": 0.8802714493456132, "grad_norm": 1.3480514287948608, "learning_rate": 2.1533385316472864e-06, "loss": 1.1594, "step": 1816 }, { "epoch": 0.8807561803199224, "grad_norm": 1.4327815771102905, "learning_rate": 2.136190053966444e-06, "loss": 1.3378, "step": 1817 }, { "epoch": 0.8812409112942317, "grad_norm": 1.433623194694519, "learning_rate": 2.119107083977742e-06, "loss": 1.5589, "step": 1818 }, { "epoch": 0.881725642268541, "grad_norm": 1.3920966386795044, "learning_rate": 2.1020896706260367e-06, "loss": 1.4108, "step": 1819 }, { "epoch": 0.8822103732428502, "grad_norm": 1.4291924238204956, "learning_rate": 2.08513786266836e-06, "loss": 1.4014, "step": 1820 }, { "epoch": 0.8826951042171595, "grad_norm": 2.1731436252593994, "learning_rate": 2.068251708673777e-06, "loss": 1.4262, "step": 1821 }, { "epoch": 0.8831798351914687, "grad_norm": 1.3496067523956299, "learning_rate": 2.051431257023237e-06, "loss": 1.2465, "step": 1822 }, { "epoch": 0.883664566165778, "grad_norm": 1.3288339376449585, "learning_rate": 2.0346765559094567e-06, "loss": 1.0933, "step": 1823 }, { "epoch": 0.8841492971400873, "grad_norm": 1.2372487783432007, "learning_rate": 2.0179876533367587e-06, "loss": 1.2749, "step": 1824 }, { "epoch": 0.8846340281143965, "grad_norm": 1.4109827280044556, "learning_rate": 2.0013645971209527e-06, "loss": 1.3048, "step": 1825 }, { "epoch": 0.8851187590887057, "grad_norm": 1.4247568845748901, "learning_rate": 1.984807434889177e-06, "loss": 1.3601, "step": 1826 }, { "epoch": 0.885603490063015, "grad_norm": 1.3820182085037231, "learning_rate": 1.9683162140798045e-06, "loss": 1.288, "step": 1827 }, { "epoch": 0.8860882210373243, "grad_norm": 1.4704811573028564, "learning_rate": 1.9518909819422336e-06, "loss": 1.4445, "step": 1828 }, { "epoch": 0.8865729520116336, "grad_norm": 1.5304688215255737, "learning_rate": 1.935531785536834e-06, "loss": 1.8307, "step": 1829 }, { "epoch": 0.8870576829859428, "grad_norm": 1.4468873739242554, "learning_rate": 1.919238671734758e-06, "loss": 1.3208, "step": 1830 }, { "epoch": 0.887542413960252, "grad_norm": 1.4901260137557983, "learning_rate": 1.9030116872178316e-06, "loss": 1.5234, "step": 1831 }, { "epoch": 0.8880271449345614, "grad_norm": 1.4315540790557861, "learning_rate": 1.886850878478405e-06, "loss": 1.2287, "step": 1832 }, { "epoch": 0.8885118759088706, "grad_norm": 1.2842153310775757, "learning_rate": 1.87075629181922e-06, "loss": 1.4168, "step": 1833 }, { "epoch": 0.8889966068831798, "grad_norm": 1.3464667797088623, "learning_rate": 1.8547279733533042e-06, "loss": 1.122, "step": 1834 }, { "epoch": 0.8894813378574891, "grad_norm": 1.4574693441390991, "learning_rate": 1.8387659690038e-06, "loss": 1.496, "step": 1835 }, { "epoch": 0.8899660688317984, "grad_norm": 1.374718427658081, "learning_rate": 1.822870324503867e-06, "loss": 1.4209, "step": 1836 }, { "epoch": 0.8904507998061076, "grad_norm": 1.4177495241165161, "learning_rate": 1.8070410853965104e-06, "loss": 1.4393, "step": 1837 }, { "epoch": 0.8909355307804169, "grad_norm": 1.4920076131820679, "learning_rate": 1.7912782970345044e-06, "loss": 1.4351, "step": 1838 }, { "epoch": 0.8914202617547261, "grad_norm": 1.3668889999389648, "learning_rate": 1.7755820045802145e-06, "loss": 1.4363, "step": 1839 }, { "epoch": 0.8919049927290353, "grad_norm": 1.2889249324798584, "learning_rate": 1.7599522530055006e-06, "loss": 1.335, "step": 1840 }, { "epoch": 0.8923897237033447, "grad_norm": 1.3367114067077637, "learning_rate": 1.744389087091558e-06, "loss": 1.4699, "step": 1841 }, { "epoch": 0.8928744546776539, "grad_norm": 1.393117904663086, "learning_rate": 1.7288925514288262e-06, "loss": 1.5868, "step": 1842 }, { "epoch": 0.8933591856519632, "grad_norm": 1.229596734046936, "learning_rate": 1.7134626904168228e-06, "loss": 1.0758, "step": 1843 }, { "epoch": 0.8938439166262724, "grad_norm": 1.3445252180099487, "learning_rate": 1.6980995482640373e-06, "loss": 1.5242, "step": 1844 }, { "epoch": 0.8943286476005817, "grad_norm": 1.4988356828689575, "learning_rate": 1.682803168987815e-06, "loss": 1.5926, "step": 1845 }, { "epoch": 0.894813378574891, "grad_norm": 1.3614110946655273, "learning_rate": 1.6675735964142015e-06, "loss": 1.3875, "step": 1846 }, { "epoch": 0.8952981095492002, "grad_norm": 1.330135703086853, "learning_rate": 1.6524108741778372e-06, "loss": 1.4712, "step": 1847 }, { "epoch": 0.8957828405235094, "grad_norm": 1.4532225131988525, "learning_rate": 1.6373150457218267e-06, "loss": 1.6974, "step": 1848 }, { "epoch": 0.8962675714978187, "grad_norm": 1.4383496046066284, "learning_rate": 1.6222861542976252e-06, "loss": 1.637, "step": 1849 }, { "epoch": 0.896752302472128, "grad_norm": 1.4017609357833862, "learning_rate": 1.6073242429648916e-06, "loss": 1.5166, "step": 1850 }, { "epoch": 0.8972370334464372, "grad_norm": 1.5737212896347046, "learning_rate": 1.5924293545913876e-06, "loss": 1.4628, "step": 1851 }, { "epoch": 0.8977217644207465, "grad_norm": 1.3844492435455322, "learning_rate": 1.5776015318528403e-06, "loss": 1.2499, "step": 1852 }, { "epoch": 0.8982064953950557, "grad_norm": 1.4061801433563232, "learning_rate": 1.5628408172328301e-06, "loss": 1.5193, "step": 1853 }, { "epoch": 0.898691226369365, "grad_norm": 1.3358350992202759, "learning_rate": 1.5481472530226554e-06, "loss": 1.1172, "step": 1854 }, { "epoch": 0.8991759573436743, "grad_norm": 1.4957354068756104, "learning_rate": 1.5335208813212375e-06, "loss": 1.5576, "step": 1855 }, { "epoch": 0.8996606883179835, "grad_norm": 1.3805766105651855, "learning_rate": 1.5189617440349635e-06, "loss": 1.4324, "step": 1856 }, { "epoch": 0.9001454192922927, "grad_norm": 1.4622647762298584, "learning_rate": 1.5044698828775932e-06, "loss": 1.2463, "step": 1857 }, { "epoch": 0.900630150266602, "grad_norm": 1.4765523672103882, "learning_rate": 1.4900453393701358e-06, "loss": 1.6936, "step": 1858 }, { "epoch": 0.9011148812409113, "grad_norm": 1.4264954328536987, "learning_rate": 1.4756881548407153e-06, "loss": 1.4019, "step": 1859 }, { "epoch": 0.9015996122152206, "grad_norm": 1.4087258577346802, "learning_rate": 1.4613983704244826e-06, "loss": 1.4478, "step": 1860 }, { "epoch": 0.9020843431895298, "grad_norm": 1.7178231477737427, "learning_rate": 1.44717602706346e-06, "loss": 1.3309, "step": 1861 }, { "epoch": 0.902569074163839, "grad_norm": 1.351010799407959, "learning_rate": 1.4330211655064568e-06, "loss": 1.3857, "step": 1862 }, { "epoch": 0.9030538051381484, "grad_norm": 1.4305837154388428, "learning_rate": 1.4189338263089241e-06, "loss": 1.3635, "step": 1863 }, { "epoch": 0.9035385361124576, "grad_norm": 1.2843518257141113, "learning_rate": 1.4049140498328728e-06, "loss": 1.1113, "step": 1864 }, { "epoch": 0.9040232670867668, "grad_norm": 1.3791704177856445, "learning_rate": 1.3909618762467186e-06, "loss": 1.1953, "step": 1865 }, { "epoch": 0.9045079980610761, "grad_norm": 1.3165531158447266, "learning_rate": 1.3770773455251935e-06, "loss": 1.1486, "step": 1866 }, { "epoch": 0.9049927290353854, "grad_norm": 1.3703712224960327, "learning_rate": 1.3632604974492257e-06, "loss": 1.4523, "step": 1867 }, { "epoch": 0.9054774600096946, "grad_norm": 1.28277587890625, "learning_rate": 1.3495113716058272e-06, "loss": 1.506, "step": 1868 }, { "epoch": 0.9059621909840039, "grad_norm": 1.3652846813201904, "learning_rate": 1.335830007387967e-06, "loss": 1.3893, "step": 1869 }, { "epoch": 0.9064469219583131, "grad_norm": 1.3692935705184937, "learning_rate": 1.3222164439944811e-06, "loss": 1.644, "step": 1870 }, { "epoch": 0.9069316529326223, "grad_norm": 1.5364172458648682, "learning_rate": 1.3086707204299414e-06, "loss": 1.6484, "step": 1871 }, { "epoch": 0.9074163839069317, "grad_norm": 1.4699918031692505, "learning_rate": 1.2951928755045417e-06, "loss": 1.593, "step": 1872 }, { "epoch": 0.9079011148812409, "grad_norm": 1.437369465827942, "learning_rate": 1.281782947834015e-06, "loss": 1.5027, "step": 1873 }, { "epoch": 0.9083858458555502, "grad_norm": 1.4166172742843628, "learning_rate": 1.26844097583948e-06, "loss": 1.506, "step": 1874 }, { "epoch": 0.9088705768298594, "grad_norm": 1.340541124343872, "learning_rate": 1.2551669977473813e-06, "loss": 1.4797, "step": 1875 }, { "epoch": 0.9093553078041687, "grad_norm": 1.5571600198745728, "learning_rate": 1.241961051589316e-06, "loss": 1.5784, "step": 1876 }, { "epoch": 0.909840038778478, "grad_norm": 1.496361494064331, "learning_rate": 1.2288231752019956e-06, "loss": 1.3185, "step": 1877 }, { "epoch": 0.9103247697527872, "grad_norm": 1.3229130506515503, "learning_rate": 1.2157534062270798e-06, "loss": 1.4291, "step": 1878 }, { "epoch": 0.9108095007270964, "grad_norm": 1.5197927951812744, "learning_rate": 1.2027517821111112e-06, "loss": 1.3644, "step": 1879 }, { "epoch": 0.9112942317014057, "grad_norm": 1.3992700576782227, "learning_rate": 1.1898183401053697e-06, "loss": 1.1666, "step": 1880 }, { "epoch": 0.911778962675715, "grad_norm": 1.4355274438858032, "learning_rate": 1.1769531172658e-06, "loss": 1.487, "step": 1881 }, { "epoch": 0.9122636936500242, "grad_norm": 1.310941219329834, "learning_rate": 1.1641561504528803e-06, "loss": 1.3596, "step": 1882 }, { "epoch": 0.9127484246243335, "grad_norm": 1.495992660522461, "learning_rate": 1.1514274763315292e-06, "loss": 1.8062, "step": 1883 }, { "epoch": 0.9132331555986427, "grad_norm": 2.58510684967041, "learning_rate": 1.1387671313710075e-06, "loss": 1.4036, "step": 1884 }, { "epoch": 0.913717886572952, "grad_norm": 1.5952808856964111, "learning_rate": 1.1261751518447882e-06, "loss": 1.6931, "step": 1885 }, { "epoch": 0.9142026175472613, "grad_norm": 1.2862507104873657, "learning_rate": 1.113651573830482e-06, "loss": 1.1187, "step": 1886 }, { "epoch": 0.9146873485215705, "grad_norm": 1.285225510597229, "learning_rate": 1.1011964332097114e-06, "loss": 1.5083, "step": 1887 }, { "epoch": 0.9151720794958798, "grad_norm": 1.5734096765518188, "learning_rate": 1.0888097656680253e-06, "loss": 1.646, "step": 1888 }, { "epoch": 0.9156568104701891, "grad_norm": 1.4412444829940796, "learning_rate": 1.0764916066947794e-06, "loss": 1.5821, "step": 1889 }, { "epoch": 0.9161415414444983, "grad_norm": 1.3290297985076904, "learning_rate": 1.0642419915830537e-06, "loss": 1.1081, "step": 1890 }, { "epoch": 0.9166262724188076, "grad_norm": 1.4718834161758423, "learning_rate": 1.0520609554295346e-06, "loss": 1.6698, "step": 1891 }, { "epoch": 0.9171110033931168, "grad_norm": 1.3476266860961914, "learning_rate": 1.0399485331344273e-06, "loss": 1.2459, "step": 1892 }, { "epoch": 0.917595734367426, "grad_norm": 1.4318119287490845, "learning_rate": 1.027904759401338e-06, "loss": 1.3108, "step": 1893 }, { "epoch": 0.9180804653417354, "grad_norm": 1.476616621017456, "learning_rate": 1.0159296687372034e-06, "loss": 1.7684, "step": 1894 }, { "epoch": 0.9185651963160446, "grad_norm": 1.3496558666229248, "learning_rate": 1.0040232954521557e-06, "loss": 1.4406, "step": 1895 }, { "epoch": 0.9190499272903538, "grad_norm": 1.3763827085494995, "learning_rate": 9.92185673659457e-07, "loss": 1.2898, "step": 1896 }, { "epoch": 0.9195346582646631, "grad_norm": 1.430915117263794, "learning_rate": 9.804168372753858e-07, "loss": 1.4238, "step": 1897 }, { "epoch": 0.9200193892389724, "grad_norm": 1.3764996528625488, "learning_rate": 9.687168200191304e-07, "loss": 1.5283, "step": 1898 }, { "epoch": 0.9205041202132817, "grad_norm": 1.4979218244552612, "learning_rate": 9.570856554127205e-07, "loss": 1.853, "step": 1899 }, { "epoch": 0.9209888511875909, "grad_norm": 1.4959334135055542, "learning_rate": 9.455233767808991e-07, "loss": 1.7275, "step": 1900 }, { "epoch": 0.9214735821619001, "grad_norm": 1.2733063697814941, "learning_rate": 9.340300172510586e-07, "loss": 1.4255, "step": 1901 }, { "epoch": 0.9219583131362094, "grad_norm": 1.451502799987793, "learning_rate": 9.226056097531105e-07, "loss": 1.1018, "step": 1902 }, { "epoch": 0.9224430441105187, "grad_norm": 1.3837565183639526, "learning_rate": 9.112501870194273e-07, "loss": 1.7232, "step": 1903 }, { "epoch": 0.9229277750848279, "grad_norm": 1.4380730390548706, "learning_rate": 8.9996378158472e-07, "loss": 1.2878, "step": 1904 }, { "epoch": 0.9234125060591372, "grad_norm": 1.2848694324493408, "learning_rate": 8.887464257859579e-07, "loss": 1.2934, "step": 1905 }, { "epoch": 0.9238972370334464, "grad_norm": 1.2803257703781128, "learning_rate": 8.775981517622794e-07, "loss": 1.297, "step": 1906 }, { "epoch": 0.9243819680077557, "grad_norm": 1.3022807836532593, "learning_rate": 8.665189914548955e-07, "loss": 1.2849, "step": 1907 }, { "epoch": 0.924866698982065, "grad_norm": 1.3697919845581055, "learning_rate": 8.555089766069891e-07, "loss": 1.3761, "step": 1908 }, { "epoch": 0.9253514299563742, "grad_norm": 1.5067558288574219, "learning_rate": 8.445681387636406e-07, "loss": 1.31, "step": 1909 }, { "epoch": 0.9258361609306834, "grad_norm": 1.403795838356018, "learning_rate": 8.336965092717281e-07, "loss": 1.4403, "step": 1910 }, { "epoch": 0.9263208919049928, "grad_norm": 1.440237283706665, "learning_rate": 8.228941192798323e-07, "loss": 1.7129, "step": 1911 }, { "epoch": 0.926805622879302, "grad_norm": 1.4832885265350342, "learning_rate": 8.121609997381652e-07, "loss": 1.5841, "step": 1912 }, { "epoch": 0.9272903538536112, "grad_norm": 1.4952884912490845, "learning_rate": 8.014971813984611e-07, "loss": 1.6965, "step": 1913 }, { "epoch": 0.9277750848279205, "grad_norm": 1.588004231452942, "learning_rate": 7.909026948139081e-07, "loss": 1.5482, "step": 1914 }, { "epoch": 0.9282598158022297, "grad_norm": 1.387260913848877, "learning_rate": 7.803775703390359e-07, "loss": 1.2515, "step": 1915 }, { "epoch": 0.9287445467765391, "grad_norm": 1.292482852935791, "learning_rate": 7.699218381296531e-07, "loss": 1.3326, "step": 1916 }, { "epoch": 0.9292292777508483, "grad_norm": 1.427830696105957, "learning_rate": 7.595355281427435e-07, "loss": 1.5905, "step": 1917 }, { "epoch": 0.9297140087251575, "grad_norm": 1.3774360418319702, "learning_rate": 7.492186701364007e-07, "loss": 1.6201, "step": 1918 }, { "epoch": 0.9301987396994668, "grad_norm": 1.4959042072296143, "learning_rate": 7.389712936697129e-07, "loss": 1.4392, "step": 1919 }, { "epoch": 0.9306834706737761, "grad_norm": 1.2562229633331299, "learning_rate": 7.287934281027114e-07, "loss": 1.2257, "step": 1920 }, { "epoch": 0.9311682016480853, "grad_norm": 1.3972541093826294, "learning_rate": 7.186851025962532e-07, "loss": 1.1484, "step": 1921 }, { "epoch": 0.9316529326223946, "grad_norm": 1.3372113704681396, "learning_rate": 7.086463461119658e-07, "loss": 1.2652, "step": 1922 }, { "epoch": 0.9321376635967038, "grad_norm": 1.3388514518737793, "learning_rate": 6.9867718741215e-07, "loss": 1.5144, "step": 1923 }, { "epoch": 0.932622394571013, "grad_norm": 1.3194801807403564, "learning_rate": 6.887776550597025e-07, "loss": 1.1255, "step": 1924 }, { "epoch": 0.9331071255453224, "grad_norm": 1.4161765575408936, "learning_rate": 6.789477774180236e-07, "loss": 1.7097, "step": 1925 }, { "epoch": 0.9335918565196316, "grad_norm": 1.3846325874328613, "learning_rate": 6.691875826509514e-07, "loss": 1.445, "step": 1926 }, { "epoch": 0.9340765874939408, "grad_norm": 1.311930537223816, "learning_rate": 6.59497098722675e-07, "loss": 1.406, "step": 1927 }, { "epoch": 0.9345613184682501, "grad_norm": 1.3026055097579956, "learning_rate": 6.498763533976437e-07, "loss": 1.2652, "step": 1928 }, { "epoch": 0.9350460494425594, "grad_norm": 1.4192198514938354, "learning_rate": 6.403253742405107e-07, "loss": 1.6227, "step": 1929 }, { "epoch": 0.9355307804168687, "grad_norm": 1.3232550621032715, "learning_rate": 6.308441886160254e-07, "loss": 1.4756, "step": 1930 }, { "epoch": 0.9360155113911779, "grad_norm": 1.3774874210357666, "learning_rate": 6.214328236889861e-07, "loss": 1.3581, "step": 1931 }, { "epoch": 0.9365002423654871, "grad_norm": 1.381434679031372, "learning_rate": 6.120913064241313e-07, "loss": 1.2108, "step": 1932 }, { "epoch": 0.9369849733397965, "grad_norm": 1.3841496706008911, "learning_rate": 6.02819663586085e-07, "loss": 1.5833, "step": 1933 }, { "epoch": 0.9374697043141057, "grad_norm": 1.4273308515548706, "learning_rate": 5.936179217392673e-07, "loss": 1.3397, "step": 1934 }, { "epoch": 0.9379544352884149, "grad_norm": 1.4026015996932983, "learning_rate": 5.844861072478336e-07, "loss": 1.6483, "step": 1935 }, { "epoch": 0.9384391662627242, "grad_norm": 1.4809978008270264, "learning_rate": 5.754242462755771e-07, "loss": 1.4236, "step": 1936 }, { "epoch": 0.9389238972370334, "grad_norm": 1.4197194576263428, "learning_rate": 5.664323647858655e-07, "loss": 1.5785, "step": 1937 }, { "epoch": 0.9394086282113427, "grad_norm": 1.4222776889801025, "learning_rate": 5.575104885415794e-07, "loss": 1.5917, "step": 1938 }, { "epoch": 0.939893359185652, "grad_norm": 1.4109939336776733, "learning_rate": 5.486586431050072e-07, "loss": 1.5329, "step": 1939 }, { "epoch": 0.9403780901599612, "grad_norm": 1.3708782196044922, "learning_rate": 5.398768538378063e-07, "loss": 1.1433, "step": 1940 }, { "epoch": 0.9408628211342704, "grad_norm": 1.3859466314315796, "learning_rate": 5.311651459009054e-07, "loss": 1.4556, "step": 1941 }, { "epoch": 0.9413475521085798, "grad_norm": 1.3821712732315063, "learning_rate": 5.225235442544468e-07, "loss": 1.654, "step": 1942 }, { "epoch": 0.941832283082889, "grad_norm": 1.4499861001968384, "learning_rate": 5.139520736577058e-07, "loss": 1.7619, "step": 1943 }, { "epoch": 0.9423170140571983, "grad_norm": 1.6326532363891602, "learning_rate": 5.05450758669021e-07, "loss": 1.8829, "step": 1944 }, { "epoch": 0.9428017450315075, "grad_norm": 1.4821033477783203, "learning_rate": 4.97019623645728e-07, "loss": 1.3496, "step": 1945 }, { "epoch": 0.9432864760058167, "grad_norm": 1.308059573173523, "learning_rate": 4.886586927440956e-07, "loss": 1.3063, "step": 1946 }, { "epoch": 0.9437712069801261, "grad_norm": 1.3683174848556519, "learning_rate": 4.803679899192392e-07, "loss": 1.4177, "step": 1947 }, { "epoch": 0.9442559379544353, "grad_norm": 1.246372938156128, "learning_rate": 4.7214753892506625e-07, "loss": 1.2904, "step": 1948 }, { "epoch": 0.9447406689287445, "grad_norm": 1.3812105655670166, "learning_rate": 4.6399736331420305e-07, "loss": 1.4883, "step": 1949 }, { "epoch": 0.9452253999030538, "grad_norm": 1.4763976335525513, "learning_rate": 4.559174864379234e-07, "loss": 1.5377, "step": 1950 }, { "epoch": 0.9457101308773631, "grad_norm": 1.4200667142868042, "learning_rate": 4.4790793144610097e-07, "loss": 1.2448, "step": 1951 }, { "epoch": 0.9461948618516723, "grad_norm": 1.5260330438613892, "learning_rate": 4.399687212871123e-07, "loss": 1.33, "step": 1952 }, { "epoch": 0.9466795928259816, "grad_norm": 1.5376918315887451, "learning_rate": 4.320998787077923e-07, "loss": 1.3794, "step": 1953 }, { "epoch": 0.9471643238002908, "grad_norm": 1.408273696899414, "learning_rate": 4.243014262533679e-07, "loss": 1.1804, "step": 1954 }, { "epoch": 0.9476490547746, "grad_norm": 1.395007848739624, "learning_rate": 4.165733862673854e-07, "loss": 1.2267, "step": 1955 }, { "epoch": 0.9481337857489094, "grad_norm": 1.4566540718078613, "learning_rate": 4.0891578089164996e-07, "loss": 1.2163, "step": 1956 }, { "epoch": 0.9486185167232186, "grad_norm": 1.295602560043335, "learning_rate": 4.0132863206616965e-07, "loss": 1.1429, "step": 1957 }, { "epoch": 0.9491032476975279, "grad_norm": 1.410298228263855, "learning_rate": 3.938119615290753e-07, "loss": 1.5398, "step": 1958 }, { "epoch": 0.9495879786718371, "grad_norm": 1.3872941732406616, "learning_rate": 3.8636579081657577e-07, "loss": 1.1388, "step": 1959 }, { "epoch": 0.9500727096461464, "grad_norm": 1.5268107652664185, "learning_rate": 3.7899014126288876e-07, "loss": 1.8655, "step": 1960 }, { "epoch": 0.9505574406204557, "grad_norm": 1.472521424293518, "learning_rate": 3.716850340001715e-07, "loss": 1.7315, "step": 1961 }, { "epoch": 0.9510421715947649, "grad_norm": 1.4450392723083496, "learning_rate": 3.644504899584844e-07, "loss": 1.3071, "step": 1962 }, { "epoch": 0.9515269025690741, "grad_norm": 1.436614990234375, "learning_rate": 3.5728652986570245e-07, "loss": 1.4648, "step": 1963 }, { "epoch": 0.9520116335433835, "grad_norm": 1.3284350633621216, "learning_rate": 3.5019317424747064e-07, "loss": 1.1009, "step": 1964 }, { "epoch": 0.9524963645176927, "grad_norm": 1.3217493295669556, "learning_rate": 3.43170443427146e-07, "loss": 1.303, "step": 1965 }, { "epoch": 0.9529810954920019, "grad_norm": 1.4290056228637695, "learning_rate": 3.3621835752573884e-07, "loss": 1.4079, "step": 1966 }, { "epoch": 0.9534658264663112, "grad_norm": 1.4261441230773926, "learning_rate": 3.293369364618465e-07, "loss": 1.241, "step": 1967 }, { "epoch": 0.9539505574406204, "grad_norm": 1.4014172554016113, "learning_rate": 3.2252619995160885e-07, "loss": 1.641, "step": 1968 }, { "epoch": 0.9544352884149298, "grad_norm": 1.4024168252944946, "learning_rate": 3.1578616750863875e-07, "loss": 1.3255, "step": 1969 }, { "epoch": 0.954920019389239, "grad_norm": 1.462719440460205, "learning_rate": 3.0911685844398353e-07, "loss": 1.3231, "step": 1970 }, { "epoch": 0.9554047503635482, "grad_norm": 1.350813865661621, "learning_rate": 3.025182918660496e-07, "loss": 1.1891, "step": 1971 }, { "epoch": 0.9558894813378574, "grad_norm": 1.3725783824920654, "learning_rate": 2.9599048668055853e-07, "loss": 1.2594, "step": 1972 }, { "epoch": 0.9563742123121668, "grad_norm": 1.4037284851074219, "learning_rate": 2.8953346159049375e-07, "loss": 1.3825, "step": 1973 }, { "epoch": 0.956858943286476, "grad_norm": 2.3289217948913574, "learning_rate": 2.831472350960485e-07, "loss": 1.4553, "step": 1974 }, { "epoch": 0.9573436742607853, "grad_norm": 1.3470871448516846, "learning_rate": 2.7683182549456123e-07, "loss": 1.6123, "step": 1975 }, { "epoch": 0.9578284052350945, "grad_norm": 1.5660077333450317, "learning_rate": 2.705872508804747e-07, "loss": 1.5749, "step": 1976 }, { "epoch": 0.9583131362094037, "grad_norm": 1.4341754913330078, "learning_rate": 2.644135291452854e-07, "loss": 1.1978, "step": 1977 }, { "epoch": 0.9587978671837131, "grad_norm": 1.4906240701675415, "learning_rate": 2.5831067797747746e-07, "loss": 1.6585, "step": 1978 }, { "epoch": 0.9592825981580223, "grad_norm": 1.3474236726760864, "learning_rate": 2.5227871486249164e-07, "loss": 1.3896, "step": 1979 }, { "epoch": 0.9597673291323315, "grad_norm": 1.4273277521133423, "learning_rate": 2.463176570826592e-07, "loss": 1.5367, "step": 1980 }, { "epoch": 0.9602520601066408, "grad_norm": 1.3798199892044067, "learning_rate": 2.404275217171625e-07, "loss": 1.129, "step": 1981 }, { "epoch": 0.9607367910809501, "grad_norm": 1.284221887588501, "learning_rate": 2.3460832564197455e-07, "loss": 1.2853, "step": 1982 }, { "epoch": 0.9612215220552593, "grad_norm": 1.366292953491211, "learning_rate": 2.288600855298306e-07, "loss": 1.2893, "step": 1983 }, { "epoch": 0.9617062530295686, "grad_norm": 1.3929351568222046, "learning_rate": 2.2318281785015936e-07, "loss": 1.5092, "step": 1984 }, { "epoch": 0.9621909840038778, "grad_norm": 1.2522450685501099, "learning_rate": 2.1757653886904927e-07, "loss": 1.39, "step": 1985 }, { "epoch": 0.9626757149781872, "grad_norm": 1.3403209447860718, "learning_rate": 2.120412646491904e-07, "loss": 1.3051, "step": 1986 }, { "epoch": 0.9631604459524964, "grad_norm": 1.3888131380081177, "learning_rate": 2.0657701104984384e-07, "loss": 1.4746, "step": 1987 }, { "epoch": 0.9636451769268056, "grad_norm": 1.4653536081314087, "learning_rate": 2.0118379372678354e-07, "loss": 1.8005, "step": 1988 }, { "epoch": 0.9641299079011149, "grad_norm": 1.3622173070907593, "learning_rate": 1.9586162813225174e-07, "loss": 1.366, "step": 1989 }, { "epoch": 0.9646146388754241, "grad_norm": 1.3387054204940796, "learning_rate": 1.9061052951492575e-07, "loss": 1.4404, "step": 1990 }, { "epoch": 0.9650993698497334, "grad_norm": 1.361503005027771, "learning_rate": 1.8543051291986247e-07, "loss": 1.4713, "step": 1991 }, { "epoch": 0.9655841008240427, "grad_norm": 1.3547799587249756, "learning_rate": 1.80321593188465e-07, "loss": 1.2779, "step": 1992 }, { "epoch": 0.9660688317983519, "grad_norm": 1.6397618055343628, "learning_rate": 1.7528378495842435e-07, "loss": 1.4227, "step": 1993 }, { "epoch": 0.9665535627726611, "grad_norm": 1.4830430746078491, "learning_rate": 1.7031710266370016e-07, "loss": 1.5416, "step": 1994 }, { "epoch": 0.9670382937469705, "grad_norm": 1.4233936071395874, "learning_rate": 1.6542156053446223e-07, "loss": 1.2741, "step": 1995 }, { "epoch": 0.9675230247212797, "grad_norm": 1.3970363140106201, "learning_rate": 1.6059717259705175e-07, "loss": 1.2843, "step": 1996 }, { "epoch": 0.968007755695589, "grad_norm": 1.4266377687454224, "learning_rate": 1.5584395267394802e-07, "loss": 1.2624, "step": 1997 }, { "epoch": 0.9684924866698982, "grad_norm": 1.3139270544052124, "learning_rate": 1.5116191438372394e-07, "loss": 1.3138, "step": 1998 }, { "epoch": 0.9689772176442074, "grad_norm": 1.3843278884887695, "learning_rate": 1.4655107114101007e-07, "loss": 1.3294, "step": 1999 }, { "epoch": 0.9694619486185168, "grad_norm": 1.2902089357376099, "learning_rate": 1.4201143615645006e-07, "loss": 1.3585, "step": 2000 }, { "epoch": 0.969946679592826, "grad_norm": 1.469571590423584, "learning_rate": 1.3754302243667304e-07, "loss": 1.4797, "step": 2001 }, { "epoch": 0.9704314105671352, "grad_norm": 1.3052209615707397, "learning_rate": 1.331458427842408e-07, "loss": 1.3906, "step": 2002 }, { "epoch": 0.9709161415414445, "grad_norm": 1.5683739185333252, "learning_rate": 1.2881990979763393e-07, "loss": 1.5129, "step": 2003 }, { "epoch": 0.9714008725157538, "grad_norm": 1.4021413326263428, "learning_rate": 1.2456523587118517e-07, "loss": 1.2832, "step": 2004 }, { "epoch": 0.971885603490063, "grad_norm": 1.4269837141036987, "learning_rate": 1.2038183319507955e-07, "loss": 1.3047, "step": 2005 }, { "epoch": 0.9723703344643723, "grad_norm": 1.4196062088012695, "learning_rate": 1.1626971375528484e-07, "loss": 1.2863, "step": 2006 }, { "epoch": 0.9728550654386815, "grad_norm": 1.4747346639633179, "learning_rate": 1.1222888933354602e-07, "loss": 1.4186, "step": 2007 }, { "epoch": 0.9733397964129908, "grad_norm": 1.3612735271453857, "learning_rate": 1.0825937150732989e-07, "loss": 1.5173, "step": 2008 }, { "epoch": 0.9738245273873001, "grad_norm": 1.586757779121399, "learning_rate": 1.043611716498083e-07, "loss": 1.3961, "step": 2009 }, { "epoch": 0.9743092583616093, "grad_norm": 1.2929972410202026, "learning_rate": 1.0053430092981097e-07, "loss": 1.0006, "step": 2010 }, { "epoch": 0.9747939893359185, "grad_norm": 1.3124991655349731, "learning_rate": 9.677877031180615e-08, "loss": 1.1774, "step": 2011 }, { "epoch": 0.9752787203102278, "grad_norm": 1.4618499279022217, "learning_rate": 9.30945905558589e-08, "loss": 1.5174, "step": 2012 }, { "epoch": 0.9757634512845371, "grad_norm": 1.536896824836731, "learning_rate": 8.948177221760889e-08, "loss": 2.057, "step": 2013 }, { "epoch": 0.9762481822588464, "grad_norm": 1.474232792854309, "learning_rate": 8.594032564823717e-08, "loss": 1.3491, "step": 2014 }, { "epoch": 0.9767329132331556, "grad_norm": 1.3744844198226929, "learning_rate": 8.247026099443277e-08, "loss": 1.4189, "step": 2015 }, { "epoch": 0.9772176442074648, "grad_norm": 1.3619568347930908, "learning_rate": 7.907158819836503e-08, "loss": 1.4581, "step": 2016 }, { "epoch": 0.9777023751817742, "grad_norm": 1.3401083946228027, "learning_rate": 7.574431699766127e-08, "loss": 1.523, "step": 2017 }, { "epoch": 0.9781871061560834, "grad_norm": 1.4377765655517578, "learning_rate": 7.248845692537088e-08, "loss": 1.4517, "step": 2018 }, { "epoch": 0.9786718371303926, "grad_norm": 1.4789650440216064, "learning_rate": 6.930401730994573e-08, "loss": 1.5122, "step": 2019 }, { "epoch": 0.9791565681047019, "grad_norm": 1.4197337627410889, "learning_rate": 6.619100727520422e-08, "loss": 1.2408, "step": 2020 }, { "epoch": 0.9796412990790111, "grad_norm": 1.294545292854309, "learning_rate": 6.314943574030896e-08, "loss": 1.4684, "step": 2021 }, { "epoch": 0.9801260300533204, "grad_norm": 1.500753402709961, "learning_rate": 6.01793114197502e-08, "loss": 1.6566, "step": 2022 }, { "epoch": 0.9806107610276297, "grad_norm": 1.4268614053726196, "learning_rate": 5.728064282330137e-08, "loss": 1.4571, "step": 2023 }, { "epoch": 0.9810954920019389, "grad_norm": 1.3109159469604492, "learning_rate": 5.4453438256019115e-08, "loss": 1.17, "step": 2024 }, { "epoch": 0.9815802229762481, "grad_norm": 1.3804876804351807, "learning_rate": 5.169770581819888e-08, "loss": 1.3375, "step": 2025 }, { "epoch": 0.9820649539505575, "grad_norm": 1.758589267730713, "learning_rate": 4.901345340535824e-08, "loss": 1.6656, "step": 2026 }, { "epoch": 0.9825496849248667, "grad_norm": 1.500661015510559, "learning_rate": 4.6400688708217455e-08, "loss": 1.522, "step": 2027 }, { "epoch": 0.983034415899176, "grad_norm": 1.3010058403015137, "learning_rate": 4.385941921268011e-08, "loss": 1.075, "step": 2028 }, { "epoch": 0.9835191468734852, "grad_norm": 1.2376115322113037, "learning_rate": 4.138965219979973e-08, "loss": 1.0122, "step": 2029 }, { "epoch": 0.9840038778477945, "grad_norm": 1.4325613975524902, "learning_rate": 3.8991394745771516e-08, "loss": 1.5664, "step": 2030 }, { "epoch": 0.9844886088221038, "grad_norm": 1.3779929876327515, "learning_rate": 3.666465372190453e-08, "loss": 1.2317, "step": 2031 }, { "epoch": 0.984973339796413, "grad_norm": 1.4506980180740356, "learning_rate": 3.440943579460232e-08, "loss": 1.3087, "step": 2032 }, { "epoch": 0.9854580707707222, "grad_norm": 1.5038971900939941, "learning_rate": 3.2225747425351785e-08, "loss": 1.6422, "step": 2033 }, { "epoch": 0.9859428017450315, "grad_norm": 1.4041410684585571, "learning_rate": 3.011359487068987e-08, "loss": 1.5016, "step": 2034 }, { "epoch": 0.9864275327193408, "grad_norm": 1.475483775138855, "learning_rate": 2.807298418220361e-08, "loss": 1.5435, "step": 2035 }, { "epoch": 0.98691226369365, "grad_norm": 1.4113868474960327, "learning_rate": 2.6103921206499517e-08, "loss": 1.5201, "step": 2036 }, { "epoch": 0.9873969946679593, "grad_norm": 1.4753867387771606, "learning_rate": 2.4206411585186996e-08, "loss": 1.6761, "step": 2037 }, { "epoch": 0.9878817256422685, "grad_norm": 1.380028247833252, "learning_rate": 2.2380460754875544e-08, "loss": 1.2831, "step": 2038 }, { "epoch": 0.9883664566165778, "grad_norm": 1.2903367280960083, "learning_rate": 2.0626073947138668e-08, "loss": 1.235, "step": 2039 }, { "epoch": 0.9888511875908871, "grad_norm": 1.7924153804779053, "learning_rate": 1.8943256188516667e-08, "loss": 1.4112, "step": 2040 }, { "epoch": 0.9893359185651963, "grad_norm": 1.4166916608810425, "learning_rate": 1.7332012300494417e-08, "loss": 1.651, "step": 2041 }, { "epoch": 0.9898206495395055, "grad_norm": 1.2969902753829956, "learning_rate": 1.5792346899490275e-08, "loss": 1.2091, "step": 2042 }, { "epoch": 0.9903053805138148, "grad_norm": 1.426344633102417, "learning_rate": 1.4324264396836651e-08, "loss": 1.4629, "step": 2043 }, { "epoch": 0.9907901114881241, "grad_norm": 1.4453415870666504, "learning_rate": 1.2927768998774458e-08, "loss": 1.4487, "step": 2044 }, { "epoch": 0.9912748424624334, "grad_norm": 1.3389601707458496, "learning_rate": 1.1602864706442008e-08, "loss": 1.2715, "step": 2045 }, { "epoch": 0.9917595734367426, "grad_norm": 1.382049560546875, "learning_rate": 1.0349555315855574e-08, "loss": 1.3052, "step": 2046 }, { "epoch": 0.9922443044110518, "grad_norm": 1.3508886098861694, "learning_rate": 9.167844417901083e-09, "loss": 1.4453, "step": 2047 }, { "epoch": 0.9927290353853612, "grad_norm": 1.3657429218292236, "learning_rate": 8.057735398331324e-09, "loss": 1.4893, "step": 2048 }, { "epoch": 0.9932137663596704, "grad_norm": 1.395776629447937, "learning_rate": 7.0192314377520715e-09, "loss": 1.283, "step": 2049 }, { "epoch": 0.9936984973339796, "grad_norm": 1.4177159070968628, "learning_rate": 6.052335511599893e-09, "loss": 1.2549, "step": 2050 }, { "epoch": 0.9941832283082889, "grad_norm": 1.4473752975463867, "learning_rate": 5.1570503901587905e-09, "loss": 1.49, "step": 2051 }, { "epoch": 0.9946679592825982, "grad_norm": 1.3230009078979492, "learning_rate": 4.333378638532448e-09, "loss": 1.7049, "step": 2052 }, { "epoch": 0.9951526902569074, "grad_norm": 1.2757775783538818, "learning_rate": 3.581322616641458e-09, "loss": 1.1337, "step": 2053 }, { "epoch": 0.9956374212312167, "grad_norm": 1.4444496631622314, "learning_rate": 2.9008844792260957e-09, "loss": 1.6056, "step": 2054 }, { "epoch": 0.9961221522055259, "grad_norm": 1.566375970840454, "learning_rate": 2.292066175821339e-09, "loss": 1.3957, "step": 2055 }, { "epoch": 0.9966068831798351, "grad_norm": 1.4305633306503296, "learning_rate": 1.754869450773522e-09, "loss": 1.4048, "step": 2056 }, { "epoch": 0.9970916141541445, "grad_norm": 1.446074366569519, "learning_rate": 1.2892958432153546e-09, "loss": 1.6426, "step": 2057 }, { "epoch": 0.9975763451284537, "grad_norm": 1.418178915977478, "learning_rate": 8.953466870742499e-10, "loss": 1.2807, "step": 2058 }, { "epoch": 0.998061076102763, "grad_norm": 1.4988263845443726, "learning_rate": 5.730231110639972e-10, "loss": 1.61, "step": 2059 }, { "epoch": 0.9985458070770722, "grad_norm": 1.3278765678405762, "learning_rate": 3.2232603868476153e-10, "loss": 1.1255, "step": 2060 }, { "epoch": 0.9990305380513815, "grad_norm": 1.492288589477539, "learning_rate": 1.4325618821198207e-10, "loss": 1.5154, "step": 2061 }, { "epoch": 0.9995152690256908, "grad_norm": 1.3655840158462524, "learning_rate": 3.5814072707474054e-11, "loss": 1.4809, "step": 2062 }, { "epoch": 1.0, "grad_norm": 1.489664912223816, "learning_rate": 0.0, "loss": 1.4304, "step": 2063 } ], "logging_steps": 1, "max_steps": 2063, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8633303665737728.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }