diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14474 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2063, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004847309743092584, + "grad_norm": 7.171422004699707, + "learning_rate": 2.4154589371980677e-07, + "loss": 3.4607, + "step": 1 + }, + { + "epoch": 0.0009694619486185168, + "grad_norm": 6.8233819007873535, + "learning_rate": 4.830917874396135e-07, + "loss": 3.2371, + "step": 2 + }, + { + "epoch": 0.001454192922927775, + "grad_norm": 7.979361057281494, + "learning_rate": 7.246376811594203e-07, + "loss": 3.3669, + "step": 3 + }, + { + "epoch": 0.0019389238972370335, + "grad_norm": 6.270462512969971, + "learning_rate": 9.66183574879227e-07, + "loss": 3.3027, + "step": 4 + }, + { + "epoch": 0.0024236548715462916, + "grad_norm": 6.778450012207031, + "learning_rate": 1.2077294685990338e-06, + "loss": 3.2712, + "step": 5 + }, + { + "epoch": 0.00290838584585555, + "grad_norm": 6.3815741539001465, + "learning_rate": 1.4492753623188406e-06, + "loss": 3.3691, + "step": 6 + }, + { + "epoch": 0.0033931168201648087, + "grad_norm": 6.661952018737793, + "learning_rate": 1.6908212560386474e-06, + "loss": 3.3269, + "step": 7 + }, + { + "epoch": 0.003877847794474067, + "grad_norm": 6.819033145904541, + "learning_rate": 1.932367149758454e-06, + "loss": 3.2467, + "step": 8 + }, + { + "epoch": 0.004362578768783325, + "grad_norm": 5.535458564758301, + "learning_rate": 2.173913043478261e-06, + "loss": 3.1249, + "step": 9 + }, + { + "epoch": 0.004847309743092583, + "grad_norm": 6.2597880363464355, + "learning_rate": 2.4154589371980677e-06, + "loss": 3.4761, + "step": 10 + }, + { + "epoch": 0.005332040717401842, + "grad_norm": 6.067697525024414, + "learning_rate": 2.6570048309178746e-06, + "loss": 3.3821, + "step": 11 + }, + { + "epoch": 0.0058167716917111, + "grad_norm": 5.567847728729248, + "learning_rate": 2.898550724637681e-06, + "loss": 3.0755, + "step": 12 + }, + { + "epoch": 0.006301502666020358, + "grad_norm": 4.956480026245117, + "learning_rate": 3.140096618357488e-06, + "loss": 3.0035, + "step": 13 + }, + { + "epoch": 0.0067862336403296175, + "grad_norm": 6.243023872375488, + "learning_rate": 3.3816425120772947e-06, + "loss": 3.1336, + "step": 14 + }, + { + "epoch": 0.007270964614638876, + "grad_norm": 5.114922046661377, + "learning_rate": 3.6231884057971017e-06, + "loss": 2.9698, + "step": 15 + }, + { + "epoch": 0.007755695588948134, + "grad_norm": 5.099619388580322, + "learning_rate": 3.864734299516908e-06, + "loss": 3.022, + "step": 16 + }, + { + "epoch": 0.008240426563257392, + "grad_norm": 5.203329563140869, + "learning_rate": 4.106280193236716e-06, + "loss": 2.8866, + "step": 17 + }, + { + "epoch": 0.00872515753756665, + "grad_norm": 4.828646659851074, + "learning_rate": 4.347826086956522e-06, + "loss": 2.7138, + "step": 18 + }, + { + "epoch": 0.009209888511875909, + "grad_norm": 5.072122573852539, + "learning_rate": 4.589371980676329e-06, + "loss": 2.8717, + "step": 19 + }, + { + "epoch": 0.009694619486185167, + "grad_norm": 4.990163326263428, + "learning_rate": 4.830917874396135e-06, + "loss": 2.8924, + "step": 20 + }, + { + "epoch": 0.010179350460494426, + "grad_norm": 4.578018665313721, + "learning_rate": 5.072463768115943e-06, + "loss": 2.5511, + "step": 21 + }, + { + "epoch": 0.010664081434803683, + "grad_norm": 4.484613418579102, + "learning_rate": 5.314009661835749e-06, + "loss": 2.6689, + "step": 22 + }, + { + "epoch": 0.011148812409112942, + "grad_norm": 4.256691932678223, + "learning_rate": 5.555555555555556e-06, + "loss": 2.5432, + "step": 23 + }, + { + "epoch": 0.0116335433834222, + "grad_norm": 4.374969005584717, + "learning_rate": 5.797101449275362e-06, + "loss": 2.6499, + "step": 24 + }, + { + "epoch": 0.012118274357731459, + "grad_norm": 4.293645858764648, + "learning_rate": 6.038647342995169e-06, + "loss": 2.2692, + "step": 25 + }, + { + "epoch": 0.012603005332040717, + "grad_norm": 3.59814190864563, + "learning_rate": 6.280193236714976e-06, + "loss": 2.3972, + "step": 26 + }, + { + "epoch": 0.013087736306349976, + "grad_norm": 4.462075710296631, + "learning_rate": 6.521739130434783e-06, + "loss": 2.3534, + "step": 27 + }, + { + "epoch": 0.013572467280659235, + "grad_norm": 3.837050437927246, + "learning_rate": 6.7632850241545894e-06, + "loss": 2.1863, + "step": 28 + }, + { + "epoch": 0.014057198254968492, + "grad_norm": 3.4918296337127686, + "learning_rate": 7.004830917874397e-06, + "loss": 2.0585, + "step": 29 + }, + { + "epoch": 0.014541929229277752, + "grad_norm": 3.0016872882843018, + "learning_rate": 7.246376811594203e-06, + "loss": 2.0722, + "step": 30 + }, + { + "epoch": 0.015026660203587009, + "grad_norm": 3.585313320159912, + "learning_rate": 7.48792270531401e-06, + "loss": 2.2528, + "step": 31 + }, + { + "epoch": 0.015511391177896268, + "grad_norm": 3.0016026496887207, + "learning_rate": 7.729468599033817e-06, + "loss": 1.9671, + "step": 32 + }, + { + "epoch": 0.015996122152205527, + "grad_norm": 2.7799417972564697, + "learning_rate": 7.971014492753623e-06, + "loss": 2.1397, + "step": 33 + }, + { + "epoch": 0.016480853126514785, + "grad_norm": 2.6177585124969482, + "learning_rate": 8.212560386473431e-06, + "loss": 2.0716, + "step": 34 + }, + { + "epoch": 0.016965584100824042, + "grad_norm": 2.6502163410186768, + "learning_rate": 8.454106280193238e-06, + "loss": 2.0137, + "step": 35 + }, + { + "epoch": 0.0174503150751333, + "grad_norm": 2.5228984355926514, + "learning_rate": 8.695652173913044e-06, + "loss": 1.7984, + "step": 36 + }, + { + "epoch": 0.01793504604944256, + "grad_norm": 2.638493537902832, + "learning_rate": 8.93719806763285e-06, + "loss": 1.8365, + "step": 37 + }, + { + "epoch": 0.018419777023751818, + "grad_norm": 2.337646007537842, + "learning_rate": 9.178743961352658e-06, + "loss": 1.8024, + "step": 38 + }, + { + "epoch": 0.018904507998061076, + "grad_norm": 2.2700746059417725, + "learning_rate": 9.420289855072464e-06, + "loss": 1.8987, + "step": 39 + }, + { + "epoch": 0.019389238972370333, + "grad_norm": 2.020162343978882, + "learning_rate": 9.66183574879227e-06, + "loss": 1.6793, + "step": 40 + }, + { + "epoch": 0.019873969946679594, + "grad_norm": 2.015110492706299, + "learning_rate": 9.903381642512077e-06, + "loss": 1.8938, + "step": 41 + }, + { + "epoch": 0.02035870092098885, + "grad_norm": 2.2413482666015625, + "learning_rate": 1.0144927536231885e-05, + "loss": 2.1364, + "step": 42 + }, + { + "epoch": 0.02084343189529811, + "grad_norm": 1.931469202041626, + "learning_rate": 1.0386473429951692e-05, + "loss": 1.9931, + "step": 43 + }, + { + "epoch": 0.021328162869607366, + "grad_norm": 1.8747423887252808, + "learning_rate": 1.0628019323671499e-05, + "loss": 1.6649, + "step": 44 + }, + { + "epoch": 0.021812893843916627, + "grad_norm": 1.9672399759292603, + "learning_rate": 1.0869565217391305e-05, + "loss": 1.8262, + "step": 45 + }, + { + "epoch": 0.022297624818225885, + "grad_norm": 1.9558823108673096, + "learning_rate": 1.1111111111111112e-05, + "loss": 1.8059, + "step": 46 + }, + { + "epoch": 0.022782355792535142, + "grad_norm": 1.8525190353393555, + "learning_rate": 1.1352657004830918e-05, + "loss": 1.721, + "step": 47 + }, + { + "epoch": 0.0232670867668444, + "grad_norm": 1.8605695962905884, + "learning_rate": 1.1594202898550725e-05, + "loss": 1.9044, + "step": 48 + }, + { + "epoch": 0.02375181774115366, + "grad_norm": 2.524609088897705, + "learning_rate": 1.1835748792270531e-05, + "loss": 1.6106, + "step": 49 + }, + { + "epoch": 0.024236548715462918, + "grad_norm": 1.8198078870773315, + "learning_rate": 1.2077294685990338e-05, + "loss": 1.822, + "step": 50 + }, + { + "epoch": 0.024721279689772176, + "grad_norm": 1.7705700397491455, + "learning_rate": 1.2318840579710146e-05, + "loss": 1.6892, + "step": 51 + }, + { + "epoch": 0.025206010664081433, + "grad_norm": 1.9049899578094482, + "learning_rate": 1.2560386473429953e-05, + "loss": 1.7649, + "step": 52 + }, + { + "epoch": 0.025690741638390694, + "grad_norm": 1.835711121559143, + "learning_rate": 1.2801932367149761e-05, + "loss": 1.7394, + "step": 53 + }, + { + "epoch": 0.02617547261269995, + "grad_norm": 1.6949882507324219, + "learning_rate": 1.3043478260869566e-05, + "loss": 1.6299, + "step": 54 + }, + { + "epoch": 0.02666020358700921, + "grad_norm": 1.7638367414474487, + "learning_rate": 1.3285024154589374e-05, + "loss": 1.5448, + "step": 55 + }, + { + "epoch": 0.02714493456131847, + "grad_norm": 2.0102837085723877, + "learning_rate": 1.3526570048309179e-05, + "loss": 1.9325, + "step": 56 + }, + { + "epoch": 0.027629665535627727, + "grad_norm": 1.7918657064437866, + "learning_rate": 1.3768115942028985e-05, + "loss": 1.6411, + "step": 57 + }, + { + "epoch": 0.028114396509936985, + "grad_norm": 2.3990306854248047, + "learning_rate": 1.4009661835748794e-05, + "loss": 1.8308, + "step": 58 + }, + { + "epoch": 0.028599127484246242, + "grad_norm": 2.5733494758605957, + "learning_rate": 1.4251207729468599e-05, + "loss": 2.1079, + "step": 59 + }, + { + "epoch": 0.029083858458555503, + "grad_norm": 1.8608803749084473, + "learning_rate": 1.4492753623188407e-05, + "loss": 1.9755, + "step": 60 + }, + { + "epoch": 0.02956858943286476, + "grad_norm": 1.7092028856277466, + "learning_rate": 1.4734299516908212e-05, + "loss": 1.6123, + "step": 61 + }, + { + "epoch": 0.030053320407174018, + "grad_norm": 1.6902247667312622, + "learning_rate": 1.497584541062802e-05, + "loss": 1.6213, + "step": 62 + }, + { + "epoch": 0.030538051381483276, + "grad_norm": 1.5993880033493042, + "learning_rate": 1.5217391304347828e-05, + "loss": 1.5819, + "step": 63 + }, + { + "epoch": 0.031022782355792537, + "grad_norm": 1.6327762603759766, + "learning_rate": 1.5458937198067633e-05, + "loss": 1.6991, + "step": 64 + }, + { + "epoch": 0.031507513330101794, + "grad_norm": 1.6658787727355957, + "learning_rate": 1.570048309178744e-05, + "loss": 1.9614, + "step": 65 + }, + { + "epoch": 0.031992244304411055, + "grad_norm": 1.5906341075897217, + "learning_rate": 1.5942028985507246e-05, + "loss": 1.5806, + "step": 66 + }, + { + "epoch": 0.03247697527872031, + "grad_norm": 1.7006059885025024, + "learning_rate": 1.6183574879227054e-05, + "loss": 2.1204, + "step": 67 + }, + { + "epoch": 0.03296170625302957, + "grad_norm": 1.8104106187820435, + "learning_rate": 1.6425120772946863e-05, + "loss": 1.7377, + "step": 68 + }, + { + "epoch": 0.033446437227338824, + "grad_norm": 1.7204387187957764, + "learning_rate": 1.6666666666666667e-05, + "loss": 1.6536, + "step": 69 + }, + { + "epoch": 0.033931168201648085, + "grad_norm": 1.77187979221344, + "learning_rate": 1.6908212560386476e-05, + "loss": 1.7974, + "step": 70 + }, + { + "epoch": 0.034415899175957346, + "grad_norm": 1.7312054634094238, + "learning_rate": 1.714975845410628e-05, + "loss": 1.6036, + "step": 71 + }, + { + "epoch": 0.0349006301502666, + "grad_norm": 1.7339930534362793, + "learning_rate": 1.739130434782609e-05, + "loss": 1.532, + "step": 72 + }, + { + "epoch": 0.03538536112457586, + "grad_norm": 1.6795563697814941, + "learning_rate": 1.7632850241545894e-05, + "loss": 1.7439, + "step": 73 + }, + { + "epoch": 0.03587009209888512, + "grad_norm": 1.6724114418029785, + "learning_rate": 1.78743961352657e-05, + "loss": 1.7637, + "step": 74 + }, + { + "epoch": 0.036354823073194376, + "grad_norm": 1.5222914218902588, + "learning_rate": 1.8115942028985507e-05, + "loss": 1.4971, + "step": 75 + }, + { + "epoch": 0.036839554047503636, + "grad_norm": 1.6842753887176514, + "learning_rate": 1.8357487922705315e-05, + "loss": 1.525, + "step": 76 + }, + { + "epoch": 0.03732428502181289, + "grad_norm": 1.6687383651733398, + "learning_rate": 1.859903381642512e-05, + "loss": 1.9496, + "step": 77 + }, + { + "epoch": 0.03780901599612215, + "grad_norm": 1.8582350015640259, + "learning_rate": 1.8840579710144928e-05, + "loss": 1.733, + "step": 78 + }, + { + "epoch": 0.03829374697043141, + "grad_norm": 1.554795265197754, + "learning_rate": 1.9082125603864733e-05, + "loss": 1.492, + "step": 79 + }, + { + "epoch": 0.038778477944740666, + "grad_norm": 1.7551424503326416, + "learning_rate": 1.932367149758454e-05, + "loss": 1.898, + "step": 80 + }, + { + "epoch": 0.03926320891904993, + "grad_norm": 1.9943029880523682, + "learning_rate": 1.956521739130435e-05, + "loss": 1.6975, + "step": 81 + }, + { + "epoch": 0.03974793989335919, + "grad_norm": 1.6358810663223267, + "learning_rate": 1.9806763285024154e-05, + "loss": 1.4857, + "step": 82 + }, + { + "epoch": 0.04023267086766844, + "grad_norm": 1.7312710285186768, + "learning_rate": 2.0048309178743963e-05, + "loss": 1.5569, + "step": 83 + }, + { + "epoch": 0.0407174018419777, + "grad_norm": 1.6241320371627808, + "learning_rate": 2.028985507246377e-05, + "loss": 1.734, + "step": 84 + }, + { + "epoch": 0.041202132816286964, + "grad_norm": 1.6983885765075684, + "learning_rate": 2.0531400966183576e-05, + "loss": 1.786, + "step": 85 + }, + { + "epoch": 0.04168686379059622, + "grad_norm": 1.615333080291748, + "learning_rate": 2.0772946859903384e-05, + "loss": 1.8196, + "step": 86 + }, + { + "epoch": 0.04217159476490548, + "grad_norm": 1.6279438734054565, + "learning_rate": 2.101449275362319e-05, + "loss": 1.5445, + "step": 87 + }, + { + "epoch": 0.04265632573921473, + "grad_norm": 1.582491397857666, + "learning_rate": 2.1256038647342997e-05, + "loss": 1.456, + "step": 88 + }, + { + "epoch": 0.043141056713523994, + "grad_norm": 1.701452612876892, + "learning_rate": 2.1497584541062805e-05, + "loss": 1.8605, + "step": 89 + }, + { + "epoch": 0.043625787687833255, + "grad_norm": 1.845126986503601, + "learning_rate": 2.173913043478261e-05, + "loss": 1.5401, + "step": 90 + }, + { + "epoch": 0.04411051866214251, + "grad_norm": 1.6799591779708862, + "learning_rate": 2.198067632850242e-05, + "loss": 1.7081, + "step": 91 + }, + { + "epoch": 0.04459524963645177, + "grad_norm": 1.7880574464797974, + "learning_rate": 2.2222222222222223e-05, + "loss": 1.5429, + "step": 92 + }, + { + "epoch": 0.04507998061076103, + "grad_norm": 1.625308632850647, + "learning_rate": 2.246376811594203e-05, + "loss": 1.5608, + "step": 93 + }, + { + "epoch": 0.045564711585070285, + "grad_norm": 1.8478410243988037, + "learning_rate": 2.2705314009661836e-05, + "loss": 2.0223, + "step": 94 + }, + { + "epoch": 0.046049442559379546, + "grad_norm": 1.5162346363067627, + "learning_rate": 2.294685990338164e-05, + "loss": 1.2781, + "step": 95 + }, + { + "epoch": 0.0465341735336888, + "grad_norm": 1.854674220085144, + "learning_rate": 2.318840579710145e-05, + "loss": 1.6944, + "step": 96 + }, + { + "epoch": 0.04701890450799806, + "grad_norm": 1.7069923877716064, + "learning_rate": 2.3429951690821258e-05, + "loss": 1.6425, + "step": 97 + }, + { + "epoch": 0.04750363548230732, + "grad_norm": 1.5543208122253418, + "learning_rate": 2.3671497584541063e-05, + "loss": 1.4902, + "step": 98 + }, + { + "epoch": 0.047988366456616575, + "grad_norm": 1.5168403387069702, + "learning_rate": 2.391304347826087e-05, + "loss": 1.3613, + "step": 99 + }, + { + "epoch": 0.048473097430925836, + "grad_norm": 1.7112879753112793, + "learning_rate": 2.4154589371980676e-05, + "loss": 1.9106, + "step": 100 + }, + { + "epoch": 0.0489578284052351, + "grad_norm": 1.8067775964736938, + "learning_rate": 2.4396135265700484e-05, + "loss": 1.8777, + "step": 101 + }, + { + "epoch": 0.04944255937954435, + "grad_norm": 1.528732419013977, + "learning_rate": 2.4637681159420292e-05, + "loss": 1.4165, + "step": 102 + }, + { + "epoch": 0.04992729035385361, + "grad_norm": 1.5768786668777466, + "learning_rate": 2.4879227053140097e-05, + "loss": 1.5526, + "step": 103 + }, + { + "epoch": 0.050412021328162866, + "grad_norm": 1.665514349937439, + "learning_rate": 2.5120772946859905e-05, + "loss": 1.5298, + "step": 104 + }, + { + "epoch": 0.05089675230247213, + "grad_norm": 1.59950852394104, + "learning_rate": 2.5362318840579714e-05, + "loss": 1.6167, + "step": 105 + }, + { + "epoch": 0.05138148327678139, + "grad_norm": 1.6841107606887817, + "learning_rate": 2.5603864734299522e-05, + "loss": 1.6686, + "step": 106 + }, + { + "epoch": 0.05186621425109064, + "grad_norm": 1.6083794832229614, + "learning_rate": 2.5845410628019323e-05, + "loss": 1.3828, + "step": 107 + }, + { + "epoch": 0.0523509452253999, + "grad_norm": 1.6372400522232056, + "learning_rate": 2.608695652173913e-05, + "loss": 1.7482, + "step": 108 + }, + { + "epoch": 0.052835676199709164, + "grad_norm": 1.7453114986419678, + "learning_rate": 2.632850241545894e-05, + "loss": 1.769, + "step": 109 + }, + { + "epoch": 0.05332040717401842, + "grad_norm": 1.6545413732528687, + "learning_rate": 2.6570048309178748e-05, + "loss": 2.123, + "step": 110 + }, + { + "epoch": 0.05380513814832768, + "grad_norm": 1.8229496479034424, + "learning_rate": 2.6811594202898553e-05, + "loss": 1.9621, + "step": 111 + }, + { + "epoch": 0.05428986912263694, + "grad_norm": 1.640529990196228, + "learning_rate": 2.7053140096618358e-05, + "loss": 1.8864, + "step": 112 + }, + { + "epoch": 0.054774600096946194, + "grad_norm": 1.4900970458984375, + "learning_rate": 2.7294685990338166e-05, + "loss": 1.9581, + "step": 113 + }, + { + "epoch": 0.055259331071255455, + "grad_norm": 1.60316002368927, + "learning_rate": 2.753623188405797e-05, + "loss": 1.4996, + "step": 114 + }, + { + "epoch": 0.05574406204556471, + "grad_norm": 1.8271881341934204, + "learning_rate": 2.777777777777778e-05, + "loss": 1.8359, + "step": 115 + }, + { + "epoch": 0.05622879301987397, + "grad_norm": 1.6188838481903076, + "learning_rate": 2.8019323671497587e-05, + "loss": 1.778, + "step": 116 + }, + { + "epoch": 0.05671352399418323, + "grad_norm": 1.672446608543396, + "learning_rate": 2.826086956521739e-05, + "loss": 1.9118, + "step": 117 + }, + { + "epoch": 0.057198254968492485, + "grad_norm": 1.725005030632019, + "learning_rate": 2.8502415458937197e-05, + "loss": 1.4782, + "step": 118 + }, + { + "epoch": 0.057682985942801746, + "grad_norm": 1.6945290565490723, + "learning_rate": 2.8743961352657005e-05, + "loss": 1.4347, + "step": 119 + }, + { + "epoch": 0.058167716917111006, + "grad_norm": 1.6939204931259155, + "learning_rate": 2.8985507246376814e-05, + "loss": 1.6457, + "step": 120 + }, + { + "epoch": 0.05865244789142026, + "grad_norm": 1.7131168842315674, + "learning_rate": 2.9227053140096622e-05, + "loss": 1.8858, + "step": 121 + }, + { + "epoch": 0.05913717886572952, + "grad_norm": 1.8440873622894287, + "learning_rate": 2.9468599033816423e-05, + "loss": 2.0418, + "step": 122 + }, + { + "epoch": 0.059621909840038775, + "grad_norm": 1.6105564832687378, + "learning_rate": 2.971014492753623e-05, + "loss": 1.5901, + "step": 123 + }, + { + "epoch": 0.060106640814348036, + "grad_norm": 1.7111328840255737, + "learning_rate": 2.995169082125604e-05, + "loss": 1.6965, + "step": 124 + }, + { + "epoch": 0.0605913717886573, + "grad_norm": 1.6334936618804932, + "learning_rate": 3.0193236714975848e-05, + "loss": 1.7441, + "step": 125 + }, + { + "epoch": 0.06107610276296655, + "grad_norm": 1.6838350296020508, + "learning_rate": 3.0434782608695656e-05, + "loss": 1.7894, + "step": 126 + }, + { + "epoch": 0.06156083373727581, + "grad_norm": 1.6963026523590088, + "learning_rate": 3.067632850241546e-05, + "loss": 1.8447, + "step": 127 + }, + { + "epoch": 0.06204556471158507, + "grad_norm": 1.6151043176651, + "learning_rate": 3.0917874396135266e-05, + "loss": 1.6308, + "step": 128 + }, + { + "epoch": 0.06253029568589433, + "grad_norm": 1.802123785018921, + "learning_rate": 3.1159420289855074e-05, + "loss": 1.5931, + "step": 129 + }, + { + "epoch": 0.06301502666020359, + "grad_norm": 9.797577857971191, + "learning_rate": 3.140096618357488e-05, + "loss": 1.4618, + "step": 130 + }, + { + "epoch": 0.06349975763451285, + "grad_norm": 1.555799126625061, + "learning_rate": 3.164251207729469e-05, + "loss": 1.4462, + "step": 131 + }, + { + "epoch": 0.06398448860882211, + "grad_norm": 1.631108283996582, + "learning_rate": 3.188405797101449e-05, + "loss": 1.6234, + "step": 132 + }, + { + "epoch": 0.06446921958313136, + "grad_norm": 1.6092970371246338, + "learning_rate": 3.21256038647343e-05, + "loss": 1.6422, + "step": 133 + }, + { + "epoch": 0.06495395055744062, + "grad_norm": 1.667833685874939, + "learning_rate": 3.236714975845411e-05, + "loss": 1.6484, + "step": 134 + }, + { + "epoch": 0.06543868153174988, + "grad_norm": 1.7068734169006348, + "learning_rate": 3.260869565217392e-05, + "loss": 1.4214, + "step": 135 + }, + { + "epoch": 0.06592341250605914, + "grad_norm": 1.5251973867416382, + "learning_rate": 3.2850241545893725e-05, + "loss": 1.5508, + "step": 136 + }, + { + "epoch": 0.0664081434803684, + "grad_norm": 1.6476942300796509, + "learning_rate": 3.3091787439613533e-05, + "loss": 1.49, + "step": 137 + }, + { + "epoch": 0.06689287445467765, + "grad_norm": 1.7182284593582153, + "learning_rate": 3.3333333333333335e-05, + "loss": 1.6699, + "step": 138 + }, + { + "epoch": 0.06737760542898691, + "grad_norm": 1.5735374689102173, + "learning_rate": 3.357487922705314e-05, + "loss": 1.6516, + "step": 139 + }, + { + "epoch": 0.06786233640329617, + "grad_norm": 1.6058720350265503, + "learning_rate": 3.381642512077295e-05, + "loss": 1.6413, + "step": 140 + }, + { + "epoch": 0.06834706737760543, + "grad_norm": 1.596596360206604, + "learning_rate": 3.405797101449276e-05, + "loss": 1.5226, + "step": 141 + }, + { + "epoch": 0.06883179835191469, + "grad_norm": 1.7365012168884277, + "learning_rate": 3.429951690821256e-05, + "loss": 1.9471, + "step": 142 + }, + { + "epoch": 0.06931652932622395, + "grad_norm": 1.566796064376831, + "learning_rate": 3.454106280193237e-05, + "loss": 1.6039, + "step": 143 + }, + { + "epoch": 0.0698012603005332, + "grad_norm": 2.105440616607666, + "learning_rate": 3.478260869565218e-05, + "loss": 1.8893, + "step": 144 + }, + { + "epoch": 0.07028599127484246, + "grad_norm": 1.701600193977356, + "learning_rate": 3.502415458937198e-05, + "loss": 1.8973, + "step": 145 + }, + { + "epoch": 0.07077072224915172, + "grad_norm": 1.6586337089538574, + "learning_rate": 3.526570048309179e-05, + "loss": 1.6766, + "step": 146 + }, + { + "epoch": 0.07125545322346098, + "grad_norm": 1.6110674142837524, + "learning_rate": 3.5507246376811596e-05, + "loss": 1.5639, + "step": 147 + }, + { + "epoch": 0.07174018419777024, + "grad_norm": 1.6405996084213257, + "learning_rate": 3.57487922705314e-05, + "loss": 1.7666, + "step": 148 + }, + { + "epoch": 0.07222491517207949, + "grad_norm": 1.6947883367538452, + "learning_rate": 3.5990338164251205e-05, + "loss": 1.578, + "step": 149 + }, + { + "epoch": 0.07270964614638875, + "grad_norm": 1.6765739917755127, + "learning_rate": 3.6231884057971014e-05, + "loss": 1.7495, + "step": 150 + }, + { + "epoch": 0.07319437712069801, + "grad_norm": 1.8097703456878662, + "learning_rate": 3.647342995169082e-05, + "loss": 1.3894, + "step": 151 + }, + { + "epoch": 0.07367910809500727, + "grad_norm": 1.657542109489441, + "learning_rate": 3.671497584541063e-05, + "loss": 1.615, + "step": 152 + }, + { + "epoch": 0.07416383906931653, + "grad_norm": 1.6826646327972412, + "learning_rate": 3.695652173913043e-05, + "loss": 1.5399, + "step": 153 + }, + { + "epoch": 0.07464857004362578, + "grad_norm": 1.6431878805160522, + "learning_rate": 3.719806763285024e-05, + "loss": 1.8077, + "step": 154 + }, + { + "epoch": 0.07513330101793504, + "grad_norm": 1.617641568183899, + "learning_rate": 3.743961352657005e-05, + "loss": 1.7934, + "step": 155 + }, + { + "epoch": 0.0756180319922443, + "grad_norm": 1.6802802085876465, + "learning_rate": 3.7681159420289856e-05, + "loss": 2.0497, + "step": 156 + }, + { + "epoch": 0.07610276296655356, + "grad_norm": 1.5995146036148071, + "learning_rate": 3.7922705314009665e-05, + "loss": 1.5435, + "step": 157 + }, + { + "epoch": 0.07658749394086282, + "grad_norm": 1.6491643190383911, + "learning_rate": 3.8164251207729466e-05, + "loss": 1.6003, + "step": 158 + }, + { + "epoch": 0.07707222491517209, + "grad_norm": 1.671040654182434, + "learning_rate": 3.8405797101449274e-05, + "loss": 1.6768, + "step": 159 + }, + { + "epoch": 0.07755695588948133, + "grad_norm": 1.4470667839050293, + "learning_rate": 3.864734299516908e-05, + "loss": 1.4752, + "step": 160 + }, + { + "epoch": 0.0780416868637906, + "grad_norm": 1.4912703037261963, + "learning_rate": 3.888888888888889e-05, + "loss": 1.5621, + "step": 161 + }, + { + "epoch": 0.07852641783809985, + "grad_norm": 1.6416089534759521, + "learning_rate": 3.91304347826087e-05, + "loss": 1.7115, + "step": 162 + }, + { + "epoch": 0.07901114881240912, + "grad_norm": 1.738970160484314, + "learning_rate": 3.937198067632851e-05, + "loss": 1.5945, + "step": 163 + }, + { + "epoch": 0.07949587978671838, + "grad_norm": 1.50935959815979, + "learning_rate": 3.961352657004831e-05, + "loss": 1.4379, + "step": 164 + }, + { + "epoch": 0.07998061076102762, + "grad_norm": 1.8530389070510864, + "learning_rate": 3.985507246376812e-05, + "loss": 1.6197, + "step": 165 + }, + { + "epoch": 0.08046534173533688, + "grad_norm": 1.5483351945877075, + "learning_rate": 4.0096618357487925e-05, + "loss": 1.4075, + "step": 166 + }, + { + "epoch": 0.08095007270964615, + "grad_norm": 1.7282538414001465, + "learning_rate": 4.0338164251207733e-05, + "loss": 1.7884, + "step": 167 + }, + { + "epoch": 0.0814348036839554, + "grad_norm": 1.605089545249939, + "learning_rate": 4.057971014492754e-05, + "loss": 1.4268, + "step": 168 + }, + { + "epoch": 0.08191953465826467, + "grad_norm": 1.6566203832626343, + "learning_rate": 4.082125603864734e-05, + "loss": 1.5855, + "step": 169 + }, + { + "epoch": 0.08240426563257393, + "grad_norm": 1.7147961854934692, + "learning_rate": 4.106280193236715e-05, + "loss": 1.9024, + "step": 170 + }, + { + "epoch": 0.08288899660688318, + "grad_norm": 1.638307809829712, + "learning_rate": 4.130434782608696e-05, + "loss": 1.5341, + "step": 171 + }, + { + "epoch": 0.08337372758119244, + "grad_norm": 1.503071665763855, + "learning_rate": 4.154589371980677e-05, + "loss": 1.3162, + "step": 172 + }, + { + "epoch": 0.0838584585555017, + "grad_norm": 1.6790330410003662, + "learning_rate": 4.1787439613526576e-05, + "loss": 1.5488, + "step": 173 + }, + { + "epoch": 0.08434318952981096, + "grad_norm": 1.6894205808639526, + "learning_rate": 4.202898550724638e-05, + "loss": 1.4366, + "step": 174 + }, + { + "epoch": 0.08482792050412022, + "grad_norm": 1.550458550453186, + "learning_rate": 4.2270531400966186e-05, + "loss": 1.6258, + "step": 175 + }, + { + "epoch": 0.08531265147842947, + "grad_norm": 1.4660718441009521, + "learning_rate": 4.2512077294685994e-05, + "loss": 1.3366, + "step": 176 + }, + { + "epoch": 0.08579738245273873, + "grad_norm": 1.527509331703186, + "learning_rate": 4.27536231884058e-05, + "loss": 1.5319, + "step": 177 + }, + { + "epoch": 0.08628211342704799, + "grad_norm": 5.795914173126221, + "learning_rate": 4.299516908212561e-05, + "loss": 1.3028, + "step": 178 + }, + { + "epoch": 0.08676684440135725, + "grad_norm": 1.6990413665771484, + "learning_rate": 4.323671497584541e-05, + "loss": 1.8149, + "step": 179 + }, + { + "epoch": 0.08725157537566651, + "grad_norm": 1.704795479774475, + "learning_rate": 4.347826086956522e-05, + "loss": 1.671, + "step": 180 + }, + { + "epoch": 0.08773630634997576, + "grad_norm": 1.5868449211120605, + "learning_rate": 4.371980676328503e-05, + "loss": 1.5234, + "step": 181 + }, + { + "epoch": 0.08822103732428502, + "grad_norm": 1.7695292234420776, + "learning_rate": 4.396135265700484e-05, + "loss": 1.6046, + "step": 182 + }, + { + "epoch": 0.08870576829859428, + "grad_norm": 1.6047744750976562, + "learning_rate": 4.4202898550724645e-05, + "loss": 1.7261, + "step": 183 + }, + { + "epoch": 0.08919049927290354, + "grad_norm": 1.7648770809173584, + "learning_rate": 4.4444444444444447e-05, + "loss": 1.6856, + "step": 184 + }, + { + "epoch": 0.0896752302472128, + "grad_norm": 1.8613518476486206, + "learning_rate": 4.4685990338164255e-05, + "loss": 1.8129, + "step": 185 + }, + { + "epoch": 0.09015996122152206, + "grad_norm": 1.7032883167266846, + "learning_rate": 4.492753623188406e-05, + "loss": 1.6468, + "step": 186 + }, + { + "epoch": 0.09064469219583131, + "grad_norm": 1.4716066122055054, + "learning_rate": 4.5169082125603865e-05, + "loss": 1.2393, + "step": 187 + }, + { + "epoch": 0.09112942317014057, + "grad_norm": 1.582453966140747, + "learning_rate": 4.541062801932367e-05, + "loss": 1.7276, + "step": 188 + }, + { + "epoch": 0.09161415414444983, + "grad_norm": 1.5959687232971191, + "learning_rate": 4.565217391304348e-05, + "loss": 1.4626, + "step": 189 + }, + { + "epoch": 0.09209888511875909, + "grad_norm": 2.014113426208496, + "learning_rate": 4.589371980676328e-05, + "loss": 2.0453, + "step": 190 + }, + { + "epoch": 0.09258361609306835, + "grad_norm": 1.6704699993133545, + "learning_rate": 4.613526570048309e-05, + "loss": 1.7625, + "step": 191 + }, + { + "epoch": 0.0930683470673776, + "grad_norm": 1.8854517936706543, + "learning_rate": 4.63768115942029e-05, + "loss": 1.7175, + "step": 192 + }, + { + "epoch": 0.09355307804168686, + "grad_norm": 1.588793158531189, + "learning_rate": 4.661835748792271e-05, + "loss": 1.5578, + "step": 193 + }, + { + "epoch": 0.09403780901599612, + "grad_norm": 1.5601580142974854, + "learning_rate": 4.6859903381642516e-05, + "loss": 1.6403, + "step": 194 + }, + { + "epoch": 0.09452253999030538, + "grad_norm": 1.6801401376724243, + "learning_rate": 4.710144927536232e-05, + "loss": 1.4691, + "step": 195 + }, + { + "epoch": 0.09500727096461464, + "grad_norm": 2.123878002166748, + "learning_rate": 4.7342995169082125e-05, + "loss": 1.5602, + "step": 196 + }, + { + "epoch": 0.0954920019389239, + "grad_norm": 1.666060447692871, + "learning_rate": 4.7584541062801933e-05, + "loss": 1.4702, + "step": 197 + }, + { + "epoch": 0.09597673291323315, + "grad_norm": 1.7319121360778809, + "learning_rate": 4.782608695652174e-05, + "loss": 1.8495, + "step": 198 + }, + { + "epoch": 0.09646146388754241, + "grad_norm": 1.5935922861099243, + "learning_rate": 4.806763285024155e-05, + "loss": 1.5142, + "step": 199 + }, + { + "epoch": 0.09694619486185167, + "grad_norm": 1.8752068281173706, + "learning_rate": 4.830917874396135e-05, + "loss": 1.9948, + "step": 200 + }, + { + "epoch": 0.09743092583616093, + "grad_norm": 1.8214166164398193, + "learning_rate": 4.855072463768116e-05, + "loss": 1.7985, + "step": 201 + }, + { + "epoch": 0.0979156568104702, + "grad_norm": 1.6320829391479492, + "learning_rate": 4.879227053140097e-05, + "loss": 1.6227, + "step": 202 + }, + { + "epoch": 0.09840038778477944, + "grad_norm": 1.5568000078201294, + "learning_rate": 4.9033816425120776e-05, + "loss": 1.587, + "step": 203 + }, + { + "epoch": 0.0988851187590887, + "grad_norm": 1.5281633138656616, + "learning_rate": 4.9275362318840584e-05, + "loss": 1.4167, + "step": 204 + }, + { + "epoch": 0.09936984973339796, + "grad_norm": 1.5692189931869507, + "learning_rate": 4.9516908212560386e-05, + "loss": 1.5648, + "step": 205 + }, + { + "epoch": 0.09985458070770722, + "grad_norm": 1.644121766090393, + "learning_rate": 4.9758454106280194e-05, + "loss": 1.8842, + "step": 206 + }, + { + "epoch": 0.10033931168201649, + "grad_norm": 1.654074788093567, + "learning_rate": 5e-05, + "loss": 1.3634, + "step": 207 + }, + { + "epoch": 0.10082404265632573, + "grad_norm": 1.6578999757766724, + "learning_rate": 4.9999964185927293e-05, + "loss": 1.4366, + "step": 208 + }, + { + "epoch": 0.101308773630635, + "grad_norm": 1.6873010396957397, + "learning_rate": 4.999985674381179e-05, + "loss": 1.5582, + "step": 209 + }, + { + "epoch": 0.10179350460494425, + "grad_norm": 1.5198674201965332, + "learning_rate": 4.999967767396132e-05, + "loss": 1.3704, + "step": 210 + }, + { + "epoch": 0.10227823557925352, + "grad_norm": 1.6594791412353516, + "learning_rate": 4.999942697688894e-05, + "loss": 1.7239, + "step": 211 + }, + { + "epoch": 0.10276296655356278, + "grad_norm": 1.6230357885360718, + "learning_rate": 4.9999104653312926e-05, + "loss": 1.5243, + "step": 212 + }, + { + "epoch": 0.10324769752787204, + "grad_norm": 1.62498140335083, + "learning_rate": 4.9998710704156785e-05, + "loss": 1.4242, + "step": 213 + }, + { + "epoch": 0.10373242850218128, + "grad_norm": 1.611889362335205, + "learning_rate": 4.9998245130549226e-05, + "loss": 1.7185, + "step": 214 + }, + { + "epoch": 0.10421715947649055, + "grad_norm": 1.7176823616027832, + "learning_rate": 4.999770793382418e-05, + "loss": 1.7587, + "step": 215 + }, + { + "epoch": 0.1047018904507998, + "grad_norm": 1.8749507665634155, + "learning_rate": 4.999709911552077e-05, + "loss": 1.5999, + "step": 216 + }, + { + "epoch": 0.10518662142510907, + "grad_norm": 2.380711793899536, + "learning_rate": 4.999641867738336e-05, + "loss": 1.5824, + "step": 217 + }, + { + "epoch": 0.10567135239941833, + "grad_norm": 1.681323528289795, + "learning_rate": 4.999566662136147e-05, + "loss": 1.6289, + "step": 218 + }, + { + "epoch": 0.10615608337372757, + "grad_norm": 1.7086970806121826, + "learning_rate": 4.999484294960984e-05, + "loss": 1.5793, + "step": 219 + }, + { + "epoch": 0.10664081434803684, + "grad_norm": 1.6258258819580078, + "learning_rate": 4.999394766448841e-05, + "loss": 1.7768, + "step": 220 + }, + { + "epoch": 0.1071255453223461, + "grad_norm": 1.609623670578003, + "learning_rate": 4.9992980768562256e-05, + "loss": 1.5748, + "step": 221 + }, + { + "epoch": 0.10761027629665536, + "grad_norm": 1.6131285429000854, + "learning_rate": 4.999194226460167e-05, + "loss": 1.7234, + "step": 222 + }, + { + "epoch": 0.10809500727096462, + "grad_norm": 1.7092560529708862, + "learning_rate": 4.99908321555821e-05, + "loss": 1.8781, + "step": 223 + }, + { + "epoch": 0.10857973824527388, + "grad_norm": 1.755306601524353, + "learning_rate": 4.998965044468414e-05, + "loss": 1.647, + "step": 224 + }, + { + "epoch": 0.10906446921958313, + "grad_norm": 1.637286901473999, + "learning_rate": 4.9988397135293567e-05, + "loss": 1.401, + "step": 225 + }, + { + "epoch": 0.10954920019389239, + "grad_norm": 1.505699634552002, + "learning_rate": 4.998707223100123e-05, + "loss": 1.3702, + "step": 226 + }, + { + "epoch": 0.11003393116820165, + "grad_norm": 1.5001720190048218, + "learning_rate": 4.9985675735603164e-05, + "loss": 1.6756, + "step": 227 + }, + { + "epoch": 0.11051866214251091, + "grad_norm": 1.6514838933944702, + "learning_rate": 4.998420765310051e-05, + "loss": 1.3885, + "step": 228 + }, + { + "epoch": 0.11100339311682017, + "grad_norm": 1.6999015808105469, + "learning_rate": 4.998266798769951e-05, + "loss": 1.2946, + "step": 229 + }, + { + "epoch": 0.11148812409112942, + "grad_norm": 1.8051220178604126, + "learning_rate": 4.998105674381148e-05, + "loss": 1.6211, + "step": 230 + }, + { + "epoch": 0.11197285506543868, + "grad_norm": 1.7223929166793823, + "learning_rate": 4.9979373926052865e-05, + "loss": 1.6805, + "step": 231 + }, + { + "epoch": 0.11245758603974794, + "grad_norm": 1.5139886140823364, + "learning_rate": 4.997761953924512e-05, + "loss": 1.4749, + "step": 232 + }, + { + "epoch": 0.1129423170140572, + "grad_norm": 1.6115888357162476, + "learning_rate": 4.997579358841481e-05, + "loss": 1.6298, + "step": 233 + }, + { + "epoch": 0.11342704798836646, + "grad_norm": 1.5873501300811768, + "learning_rate": 4.99738960787935e-05, + "loss": 1.5954, + "step": 234 + }, + { + "epoch": 0.11391177896267571, + "grad_norm": 1.5256075859069824, + "learning_rate": 4.99719270158178e-05, + "loss": 1.155, + "step": 235 + }, + { + "epoch": 0.11439650993698497, + "grad_norm": 1.633514165878296, + "learning_rate": 4.996988640512931e-05, + "loss": 1.8803, + "step": 236 + }, + { + "epoch": 0.11488124091129423, + "grad_norm": 1.6052523851394653, + "learning_rate": 4.996777425257465e-05, + "loss": 1.6264, + "step": 237 + }, + { + "epoch": 0.11536597188560349, + "grad_norm": 1.5508285760879517, + "learning_rate": 4.99655905642054e-05, + "loss": 1.6035, + "step": 238 + }, + { + "epoch": 0.11585070285991275, + "grad_norm": 1.9220739603042603, + "learning_rate": 4.996333534627809e-05, + "loss": 1.7689, + "step": 239 + }, + { + "epoch": 0.11633543383422201, + "grad_norm": 1.5728856325149536, + "learning_rate": 4.9961008605254237e-05, + "loss": 1.4674, + "step": 240 + }, + { + "epoch": 0.11682016480853126, + "grad_norm": 1.6598260402679443, + "learning_rate": 4.9958610347800206e-05, + "loss": 1.4091, + "step": 241 + }, + { + "epoch": 0.11730489578284052, + "grad_norm": 1.6171025037765503, + "learning_rate": 4.995614058078733e-05, + "loss": 1.5094, + "step": 242 + }, + { + "epoch": 0.11778962675714978, + "grad_norm": 1.5818380117416382, + "learning_rate": 4.995359931129179e-05, + "loss": 1.464, + "step": 243 + }, + { + "epoch": 0.11827435773145904, + "grad_norm": 1.5343612432479858, + "learning_rate": 4.995098654659465e-05, + "loss": 1.4005, + "step": 244 + }, + { + "epoch": 0.1187590887057683, + "grad_norm": 1.4373968839645386, + "learning_rate": 4.99483022941818e-05, + "loss": 1.156, + "step": 245 + }, + { + "epoch": 0.11924381968007755, + "grad_norm": 1.7314152717590332, + "learning_rate": 4.994554656174398e-05, + "loss": 1.7723, + "step": 246 + }, + { + "epoch": 0.11972855065438681, + "grad_norm": 1.6805850267410278, + "learning_rate": 4.99427193571767e-05, + "loss": 1.6244, + "step": 247 + }, + { + "epoch": 0.12021328162869607, + "grad_norm": 1.5888878107070923, + "learning_rate": 4.993982068858025e-05, + "loss": 1.6246, + "step": 248 + }, + { + "epoch": 0.12069801260300533, + "grad_norm": 1.8114043474197388, + "learning_rate": 4.9936850564259695e-05, + "loss": 1.4527, + "step": 249 + }, + { + "epoch": 0.1211827435773146, + "grad_norm": 1.7056317329406738, + "learning_rate": 4.99338089927248e-05, + "loss": 1.5897, + "step": 250 + }, + { + "epoch": 0.12166747455162386, + "grad_norm": 1.6106626987457275, + "learning_rate": 4.993069598269006e-05, + "loss": 1.5619, + "step": 251 + }, + { + "epoch": 0.1221522055259331, + "grad_norm": 1.6103230714797974, + "learning_rate": 4.9927511543074635e-05, + "loss": 1.3745, + "step": 252 + }, + { + "epoch": 0.12263693650024236, + "grad_norm": 1.699284315109253, + "learning_rate": 4.992425568300234e-05, + "loss": 1.5918, + "step": 253 + }, + { + "epoch": 0.12312166747455162, + "grad_norm": 1.6529169082641602, + "learning_rate": 4.992092841180164e-05, + "loss": 1.7702, + "step": 254 + }, + { + "epoch": 0.12360639844886089, + "grad_norm": 1.7270163297653198, + "learning_rate": 4.9917529739005574e-05, + "loss": 1.5362, + "step": 255 + }, + { + "epoch": 0.12409112942317015, + "grad_norm": 2.138240098953247, + "learning_rate": 4.991405967435177e-05, + "loss": 1.7812, + "step": 256 + }, + { + "epoch": 0.1245758603974794, + "grad_norm": 1.7157567739486694, + "learning_rate": 4.991051822778239e-05, + "loss": 1.8263, + "step": 257 + }, + { + "epoch": 0.12506059137178865, + "grad_norm": 1.5743305683135986, + "learning_rate": 4.990690540944414e-05, + "loss": 1.5631, + "step": 258 + }, + { + "epoch": 0.12554532234609792, + "grad_norm": 1.4902536869049072, + "learning_rate": 4.9903221229688194e-05, + "loss": 1.6046, + "step": 259 + }, + { + "epoch": 0.12603005332040718, + "grad_norm": 1.700891375541687, + "learning_rate": 4.989946569907019e-05, + "loss": 1.7547, + "step": 260 + }, + { + "epoch": 0.12651478429471644, + "grad_norm": 1.683803915977478, + "learning_rate": 4.989563882835019e-05, + "loss": 1.3777, + "step": 261 + }, + { + "epoch": 0.1269995152690257, + "grad_norm": 1.548683762550354, + "learning_rate": 4.989174062849267e-05, + "loss": 1.2059, + "step": 262 + }, + { + "epoch": 0.12748424624333496, + "grad_norm": 1.5738409757614136, + "learning_rate": 4.988777111066646e-05, + "loss": 1.2089, + "step": 263 + }, + { + "epoch": 0.12796897721764422, + "grad_norm": 1.5687381029129028, + "learning_rate": 4.9883730286244715e-05, + "loss": 1.4081, + "step": 264 + }, + { + "epoch": 0.12845370819195345, + "grad_norm": 1.6630191802978516, + "learning_rate": 4.987961816680492e-05, + "loss": 1.472, + "step": 265 + }, + { + "epoch": 0.1289384391662627, + "grad_norm": 1.692008376121521, + "learning_rate": 4.987543476412881e-05, + "loss": 1.8636, + "step": 266 + }, + { + "epoch": 0.12942317014057197, + "grad_norm": 1.612111210823059, + "learning_rate": 4.987118009020237e-05, + "loss": 1.5438, + "step": 267 + }, + { + "epoch": 0.12990790111488124, + "grad_norm": 2.01253342628479, + "learning_rate": 4.986685415721576e-05, + "loss": 1.9671, + "step": 268 + }, + { + "epoch": 0.1303926320891905, + "grad_norm": 1.5312128067016602, + "learning_rate": 4.986245697756333e-05, + "loss": 1.3681, + "step": 269 + }, + { + "epoch": 0.13087736306349976, + "grad_norm": 1.6384061574935913, + "learning_rate": 4.985798856384355e-05, + "loss": 1.6502, + "step": 270 + }, + { + "epoch": 0.13136209403780902, + "grad_norm": 2.0369157791137695, + "learning_rate": 4.985344892885899e-05, + "loss": 1.5976, + "step": 271 + }, + { + "epoch": 0.13184682501211828, + "grad_norm": 1.5668435096740723, + "learning_rate": 4.984883808561628e-05, + "loss": 1.456, + "step": 272 + }, + { + "epoch": 0.13233155598642754, + "grad_norm": 3.0037758350372314, + "learning_rate": 4.9844156047326054e-05, + "loss": 1.4169, + "step": 273 + }, + { + "epoch": 0.1328162869607368, + "grad_norm": 1.5953983068466187, + "learning_rate": 4.9839402827402947e-05, + "loss": 1.5507, + "step": 274 + }, + { + "epoch": 0.13330101793504606, + "grad_norm": 1.6405383348464966, + "learning_rate": 4.983457843946554e-05, + "loss": 1.5394, + "step": 275 + }, + { + "epoch": 0.1337857489093553, + "grad_norm": 1.5499027967453003, + "learning_rate": 4.98296828973363e-05, + "loss": 1.5463, + "step": 276 + }, + { + "epoch": 0.13427047988366456, + "grad_norm": 1.5397121906280518, + "learning_rate": 4.9824716215041575e-05, + "loss": 1.5493, + "step": 277 + }, + { + "epoch": 0.13475521085797382, + "grad_norm": 1.5781245231628418, + "learning_rate": 4.981967840681154e-05, + "loss": 1.6137, + "step": 278 + }, + { + "epoch": 0.13523994183228308, + "grad_norm": 1.79483962059021, + "learning_rate": 4.981456948708014e-05, + "loss": 1.6445, + "step": 279 + }, + { + "epoch": 0.13572467280659234, + "grad_norm": 1.5061801671981812, + "learning_rate": 4.980938947048508e-05, + "loss": 1.4163, + "step": 280 + }, + { + "epoch": 0.1362094037809016, + "grad_norm": 1.5987025499343872, + "learning_rate": 4.980413837186775e-05, + "loss": 1.531, + "step": 281 + }, + { + "epoch": 0.13669413475521086, + "grad_norm": 1.7168915271759033, + "learning_rate": 4.979881620627322e-05, + "loss": 1.4682, + "step": 282 + }, + { + "epoch": 0.13717886572952012, + "grad_norm": 1.4458422660827637, + "learning_rate": 4.979342298895016e-05, + "loss": 1.2247, + "step": 283 + }, + { + "epoch": 0.13766359670382938, + "grad_norm": 1.6409187316894531, + "learning_rate": 4.9787958735350816e-05, + "loss": 1.7199, + "step": 284 + }, + { + "epoch": 0.13814832767813864, + "grad_norm": 1.5449514389038086, + "learning_rate": 4.978242346113095e-05, + "loss": 1.4311, + "step": 285 + }, + { + "epoch": 0.1386330586524479, + "grad_norm": 1.5891187191009521, + "learning_rate": 4.977681718214984e-05, + "loss": 1.8869, + "step": 286 + }, + { + "epoch": 0.13911778962675714, + "grad_norm": 3.7609283924102783, + "learning_rate": 4.977113991447017e-05, + "loss": 1.7382, + "step": 287 + }, + { + "epoch": 0.1396025206010664, + "grad_norm": 1.7186360359191895, + "learning_rate": 4.976539167435803e-05, + "loss": 1.699, + "step": 288 + }, + { + "epoch": 0.14008725157537566, + "grad_norm": 1.4953521490097046, + "learning_rate": 4.9759572478282846e-05, + "loss": 1.4215, + "step": 289 + }, + { + "epoch": 0.14057198254968492, + "grad_norm": 1.5115928649902344, + "learning_rate": 4.975368234291734e-05, + "loss": 1.2988, + "step": 290 + }, + { + "epoch": 0.14105671352399418, + "grad_norm": 2.540882110595703, + "learning_rate": 4.974772128513751e-05, + "loss": 1.651, + "step": 291 + }, + { + "epoch": 0.14154144449830344, + "grad_norm": 1.6582648754119873, + "learning_rate": 4.974168932202252e-05, + "loss": 1.7001, + "step": 292 + }, + { + "epoch": 0.1420261754726127, + "grad_norm": 1.5049525499343872, + "learning_rate": 4.973558647085472e-05, + "loss": 1.7229, + "step": 293 + }, + { + "epoch": 0.14251090644692196, + "grad_norm": 1.6676872968673706, + "learning_rate": 4.972941274911953e-05, + "loss": 1.5541, + "step": 294 + }, + { + "epoch": 0.14299563742123123, + "grad_norm": 1.5429099798202515, + "learning_rate": 4.972316817450544e-05, + "loss": 1.813, + "step": 295 + }, + { + "epoch": 0.1434803683955405, + "grad_norm": 1.6391764879226685, + "learning_rate": 4.9716852764903955e-05, + "loss": 1.9432, + "step": 296 + }, + { + "epoch": 0.14396509936984975, + "grad_norm": 1.6778995990753174, + "learning_rate": 4.9710466538409505e-05, + "loss": 1.5883, + "step": 297 + }, + { + "epoch": 0.14444983034415898, + "grad_norm": 1.508912205696106, + "learning_rate": 4.9704009513319444e-05, + "loss": 1.6739, + "step": 298 + }, + { + "epoch": 0.14493456131846824, + "grad_norm": 1.5249568223953247, + "learning_rate": 4.9697481708133955e-05, + "loss": 1.3977, + "step": 299 + }, + { + "epoch": 0.1454192922927775, + "grad_norm": 1.429032802581787, + "learning_rate": 4.969088314155602e-05, + "loss": 1.3599, + "step": 300 + }, + { + "epoch": 0.14590402326708676, + "grad_norm": 1.425854206085205, + "learning_rate": 4.968421383249137e-05, + "loss": 1.4904, + "step": 301 + }, + { + "epoch": 0.14638875424139602, + "grad_norm": 1.5954697132110596, + "learning_rate": 4.967747380004839e-05, + "loss": 1.6036, + "step": 302 + }, + { + "epoch": 0.14687348521570529, + "grad_norm": 1.6121957302093506, + "learning_rate": 4.967066306353816e-05, + "loss": 1.5024, + "step": 303 + }, + { + "epoch": 0.14735821619001455, + "grad_norm": 1.502057433128357, + "learning_rate": 4.966378164247426e-05, + "loss": 1.4643, + "step": 304 + }, + { + "epoch": 0.1478429471643238, + "grad_norm": 1.4162192344665527, + "learning_rate": 4.965682955657286e-05, + "loss": 1.279, + "step": 305 + }, + { + "epoch": 0.14832767813863307, + "grad_norm": 1.5923371315002441, + "learning_rate": 4.964980682575253e-05, + "loss": 1.4768, + "step": 306 + }, + { + "epoch": 0.14881240911294233, + "grad_norm": 1.5782544612884521, + "learning_rate": 4.964271347013431e-05, + "loss": 1.3391, + "step": 307 + }, + { + "epoch": 0.14929714008725156, + "grad_norm": 2.188934326171875, + "learning_rate": 4.9635549510041516e-05, + "loss": 1.6648, + "step": 308 + }, + { + "epoch": 0.14978187106156082, + "grad_norm": 1.6566468477249146, + "learning_rate": 4.9628314965999835e-05, + "loss": 1.6523, + "step": 309 + }, + { + "epoch": 0.15026660203587008, + "grad_norm": 1.46741783618927, + "learning_rate": 4.9621009858737116e-05, + "loss": 1.5349, + "step": 310 + }, + { + "epoch": 0.15075133301017934, + "grad_norm": 1.486839771270752, + "learning_rate": 4.961363420918342e-05, + "loss": 1.4895, + "step": 311 + }, + { + "epoch": 0.1512360639844886, + "grad_norm": 1.4840891361236572, + "learning_rate": 4.960618803847092e-05, + "loss": 1.5155, + "step": 312 + }, + { + "epoch": 0.15172079495879787, + "grad_norm": 1.578395962715149, + "learning_rate": 4.959867136793384e-05, + "loss": 1.5057, + "step": 313 + }, + { + "epoch": 0.15220552593310713, + "grad_norm": 1.5518931150436401, + "learning_rate": 4.959108421910835e-05, + "loss": 1.8778, + "step": 314 + }, + { + "epoch": 0.1526902569074164, + "grad_norm": 1.491754412651062, + "learning_rate": 4.958342661373262e-05, + "loss": 1.5156, + "step": 315 + }, + { + "epoch": 0.15317498788172565, + "grad_norm": 1.492876648902893, + "learning_rate": 4.957569857374664e-05, + "loss": 1.5804, + "step": 316 + }, + { + "epoch": 0.1536597188560349, + "grad_norm": 1.7110258340835571, + "learning_rate": 4.956790012129221e-05, + "loss": 1.8366, + "step": 317 + }, + { + "epoch": 0.15414444983034417, + "grad_norm": 1.5975233316421509, + "learning_rate": 4.9560031278712896e-05, + "loss": 1.5372, + "step": 318 + }, + { + "epoch": 0.1546291808046534, + "grad_norm": 1.5784556865692139, + "learning_rate": 4.95520920685539e-05, + "loss": 1.7814, + "step": 319 + }, + { + "epoch": 0.15511391177896267, + "grad_norm": 1.4783300161361694, + "learning_rate": 4.9544082513562076e-05, + "loss": 1.495, + "step": 320 + }, + { + "epoch": 0.15559864275327193, + "grad_norm": 1.75221586227417, + "learning_rate": 4.95360026366858e-05, + "loss": 1.7768, + "step": 321 + }, + { + "epoch": 0.1560833737275812, + "grad_norm": 2.064201831817627, + "learning_rate": 4.952785246107494e-05, + "loss": 1.983, + "step": 322 + }, + { + "epoch": 0.15656810470189045, + "grad_norm": 1.7069522142410278, + "learning_rate": 4.951963201008076e-05, + "loss": 1.6382, + "step": 323 + }, + { + "epoch": 0.1570528356761997, + "grad_norm": 1.5614511966705322, + "learning_rate": 4.951134130725591e-05, + "loss": 1.6066, + "step": 324 + }, + { + "epoch": 0.15753756665050897, + "grad_norm": 1.510345697402954, + "learning_rate": 4.950298037635428e-05, + "loss": 1.6226, + "step": 325 + }, + { + "epoch": 0.15802229762481823, + "grad_norm": 1.7443574666976929, + "learning_rate": 4.949454924133098e-05, + "loss": 1.8012, + "step": 326 + }, + { + "epoch": 0.1585070285991275, + "grad_norm": 1.6052006483078003, + "learning_rate": 4.948604792634229e-05, + "loss": 1.7394, + "step": 327 + }, + { + "epoch": 0.15899175957343675, + "grad_norm": 1.6979362964630127, + "learning_rate": 4.947747645574555e-05, + "loss": 1.4477, + "step": 328 + }, + { + "epoch": 0.159476490547746, + "grad_norm": 2.022580146789551, + "learning_rate": 4.9468834854099095e-05, + "loss": 1.5245, + "step": 329 + }, + { + "epoch": 0.15996122152205525, + "grad_norm": 1.485262155532837, + "learning_rate": 4.94601231461622e-05, + "loss": 1.4005, + "step": 330 + }, + { + "epoch": 0.1604459524963645, + "grad_norm": 1.5557737350463867, + "learning_rate": 4.9451341356895e-05, + "loss": 1.5077, + "step": 331 + }, + { + "epoch": 0.16093068347067377, + "grad_norm": 1.7877156734466553, + "learning_rate": 4.9442489511458426e-05, + "loss": 1.62, + "step": 332 + }, + { + "epoch": 0.16141541444498303, + "grad_norm": 1.6162307262420654, + "learning_rate": 4.943356763521414e-05, + "loss": 1.7224, + "step": 333 + }, + { + "epoch": 0.1619001454192923, + "grad_norm": 1.500104308128357, + "learning_rate": 4.942457575372443e-05, + "loss": 1.4062, + "step": 334 + }, + { + "epoch": 0.16238487639360155, + "grad_norm": 1.463844656944275, + "learning_rate": 4.941551389275217e-05, + "loss": 1.6271, + "step": 335 + }, + { + "epoch": 0.1628696073679108, + "grad_norm": 1.591209888458252, + "learning_rate": 4.940638207826074e-05, + "loss": 1.5389, + "step": 336 + }, + { + "epoch": 0.16335433834222007, + "grad_norm": 1.4040523767471313, + "learning_rate": 4.9397180336413915e-05, + "loss": 1.355, + "step": 337 + }, + { + "epoch": 0.16383906931652933, + "grad_norm": 1.4741119146347046, + "learning_rate": 4.938790869357587e-05, + "loss": 1.5721, + "step": 338 + }, + { + "epoch": 0.1643238002908386, + "grad_norm": 1.6420493125915527, + "learning_rate": 4.937856717631102e-05, + "loss": 1.7241, + "step": 339 + }, + { + "epoch": 0.16480853126514786, + "grad_norm": 1.6836682558059692, + "learning_rate": 4.936915581138398e-05, + "loss": 1.5049, + "step": 340 + }, + { + "epoch": 0.1652932622394571, + "grad_norm": 1.529339075088501, + "learning_rate": 4.935967462575949e-05, + "loss": 1.4003, + "step": 341 + }, + { + "epoch": 0.16577799321376635, + "grad_norm": 1.4345910549163818, + "learning_rate": 4.9350123646602356e-05, + "loss": 1.5476, + "step": 342 + }, + { + "epoch": 0.1662627241880756, + "grad_norm": 1.5849047899246216, + "learning_rate": 4.934050290127733e-05, + "loss": 1.6484, + "step": 343 + }, + { + "epoch": 0.16674745516238487, + "grad_norm": 1.5803264379501343, + "learning_rate": 4.933081241734905e-05, + "loss": 1.5366, + "step": 344 + }, + { + "epoch": 0.16723218613669413, + "grad_norm": 1.4626617431640625, + "learning_rate": 4.9321052222581976e-05, + "loss": 1.5589, + "step": 345 + }, + { + "epoch": 0.1677169171110034, + "grad_norm": 1.4863779544830322, + "learning_rate": 4.93112223449403e-05, + "loss": 1.3602, + "step": 346 + }, + { + "epoch": 0.16820164808531265, + "grad_norm": 3.0919888019561768, + "learning_rate": 4.930132281258785e-05, + "loss": 1.602, + "step": 347 + }, + { + "epoch": 0.16868637905962192, + "grad_norm": 1.61482834815979, + "learning_rate": 4.929135365388804e-05, + "loss": 1.594, + "step": 348 + }, + { + "epoch": 0.16917111003393118, + "grad_norm": 1.4651432037353516, + "learning_rate": 4.928131489740375e-05, + "loss": 1.353, + "step": 349 + }, + { + "epoch": 0.16965584100824044, + "grad_norm": 1.6565533876419067, + "learning_rate": 4.9271206571897286e-05, + "loss": 1.7263, + "step": 350 + }, + { + "epoch": 0.1701405719825497, + "grad_norm": 1.62041175365448, + "learning_rate": 4.926102870633029e-05, + "loss": 1.7091, + "step": 351 + }, + { + "epoch": 0.17062530295685893, + "grad_norm": 1.470937728881836, + "learning_rate": 4.9250781329863606e-05, + "loss": 1.4829, + "step": 352 + }, + { + "epoch": 0.1711100339311682, + "grad_norm": 1.6846436262130737, + "learning_rate": 4.924046447185726e-05, + "loss": 1.6661, + "step": 353 + }, + { + "epoch": 0.17159476490547745, + "grad_norm": 1.5719223022460938, + "learning_rate": 4.923007816187035e-05, + "loss": 1.3444, + "step": 354 + }, + { + "epoch": 0.17207949587978671, + "grad_norm": 1.6110163927078247, + "learning_rate": 4.921962242966097e-05, + "loss": 1.789, + "step": 355 + }, + { + "epoch": 0.17256422685409598, + "grad_norm": 1.6574710607528687, + "learning_rate": 4.9209097305186094e-05, + "loss": 1.4593, + "step": 356 + }, + { + "epoch": 0.17304895782840524, + "grad_norm": 1.5048030614852905, + "learning_rate": 4.9198502818601547e-05, + "loss": 1.3008, + "step": 357 + }, + { + "epoch": 0.1735336888027145, + "grad_norm": 1.6137561798095703, + "learning_rate": 4.918783900026184e-05, + "loss": 1.4641, + "step": 358 + }, + { + "epoch": 0.17401841977702376, + "grad_norm": 1.6105793714523315, + "learning_rate": 4.9177105880720173e-05, + "loss": 1.6433, + "step": 359 + }, + { + "epoch": 0.17450315075133302, + "grad_norm": 1.8704934120178223, + "learning_rate": 4.916630349072828e-05, + "loss": 1.9321, + "step": 360 + }, + { + "epoch": 0.17498788172564228, + "grad_norm": 1.8688757419586182, + "learning_rate": 4.915543186123636e-05, + "loss": 1.6639, + "step": 361 + }, + { + "epoch": 0.1754726126999515, + "grad_norm": 1.411590576171875, + "learning_rate": 4.9144491023393016e-05, + "loss": 1.4361, + "step": 362 + }, + { + "epoch": 0.17595734367426077, + "grad_norm": 1.503797173500061, + "learning_rate": 4.913348100854511e-05, + "loss": 1.4521, + "step": 363 + }, + { + "epoch": 0.17644207464857004, + "grad_norm": 2.0284149646759033, + "learning_rate": 4.912240184823772e-05, + "loss": 1.6762, + "step": 364 + }, + { + "epoch": 0.1769268056228793, + "grad_norm": 1.6562719345092773, + "learning_rate": 4.911125357421405e-05, + "loss": 1.5597, + "step": 365 + }, + { + "epoch": 0.17741153659718856, + "grad_norm": 1.6344953775405884, + "learning_rate": 4.9100036218415285e-05, + "loss": 1.6212, + "step": 366 + }, + { + "epoch": 0.17789626757149782, + "grad_norm": 1.5039825439453125, + "learning_rate": 4.908874981298057e-05, + "loss": 1.4205, + "step": 367 + }, + { + "epoch": 0.17838099854580708, + "grad_norm": 1.5786499977111816, + "learning_rate": 4.907739439024689e-05, + "loss": 1.6106, + "step": 368 + }, + { + "epoch": 0.17886572952011634, + "grad_norm": 1.7766309976577759, + "learning_rate": 4.9065969982748946e-05, + "loss": 1.5542, + "step": 369 + }, + { + "epoch": 0.1793504604944256, + "grad_norm": 1.6327425241470337, + "learning_rate": 4.9054476623219104e-05, + "loss": 1.5334, + "step": 370 + }, + { + "epoch": 0.17983519146873486, + "grad_norm": 1.6132947206497192, + "learning_rate": 4.904291434458729e-05, + "loss": 1.5541, + "step": 371 + }, + { + "epoch": 0.18031992244304412, + "grad_norm": 1.458174467086792, + "learning_rate": 4.9031283179980874e-05, + "loss": 1.3287, + "step": 372 + }, + { + "epoch": 0.18080465341735336, + "grad_norm": 1.5462307929992676, + "learning_rate": 4.901958316272462e-05, + "loss": 1.6874, + "step": 373 + }, + { + "epoch": 0.18128938439166262, + "grad_norm": 1.3828489780426025, + "learning_rate": 4.9007814326340544e-05, + "loss": 1.4119, + "step": 374 + }, + { + "epoch": 0.18177411536597188, + "grad_norm": 1.6115537881851196, + "learning_rate": 4.899597670454785e-05, + "loss": 1.6936, + "step": 375 + }, + { + "epoch": 0.18225884634028114, + "grad_norm": 1.5310516357421875, + "learning_rate": 4.89840703312628e-05, + "loss": 1.4026, + "step": 376 + }, + { + "epoch": 0.1827435773145904, + "grad_norm": 1.604029655456543, + "learning_rate": 4.897209524059866e-05, + "loss": 1.5988, + "step": 377 + }, + { + "epoch": 0.18322830828889966, + "grad_norm": 1.6310497522354126, + "learning_rate": 4.896005146686558e-05, + "loss": 1.5445, + "step": 378 + }, + { + "epoch": 0.18371303926320892, + "grad_norm": 1.655869483947754, + "learning_rate": 4.8947939044570467e-05, + "loss": 1.5755, + "step": 379 + }, + { + "epoch": 0.18419777023751818, + "grad_norm": 1.5540688037872314, + "learning_rate": 4.893575800841695e-05, + "loss": 1.5128, + "step": 380 + }, + { + "epoch": 0.18468250121182744, + "grad_norm": 1.5631048679351807, + "learning_rate": 4.892350839330522e-05, + "loss": 1.645, + "step": 381 + }, + { + "epoch": 0.1851672321861367, + "grad_norm": 1.522678017616272, + "learning_rate": 4.891119023433198e-05, + "loss": 1.5037, + "step": 382 + }, + { + "epoch": 0.18565196316044597, + "grad_norm": 1.7026262283325195, + "learning_rate": 4.8898803566790296e-05, + "loss": 1.6548, + "step": 383 + }, + { + "epoch": 0.1861366941347552, + "grad_norm": 2.272061824798584, + "learning_rate": 4.888634842616953e-05, + "loss": 1.6816, + "step": 384 + }, + { + "epoch": 0.18662142510906446, + "grad_norm": 1.7039604187011719, + "learning_rate": 4.887382484815522e-05, + "loss": 1.8058, + "step": 385 + }, + { + "epoch": 0.18710615608337372, + "grad_norm": 1.421434760093689, + "learning_rate": 4.8861232868628994e-05, + "loss": 1.2899, + "step": 386 + }, + { + "epoch": 0.18759088705768298, + "grad_norm": 1.4786161184310913, + "learning_rate": 4.884857252366847e-05, + "loss": 1.5044, + "step": 387 + }, + { + "epoch": 0.18807561803199224, + "grad_norm": 1.6115511655807495, + "learning_rate": 4.8835843849547126e-05, + "loss": 1.4588, + "step": 388 + }, + { + "epoch": 0.1885603490063015, + "grad_norm": 1.4734134674072266, + "learning_rate": 4.88230468827342e-05, + "loss": 1.5804, + "step": 389 + }, + { + "epoch": 0.18904507998061076, + "grad_norm": 1.677364468574524, + "learning_rate": 4.8810181659894635e-05, + "loss": 1.5134, + "step": 390 + }, + { + "epoch": 0.18952981095492002, + "grad_norm": 1.5314624309539795, + "learning_rate": 4.879724821788889e-05, + "loss": 1.7543, + "step": 391 + }, + { + "epoch": 0.19001454192922929, + "grad_norm": 1.8517158031463623, + "learning_rate": 4.878424659377292e-05, + "loss": 1.641, + "step": 392 + }, + { + "epoch": 0.19049927290353855, + "grad_norm": 1.5239332914352417, + "learning_rate": 4.8771176824798006e-05, + "loss": 1.2542, + "step": 393 + }, + { + "epoch": 0.1909840038778478, + "grad_norm": 1.6505666971206665, + "learning_rate": 4.875803894841069e-05, + "loss": 1.5467, + "step": 394 + }, + { + "epoch": 0.19146873485215704, + "grad_norm": 1.5365902185440063, + "learning_rate": 4.8744833002252625e-05, + "loss": 1.3391, + "step": 395 + }, + { + "epoch": 0.1919534658264663, + "grad_norm": 1.8492168188095093, + "learning_rate": 4.8731559024160524e-05, + "loss": 1.6368, + "step": 396 + }, + { + "epoch": 0.19243819680077556, + "grad_norm": 1.471468210220337, + "learning_rate": 4.8718217052165985e-05, + "loss": 1.2187, + "step": 397 + }, + { + "epoch": 0.19292292777508482, + "grad_norm": 1.6151539087295532, + "learning_rate": 4.870480712449546e-05, + "loss": 1.6523, + "step": 398 + }, + { + "epoch": 0.19340765874939408, + "grad_norm": 1.5233840942382812, + "learning_rate": 4.869132927957007e-05, + "loss": 1.6125, + "step": 399 + }, + { + "epoch": 0.19389238972370335, + "grad_norm": 1.4746246337890625, + "learning_rate": 4.8677783556005515e-05, + "loss": 1.5412, + "step": 400 + }, + { + "epoch": 0.1943771206980126, + "grad_norm": 1.7926899194717407, + "learning_rate": 4.8664169992612035e-05, + "loss": 1.7692, + "step": 401 + }, + { + "epoch": 0.19486185167232187, + "grad_norm": 1.5407027006149292, + "learning_rate": 4.865048862839417e-05, + "loss": 1.3119, + "step": 402 + }, + { + "epoch": 0.19534658264663113, + "grad_norm": 1.5474193096160889, + "learning_rate": 4.8636739502550775e-05, + "loss": 1.4238, + "step": 403 + }, + { + "epoch": 0.1958313136209404, + "grad_norm": 1.5824707746505737, + "learning_rate": 4.862292265447481e-05, + "loss": 1.5708, + "step": 404 + }, + { + "epoch": 0.19631604459524965, + "grad_norm": 1.5270860195159912, + "learning_rate": 4.860903812375329e-05, + "loss": 1.5826, + "step": 405 + }, + { + "epoch": 0.19680077556955888, + "grad_norm": 1.4273207187652588, + "learning_rate": 4.859508595016713e-05, + "loss": 1.3348, + "step": 406 + }, + { + "epoch": 0.19728550654386814, + "grad_norm": 1.4353327751159668, + "learning_rate": 4.8581066173691074e-05, + "loss": 1.2723, + "step": 407 + }, + { + "epoch": 0.1977702375181774, + "grad_norm": 1.475465178489685, + "learning_rate": 4.856697883449355e-05, + "loss": 1.5006, + "step": 408 + }, + { + "epoch": 0.19825496849248667, + "grad_norm": 1.4205036163330078, + "learning_rate": 4.8552823972936545e-05, + "loss": 1.2869, + "step": 409 + }, + { + "epoch": 0.19873969946679593, + "grad_norm": 1.5155439376831055, + "learning_rate": 4.853860162957552e-05, + "loss": 1.4603, + "step": 410 + }, + { + "epoch": 0.1992244304411052, + "grad_norm": 1.5754328966140747, + "learning_rate": 4.8524311845159286e-05, + "loss": 1.4744, + "step": 411 + }, + { + "epoch": 0.19970916141541445, + "grad_norm": 1.5085046291351318, + "learning_rate": 4.850995466062988e-05, + "loss": 1.4843, + "step": 412 + }, + { + "epoch": 0.2001938923897237, + "grad_norm": 1.3275054693222046, + "learning_rate": 4.849553011712241e-05, + "loss": 1.4272, + "step": 413 + }, + { + "epoch": 0.20067862336403297, + "grad_norm": 1.424777626991272, + "learning_rate": 4.848103825596504e-05, + "loss": 1.4182, + "step": 414 + }, + { + "epoch": 0.20116335433834223, + "grad_norm": 1.498131513595581, + "learning_rate": 4.8466479118678766e-05, + "loss": 1.4618, + "step": 415 + }, + { + "epoch": 0.20164808531265146, + "grad_norm": 1.4873474836349487, + "learning_rate": 4.845185274697734e-05, + "loss": 1.2547, + "step": 416 + }, + { + "epoch": 0.20213281628696073, + "grad_norm": 1.4218641519546509, + "learning_rate": 4.843715918276717e-05, + "loss": 1.2078, + "step": 417 + }, + { + "epoch": 0.20261754726127, + "grad_norm": 1.585609793663025, + "learning_rate": 4.842239846814716e-05, + "loss": 1.3015, + "step": 418 + }, + { + "epoch": 0.20310227823557925, + "grad_norm": 1.5259923934936523, + "learning_rate": 4.840757064540862e-05, + "loss": 1.8969, + "step": 419 + }, + { + "epoch": 0.2035870092098885, + "grad_norm": 1.5763602256774902, + "learning_rate": 4.8392675757035114e-05, + "loss": 1.6627, + "step": 420 + }, + { + "epoch": 0.20407174018419777, + "grad_norm": 1.4582678079605103, + "learning_rate": 4.837771384570238e-05, + "loss": 1.4835, + "step": 421 + }, + { + "epoch": 0.20455647115850703, + "grad_norm": 1.6100239753723145, + "learning_rate": 4.8362684954278174e-05, + "loss": 1.3861, + "step": 422 + }, + { + "epoch": 0.2050412021328163, + "grad_norm": 1.6313788890838623, + "learning_rate": 4.834758912582217e-05, + "loss": 1.4019, + "step": 423 + }, + { + "epoch": 0.20552593310712555, + "grad_norm": 1.7470930814743042, + "learning_rate": 4.8332426403585805e-05, + "loss": 1.7705, + "step": 424 + }, + { + "epoch": 0.2060106640814348, + "grad_norm": 1.4268667697906494, + "learning_rate": 4.831719683101219e-05, + "loss": 1.4434, + "step": 425 + }, + { + "epoch": 0.20649539505574407, + "grad_norm": 1.5203051567077637, + "learning_rate": 4.830190045173596e-05, + "loss": 1.7992, + "step": 426 + }, + { + "epoch": 0.2069801260300533, + "grad_norm": 4.344082355499268, + "learning_rate": 4.828653730958318e-05, + "loss": 1.4597, + "step": 427 + }, + { + "epoch": 0.20746485700436257, + "grad_norm": 1.5200762748718262, + "learning_rate": 4.827110744857117e-05, + "loss": 1.4586, + "step": 428 + }, + { + "epoch": 0.20794958797867183, + "grad_norm": 1.4414129257202148, + "learning_rate": 4.825561091290844e-05, + "loss": 1.2286, + "step": 429 + }, + { + "epoch": 0.2084343189529811, + "grad_norm": 1.9879165887832642, + "learning_rate": 4.82400477469945e-05, + "loss": 1.6122, + "step": 430 + }, + { + "epoch": 0.20891904992729035, + "grad_norm": 1.4587631225585938, + "learning_rate": 4.822441799541979e-05, + "loss": 1.5816, + "step": 431 + }, + { + "epoch": 0.2094037809015996, + "grad_norm": 1.6219898462295532, + "learning_rate": 4.82087217029655e-05, + "loss": 1.4554, + "step": 432 + }, + { + "epoch": 0.20988851187590887, + "grad_norm": 1.6090373992919922, + "learning_rate": 4.819295891460349e-05, + "loss": 1.8978, + "step": 433 + }, + { + "epoch": 0.21037324285021813, + "grad_norm": 1.6244341135025024, + "learning_rate": 4.817712967549614e-05, + "loss": 1.5749, + "step": 434 + }, + { + "epoch": 0.2108579738245274, + "grad_norm": 1.5403733253479004, + "learning_rate": 4.8161234030996204e-05, + "loss": 1.5755, + "step": 435 + }, + { + "epoch": 0.21134270479883666, + "grad_norm": 1.6046067476272583, + "learning_rate": 4.81452720266467e-05, + "loss": 1.8461, + "step": 436 + }, + { + "epoch": 0.21182743577314592, + "grad_norm": 1.4461714029312134, + "learning_rate": 4.8129243708180785e-05, + "loss": 1.4177, + "step": 437 + }, + { + "epoch": 0.21231216674745515, + "grad_norm": 1.6043570041656494, + "learning_rate": 4.81131491215216e-05, + "loss": 1.5736, + "step": 438 + }, + { + "epoch": 0.2127968977217644, + "grad_norm": 1.4942891597747803, + "learning_rate": 4.8096988312782174e-05, + "loss": 1.3317, + "step": 439 + }, + { + "epoch": 0.21328162869607367, + "grad_norm": 1.5307039022445679, + "learning_rate": 4.808076132826524e-05, + "loss": 1.493, + "step": 440 + }, + { + "epoch": 0.21376635967038293, + "grad_norm": 1.3940438032150269, + "learning_rate": 4.806446821446317e-05, + "loss": 1.4157, + "step": 441 + }, + { + "epoch": 0.2142510906446922, + "grad_norm": 1.4989084005355835, + "learning_rate": 4.8048109018057776e-05, + "loss": 1.6034, + "step": 442 + }, + { + "epoch": 0.21473582161900145, + "grad_norm": 1.5145782232284546, + "learning_rate": 4.80316837859202e-05, + "loss": 1.6784, + "step": 443 + }, + { + "epoch": 0.21522055259331072, + "grad_norm": 1.4529794454574585, + "learning_rate": 4.801519256511082e-05, + "loss": 1.4246, + "step": 444 + }, + { + "epoch": 0.21570528356761998, + "grad_norm": 1.5570049285888672, + "learning_rate": 4.799863540287905e-05, + "loss": 1.582, + "step": 445 + }, + { + "epoch": 0.21619001454192924, + "grad_norm": 1.42445707321167, + "learning_rate": 4.798201234666324e-05, + "loss": 1.3466, + "step": 446 + }, + { + "epoch": 0.2166747455162385, + "grad_norm": 1.5558465719223022, + "learning_rate": 4.796532344409055e-05, + "loss": 1.5421, + "step": 447 + }, + { + "epoch": 0.21715947649054776, + "grad_norm": 4.552513599395752, + "learning_rate": 4.794856874297676e-05, + "loss": 2.2261, + "step": 448 + }, + { + "epoch": 0.217644207464857, + "grad_norm": 1.374784231185913, + "learning_rate": 4.793174829132623e-05, + "loss": 1.5355, + "step": 449 + }, + { + "epoch": 0.21812893843916625, + "grad_norm": 1.9197838306427002, + "learning_rate": 4.791486213733164e-05, + "loss": 1.7625, + "step": 450 + }, + { + "epoch": 0.21861366941347551, + "grad_norm": 1.4595524072647095, + "learning_rate": 4.789791032937397e-05, + "loss": 1.2965, + "step": 451 + }, + { + "epoch": 0.21909840038778478, + "grad_norm": 1.4804373979568481, + "learning_rate": 4.7880892916022265e-05, + "loss": 1.3907, + "step": 452 + }, + { + "epoch": 0.21958313136209404, + "grad_norm": 1.4374562501907349, + "learning_rate": 4.786380994603356e-05, + "loss": 1.4291, + "step": 453 + }, + { + "epoch": 0.2200678623364033, + "grad_norm": 1.697147250175476, + "learning_rate": 4.7846661468352716e-05, + "loss": 1.8611, + "step": 454 + }, + { + "epoch": 0.22055259331071256, + "grad_norm": 1.5372896194458008, + "learning_rate": 4.782944753211228e-05, + "loss": 1.5396, + "step": 455 + }, + { + "epoch": 0.22103732428502182, + "grad_norm": 1.520835518836975, + "learning_rate": 4.781216818663234e-05, + "loss": 1.5921, + "step": 456 + }, + { + "epoch": 0.22152205525933108, + "grad_norm": 1.7574188709259033, + "learning_rate": 4.7794823481420406e-05, + "loss": 1.644, + "step": 457 + }, + { + "epoch": 0.22200678623364034, + "grad_norm": 1.384774923324585, + "learning_rate": 4.7777413466171227e-05, + "loss": 1.542, + "step": 458 + }, + { + "epoch": 0.2224915172079496, + "grad_norm": 1.520736575126648, + "learning_rate": 4.7759938190766694e-05, + "loss": 1.674, + "step": 459 + }, + { + "epoch": 0.22297624818225883, + "grad_norm": 1.4717646837234497, + "learning_rate": 4.7742397705275665e-05, + "loss": 1.3583, + "step": 460 + }, + { + "epoch": 0.2234609791565681, + "grad_norm": 1.4752681255340576, + "learning_rate": 4.772479205995385e-05, + "loss": 1.4931, + "step": 461 + }, + { + "epoch": 0.22394571013087736, + "grad_norm": 1.5523077249526978, + "learning_rate": 4.7707121305243623e-05, + "loss": 1.4731, + "step": 462 + }, + { + "epoch": 0.22443044110518662, + "grad_norm": 1.4732838869094849, + "learning_rate": 4.768938549177393e-05, + "loss": 1.4059, + "step": 463 + }, + { + "epoch": 0.22491517207949588, + "grad_norm": 1.555053472518921, + "learning_rate": 4.7671584670360105e-05, + "loss": 1.3137, + "step": 464 + }, + { + "epoch": 0.22539990305380514, + "grad_norm": 1.4933258295059204, + "learning_rate": 4.765371889200373e-05, + "loss": 1.4378, + "step": 465 + }, + { + "epoch": 0.2258846340281144, + "grad_norm": 1.409632921218872, + "learning_rate": 4.763578820789253e-05, + "loss": 1.4902, + "step": 466 + }, + { + "epoch": 0.22636936500242366, + "grad_norm": 1.4202258586883545, + "learning_rate": 4.761779266940015e-05, + "loss": 1.3832, + "step": 467 + }, + { + "epoch": 0.22685409597673292, + "grad_norm": 1.3944295644760132, + "learning_rate": 4.759973232808609e-05, + "loss": 1.5828, + "step": 468 + }, + { + "epoch": 0.22733882695104218, + "grad_norm": 1.5394325256347656, + "learning_rate": 4.758160723569548e-05, + "loss": 1.207, + "step": 469 + }, + { + "epoch": 0.22782355792535142, + "grad_norm": 1.7582964897155762, + "learning_rate": 4.756341744415901e-05, + "loss": 1.7935, + "step": 470 + }, + { + "epoch": 0.22830828889966068, + "grad_norm": 1.4797354936599731, + "learning_rate": 4.754516300559271e-05, + "loss": 1.5384, + "step": 471 + }, + { + "epoch": 0.22879301987396994, + "grad_norm": 1.5122283697128296, + "learning_rate": 4.752684397229784e-05, + "loss": 1.8131, + "step": 472 + }, + { + "epoch": 0.2292777508482792, + "grad_norm": 1.4401196241378784, + "learning_rate": 4.750846039676075e-05, + "loss": 1.5188, + "step": 473 + }, + { + "epoch": 0.22976248182258846, + "grad_norm": 1.4823417663574219, + "learning_rate": 4.7490012331652675e-05, + "loss": 1.2987, + "step": 474 + }, + { + "epoch": 0.23024721279689772, + "grad_norm": 1.5985757112503052, + "learning_rate": 4.7471499829829666e-05, + "loss": 1.7577, + "step": 475 + }, + { + "epoch": 0.23073194377120698, + "grad_norm": 1.5306726694107056, + "learning_rate": 4.7452922944332355e-05, + "loss": 1.4455, + "step": 476 + }, + { + "epoch": 0.23121667474551624, + "grad_norm": 1.585697889328003, + "learning_rate": 4.7434281728385867e-05, + "loss": 1.4526, + "step": 477 + }, + { + "epoch": 0.2317014057198255, + "grad_norm": 1.5287288427352905, + "learning_rate": 4.741557623539962e-05, + "loss": 1.8082, + "step": 478 + }, + { + "epoch": 0.23218613669413476, + "grad_norm": 1.835727334022522, + "learning_rate": 4.73968065189672e-05, + "loss": 1.5274, + "step": 479 + }, + { + "epoch": 0.23267086766844403, + "grad_norm": 1.5724575519561768, + "learning_rate": 4.7377972632866226e-05, + "loss": 1.8953, + "step": 480 + }, + { + "epoch": 0.23315559864275326, + "grad_norm": 1.598510503768921, + "learning_rate": 4.7359074631058134e-05, + "loss": 1.7241, + "step": 481 + }, + { + "epoch": 0.23364032961706252, + "grad_norm": 1.5090588331222534, + "learning_rate": 4.7340112567688085e-05, + "loss": 1.3641, + "step": 482 + }, + { + "epoch": 0.23412506059137178, + "grad_norm": 1.5519758462905884, + "learning_rate": 4.732108649708478e-05, + "loss": 1.5011, + "step": 483 + }, + { + "epoch": 0.23460979156568104, + "grad_norm": 1.5860713720321655, + "learning_rate": 4.7301996473760304e-05, + "loss": 1.5786, + "step": 484 + }, + { + "epoch": 0.2350945225399903, + "grad_norm": 1.534163236618042, + "learning_rate": 4.728284255240996e-05, + "loss": 1.4667, + "step": 485 + }, + { + "epoch": 0.23557925351429956, + "grad_norm": 1.3986639976501465, + "learning_rate": 4.726362478791217e-05, + "loss": 1.4603, + "step": 486 + }, + { + "epoch": 0.23606398448860882, + "grad_norm": 1.4852182865142822, + "learning_rate": 4.724434323532821e-05, + "loss": 1.6303, + "step": 487 + }, + { + "epoch": 0.23654871546291809, + "grad_norm": 1.4321929216384888, + "learning_rate": 4.7224997949902186e-05, + "loss": 1.3958, + "step": 488 + }, + { + "epoch": 0.23703344643722735, + "grad_norm": 1.8100051879882812, + "learning_rate": 4.720558898706077e-05, + "loss": 1.4998, + "step": 489 + }, + { + "epoch": 0.2375181774115366, + "grad_norm": 1.4117873907089233, + "learning_rate": 4.7186116402413064e-05, + "loss": 1.3552, + "step": 490 + }, + { + "epoch": 0.23800290838584587, + "grad_norm": 1.5080662965774536, + "learning_rate": 4.716658025175049e-05, + "loss": 1.5167, + "step": 491 + }, + { + "epoch": 0.2384876393601551, + "grad_norm": 1.5356587171554565, + "learning_rate": 4.714698059104658e-05, + "loss": 1.3864, + "step": 492 + }, + { + "epoch": 0.23897237033446436, + "grad_norm": 1.686954140663147, + "learning_rate": 4.712731747645682e-05, + "loss": 1.6785, + "step": 493 + }, + { + "epoch": 0.23945710130877362, + "grad_norm": 1.4700133800506592, + "learning_rate": 4.7107590964318505e-05, + "loss": 1.2093, + "step": 494 + }, + { + "epoch": 0.23994183228308288, + "grad_norm": 1.4735498428344727, + "learning_rate": 4.708780111115057e-05, + "loss": 1.6332, + "step": 495 + }, + { + "epoch": 0.24042656325739215, + "grad_norm": 1.572034239768982, + "learning_rate": 4.706794797365346e-05, + "loss": 1.5175, + "step": 496 + }, + { + "epoch": 0.2409112942317014, + "grad_norm": 1.4558137655258179, + "learning_rate": 4.7048031608708876e-05, + "loss": 1.2924, + "step": 497 + }, + { + "epoch": 0.24139602520601067, + "grad_norm": 1.6685311794281006, + "learning_rate": 4.702805207337974e-05, + "loss": 1.0251, + "step": 498 + }, + { + "epoch": 0.24188075618031993, + "grad_norm": 1.5901682376861572, + "learning_rate": 4.7008009424909917e-05, + "loss": 2.1593, + "step": 499 + }, + { + "epoch": 0.2423654871546292, + "grad_norm": 1.4795153141021729, + "learning_rate": 4.698790372072411e-05, + "loss": 1.7071, + "step": 500 + }, + { + "epoch": 0.24285021812893845, + "grad_norm": 1.9734545946121216, + "learning_rate": 4.696773501842771e-05, + "loss": 1.4482, + "step": 501 + }, + { + "epoch": 0.2433349491032477, + "grad_norm": 1.4583733081817627, + "learning_rate": 4.694750337580659e-05, + "loss": 1.5411, + "step": 502 + }, + { + "epoch": 0.24381968007755694, + "grad_norm": 1.5093872547149658, + "learning_rate": 4.6927208850826925e-05, + "loss": 1.414, + "step": 503 + }, + { + "epoch": 0.2443044110518662, + "grad_norm": 1.376407504081726, + "learning_rate": 4.6906851501635106e-05, + "loss": 1.2979, + "step": 504 + }, + { + "epoch": 0.24478914202617547, + "grad_norm": 1.5103840827941895, + "learning_rate": 4.688643138655748e-05, + "loss": 1.502, + "step": 505 + }, + { + "epoch": 0.24527387300048473, + "grad_norm": 1.4392768144607544, + "learning_rate": 4.686594856410027e-05, + "loss": 1.4235, + "step": 506 + }, + { + "epoch": 0.245758603974794, + "grad_norm": 1.5379340648651123, + "learning_rate": 4.684540309294932e-05, + "loss": 1.5077, + "step": 507 + }, + { + "epoch": 0.24624333494910325, + "grad_norm": 1.5546420812606812, + "learning_rate": 4.682479503197001e-05, + "loss": 1.5388, + "step": 508 + }, + { + "epoch": 0.2467280659234125, + "grad_norm": 1.6801774501800537, + "learning_rate": 4.6804124440207e-05, + "loss": 1.7391, + "step": 509 + }, + { + "epoch": 0.24721279689772177, + "grad_norm": 1.4841091632843018, + "learning_rate": 4.678339137688416e-05, + "loss": 1.72, + "step": 510 + }, + { + "epoch": 0.24769752787203103, + "grad_norm": 1.4708201885223389, + "learning_rate": 4.67625959014043e-05, + "loss": 1.5798, + "step": 511 + }, + { + "epoch": 0.2481822588463403, + "grad_norm": 1.5190343856811523, + "learning_rate": 4.67417380733491e-05, + "loss": 1.6462, + "step": 512 + }, + { + "epoch": 0.24866698982064955, + "grad_norm": 1.479689121246338, + "learning_rate": 4.6720817952478854e-05, + "loss": 1.5912, + "step": 513 + }, + { + "epoch": 0.2491517207949588, + "grad_norm": 1.4845134019851685, + "learning_rate": 4.6699835598732325e-05, + "loss": 1.4543, + "step": 514 + }, + { + "epoch": 0.24963645176926805, + "grad_norm": 1.5402343273162842, + "learning_rate": 4.667879107222662e-05, + "loss": 1.4791, + "step": 515 + }, + { + "epoch": 0.2501211827435773, + "grad_norm": 1.627976417541504, + "learning_rate": 4.6657684433256934e-05, + "loss": 1.4067, + "step": 516 + }, + { + "epoch": 0.2506059137178866, + "grad_norm": 1.530739665031433, + "learning_rate": 4.6636515742296464e-05, + "loss": 1.9333, + "step": 517 + }, + { + "epoch": 0.25109064469219583, + "grad_norm": 1.4607384204864502, + "learning_rate": 4.661528505999615e-05, + "loss": 1.537, + "step": 518 + }, + { + "epoch": 0.25157537566650506, + "grad_norm": 1.5360444784164429, + "learning_rate": 4.6593992447184586e-05, + "loss": 1.3789, + "step": 519 + }, + { + "epoch": 0.25206010664081435, + "grad_norm": 1.4390000104904175, + "learning_rate": 4.6572637964867776e-05, + "loss": 1.4374, + "step": 520 + }, + { + "epoch": 0.2525448376151236, + "grad_norm": 1.4811269044876099, + "learning_rate": 4.6551221674229003e-05, + "loss": 1.561, + "step": 521 + }, + { + "epoch": 0.2530295685894329, + "grad_norm": 1.4047800302505493, + "learning_rate": 4.652974363662864e-05, + "loss": 1.2907, + "step": 522 + }, + { + "epoch": 0.2535142995637421, + "grad_norm": 1.6080119609832764, + "learning_rate": 4.650820391360396e-05, + "loss": 1.4566, + "step": 523 + }, + { + "epoch": 0.2539990305380514, + "grad_norm": 1.5174490213394165, + "learning_rate": 4.6486602566868975e-05, + "loss": 1.4285, + "step": 524 + }, + { + "epoch": 0.25448376151236063, + "grad_norm": 1.5314956903457642, + "learning_rate": 4.6464939658314274e-05, + "loss": 1.3154, + "step": 525 + }, + { + "epoch": 0.2549684924866699, + "grad_norm": 1.4693747758865356, + "learning_rate": 4.6443215250006806e-05, + "loss": 1.4333, + "step": 526 + }, + { + "epoch": 0.25545322346097915, + "grad_norm": 1.5617380142211914, + "learning_rate": 4.642142940418973e-05, + "loss": 1.4411, + "step": 527 + }, + { + "epoch": 0.25593795443528844, + "grad_norm": 1.4139310121536255, + "learning_rate": 4.6399582183282256e-05, + "loss": 1.6047, + "step": 528 + }, + { + "epoch": 0.2564226854095977, + "grad_norm": 1.6654372215270996, + "learning_rate": 4.6377673649879396e-05, + "loss": 1.7693, + "step": 529 + }, + { + "epoch": 0.2569074163839069, + "grad_norm": 1.595267415046692, + "learning_rate": 4.635570386675186e-05, + "loss": 1.458, + "step": 530 + }, + { + "epoch": 0.2573921473582162, + "grad_norm": 1.4914833307266235, + "learning_rate": 4.633367289684586e-05, + "loss": 1.4662, + "step": 531 + }, + { + "epoch": 0.2578768783325254, + "grad_norm": 1.764076828956604, + "learning_rate": 4.631158080328287e-05, + "loss": 2.023, + "step": 532 + }, + { + "epoch": 0.2583616093068347, + "grad_norm": 1.4962358474731445, + "learning_rate": 4.628942764935954e-05, + "loss": 1.1698, + "step": 533 + }, + { + "epoch": 0.25884634028114395, + "grad_norm": 1.628941535949707, + "learning_rate": 4.626721349854742e-05, + "loss": 1.6603, + "step": 534 + }, + { + "epoch": 0.25933107125545324, + "grad_norm": 1.473333477973938, + "learning_rate": 4.6244938414492875e-05, + "loss": 1.4744, + "step": 535 + }, + { + "epoch": 0.25981580222976247, + "grad_norm": 1.5255992412567139, + "learning_rate": 4.62226024610168e-05, + "loss": 1.6574, + "step": 536 + }, + { + "epoch": 0.26030053320407176, + "grad_norm": 1.4754759073257446, + "learning_rate": 4.6200205702114526e-05, + "loss": 1.3515, + "step": 537 + }, + { + "epoch": 0.260785264178381, + "grad_norm": 1.539143681526184, + "learning_rate": 4.617774820195557e-05, + "loss": 1.361, + "step": 538 + }, + { + "epoch": 0.2612699951526903, + "grad_norm": 1.4834741353988647, + "learning_rate": 4.615523002488352e-05, + "loss": 1.4823, + "step": 539 + }, + { + "epoch": 0.2617547261269995, + "grad_norm": 1.541680932044983, + "learning_rate": 4.6132651235415764e-05, + "loss": 1.6218, + "step": 540 + }, + { + "epoch": 0.26223945710130875, + "grad_norm": 1.802275538444519, + "learning_rate": 4.6110011898243374e-05, + "loss": 1.2527, + "step": 541 + }, + { + "epoch": 0.26272418807561804, + "grad_norm": 1.4547120332717896, + "learning_rate": 4.608731207823093e-05, + "loss": 1.2674, + "step": 542 + }, + { + "epoch": 0.26320891904992727, + "grad_norm": 1.4830158948898315, + "learning_rate": 4.606455184041622e-05, + "loss": 1.3814, + "step": 543 + }, + { + "epoch": 0.26369365002423656, + "grad_norm": 1.5940104722976685, + "learning_rate": 4.6041731250010246e-05, + "loss": 1.644, + "step": 544 + }, + { + "epoch": 0.2641783809985458, + "grad_norm": 1.4886668920516968, + "learning_rate": 4.601885037239683e-05, + "loss": 1.3262, + "step": 545 + }, + { + "epoch": 0.2646631119728551, + "grad_norm": 1.414720058441162, + "learning_rate": 4.5995909273132587e-05, + "loss": 1.3218, + "step": 546 + }, + { + "epoch": 0.2651478429471643, + "grad_norm": 1.3138704299926758, + "learning_rate": 4.597290801794664e-05, + "loss": 1.161, + "step": 547 + }, + { + "epoch": 0.2656325739214736, + "grad_norm": 1.401702880859375, + "learning_rate": 4.594984667274048e-05, + "loss": 1.6609, + "step": 548 + }, + { + "epoch": 0.26611730489578284, + "grad_norm": 1.4239600896835327, + "learning_rate": 4.592672530358777e-05, + "loss": 1.4492, + "step": 549 + }, + { + "epoch": 0.2666020358700921, + "grad_norm": 1.3764070272445679, + "learning_rate": 4.5903543976734145e-05, + "loss": 1.1719, + "step": 550 + }, + { + "epoch": 0.26708676684440136, + "grad_norm": 1.6861745119094849, + "learning_rate": 4.5880302758597e-05, + "loss": 1.4993, + "step": 551 + }, + { + "epoch": 0.2675714978187106, + "grad_norm": 1.3549953699111938, + "learning_rate": 4.585700171576538e-05, + "loss": 1.2464, + "step": 552 + }, + { + "epoch": 0.2680562287930199, + "grad_norm": 1.4663500785827637, + "learning_rate": 4.583364091499968e-05, + "loss": 1.7526, + "step": 553 + }, + { + "epoch": 0.2685409597673291, + "grad_norm": 1.8229775428771973, + "learning_rate": 4.581022042323155e-05, + "loss": 1.2188, + "step": 554 + }, + { + "epoch": 0.2690256907416384, + "grad_norm": 1.5518858432769775, + "learning_rate": 4.5786740307563636e-05, + "loss": 1.4877, + "step": 555 + }, + { + "epoch": 0.26951042171594763, + "grad_norm": 1.3994567394256592, + "learning_rate": 4.576320063526942e-05, + "loss": 1.1313, + "step": 556 + }, + { + "epoch": 0.2699951526902569, + "grad_norm": 1.5038384199142456, + "learning_rate": 4.573960147379304e-05, + "loss": 1.4847, + "step": 557 + }, + { + "epoch": 0.27047988366456616, + "grad_norm": 1.4688293933868408, + "learning_rate": 4.5715942890749045e-05, + "loss": 1.3024, + "step": 558 + }, + { + "epoch": 0.27096461463887545, + "grad_norm": 1.4721691608428955, + "learning_rate": 4.5692224953922266e-05, + "loss": 1.7076, + "step": 559 + }, + { + "epoch": 0.2714493456131847, + "grad_norm": 1.452632188796997, + "learning_rate": 4.566844773126757e-05, + "loss": 1.4929, + "step": 560 + }, + { + "epoch": 0.27193407658749397, + "grad_norm": 1.4588583707809448, + "learning_rate": 4.564461129090969e-05, + "loss": 1.2069, + "step": 561 + }, + { + "epoch": 0.2724188075618032, + "grad_norm": 1.426776647567749, + "learning_rate": 4.562071570114304e-05, + "loss": 1.2884, + "step": 562 + }, + { + "epoch": 0.27290353853611243, + "grad_norm": 1.4680659770965576, + "learning_rate": 4.5596761030431465e-05, + "loss": 1.8103, + "step": 563 + }, + { + "epoch": 0.2733882695104217, + "grad_norm": 1.7482470273971558, + "learning_rate": 4.557274734740813e-05, + "loss": 1.1736, + "step": 564 + }, + { + "epoch": 0.27387300048473096, + "grad_norm": 1.5107123851776123, + "learning_rate": 4.554867472087525e-05, + "loss": 1.6481, + "step": 565 + }, + { + "epoch": 0.27435773145904024, + "grad_norm": 1.5711710453033447, + "learning_rate": 4.552454321980394e-05, + "loss": 1.2276, + "step": 566 + }, + { + "epoch": 0.2748424624333495, + "grad_norm": 1.6438961029052734, + "learning_rate": 4.5500352913333974e-05, + "loss": 1.4729, + "step": 567 + }, + { + "epoch": 0.27532719340765877, + "grad_norm": 1.5576547384262085, + "learning_rate": 4.547610387077363e-05, + "loss": 1.6697, + "step": 568 + }, + { + "epoch": 0.275811924381968, + "grad_norm": 1.587242841720581, + "learning_rate": 4.5451796161599466e-05, + "loss": 1.727, + "step": 569 + }, + { + "epoch": 0.2762966553562773, + "grad_norm": 1.4595431089401245, + "learning_rate": 4.5427429855456125e-05, + "loss": 1.5131, + "step": 570 + }, + { + "epoch": 0.2767813863305865, + "grad_norm": 1.8416346311569214, + "learning_rate": 4.5403005022156145e-05, + "loss": 1.241, + "step": 571 + }, + { + "epoch": 0.2772661173048958, + "grad_norm": 1.4765833616256714, + "learning_rate": 4.5378521731679735e-05, + "loss": 1.268, + "step": 572 + }, + { + "epoch": 0.27775084827920504, + "grad_norm": 1.4958488941192627, + "learning_rate": 4.535398005417461e-05, + "loss": 1.2896, + "step": 573 + }, + { + "epoch": 0.2782355792535143, + "grad_norm": 1.6208523511886597, + "learning_rate": 4.5329380059955776e-05, + "loss": 1.5778, + "step": 574 + }, + { + "epoch": 0.27872031022782356, + "grad_norm": 1.4747179746627808, + "learning_rate": 4.530472181950528e-05, + "loss": 1.3638, + "step": 575 + }, + { + "epoch": 0.2792050412021328, + "grad_norm": 1.5383862257003784, + "learning_rate": 4.528000540347212e-05, + "loss": 1.2982, + "step": 576 + }, + { + "epoch": 0.2796897721764421, + "grad_norm": 1.5441182851791382, + "learning_rate": 4.52552308826719e-05, + "loss": 1.4339, + "step": 577 + }, + { + "epoch": 0.2801745031507513, + "grad_norm": 1.4504390954971313, + "learning_rate": 4.523039832808677e-05, + "loss": 1.6174, + "step": 578 + }, + { + "epoch": 0.2806592341250606, + "grad_norm": 1.409399151802063, + "learning_rate": 4.520550781086511e-05, + "loss": 1.1168, + "step": 579 + }, + { + "epoch": 0.28114396509936984, + "grad_norm": 1.5246732234954834, + "learning_rate": 4.5180559402321385e-05, + "loss": 1.5579, + "step": 580 + }, + { + "epoch": 0.28162869607367913, + "grad_norm": 1.6616579294204712, + "learning_rate": 4.515555317393593e-05, + "loss": 1.8633, + "step": 581 + }, + { + "epoch": 0.28211342704798836, + "grad_norm": 1.4933806657791138, + "learning_rate": 4.5130489197354734e-05, + "loss": 1.5078, + "step": 582 + }, + { + "epoch": 0.28259815802229765, + "grad_norm": 1.469255805015564, + "learning_rate": 4.510536754438923e-05, + "loss": 1.4949, + "step": 583 + }, + { + "epoch": 0.2830828889966069, + "grad_norm": 1.32322096824646, + "learning_rate": 4.508018828701612e-05, + "loss": 1.1069, + "step": 584 + }, + { + "epoch": 0.2835676199709161, + "grad_norm": 1.438407063484192, + "learning_rate": 4.5054951497377165e-05, + "loss": 1.587, + "step": 585 + }, + { + "epoch": 0.2840523509452254, + "grad_norm": 1.5011217594146729, + "learning_rate": 4.502965724777891e-05, + "loss": 1.7561, + "step": 586 + }, + { + "epoch": 0.28453708191953464, + "grad_norm": 1.436948537826538, + "learning_rate": 4.500430561069259e-05, + "loss": 1.3473, + "step": 587 + }, + { + "epoch": 0.28502181289384393, + "grad_norm": 1.5629382133483887, + "learning_rate": 4.497889665875382e-05, + "loss": 1.8745, + "step": 588 + }, + { + "epoch": 0.28550654386815316, + "grad_norm": 1.4830938577651978, + "learning_rate": 4.495343046476245e-05, + "loss": 1.5765, + "step": 589 + }, + { + "epoch": 0.28599127484246245, + "grad_norm": 1.4268252849578857, + "learning_rate": 4.492790710168233e-05, + "loss": 1.3325, + "step": 590 + }, + { + "epoch": 0.2864760058167717, + "grad_norm": 1.4147472381591797, + "learning_rate": 4.4902326642641095e-05, + "loss": 1.3003, + "step": 591 + }, + { + "epoch": 0.286960736791081, + "grad_norm": 1.4721652269363403, + "learning_rate": 4.487668916093e-05, + "loss": 1.3235, + "step": 592 + }, + { + "epoch": 0.2874454677653902, + "grad_norm": 1.5141065120697021, + "learning_rate": 4.4850994730003634e-05, + "loss": 1.2092, + "step": 593 + }, + { + "epoch": 0.2879301987396995, + "grad_norm": 1.4203892946243286, + "learning_rate": 4.482524342347978e-05, + "loss": 1.8299, + "step": 594 + }, + { + "epoch": 0.2884149297140087, + "grad_norm": 2.1075599193573, + "learning_rate": 4.479943531513918e-05, + "loss": 1.9049, + "step": 595 + }, + { + "epoch": 0.28889966068831796, + "grad_norm": 1.3928402662277222, + "learning_rate": 4.477357047892531e-05, + "loss": 1.4002, + "step": 596 + }, + { + "epoch": 0.28938439166262725, + "grad_norm": 1.4497021436691284, + "learning_rate": 4.474764898894418e-05, + "loss": 1.3542, + "step": 597 + }, + { + "epoch": 0.2898691226369365, + "grad_norm": 1.6207205057144165, + "learning_rate": 4.472167091946411e-05, + "loss": 1.8706, + "step": 598 + }, + { + "epoch": 0.29035385361124577, + "grad_norm": 1.4122729301452637, + "learning_rate": 4.469563634491554e-05, + "loss": 1.3707, + "step": 599 + }, + { + "epoch": 0.290838584585555, + "grad_norm": 1.7664668560028076, + "learning_rate": 4.4669545339890814e-05, + "loss": 1.2924, + "step": 600 + }, + { + "epoch": 0.2913233155598643, + "grad_norm": 1.4191179275512695, + "learning_rate": 4.464339797914393e-05, + "loss": 1.4986, + "step": 601 + }, + { + "epoch": 0.2918080465341735, + "grad_norm": 1.3508961200714111, + "learning_rate": 4.4617194337590376e-05, + "loss": 1.3907, + "step": 602 + }, + { + "epoch": 0.2922927775084828, + "grad_norm": 1.5529752969741821, + "learning_rate": 4.459093449030688e-05, + "loss": 1.3911, + "step": 603 + }, + { + "epoch": 0.29277750848279205, + "grad_norm": 1.4372109174728394, + "learning_rate": 4.4564618512531206e-05, + "loss": 1.3964, + "step": 604 + }, + { + "epoch": 0.2932622394571013, + "grad_norm": 1.628930687904358, + "learning_rate": 4.4538246479661936e-05, + "loss": 1.8534, + "step": 605 + }, + { + "epoch": 0.29374697043141057, + "grad_norm": 1.4418190717697144, + "learning_rate": 4.451181846725827e-05, + "loss": 1.4604, + "step": 606 + }, + { + "epoch": 0.2942317014057198, + "grad_norm": 1.503642201423645, + "learning_rate": 4.448533455103979e-05, + "loss": 1.6738, + "step": 607 + }, + { + "epoch": 0.2947164323800291, + "grad_norm": 1.5811353921890259, + "learning_rate": 4.445879480688625e-05, + "loss": 1.2007, + "step": 608 + }, + { + "epoch": 0.2952011633543383, + "grad_norm": 1.4607658386230469, + "learning_rate": 4.443219931083734e-05, + "loss": 1.3633, + "step": 609 + }, + { + "epoch": 0.2956858943286476, + "grad_norm": 1.4199997186660767, + "learning_rate": 4.440554813909252e-05, + "loss": 1.6503, + "step": 610 + }, + { + "epoch": 0.29617062530295685, + "grad_norm": 1.5280699729919434, + "learning_rate": 4.437884136801074e-05, + "loss": 1.4311, + "step": 611 + }, + { + "epoch": 0.29665535627726614, + "grad_norm": 1.4121835231781006, + "learning_rate": 4.435207907411026e-05, + "loss": 1.3614, + "step": 612 + }, + { + "epoch": 0.29714008725157537, + "grad_norm": 1.6203902959823608, + "learning_rate": 4.4325261334068426e-05, + "loss": 1.6557, + "step": 613 + }, + { + "epoch": 0.29762481822588466, + "grad_norm": 1.4249413013458252, + "learning_rate": 4.4298388224721435e-05, + "loss": 1.4862, + "step": 614 + }, + { + "epoch": 0.2981095492001939, + "grad_norm": 1.6460726261138916, + "learning_rate": 4.427145982306412e-05, + "loss": 1.7338, + "step": 615 + }, + { + "epoch": 0.2985942801745031, + "grad_norm": 1.5498179197311401, + "learning_rate": 4.4244476206249745e-05, + "loss": 1.1775, + "step": 616 + }, + { + "epoch": 0.2990790111488124, + "grad_norm": 1.4984955787658691, + "learning_rate": 4.421743745158977e-05, + "loss": 1.5588, + "step": 617 + }, + { + "epoch": 0.29956374212312165, + "grad_norm": 1.8243629932403564, + "learning_rate": 4.419034363655362e-05, + "loss": 1.6428, + "step": 618 + }, + { + "epoch": 0.30004847309743093, + "grad_norm": 1.5599781274795532, + "learning_rate": 4.4163194838768495e-05, + "loss": 1.7171, + "step": 619 + }, + { + "epoch": 0.30053320407174017, + "grad_norm": 1.6111764907836914, + "learning_rate": 4.4135991136019106e-05, + "loss": 1.8379, + "step": 620 + }, + { + "epoch": 0.30101793504604946, + "grad_norm": 1.54121994972229, + "learning_rate": 4.4108732606247495e-05, + "loss": 1.3949, + "step": 621 + }, + { + "epoch": 0.3015026660203587, + "grad_norm": 1.552783489227295, + "learning_rate": 4.408141932755277e-05, + "loss": 1.3488, + "step": 622 + }, + { + "epoch": 0.301987396994668, + "grad_norm": 1.6675286293029785, + "learning_rate": 4.4054051378190915e-05, + "loss": 1.2843, + "step": 623 + }, + { + "epoch": 0.3024721279689772, + "grad_norm": 1.6017731428146362, + "learning_rate": 4.402662883657454e-05, + "loss": 1.6162, + "step": 624 + }, + { + "epoch": 0.3029568589432865, + "grad_norm": 1.6486562490463257, + "learning_rate": 4.3999151781272694e-05, + "loss": 1.723, + "step": 625 + }, + { + "epoch": 0.30344158991759573, + "grad_norm": 1.474776268005371, + "learning_rate": 4.397162029101058e-05, + "loss": 1.7979, + "step": 626 + }, + { + "epoch": 0.30392632089190497, + "grad_norm": 1.4462790489196777, + "learning_rate": 4.3944034444669405e-05, + "loss": 1.3842, + "step": 627 + }, + { + "epoch": 0.30441105186621426, + "grad_norm": 1.8951647281646729, + "learning_rate": 4.391639432128606e-05, + "loss": 1.4682, + "step": 628 + }, + { + "epoch": 0.3048957828405235, + "grad_norm": 1.4978829622268677, + "learning_rate": 4.3888700000052996e-05, + "loss": 1.4582, + "step": 629 + }, + { + "epoch": 0.3053805138148328, + "grad_norm": 1.5500818490982056, + "learning_rate": 4.386095156031792e-05, + "loss": 1.6484, + "step": 630 + }, + { + "epoch": 0.305865244789142, + "grad_norm": 1.9269793033599854, + "learning_rate": 4.3833149081583604e-05, + "loss": 1.5815, + "step": 631 + }, + { + "epoch": 0.3063499757634513, + "grad_norm": 2.429781436920166, + "learning_rate": 4.3805292643507644e-05, + "loss": 1.935, + "step": 632 + }, + { + "epoch": 0.30683470673776053, + "grad_norm": 1.5993130207061768, + "learning_rate": 4.377738232590225e-05, + "loss": 1.3461, + "step": 633 + }, + { + "epoch": 0.3073194377120698, + "grad_norm": 1.588075041770935, + "learning_rate": 4.374941820873399e-05, + "loss": 1.617, + "step": 634 + }, + { + "epoch": 0.30780416868637905, + "grad_norm": 1.6482924222946167, + "learning_rate": 4.372140037212357e-05, + "loss": 1.4718, + "step": 635 + }, + { + "epoch": 0.30828889966068834, + "grad_norm": 1.5200806856155396, + "learning_rate": 4.369332889634563e-05, + "loss": 1.5187, + "step": 636 + }, + { + "epoch": 0.3087736306349976, + "grad_norm": 1.4226235151290894, + "learning_rate": 4.366520386182846e-05, + "loss": 1.2748, + "step": 637 + }, + { + "epoch": 0.3092583616093068, + "grad_norm": 1.404143214225769, + "learning_rate": 4.363702534915385e-05, + "loss": 1.3319, + "step": 638 + }, + { + "epoch": 0.3097430925836161, + "grad_norm": 1.511928677558899, + "learning_rate": 4.360879343905676e-05, + "loss": 1.8477, + "step": 639 + }, + { + "epoch": 0.31022782355792533, + "grad_norm": 1.4452472925186157, + "learning_rate": 4.358050821242517e-05, + "loss": 1.222, + "step": 640 + }, + { + "epoch": 0.3107125545322346, + "grad_norm": 1.5438624620437622, + "learning_rate": 4.3552169750299835e-05, + "loss": 1.5466, + "step": 641 + }, + { + "epoch": 0.31119728550654385, + "grad_norm": 4.010799884796143, + "learning_rate": 4.352377813387398e-05, + "loss": 2.1084, + "step": 642 + }, + { + "epoch": 0.31168201648085314, + "grad_norm": 1.4538034200668335, + "learning_rate": 4.349533344449318e-05, + "loss": 1.5272, + "step": 643 + }, + { + "epoch": 0.3121667474551624, + "grad_norm": 1.4150643348693848, + "learning_rate": 4.346683576365505e-05, + "loss": 1.5093, + "step": 644 + }, + { + "epoch": 0.31265147842947166, + "grad_norm": 1.3800829648971558, + "learning_rate": 4.3438285173009006e-05, + "loss": 1.6719, + "step": 645 + }, + { + "epoch": 0.3131362094037809, + "grad_norm": 1.3637222051620483, + "learning_rate": 4.340968175435611e-05, + "loss": 1.2698, + "step": 646 + }, + { + "epoch": 0.3136209403780902, + "grad_norm": 1.470177173614502, + "learning_rate": 4.338102558964876e-05, + "loss": 1.3927, + "step": 647 + }, + { + "epoch": 0.3141056713523994, + "grad_norm": 2.070209264755249, + "learning_rate": 4.335231676099044e-05, + "loss": 1.3481, + "step": 648 + }, + { + "epoch": 0.31459040232670865, + "grad_norm": 1.404061198234558, + "learning_rate": 4.332355535063559e-05, + "loss": 1.514, + "step": 649 + }, + { + "epoch": 0.31507513330101794, + "grad_norm": 3.9239437580108643, + "learning_rate": 4.329474144098924e-05, + "loss": 1.341, + "step": 650 + }, + { + "epoch": 0.3155598642753272, + "grad_norm": 2.510849952697754, + "learning_rate": 4.32658751146069e-05, + "loss": 1.4483, + "step": 651 + }, + { + "epoch": 0.31604459524963646, + "grad_norm": 1.472462773323059, + "learning_rate": 4.323695645419419e-05, + "loss": 1.7946, + "step": 652 + }, + { + "epoch": 0.3165293262239457, + "grad_norm": 1.3601329326629639, + "learning_rate": 4.320798554260674e-05, + "loss": 1.1157, + "step": 653 + }, + { + "epoch": 0.317014057198255, + "grad_norm": 1.462933897972107, + "learning_rate": 4.3178962462849835e-05, + "loss": 1.3288, + "step": 654 + }, + { + "epoch": 0.3174987881725642, + "grad_norm": 1.6446117162704468, + "learning_rate": 4.3149887298078276e-05, + "loss": 1.9576, + "step": 655 + }, + { + "epoch": 0.3179835191468735, + "grad_norm": 1.4152432680130005, + "learning_rate": 4.312076013159604e-05, + "loss": 1.2452, + "step": 656 + }, + { + "epoch": 0.31846825012118274, + "grad_norm": 2.2185819149017334, + "learning_rate": 4.309158104685614e-05, + "loss": 1.8676, + "step": 657 + }, + { + "epoch": 0.318952981095492, + "grad_norm": 1.3394452333450317, + "learning_rate": 4.3062350127460325e-05, + "loss": 1.2346, + "step": 658 + }, + { + "epoch": 0.31943771206980126, + "grad_norm": 1.6615955829620361, + "learning_rate": 4.303306745715885e-05, + "loss": 1.5486, + "step": 659 + }, + { + "epoch": 0.3199224430441105, + "grad_norm": 1.4415209293365479, + "learning_rate": 4.3003733119850256e-05, + "loss": 1.5477, + "step": 660 + }, + { + "epoch": 0.3204071740184198, + "grad_norm": 1.4911268949508667, + "learning_rate": 4.29743471995811e-05, + "loss": 1.3075, + "step": 661 + }, + { + "epoch": 0.320891904992729, + "grad_norm": 1.4862326383590698, + "learning_rate": 4.2944909780545754e-05, + "loss": 1.5706, + "step": 662 + }, + { + "epoch": 0.3213766359670383, + "grad_norm": 1.5849559307098389, + "learning_rate": 4.291542094708612e-05, + "loss": 1.4566, + "step": 663 + }, + { + "epoch": 0.32186136694134754, + "grad_norm": 1.786057949066162, + "learning_rate": 4.288588078369141e-05, + "loss": 1.7439, + "step": 664 + }, + { + "epoch": 0.3223460979156568, + "grad_norm": 1.5245951414108276, + "learning_rate": 4.2856289374997927e-05, + "loss": 1.524, + "step": 665 + }, + { + "epoch": 0.32283082888996606, + "grad_norm": 1.8819663524627686, + "learning_rate": 4.282664680578876e-05, + "loss": 1.4255, + "step": 666 + }, + { + "epoch": 0.32331555986427535, + "grad_norm": 1.5141843557357788, + "learning_rate": 4.2796953160993616e-05, + "loss": 1.7939, + "step": 667 + }, + { + "epoch": 0.3238002908385846, + "grad_norm": 1.4404217004776, + "learning_rate": 4.276720852568851e-05, + "loss": 1.7917, + "step": 668 + }, + { + "epoch": 0.32428502181289387, + "grad_norm": 1.4955337047576904, + "learning_rate": 4.273741298509557e-05, + "loss": 1.7536, + "step": 669 + }, + { + "epoch": 0.3247697527872031, + "grad_norm": 1.5283381938934326, + "learning_rate": 4.2707566624582774e-05, + "loss": 1.3797, + "step": 670 + }, + { + "epoch": 0.32525448376151234, + "grad_norm": 1.550523281097412, + "learning_rate": 4.267766952966369e-05, + "loss": 1.6906, + "step": 671 + }, + { + "epoch": 0.3257392147358216, + "grad_norm": 1.4687964916229248, + "learning_rate": 4.264772178599726e-05, + "loss": 1.4744, + "step": 672 + }, + { + "epoch": 0.32622394571013086, + "grad_norm": 1.485487937927246, + "learning_rate": 4.261772347938754e-05, + "loss": 1.675, + "step": 673 + }, + { + "epoch": 0.32670867668444015, + "grad_norm": 1.8627375364303589, + "learning_rate": 4.258767469578345e-05, + "loss": 1.5248, + "step": 674 + }, + { + "epoch": 0.3271934076587494, + "grad_norm": 1.4943833351135254, + "learning_rate": 4.255757552127855e-05, + "loss": 1.3291, + "step": 675 + }, + { + "epoch": 0.32767813863305867, + "grad_norm": 1.4833825826644897, + "learning_rate": 4.252742604211073e-05, + "loss": 1.3374, + "step": 676 + }, + { + "epoch": 0.3281628696073679, + "grad_norm": 1.7539558410644531, + "learning_rate": 4.2497226344662065e-05, + "loss": 1.9051, + "step": 677 + }, + { + "epoch": 0.3286476005816772, + "grad_norm": 1.5023777484893799, + "learning_rate": 4.2466976515458484e-05, + "loss": 1.416, + "step": 678 + }, + { + "epoch": 0.3291323315559864, + "grad_norm": 1.383313775062561, + "learning_rate": 4.243667664116956e-05, + "loss": 1.6164, + "step": 679 + }, + { + "epoch": 0.3296170625302957, + "grad_norm": 1.8263195753097534, + "learning_rate": 4.2406326808608225e-05, + "loss": 1.6341, + "step": 680 + }, + { + "epoch": 0.33010179350460495, + "grad_norm": 2.7321722507476807, + "learning_rate": 4.237592710473059e-05, + "loss": 1.2566, + "step": 681 + }, + { + "epoch": 0.3305865244789142, + "grad_norm": 1.4957518577575684, + "learning_rate": 4.234547761663562e-05, + "loss": 1.5391, + "step": 682 + }, + { + "epoch": 0.33107125545322347, + "grad_norm": 1.3792263269424438, + "learning_rate": 4.2314978431564923e-05, + "loss": 1.2249, + "step": 683 + }, + { + "epoch": 0.3315559864275327, + "grad_norm": 1.4482512474060059, + "learning_rate": 4.228442963690252e-05, + "loss": 1.3106, + "step": 684 + }, + { + "epoch": 0.332040717401842, + "grad_norm": 1.5233250856399536, + "learning_rate": 4.2253831320174534e-05, + "loss": 1.4784, + "step": 685 + }, + { + "epoch": 0.3325254483761512, + "grad_norm": 1.4404493570327759, + "learning_rate": 4.2223183569049005e-05, + "loss": 1.3078, + "step": 686 + }, + { + "epoch": 0.3330101793504605, + "grad_norm": 1.585357904434204, + "learning_rate": 4.2192486471335585e-05, + "loss": 1.6884, + "step": 687 + }, + { + "epoch": 0.33349491032476974, + "grad_norm": 1.4535739421844482, + "learning_rate": 4.216174011498533e-05, + "loss": 1.8015, + "step": 688 + }, + { + "epoch": 0.33397964129907903, + "grad_norm": 1.4156732559204102, + "learning_rate": 4.2130944588090415e-05, + "loss": 1.5549, + "step": 689 + }, + { + "epoch": 0.33446437227338827, + "grad_norm": 1.4804260730743408, + "learning_rate": 4.2100099978883896e-05, + "loss": 1.4739, + "step": 690 + }, + { + "epoch": 0.33494910324769755, + "grad_norm": 1.4360147714614868, + "learning_rate": 4.206920637573946e-05, + "loss": 1.3966, + "step": 691 + }, + { + "epoch": 0.3354338342220068, + "grad_norm": 1.5461288690567017, + "learning_rate": 4.203826386717118e-05, + "loss": 1.4189, + "step": 692 + }, + { + "epoch": 0.335918565196316, + "grad_norm": 1.321059226989746, + "learning_rate": 4.200727254183322e-05, + "loss": 1.1196, + "step": 693 + }, + { + "epoch": 0.3364032961706253, + "grad_norm": 1.325295329093933, + "learning_rate": 4.1976232488519626e-05, + "loss": 1.2674, + "step": 694 + }, + { + "epoch": 0.33688802714493454, + "grad_norm": 1.6588038206100464, + "learning_rate": 4.1945143796164076e-05, + "loss": 1.7719, + "step": 695 + }, + { + "epoch": 0.33737275811924383, + "grad_norm": 1.321679711341858, + "learning_rate": 4.191400655383956e-05, + "loss": 1.238, + "step": 696 + }, + { + "epoch": 0.33785748909355307, + "grad_norm": 1.4317057132720947, + "learning_rate": 4.188282085075821e-05, + "loss": 1.5644, + "step": 697 + }, + { + "epoch": 0.33834222006786235, + "grad_norm": 1.4975212812423706, + "learning_rate": 4.185158677627099e-05, + "loss": 1.5107, + "step": 698 + }, + { + "epoch": 0.3388269510421716, + "grad_norm": 1.4152145385742188, + "learning_rate": 4.182030441986744e-05, + "loss": 1.5376, + "step": 699 + }, + { + "epoch": 0.3393116820164809, + "grad_norm": 1.418017864227295, + "learning_rate": 4.178897387117546e-05, + "loss": 1.3491, + "step": 700 + }, + { + "epoch": 0.3397964129907901, + "grad_norm": 1.5809285640716553, + "learning_rate": 4.175759521996101e-05, + "loss": 1.487, + "step": 701 + }, + { + "epoch": 0.3402811439650994, + "grad_norm": 1.5660691261291504, + "learning_rate": 4.172616855612787e-05, + "loss": 1.6579, + "step": 702 + }, + { + "epoch": 0.34076587493940863, + "grad_norm": 1.5755140781402588, + "learning_rate": 4.169469396971739e-05, + "loss": 1.2482, + "step": 703 + }, + { + "epoch": 0.34125060591371786, + "grad_norm": 1.782956838607788, + "learning_rate": 4.166317155090822e-05, + "loss": 1.5582, + "step": 704 + }, + { + "epoch": 0.34173533688802715, + "grad_norm": 1.4416836500167847, + "learning_rate": 4.1631601390016055e-05, + "loss": 1.653, + "step": 705 + }, + { + "epoch": 0.3422200678623364, + "grad_norm": 1.537095069885254, + "learning_rate": 4.159998357749338e-05, + "loss": 1.4798, + "step": 706 + }, + { + "epoch": 0.3427047988366457, + "grad_norm": 1.4685359001159668, + "learning_rate": 4.1568318203929195e-05, + "loss": 1.3947, + "step": 707 + }, + { + "epoch": 0.3431895298109549, + "grad_norm": 1.462533712387085, + "learning_rate": 4.1536605360048795e-05, + "loss": 1.4598, + "step": 708 + }, + { + "epoch": 0.3436742607852642, + "grad_norm": 1.5037108659744263, + "learning_rate": 4.150484513671346e-05, + "loss": 1.5225, + "step": 709 + }, + { + "epoch": 0.34415899175957343, + "grad_norm": 1.4552758932113647, + "learning_rate": 4.147303762492022e-05, + "loss": 1.433, + "step": 710 + }, + { + "epoch": 0.3446437227338827, + "grad_norm": 1.6252245903015137, + "learning_rate": 4.144118291580161e-05, + "loss": 1.6058, + "step": 711 + }, + { + "epoch": 0.34512845370819195, + "grad_norm": 1.6239243745803833, + "learning_rate": 4.140928110062538e-05, + "loss": 1.3355, + "step": 712 + }, + { + "epoch": 0.3456131846825012, + "grad_norm": 1.3930360078811646, + "learning_rate": 4.137733227079423e-05, + "loss": 1.4171, + "step": 713 + }, + { + "epoch": 0.3460979156568105, + "grad_norm": 1.9757750034332275, + "learning_rate": 4.134533651784559e-05, + "loss": 1.3524, + "step": 714 + }, + { + "epoch": 0.3465826466311197, + "grad_norm": 1.4614371061325073, + "learning_rate": 4.131329393345131e-05, + "loss": 1.7791, + "step": 715 + }, + { + "epoch": 0.347067377605429, + "grad_norm": 1.3664659261703491, + "learning_rate": 4.1281204609417435e-05, + "loss": 1.2993, + "step": 716 + }, + { + "epoch": 0.34755210857973823, + "grad_norm": 1.4222404956817627, + "learning_rate": 4.1249068637683906e-05, + "loss": 1.3012, + "step": 717 + }, + { + "epoch": 0.3480368395540475, + "grad_norm": 1.4118279218673706, + "learning_rate": 4.1216886110324324e-05, + "loss": 1.3078, + "step": 718 + }, + { + "epoch": 0.34852157052835675, + "grad_norm": 1.4675920009613037, + "learning_rate": 4.118465711954569e-05, + "loss": 1.4492, + "step": 719 + }, + { + "epoch": 0.34900630150266604, + "grad_norm": 1.490893006324768, + "learning_rate": 4.115238175768812e-05, + "loss": 1.4637, + "step": 720 + }, + { + "epoch": 0.34949103247697527, + "grad_norm": 1.3924874067306519, + "learning_rate": 4.1120060117224566e-05, + "loss": 1.3949, + "step": 721 + }, + { + "epoch": 0.34997576345128456, + "grad_norm": 1.4966955184936523, + "learning_rate": 4.108769229076061e-05, + "loss": 1.5634, + "step": 722 + }, + { + "epoch": 0.3504604944255938, + "grad_norm": 1.4051393270492554, + "learning_rate": 4.105527837103414e-05, + "loss": 1.2924, + "step": 723 + }, + { + "epoch": 0.350945225399903, + "grad_norm": 1.4678040742874146, + "learning_rate": 4.102281845091512e-05, + "loss": 1.6048, + "step": 724 + }, + { + "epoch": 0.3514299563742123, + "grad_norm": 1.4021576642990112, + "learning_rate": 4.0990312623405305e-05, + "loss": 1.3513, + "step": 725 + }, + { + "epoch": 0.35191468734852155, + "grad_norm": 1.5062463283538818, + "learning_rate": 4.095776098163798e-05, + "loss": 1.489, + "step": 726 + }, + { + "epoch": 0.35239941832283084, + "grad_norm": 1.4043464660644531, + "learning_rate": 4.0925163618877695e-05, + "loss": 1.497, + "step": 727 + }, + { + "epoch": 0.35288414929714007, + "grad_norm": 1.5384563207626343, + "learning_rate": 4.0892520628519985e-05, + "loss": 1.8048, + "step": 728 + }, + { + "epoch": 0.35336888027144936, + "grad_norm": 1.4933316707611084, + "learning_rate": 4.085983210409114e-05, + "loss": 1.2942, + "step": 729 + }, + { + "epoch": 0.3538536112457586, + "grad_norm": 1.570822834968567, + "learning_rate": 4.082709813924789e-05, + "loss": 1.6173, + "step": 730 + }, + { + "epoch": 0.3543383422200679, + "grad_norm": 1.5000869035720825, + "learning_rate": 4.079431882777715e-05, + "loss": 1.3644, + "step": 731 + }, + { + "epoch": 0.3548230731943771, + "grad_norm": 1.4136881828308105, + "learning_rate": 4.0761494263595796e-05, + "loss": 1.7214, + "step": 732 + }, + { + "epoch": 0.3553078041686864, + "grad_norm": 1.5652875900268555, + "learning_rate": 4.072862454075031e-05, + "loss": 1.6255, + "step": 733 + }, + { + "epoch": 0.35579253514299564, + "grad_norm": 1.6801156997680664, + "learning_rate": 4.06957097534166e-05, + "loss": 1.7948, + "step": 734 + }, + { + "epoch": 0.35627726611730487, + "grad_norm": 1.3107026815414429, + "learning_rate": 4.0662749995899666e-05, + "loss": 1.3926, + "step": 735 + }, + { + "epoch": 0.35676199709161416, + "grad_norm": 1.503405213356018, + "learning_rate": 4.062974536263336e-05, + "loss": 1.5015, + "step": 736 + }, + { + "epoch": 0.3572467280659234, + "grad_norm": 1.5230603218078613, + "learning_rate": 4.0596695948180116e-05, + "loss": 1.3572, + "step": 737 + }, + { + "epoch": 0.3577314590402327, + "grad_norm": 1.4852657318115234, + "learning_rate": 4.056360184723065e-05, + "loss": 1.5162, + "step": 738 + }, + { + "epoch": 0.3582161900145419, + "grad_norm": 1.5229320526123047, + "learning_rate": 4.0530463154603747e-05, + "loss": 1.5939, + "step": 739 + }, + { + "epoch": 0.3587009209888512, + "grad_norm": 1.5715887546539307, + "learning_rate": 4.049727996524591e-05, + "loss": 1.4526, + "step": 740 + }, + { + "epoch": 0.35918565196316043, + "grad_norm": 1.3569464683532715, + "learning_rate": 4.046405237423116e-05, + "loss": 1.3617, + "step": 741 + }, + { + "epoch": 0.3596703829374697, + "grad_norm": 1.477789282798767, + "learning_rate": 4.043078047676072e-05, + "loss": 1.4437, + "step": 742 + }, + { + "epoch": 0.36015511391177896, + "grad_norm": 1.3856958150863647, + "learning_rate": 4.039746436816277e-05, + "loss": 1.3219, + "step": 743 + }, + { + "epoch": 0.36063984488608825, + "grad_norm": 1.4407496452331543, + "learning_rate": 4.036410414389215e-05, + "loss": 1.2868, + "step": 744 + }, + { + "epoch": 0.3611245758603975, + "grad_norm": 1.449094533920288, + "learning_rate": 4.03306998995301e-05, + "loss": 1.3048, + "step": 745 + }, + { + "epoch": 0.3616093068347067, + "grad_norm": 1.4655970335006714, + "learning_rate": 4.029725173078398e-05, + "loss": 1.4586, + "step": 746 + }, + { + "epoch": 0.362094037809016, + "grad_norm": 1.4294497966766357, + "learning_rate": 4.0263759733487015e-05, + "loss": 1.417, + "step": 747 + }, + { + "epoch": 0.36257876878332523, + "grad_norm": 1.5399290323257446, + "learning_rate": 4.023022400359797e-05, + "loss": 1.6722, + "step": 748 + }, + { + "epoch": 0.3630634997576345, + "grad_norm": 1.4106359481811523, + "learning_rate": 4.019664463720094e-05, + "loss": 1.5666, + "step": 749 + }, + { + "epoch": 0.36354823073194376, + "grad_norm": 1.5670846700668335, + "learning_rate": 4.0163021730505045e-05, + "loss": 1.6455, + "step": 750 + }, + { + "epoch": 0.36403296170625304, + "grad_norm": 1.5435312986373901, + "learning_rate": 4.012935537984414e-05, + "loss": 1.7314, + "step": 751 + }, + { + "epoch": 0.3645176926805623, + "grad_norm": 1.43980872631073, + "learning_rate": 4.009564568167653e-05, + "loss": 1.1749, + "step": 752 + }, + { + "epoch": 0.36500242365487157, + "grad_norm": 1.5904608964920044, + "learning_rate": 4.006189273258477e-05, + "loss": 1.4091, + "step": 753 + }, + { + "epoch": 0.3654871546291808, + "grad_norm": 1.569663643836975, + "learning_rate": 4.00280966292753e-05, + "loss": 1.447, + "step": 754 + }, + { + "epoch": 0.3659718856034901, + "grad_norm": 1.4279499053955078, + "learning_rate": 3.99942574685782e-05, + "loss": 1.4849, + "step": 755 + }, + { + "epoch": 0.3664566165777993, + "grad_norm": 1.4076343774795532, + "learning_rate": 3.9960375347446934e-05, + "loss": 1.3214, + "step": 756 + }, + { + "epoch": 0.36694134755210855, + "grad_norm": 1.4956542253494263, + "learning_rate": 3.9926450362958024e-05, + "loss": 1.4805, + "step": 757 + }, + { + "epoch": 0.36742607852641784, + "grad_norm": 1.563628911972046, + "learning_rate": 3.9892482612310836e-05, + "loss": 1.8152, + "step": 758 + }, + { + "epoch": 0.3679108095007271, + "grad_norm": 1.4647592306137085, + "learning_rate": 3.985847219282725e-05, + "loss": 1.4635, + "step": 759 + }, + { + "epoch": 0.36839554047503636, + "grad_norm": 1.4081557989120483, + "learning_rate": 3.982441920195138e-05, + "loss": 1.5819, + "step": 760 + }, + { + "epoch": 0.3688802714493456, + "grad_norm": 1.5153309106826782, + "learning_rate": 3.9790323737249346e-05, + "loss": 1.5674, + "step": 761 + }, + { + "epoch": 0.3693650024236549, + "grad_norm": 1.4655232429504395, + "learning_rate": 3.975618589640894e-05, + "loss": 1.4397, + "step": 762 + }, + { + "epoch": 0.3698497333979641, + "grad_norm": 1.60316801071167, + "learning_rate": 3.9722005777239354e-05, + "loss": 1.5732, + "step": 763 + }, + { + "epoch": 0.3703344643722734, + "grad_norm": 1.594388723373413, + "learning_rate": 3.9687783477670966e-05, + "loss": 1.6924, + "step": 764 + }, + { + "epoch": 0.37081919534658264, + "grad_norm": 1.587856411933899, + "learning_rate": 3.9653519095754934e-05, + "loss": 1.7377, + "step": 765 + }, + { + "epoch": 0.37130392632089193, + "grad_norm": 1.4031965732574463, + "learning_rate": 3.961921272966305e-05, + "loss": 1.5464, + "step": 766 + }, + { + "epoch": 0.37178865729520116, + "grad_norm": 1.3980727195739746, + "learning_rate": 3.958486447768736e-05, + "loss": 1.3223, + "step": 767 + }, + { + "epoch": 0.3722733882695104, + "grad_norm": 1.411486268043518, + "learning_rate": 3.95504744382399e-05, + "loss": 1.5066, + "step": 768 + }, + { + "epoch": 0.3727581192438197, + "grad_norm": 1.5289703607559204, + "learning_rate": 3.9516042709852506e-05, + "loss": 1.4962, + "step": 769 + }, + { + "epoch": 0.3732428502181289, + "grad_norm": 1.3834885358810425, + "learning_rate": 3.948156939117639e-05, + "loss": 1.3076, + "step": 770 + }, + { + "epoch": 0.3737275811924382, + "grad_norm": 1.5126419067382812, + "learning_rate": 3.944705458098194e-05, + "loss": 1.203, + "step": 771 + }, + { + "epoch": 0.37421231216674744, + "grad_norm": 1.4430164098739624, + "learning_rate": 3.9412498378158446e-05, + "loss": 1.2171, + "step": 772 + }, + { + "epoch": 0.37469704314105673, + "grad_norm": 1.4813838005065918, + "learning_rate": 3.9377900881713764e-05, + "loss": 1.4277, + "step": 773 + }, + { + "epoch": 0.37518177411536596, + "grad_norm": 1.431311011314392, + "learning_rate": 3.9343262190774076e-05, + "loss": 1.5013, + "step": 774 + }, + { + "epoch": 0.37566650508967525, + "grad_norm": 1.891334891319275, + "learning_rate": 3.93085824045836e-05, + "loss": 2.4066, + "step": 775 + }, + { + "epoch": 0.3761512360639845, + "grad_norm": 1.4745622873306274, + "learning_rate": 3.927386162250427e-05, + "loss": 1.3788, + "step": 776 + }, + { + "epoch": 0.3766359670382938, + "grad_norm": 1.437791109085083, + "learning_rate": 3.923909994401551e-05, + "loss": 1.565, + "step": 777 + }, + { + "epoch": 0.377120698012603, + "grad_norm": 1.3869900703430176, + "learning_rate": 3.92042974687139e-05, + "loss": 1.5087, + "step": 778 + }, + { + "epoch": 0.37760542898691224, + "grad_norm": 1.3274116516113281, + "learning_rate": 3.916945429631289e-05, + "loss": 1.5359, + "step": 779 + }, + { + "epoch": 0.37809015996122153, + "grad_norm": 1.4933420419692993, + "learning_rate": 3.9134570526642594e-05, + "loss": 1.5366, + "step": 780 + }, + { + "epoch": 0.37857489093553076, + "grad_norm": 1.4309301376342773, + "learning_rate": 3.9099646259649364e-05, + "loss": 1.3898, + "step": 781 + }, + { + "epoch": 0.37905962190984005, + "grad_norm": 1.476951241493225, + "learning_rate": 3.9064681595395634e-05, + "loss": 1.4312, + "step": 782 + }, + { + "epoch": 0.3795443528841493, + "grad_norm": 1.872909426689148, + "learning_rate": 3.902967663405956e-05, + "loss": 1.4043, + "step": 783 + }, + { + "epoch": 0.38002908385845857, + "grad_norm": 1.5244536399841309, + "learning_rate": 3.8994631475934775e-05, + "loss": 1.382, + "step": 784 + }, + { + "epoch": 0.3805138148327678, + "grad_norm": 1.5926803350448608, + "learning_rate": 3.895954622143004e-05, + "loss": 1.3973, + "step": 785 + }, + { + "epoch": 0.3809985458070771, + "grad_norm": 1.583483338356018, + "learning_rate": 3.8924420971069055e-05, + "loss": 1.6504, + "step": 786 + }, + { + "epoch": 0.3814832767813863, + "grad_norm": 1.3475099802017212, + "learning_rate": 3.888925582549006e-05, + "loss": 1.4933, + "step": 787 + }, + { + "epoch": 0.3819680077556956, + "grad_norm": 1.3581137657165527, + "learning_rate": 3.885405088544563e-05, + "loss": 1.2155, + "step": 788 + }, + { + "epoch": 0.38245273873000485, + "grad_norm": 1.462459921836853, + "learning_rate": 3.8818806251802334e-05, + "loss": 1.446, + "step": 789 + }, + { + "epoch": 0.3829374697043141, + "grad_norm": 1.4395034313201904, + "learning_rate": 3.878352202554051e-05, + "loss": 1.3661, + "step": 790 + }, + { + "epoch": 0.38342220067862337, + "grad_norm": 1.3227758407592773, + "learning_rate": 3.8748198307753874e-05, + "loss": 1.2516, + "step": 791 + }, + { + "epoch": 0.3839069316529326, + "grad_norm": 1.5062460899353027, + "learning_rate": 3.871283519964935e-05, + "loss": 1.7564, + "step": 792 + }, + { + "epoch": 0.3843916626272419, + "grad_norm": 1.4209660291671753, + "learning_rate": 3.867743280254666e-05, + "loss": 1.5597, + "step": 793 + }, + { + "epoch": 0.3848763936015511, + "grad_norm": 1.5597984790802002, + "learning_rate": 3.8641991217878154e-05, + "loss": 1.4486, + "step": 794 + }, + { + "epoch": 0.3853611245758604, + "grad_norm": 1.390090823173523, + "learning_rate": 3.8606510547188425e-05, + "loss": 1.361, + "step": 795 + }, + { + "epoch": 0.38584585555016965, + "grad_norm": 1.511312484741211, + "learning_rate": 3.857099089213405e-05, + "loss": 1.4659, + "step": 796 + }, + { + "epoch": 0.38633058652447894, + "grad_norm": 1.4220378398895264, + "learning_rate": 3.8535432354483313e-05, + "loss": 1.8297, + "step": 797 + }, + { + "epoch": 0.38681531749878817, + "grad_norm": 1.4622609615325928, + "learning_rate": 3.849983503611591e-05, + "loss": 1.6799, + "step": 798 + }, + { + "epoch": 0.38730004847309746, + "grad_norm": 1.4184774160385132, + "learning_rate": 3.8464199039022605e-05, + "loss": 1.3475, + "step": 799 + }, + { + "epoch": 0.3877847794474067, + "grad_norm": 1.6148196458816528, + "learning_rate": 3.842852446530505e-05, + "loss": 1.5014, + "step": 800 + }, + { + "epoch": 0.3882695104217159, + "grad_norm": 1.4410502910614014, + "learning_rate": 3.839281141717538e-05, + "loss": 1.6122, + "step": 801 + }, + { + "epoch": 0.3887542413960252, + "grad_norm": 1.4458303451538086, + "learning_rate": 3.835705999695595e-05, + "loss": 1.3593, + "step": 802 + }, + { + "epoch": 0.38923897237033445, + "grad_norm": 1.4514504671096802, + "learning_rate": 3.832127030707909e-05, + "loss": 1.2312, + "step": 803 + }, + { + "epoch": 0.38972370334464373, + "grad_norm": 1.5186152458190918, + "learning_rate": 3.828544245008677e-05, + "loss": 1.5824, + "step": 804 + }, + { + "epoch": 0.39020843431895297, + "grad_norm": 1.2924879789352417, + "learning_rate": 3.82495765286303e-05, + "loss": 1.3355, + "step": 805 + }, + { + "epoch": 0.39069316529326226, + "grad_norm": 1.4316754341125488, + "learning_rate": 3.821367264547006e-05, + "loss": 1.4083, + "step": 806 + }, + { + "epoch": 0.3911778962675715, + "grad_norm": 1.5610601902008057, + "learning_rate": 3.817773090347519e-05, + "loss": 1.662, + "step": 807 + }, + { + "epoch": 0.3916626272418808, + "grad_norm": 1.4195690155029297, + "learning_rate": 3.8141751405623317e-05, + "loss": 1.4629, + "step": 808 + }, + { + "epoch": 0.39214735821619, + "grad_norm": 1.4146546125411987, + "learning_rate": 3.8105734255000214e-05, + "loss": 1.452, + "step": 809 + }, + { + "epoch": 0.3926320891904993, + "grad_norm": 1.45216965675354, + "learning_rate": 3.806967955479955e-05, + "loss": 1.2171, + "step": 810 + }, + { + "epoch": 0.39311682016480853, + "grad_norm": 1.468102216720581, + "learning_rate": 3.803358740832257e-05, + "loss": 1.5747, + "step": 811 + }, + { + "epoch": 0.39360155113911777, + "grad_norm": 1.314497947692871, + "learning_rate": 3.7997457918977845e-05, + "loss": 1.351, + "step": 812 + }, + { + "epoch": 0.39408628211342706, + "grad_norm": 1.4871678352355957, + "learning_rate": 3.796129119028087e-05, + "loss": 1.2889, + "step": 813 + }, + { + "epoch": 0.3945710130877363, + "grad_norm": 1.3904635906219482, + "learning_rate": 3.79250873258539e-05, + "loss": 1.3541, + "step": 814 + }, + { + "epoch": 0.3950557440620456, + "grad_norm": 1.4073387384414673, + "learning_rate": 3.7888846429425546e-05, + "loss": 1.1039, + "step": 815 + }, + { + "epoch": 0.3955404750363548, + "grad_norm": 1.6151351928710938, + "learning_rate": 3.785256860483054e-05, + "loss": 1.6067, + "step": 816 + }, + { + "epoch": 0.3960252060106641, + "grad_norm": 1.456746220588684, + "learning_rate": 3.781625395600943e-05, + "loss": 1.4839, + "step": 817 + }, + { + "epoch": 0.39650993698497333, + "grad_norm": 1.4020472764968872, + "learning_rate": 3.7779902587008225e-05, + "loss": 1.2439, + "step": 818 + }, + { + "epoch": 0.3969946679592826, + "grad_norm": 1.5153536796569824, + "learning_rate": 3.774351460197819e-05, + "loss": 1.5316, + "step": 819 + }, + { + "epoch": 0.39747939893359185, + "grad_norm": 1.8110442161560059, + "learning_rate": 3.770709010517549e-05, + "loss": 1.4559, + "step": 820 + }, + { + "epoch": 0.3979641299079011, + "grad_norm": 1.497971773147583, + "learning_rate": 3.767062920096086e-05, + "loss": 1.587, + "step": 821 + }, + { + "epoch": 0.3984488608822104, + "grad_norm": 1.3729140758514404, + "learning_rate": 3.763413199379941e-05, + "loss": 1.27, + "step": 822 + }, + { + "epoch": 0.3989335918565196, + "grad_norm": 1.5104501247406006, + "learning_rate": 3.7597598588260196e-05, + "loss": 1.315, + "step": 823 + }, + { + "epoch": 0.3994183228308289, + "grad_norm": 1.459794521331787, + "learning_rate": 3.7561029089016055e-05, + "loss": 1.4664, + "step": 824 + }, + { + "epoch": 0.39990305380513813, + "grad_norm": 1.403060793876648, + "learning_rate": 3.7524423600843186e-05, + "loss": 1.6218, + "step": 825 + }, + { + "epoch": 0.4003877847794474, + "grad_norm": 1.4324053525924683, + "learning_rate": 3.7487782228620916e-05, + "loss": 1.274, + "step": 826 + }, + { + "epoch": 0.40087251575375665, + "grad_norm": 1.4029552936553955, + "learning_rate": 3.7451105077331396e-05, + "loss": 1.464, + "step": 827 + }, + { + "epoch": 0.40135724672806594, + "grad_norm": 1.549617052078247, + "learning_rate": 3.741439225205927e-05, + "loss": 1.4068, + "step": 828 + }, + { + "epoch": 0.4018419777023752, + "grad_norm": 1.4655847549438477, + "learning_rate": 3.7377643857991416e-05, + "loss": 1.5115, + "step": 829 + }, + { + "epoch": 0.40232670867668446, + "grad_norm": 1.5959397554397583, + "learning_rate": 3.7340860000416595e-05, + "loss": 1.5153, + "step": 830 + }, + { + "epoch": 0.4028114396509937, + "grad_norm": 1.6088857650756836, + "learning_rate": 3.730404078472518e-05, + "loss": 1.7593, + "step": 831 + }, + { + "epoch": 0.40329617062530293, + "grad_norm": 1.4206966161727905, + "learning_rate": 3.726718631640888e-05, + "loss": 1.4746, + "step": 832 + }, + { + "epoch": 0.4037809015996122, + "grad_norm": 1.4388668537139893, + "learning_rate": 3.723029670106036e-05, + "loss": 1.1222, + "step": 833 + }, + { + "epoch": 0.40426563257392145, + "grad_norm": 1.5215039253234863, + "learning_rate": 3.719337204437302e-05, + "loss": 1.3856, + "step": 834 + }, + { + "epoch": 0.40475036354823074, + "grad_norm": 1.470598578453064, + "learning_rate": 3.7156412452140646e-05, + "loss": 1.2346, + "step": 835 + }, + { + "epoch": 0.40523509452254, + "grad_norm": 1.4196856021881104, + "learning_rate": 3.711941803025712e-05, + "loss": 1.2546, + "step": 836 + }, + { + "epoch": 0.40571982549684926, + "grad_norm": 1.5476642847061157, + "learning_rate": 3.708238888471611e-05, + "loss": 1.7273, + "step": 837 + }, + { + "epoch": 0.4062045564711585, + "grad_norm": 1.4293063879013062, + "learning_rate": 3.704532512161079e-05, + "loss": 1.5743, + "step": 838 + }, + { + "epoch": 0.4066892874454678, + "grad_norm": 1.4300776720046997, + "learning_rate": 3.700822684713349e-05, + "loss": 1.3128, + "step": 839 + }, + { + "epoch": 0.407174018419777, + "grad_norm": 1.404038667678833, + "learning_rate": 3.697109416757544e-05, + "loss": 1.1517, + "step": 840 + }, + { + "epoch": 0.4076587493940863, + "grad_norm": 1.432827115058899, + "learning_rate": 3.6933927189326435e-05, + "loss": 1.3499, + "step": 841 + }, + { + "epoch": 0.40814348036839554, + "grad_norm": 1.4601900577545166, + "learning_rate": 3.689672601887455e-05, + "loss": 1.4287, + "step": 842 + }, + { + "epoch": 0.4086282113427048, + "grad_norm": 1.4678421020507812, + "learning_rate": 3.685949076280583e-05, + "loss": 1.5663, + "step": 843 + }, + { + "epoch": 0.40911294231701406, + "grad_norm": 1.538097620010376, + "learning_rate": 3.6822221527803934e-05, + "loss": 1.6855, + "step": 844 + }, + { + "epoch": 0.4095976732913233, + "grad_norm": 1.4171286821365356, + "learning_rate": 3.678491842064995e-05, + "loss": 1.6271, + "step": 845 + }, + { + "epoch": 0.4100824042656326, + "grad_norm": 1.5117058753967285, + "learning_rate": 3.674758154822194e-05, + "loss": 1.356, + "step": 846 + }, + { + "epoch": 0.4105671352399418, + "grad_norm": 1.442489743232727, + "learning_rate": 3.671021101749476e-05, + "loss": 1.3713, + "step": 847 + }, + { + "epoch": 0.4110518662142511, + "grad_norm": 1.4731215238571167, + "learning_rate": 3.667280693553967e-05, + "loss": 1.4432, + "step": 848 + }, + { + "epoch": 0.41153659718856034, + "grad_norm": 1.464311122894287, + "learning_rate": 3.663536940952409e-05, + "loss": 1.5066, + "step": 849 + }, + { + "epoch": 0.4120213281628696, + "grad_norm": 1.3426711559295654, + "learning_rate": 3.659789854671122e-05, + "loss": 1.1671, + "step": 850 + }, + { + "epoch": 0.41250605913717886, + "grad_norm": 1.665104866027832, + "learning_rate": 3.6560394454459814e-05, + "loss": 1.9996, + "step": 851 + }, + { + "epoch": 0.41299079011148815, + "grad_norm": 1.2623895406723022, + "learning_rate": 3.652285724022379e-05, + "loss": 1.1562, + "step": 852 + }, + { + "epoch": 0.4134755210857974, + "grad_norm": 1.361686110496521, + "learning_rate": 3.648528701155203e-05, + "loss": 1.278, + "step": 853 + }, + { + "epoch": 0.4139602520601066, + "grad_norm": 1.4689781665802002, + "learning_rate": 3.644768387608793e-05, + "loss": 1.417, + "step": 854 + }, + { + "epoch": 0.4144449830344159, + "grad_norm": 1.5472491979599, + "learning_rate": 3.6410047941569224e-05, + "loss": 1.4856, + "step": 855 + }, + { + "epoch": 0.41492971400872514, + "grad_norm": 1.4804879426956177, + "learning_rate": 3.637237931582759e-05, + "loss": 1.7454, + "step": 856 + }, + { + "epoch": 0.4154144449830344, + "grad_norm": 1.4140907526016235, + "learning_rate": 3.633467810678839e-05, + "loss": 1.4682, + "step": 857 + }, + { + "epoch": 0.41589917595734366, + "grad_norm": 1.4889259338378906, + "learning_rate": 3.629694442247032e-05, + "loss": 1.594, + "step": 858 + }, + { + "epoch": 0.41638390693165295, + "grad_norm": 1.4880090951919556, + "learning_rate": 3.6259178370985144e-05, + "loss": 1.4029, + "step": 859 + }, + { + "epoch": 0.4168686379059622, + "grad_norm": 1.4780687093734741, + "learning_rate": 3.6221380060537333e-05, + "loss": 1.4561, + "step": 860 + }, + { + "epoch": 0.41735336888027147, + "grad_norm": 1.4494913816452026, + "learning_rate": 3.6183549599423815e-05, + "loss": 1.4076, + "step": 861 + }, + { + "epoch": 0.4178380998545807, + "grad_norm": 1.416123390197754, + "learning_rate": 3.6145687096033634e-05, + "loss": 1.3328, + "step": 862 + }, + { + "epoch": 0.41832283082889, + "grad_norm": 1.4615715742111206, + "learning_rate": 3.6107792658847595e-05, + "loss": 1.3547, + "step": 863 + }, + { + "epoch": 0.4188075618031992, + "grad_norm": 1.3456425666809082, + "learning_rate": 3.606986639643805e-05, + "loss": 1.6143, + "step": 864 + }, + { + "epoch": 0.41929229277750846, + "grad_norm": 1.4085299968719482, + "learning_rate": 3.603190841746851e-05, + "loss": 1.4381, + "step": 865 + }, + { + "epoch": 0.41977702375181775, + "grad_norm": 1.3223602771759033, + "learning_rate": 3.599391883069335e-05, + "loss": 1.1998, + "step": 866 + }, + { + "epoch": 0.420261754726127, + "grad_norm": 1.3724223375320435, + "learning_rate": 3.595589774495753e-05, + "loss": 1.3599, + "step": 867 + }, + { + "epoch": 0.42074648570043627, + "grad_norm": 1.5481332540512085, + "learning_rate": 3.591784526919624e-05, + "loss": 1.6104, + "step": 868 + }, + { + "epoch": 0.4212312166747455, + "grad_norm": 1.4483201503753662, + "learning_rate": 3.58797615124346e-05, + "loss": 1.3523, + "step": 869 + }, + { + "epoch": 0.4217159476490548, + "grad_norm": 2.079836845397949, + "learning_rate": 3.584164658378738e-05, + "loss": 1.6446, + "step": 870 + }, + { + "epoch": 0.422200678623364, + "grad_norm": 1.3948813676834106, + "learning_rate": 3.580350059245864e-05, + "loss": 1.3267, + "step": 871 + }, + { + "epoch": 0.4226854095976733, + "grad_norm": 1.3875503540039062, + "learning_rate": 3.576532364774145e-05, + "loss": 1.2546, + "step": 872 + }, + { + "epoch": 0.42317014057198254, + "grad_norm": 1.590307354927063, + "learning_rate": 3.572711585901755e-05, + "loss": 1.733, + "step": 873 + }, + { + "epoch": 0.42365487154629183, + "grad_norm": 1.4814637899398804, + "learning_rate": 3.568887733575706e-05, + "loss": 1.5166, + "step": 874 + }, + { + "epoch": 0.42413960252060107, + "grad_norm": 1.3917155265808105, + "learning_rate": 3.565060818751816e-05, + "loss": 1.2831, + "step": 875 + }, + { + "epoch": 0.4246243334949103, + "grad_norm": 1.4022222757339478, + "learning_rate": 3.561230852394679e-05, + "loss": 1.2607, + "step": 876 + }, + { + "epoch": 0.4251090644692196, + "grad_norm": 1.503482460975647, + "learning_rate": 3.55739784547763e-05, + "loss": 1.5367, + "step": 877 + }, + { + "epoch": 0.4255937954435288, + "grad_norm": 1.544471025466919, + "learning_rate": 3.553561808982715e-05, + "loss": 1.5024, + "step": 878 + }, + { + "epoch": 0.4260785264178381, + "grad_norm": 1.434191346168518, + "learning_rate": 3.5497227539006614e-05, + "loss": 1.327, + "step": 879 + }, + { + "epoch": 0.42656325739214734, + "grad_norm": 1.3471134901046753, + "learning_rate": 3.545880691230846e-05, + "loss": 1.263, + "step": 880 + }, + { + "epoch": 0.42704798836645663, + "grad_norm": 1.3368321657180786, + "learning_rate": 3.542035631981261e-05, + "loss": 1.3815, + "step": 881 + }, + { + "epoch": 0.42753271934076587, + "grad_norm": 1.9717825651168823, + "learning_rate": 3.538187587168486e-05, + "loss": 1.6113, + "step": 882 + }, + { + "epoch": 0.42801745031507515, + "grad_norm": 1.412209153175354, + "learning_rate": 3.534336567817651e-05, + "loss": 1.2464, + "step": 883 + }, + { + "epoch": 0.4285021812893844, + "grad_norm": 1.6004551649093628, + "learning_rate": 3.530482584962414e-05, + "loss": 1.7197, + "step": 884 + }, + { + "epoch": 0.4289869122636937, + "grad_norm": 1.4312201738357544, + "learning_rate": 3.5266256496449186e-05, + "loss": 1.4607, + "step": 885 + }, + { + "epoch": 0.4294716432380029, + "grad_norm": 1.418025016784668, + "learning_rate": 3.5227657729157705e-05, + "loss": 1.441, + "step": 886 + }, + { + "epoch": 0.42995637421231214, + "grad_norm": 1.4603976011276245, + "learning_rate": 3.5189029658340025e-05, + "loss": 1.472, + "step": 887 + }, + { + "epoch": 0.43044110518662143, + "grad_norm": 1.4871946573257446, + "learning_rate": 3.5150372394670426e-05, + "loss": 1.252, + "step": 888 + }, + { + "epoch": 0.43092583616093066, + "grad_norm": 2.187009811401367, + "learning_rate": 3.5111686048906835e-05, + "loss": 1.3653, + "step": 889 + }, + { + "epoch": 0.43141056713523995, + "grad_norm": 1.4901810884475708, + "learning_rate": 3.5072970731890486e-05, + "loss": 1.7895, + "step": 890 + }, + { + "epoch": 0.4318952981095492, + "grad_norm": 1.3609486818313599, + "learning_rate": 3.5034226554545656e-05, + "loss": 1.2407, + "step": 891 + }, + { + "epoch": 0.4323800290838585, + "grad_norm": 1.5160627365112305, + "learning_rate": 3.499545362787927e-05, + "loss": 1.4348, + "step": 892 + }, + { + "epoch": 0.4328647600581677, + "grad_norm": 1.4251633882522583, + "learning_rate": 3.495665206298065e-05, + "loss": 1.4585, + "step": 893 + }, + { + "epoch": 0.433349491032477, + "grad_norm": 1.4607558250427246, + "learning_rate": 3.491782197102115e-05, + "loss": 1.3888, + "step": 894 + }, + { + "epoch": 0.43383422200678623, + "grad_norm": 1.49302077293396, + "learning_rate": 3.487896346325389e-05, + "loss": 1.604, + "step": 895 + }, + { + "epoch": 0.4343189529810955, + "grad_norm": 1.5757123231887817, + "learning_rate": 3.484007665101336e-05, + "loss": 1.5465, + "step": 896 + }, + { + "epoch": 0.43480368395540475, + "grad_norm": 1.4108710289001465, + "learning_rate": 3.480116164571519e-05, + "loss": 1.4156, + "step": 897 + }, + { + "epoch": 0.435288414929714, + "grad_norm": 1.2154583930969238, + "learning_rate": 3.476221855885576e-05, + "loss": 1.4186, + "step": 898 + }, + { + "epoch": 0.4357731459040233, + "grad_norm": 1.456335425376892, + "learning_rate": 3.47232475020119e-05, + "loss": 1.5292, + "step": 899 + }, + { + "epoch": 0.4362578768783325, + "grad_norm": 1.4850857257843018, + "learning_rate": 3.468424858684061e-05, + "loss": 1.1836, + "step": 900 + }, + { + "epoch": 0.4367426078526418, + "grad_norm": 1.4295960664749146, + "learning_rate": 3.4645221925078674e-05, + "loss": 1.3047, + "step": 901 + }, + { + "epoch": 0.43722733882695103, + "grad_norm": 1.5569299459457397, + "learning_rate": 3.4606167628542395e-05, + "loss": 1.3896, + "step": 902 + }, + { + "epoch": 0.4377120698012603, + "grad_norm": 1.4047545194625854, + "learning_rate": 3.456708580912725e-05, + "loss": 1.5364, + "step": 903 + }, + { + "epoch": 0.43819680077556955, + "grad_norm": 2.0098063945770264, + "learning_rate": 3.452797657880756e-05, + "loss": 1.7044, + "step": 904 + }, + { + "epoch": 0.43868153174987884, + "grad_norm": 1.4840755462646484, + "learning_rate": 3.4488840049636195e-05, + "loss": 1.4837, + "step": 905 + }, + { + "epoch": 0.4391662627241881, + "grad_norm": 1.7989381551742554, + "learning_rate": 3.444967633374424e-05, + "loss": 1.6995, + "step": 906 + }, + { + "epoch": 0.43965099369849736, + "grad_norm": 1.4424799680709839, + "learning_rate": 3.441048554334066e-05, + "loss": 1.397, + "step": 907 + }, + { + "epoch": 0.4401357246728066, + "grad_norm": 1.501582145690918, + "learning_rate": 3.4371267790712e-05, + "loss": 1.5452, + "step": 908 + }, + { + "epoch": 0.4406204556471158, + "grad_norm": 1.461760401725769, + "learning_rate": 3.433202318822207e-05, + "loss": 1.581, + "step": 909 + }, + { + "epoch": 0.4411051866214251, + "grad_norm": 1.3767198324203491, + "learning_rate": 3.429275184831158e-05, + "loss": 1.4953, + "step": 910 + }, + { + "epoch": 0.44158991759573435, + "grad_norm": 1.3784865140914917, + "learning_rate": 3.425345388349786e-05, + "loss": 1.4367, + "step": 911 + }, + { + "epoch": 0.44207464857004364, + "grad_norm": 1.4725292921066284, + "learning_rate": 3.421412940637453e-05, + "loss": 1.3251, + "step": 912 + }, + { + "epoch": 0.44255937954435287, + "grad_norm": 1.5235271453857422, + "learning_rate": 3.417477852961116e-05, + "loss": 1.7435, + "step": 913 + }, + { + "epoch": 0.44304411051866216, + "grad_norm": 1.5075349807739258, + "learning_rate": 3.413540136595296e-05, + "loss": 1.3295, + "step": 914 + }, + { + "epoch": 0.4435288414929714, + "grad_norm": 1.5114574432373047, + "learning_rate": 3.409599802822047e-05, + "loss": 1.6123, + "step": 915 + }, + { + "epoch": 0.4440135724672807, + "grad_norm": 1.4881386756896973, + "learning_rate": 3.40565686293092e-05, + "loss": 1.5483, + "step": 916 + }, + { + "epoch": 0.4444983034415899, + "grad_norm": 1.7126438617706299, + "learning_rate": 3.401711328218934e-05, + "loss": 1.7761, + "step": 917 + }, + { + "epoch": 0.4449830344158992, + "grad_norm": 1.6697680950164795, + "learning_rate": 3.397763209990542e-05, + "loss": 1.5235, + "step": 918 + }, + { + "epoch": 0.44546776539020844, + "grad_norm": 1.4794918298721313, + "learning_rate": 3.3938125195576e-05, + "loss": 1.468, + "step": 919 + }, + { + "epoch": 0.44595249636451767, + "grad_norm": 1.6914706230163574, + "learning_rate": 3.3898592682393336e-05, + "loss": 1.5126, + "step": 920 + }, + { + "epoch": 0.44643722733882696, + "grad_norm": 1.4983173608779907, + "learning_rate": 3.3859034673623045e-05, + "loss": 1.1858, + "step": 921 + }, + { + "epoch": 0.4469219583131362, + "grad_norm": 1.5133384466171265, + "learning_rate": 3.3819451282603805e-05, + "loss": 1.3387, + "step": 922 + }, + { + "epoch": 0.4474066892874455, + "grad_norm": 1.416934609413147, + "learning_rate": 3.377984262274701e-05, + "loss": 1.3361, + "step": 923 + }, + { + "epoch": 0.4478914202617547, + "grad_norm": 1.4019713401794434, + "learning_rate": 3.3740208807536446e-05, + "loss": 1.5387, + "step": 924 + }, + { + "epoch": 0.448376151236064, + "grad_norm": 1.4528837203979492, + "learning_rate": 3.3700549950527994e-05, + "loss": 1.332, + "step": 925 + }, + { + "epoch": 0.44886088221037324, + "grad_norm": 1.435003638267517, + "learning_rate": 3.366086616534925e-05, + "loss": 1.2677, + "step": 926 + }, + { + "epoch": 0.4493456131846825, + "grad_norm": 1.4747051000595093, + "learning_rate": 3.362115756569926e-05, + "loss": 1.2977, + "step": 927 + }, + { + "epoch": 0.44983034415899176, + "grad_norm": 1.4179006814956665, + "learning_rate": 3.358142426534817e-05, + "loss": 1.3164, + "step": 928 + }, + { + "epoch": 0.450315075133301, + "grad_norm": 1.5167887210845947, + "learning_rate": 3.354166637813687e-05, + "loss": 1.8859, + "step": 929 + }, + { + "epoch": 0.4507998061076103, + "grad_norm": 1.4719829559326172, + "learning_rate": 3.350188401797672e-05, + "loss": 1.4828, + "step": 930 + }, + { + "epoch": 0.4512845370819195, + "grad_norm": 1.4412150382995605, + "learning_rate": 3.346207729884918e-05, + "loss": 1.4545, + "step": 931 + }, + { + "epoch": 0.4517692680562288, + "grad_norm": 1.426696538925171, + "learning_rate": 3.34222463348055e-05, + "loss": 1.4738, + "step": 932 + }, + { + "epoch": 0.45225399903053803, + "grad_norm": 1.7029757499694824, + "learning_rate": 3.338239123996642e-05, + "loss": 1.7937, + "step": 933 + }, + { + "epoch": 0.4527387300048473, + "grad_norm": 1.4222897291183472, + "learning_rate": 3.3342512128521794e-05, + "loss": 1.4876, + "step": 934 + }, + { + "epoch": 0.45322346097915656, + "grad_norm": 1.3874694108963013, + "learning_rate": 3.33026091147303e-05, + "loss": 1.2369, + "step": 935 + }, + { + "epoch": 0.45370819195346584, + "grad_norm": 1.348215103149414, + "learning_rate": 3.3262682312919084e-05, + "loss": 1.3298, + "step": 936 + }, + { + "epoch": 0.4541929229277751, + "grad_norm": 1.54569673538208, + "learning_rate": 3.322273183748346e-05, + "loss": 1.4331, + "step": 937 + }, + { + "epoch": 0.45467765390208437, + "grad_norm": 1.403524398803711, + "learning_rate": 3.318275780288656e-05, + "loss": 1.1923, + "step": 938 + }, + { + "epoch": 0.4551623848763936, + "grad_norm": 1.4105076789855957, + "learning_rate": 3.3142760323659036e-05, + "loss": 1.3577, + "step": 939 + }, + { + "epoch": 0.45564711585070283, + "grad_norm": 1.3613835573196411, + "learning_rate": 3.310273951439869e-05, + "loss": 1.3631, + "step": 940 + }, + { + "epoch": 0.4561318468250121, + "grad_norm": 1.5017019510269165, + "learning_rate": 3.3062695489770175e-05, + "loss": 1.4085, + "step": 941 + }, + { + "epoch": 0.45661657779932135, + "grad_norm": 1.409903883934021, + "learning_rate": 3.302262836450466e-05, + "loss": 1.3829, + "step": 942 + }, + { + "epoch": 0.45710130877363064, + "grad_norm": 1.4461843967437744, + "learning_rate": 3.29825382533995e-05, + "loss": 1.4971, + "step": 943 + }, + { + "epoch": 0.4575860397479399, + "grad_norm": 1.486220359802246, + "learning_rate": 3.2942425271317914e-05, + "loss": 1.4184, + "step": 944 + }, + { + "epoch": 0.45807077072224917, + "grad_norm": 1.5135023593902588, + "learning_rate": 3.2902289533188634e-05, + "loss": 1.4369, + "step": 945 + }, + { + "epoch": 0.4585555016965584, + "grad_norm": 1.5079814195632935, + "learning_rate": 3.28621311540056e-05, + "loss": 1.5486, + "step": 946 + }, + { + "epoch": 0.4590402326708677, + "grad_norm": 1.4888428449630737, + "learning_rate": 3.282195024882764e-05, + "loss": 1.2889, + "step": 947 + }, + { + "epoch": 0.4595249636451769, + "grad_norm": 1.3688241243362427, + "learning_rate": 3.2781746932778076e-05, + "loss": 1.2063, + "step": 948 + }, + { + "epoch": 0.4600096946194862, + "grad_norm": 1.4846850633621216, + "learning_rate": 3.274152132104447e-05, + "loss": 1.5821, + "step": 949 + }, + { + "epoch": 0.46049442559379544, + "grad_norm": 1.5739784240722656, + "learning_rate": 3.270127352887828e-05, + "loss": 1.4057, + "step": 950 + }, + { + "epoch": 0.4609791565681047, + "grad_norm": 1.5579380989074707, + "learning_rate": 3.266100367159448e-05, + "loss": 1.5043, + "step": 951 + }, + { + "epoch": 0.46146388754241396, + "grad_norm": 1.4212524890899658, + "learning_rate": 3.2620711864571274e-05, + "loss": 1.3979, + "step": 952 + }, + { + "epoch": 0.4619486185167232, + "grad_norm": 1.4628902673721313, + "learning_rate": 3.258039822324977e-05, + "loss": 1.377, + "step": 953 + }, + { + "epoch": 0.4624333494910325, + "grad_norm": 1.5454577207565308, + "learning_rate": 3.254006286313362e-05, + "loss": 1.4185, + "step": 954 + }, + { + "epoch": 0.4629180804653417, + "grad_norm": 1.8712515830993652, + "learning_rate": 3.24997058997887e-05, + "loss": 1.6019, + "step": 955 + }, + { + "epoch": 0.463402811439651, + "grad_norm": 1.3647255897521973, + "learning_rate": 3.245932744884278e-05, + "loss": 1.1018, + "step": 956 + }, + { + "epoch": 0.46388754241396024, + "grad_norm": 1.5049718618392944, + "learning_rate": 3.241892762598522e-05, + "loss": 1.5208, + "step": 957 + }, + { + "epoch": 0.46437227338826953, + "grad_norm": 1.5857036113739014, + "learning_rate": 3.237850654696659e-05, + "loss": 1.33, + "step": 958 + }, + { + "epoch": 0.46485700436257876, + "grad_norm": 1.4404864311218262, + "learning_rate": 3.233806432759837e-05, + "loss": 1.3034, + "step": 959 + }, + { + "epoch": 0.46534173533688805, + "grad_norm": 1.4808937311172485, + "learning_rate": 3.22976010837526e-05, + "loss": 1.5562, + "step": 960 + }, + { + "epoch": 0.4658264663111973, + "grad_norm": 1.636323094367981, + "learning_rate": 3.225711693136156e-05, + "loss": 1.2444, + "step": 961 + }, + { + "epoch": 0.4663111972855065, + "grad_norm": 1.4531266689300537, + "learning_rate": 3.221661198641745e-05, + "loss": 1.2632, + "step": 962 + }, + { + "epoch": 0.4667959282598158, + "grad_norm": 1.3352887630462646, + "learning_rate": 3.217608636497203e-05, + "loss": 1.2413, + "step": 963 + }, + { + "epoch": 0.46728065923412504, + "grad_norm": 1.3544855117797852, + "learning_rate": 3.213554018313631e-05, + "loss": 1.2552, + "step": 964 + }, + { + "epoch": 0.46776539020843433, + "grad_norm": 1.5176138877868652, + "learning_rate": 3.209497355708019e-05, + "loss": 1.4344, + "step": 965 + }, + { + "epoch": 0.46825012118274356, + "grad_norm": 1.3940963745117188, + "learning_rate": 3.205438660303216e-05, + "loss": 1.418, + "step": 966 + }, + { + "epoch": 0.46873485215705285, + "grad_norm": 1.6681914329528809, + "learning_rate": 3.201377943727896e-05, + "loss": 1.5961, + "step": 967 + }, + { + "epoch": 0.4692195831313621, + "grad_norm": 1.3219623565673828, + "learning_rate": 3.1973152176165224e-05, + "loss": 1.1749, + "step": 968 + }, + { + "epoch": 0.46970431410567137, + "grad_norm": 1.4592280387878418, + "learning_rate": 3.1932504936093167e-05, + "loss": 1.6801, + "step": 969 + }, + { + "epoch": 0.4701890450799806, + "grad_norm": 1.471772313117981, + "learning_rate": 3.189183783352224e-05, + "loss": 1.4001, + "step": 970 + }, + { + "epoch": 0.4706737760542899, + "grad_norm": 1.5347093343734741, + "learning_rate": 3.1851150984968814e-05, + "loss": 1.4215, + "step": 971 + }, + { + "epoch": 0.4711585070285991, + "grad_norm": 1.481303334236145, + "learning_rate": 3.1810444507005824e-05, + "loss": 1.4193, + "step": 972 + }, + { + "epoch": 0.47164323800290836, + "grad_norm": 1.6206600666046143, + "learning_rate": 3.1769718516262466e-05, + "loss": 1.4644, + "step": 973 + }, + { + "epoch": 0.47212796897721765, + "grad_norm": 1.437705159187317, + "learning_rate": 3.172897312942381e-05, + "loss": 1.4197, + "step": 974 + }, + { + "epoch": 0.4726126999515269, + "grad_norm": 1.5616474151611328, + "learning_rate": 3.168820846323053e-05, + "loss": 1.574, + "step": 975 + }, + { + "epoch": 0.47309743092583617, + "grad_norm": 1.4508112668991089, + "learning_rate": 3.16474246344785e-05, + "loss": 1.5167, + "step": 976 + }, + { + "epoch": 0.4735821619001454, + "grad_norm": 1.4133033752441406, + "learning_rate": 3.1606621760018554e-05, + "loss": 1.3912, + "step": 977 + }, + { + "epoch": 0.4740668928744547, + "grad_norm": 1.5243926048278809, + "learning_rate": 3.156579995675603e-05, + "loss": 1.1586, + "step": 978 + }, + { + "epoch": 0.4745516238487639, + "grad_norm": 1.3419501781463623, + "learning_rate": 3.152495934165055e-05, + "loss": 1.434, + "step": 979 + }, + { + "epoch": 0.4750363548230732, + "grad_norm": 1.5045238733291626, + "learning_rate": 3.148410003171561e-05, + "loss": 1.2983, + "step": 980 + }, + { + "epoch": 0.47552108579738245, + "grad_norm": 1.3928799629211426, + "learning_rate": 3.1443222144018265e-05, + "loss": 1.7629, + "step": 981 + }, + { + "epoch": 0.47600581677169174, + "grad_norm": 1.5783344507217407, + "learning_rate": 3.1402325795678814e-05, + "loss": 1.4108, + "step": 982 + }, + { + "epoch": 0.47649054774600097, + "grad_norm": 1.3899141550064087, + "learning_rate": 3.1361411103870455e-05, + "loss": 1.3194, + "step": 983 + }, + { + "epoch": 0.4769752787203102, + "grad_norm": 1.438714623451233, + "learning_rate": 3.1320478185818896e-05, + "loss": 1.6322, + "step": 984 + }, + { + "epoch": 0.4774600096946195, + "grad_norm": 1.579592227935791, + "learning_rate": 3.127952715880212e-05, + "loss": 1.7614, + "step": 985 + }, + { + "epoch": 0.4779447406689287, + "grad_norm": 1.5455819368362427, + "learning_rate": 3.1238558140149964e-05, + "loss": 1.1737, + "step": 986 + }, + { + "epoch": 0.478429471643238, + "grad_norm": 1.4268600940704346, + "learning_rate": 3.119757124724384e-05, + "loss": 1.3511, + "step": 987 + }, + { + "epoch": 0.47891420261754725, + "grad_norm": 1.4414108991622925, + "learning_rate": 3.115656659751632e-05, + "loss": 1.6027, + "step": 988 + }, + { + "epoch": 0.47939893359185654, + "grad_norm": 1.382475733757019, + "learning_rate": 3.11155443084509e-05, + "loss": 1.3433, + "step": 989 + }, + { + "epoch": 0.47988366456616577, + "grad_norm": 1.3869885206222534, + "learning_rate": 3.10745044975816e-05, + "loss": 1.5374, + "step": 990 + }, + { + "epoch": 0.48036839554047506, + "grad_norm": 1.4492113590240479, + "learning_rate": 3.1033447282492646e-05, + "loss": 1.4005, + "step": 991 + }, + { + "epoch": 0.4808531265147843, + "grad_norm": 1.4593931436538696, + "learning_rate": 3.0992372780818113e-05, + "loss": 1.4781, + "step": 992 + }, + { + "epoch": 0.4813378574890936, + "grad_norm": 1.2960480451583862, + "learning_rate": 3.0951281110241634e-05, + "loss": 1.1956, + "step": 993 + }, + { + "epoch": 0.4818225884634028, + "grad_norm": 3.1581783294677734, + "learning_rate": 3.0910172388496e-05, + "loss": 1.2586, + "step": 994 + }, + { + "epoch": 0.48230731943771205, + "grad_norm": 1.422606348991394, + "learning_rate": 3.086904673336287e-05, + "loss": 1.3854, + "step": 995 + }, + { + "epoch": 0.48279205041202133, + "grad_norm": 1.3689360618591309, + "learning_rate": 3.082790426267243e-05, + "loss": 0.9986, + "step": 996 + }, + { + "epoch": 0.48327678138633057, + "grad_norm": 1.3445098400115967, + "learning_rate": 3.0786745094303035e-05, + "loss": 1.5066, + "step": 997 + }, + { + "epoch": 0.48376151236063986, + "grad_norm": 1.4405405521392822, + "learning_rate": 3.0745569346180876e-05, + "loss": 1.4553, + "step": 998 + }, + { + "epoch": 0.4842462433349491, + "grad_norm": 1.8509117364883423, + "learning_rate": 3.070437713627965e-05, + "loss": 1.5544, + "step": 999 + }, + { + "epoch": 0.4847309743092584, + "grad_norm": 1.540792465209961, + "learning_rate": 3.066316858262023e-05, + "loss": 1.8954, + "step": 1000 + }, + { + "epoch": 0.4852157052835676, + "grad_norm": 1.587246060371399, + "learning_rate": 3.0621943803270295e-05, + "loss": 1.3981, + "step": 1001 + }, + { + "epoch": 0.4857004362578769, + "grad_norm": 1.6329646110534668, + "learning_rate": 3.058070291634403e-05, + "loss": 1.795, + "step": 1002 + }, + { + "epoch": 0.48618516723218613, + "grad_norm": 1.5801475048065186, + "learning_rate": 3.053944604000177e-05, + "loss": 1.7153, + "step": 1003 + }, + { + "epoch": 0.4866698982064954, + "grad_norm": 1.7902394533157349, + "learning_rate": 3.0498173292449643e-05, + "loss": 1.5319, + "step": 1004 + }, + { + "epoch": 0.48715462918080465, + "grad_norm": 1.4669686555862427, + "learning_rate": 3.0456884791939278e-05, + "loss": 1.5914, + "step": 1005 + }, + { + "epoch": 0.4876393601551139, + "grad_norm": 4.603597640991211, + "learning_rate": 3.041558065676742e-05, + "loss": 1.7194, + "step": 1006 + }, + { + "epoch": 0.4881240911294232, + "grad_norm": 1.4835340976715088, + "learning_rate": 3.0374261005275607e-05, + "loss": 1.673, + "step": 1007 + }, + { + "epoch": 0.4886088221037324, + "grad_norm": 1.3735729455947876, + "learning_rate": 3.0332925955849844e-05, + "loss": 1.3659, + "step": 1008 + }, + { + "epoch": 0.4890935530780417, + "grad_norm": 1.5638624429702759, + "learning_rate": 3.0291575626920243e-05, + "loss": 1.4514, + "step": 1009 + }, + { + "epoch": 0.48957828405235093, + "grad_norm": 1.4114654064178467, + "learning_rate": 3.025021013696071e-05, + "loss": 1.4997, + "step": 1010 + }, + { + "epoch": 0.4900630150266602, + "grad_norm": 1.3967055082321167, + "learning_rate": 3.0208829604488563e-05, + "loss": 1.4588, + "step": 1011 + }, + { + "epoch": 0.49054774600096945, + "grad_norm": 1.719627022743225, + "learning_rate": 3.0167434148064254e-05, + "loss": 1.6016, + "step": 1012 + }, + { + "epoch": 0.49103247697527874, + "grad_norm": 1.463747262954712, + "learning_rate": 3.0126023886290955e-05, + "loss": 1.687, + "step": 1013 + }, + { + "epoch": 0.491517207949588, + "grad_norm": 1.4238579273223877, + "learning_rate": 3.008459893781429e-05, + "loss": 1.3301, + "step": 1014 + }, + { + "epoch": 0.49200193892389726, + "grad_norm": 1.401583194732666, + "learning_rate": 3.004315942132194e-05, + "loss": 1.6042, + "step": 1015 + }, + { + "epoch": 0.4924866698982065, + "grad_norm": 1.4220138788223267, + "learning_rate": 3.0001705455543326e-05, + "loss": 1.3374, + "step": 1016 + }, + { + "epoch": 0.49297140087251573, + "grad_norm": 1.4294856786727905, + "learning_rate": 2.99602371592493e-05, + "loss": 1.4055, + "step": 1017 + }, + { + "epoch": 0.493456131846825, + "grad_norm": 1.5217885971069336, + "learning_rate": 2.9918754651251723e-05, + "loss": 1.4845, + "step": 1018 + }, + { + "epoch": 0.49394086282113425, + "grad_norm": 1.4662128686904907, + "learning_rate": 2.9877258050403212e-05, + "loss": 1.2869, + "step": 1019 + }, + { + "epoch": 0.49442559379544354, + "grad_norm": 1.4692763090133667, + "learning_rate": 2.9835747475596743e-05, + "loss": 1.3966, + "step": 1020 + }, + { + "epoch": 0.4949103247697528, + "grad_norm": 1.358508586883545, + "learning_rate": 2.979422304576534e-05, + "loss": 1.3493, + "step": 1021 + }, + { + "epoch": 0.49539505574406206, + "grad_norm": 1.510632038116455, + "learning_rate": 2.9752684879881725e-05, + "loss": 1.6699, + "step": 1022 + }, + { + "epoch": 0.4958797867183713, + "grad_norm": 1.507602334022522, + "learning_rate": 2.9711133096957962e-05, + "loss": 1.6718, + "step": 1023 + }, + { + "epoch": 0.4963645176926806, + "grad_norm": 1.3191126585006714, + "learning_rate": 2.966956781604513e-05, + "loss": 1.2832, + "step": 1024 + }, + { + "epoch": 0.4968492486669898, + "grad_norm": 1.4080357551574707, + "learning_rate": 2.9627989156233006e-05, + "loss": 1.4744, + "step": 1025 + }, + { + "epoch": 0.4973339796412991, + "grad_norm": 1.4639397859573364, + "learning_rate": 2.9586397236649666e-05, + "loss": 1.5554, + "step": 1026 + }, + { + "epoch": 0.49781871061560834, + "grad_norm": 1.427006721496582, + "learning_rate": 2.9544792176461205e-05, + "loss": 1.8437, + "step": 1027 + }, + { + "epoch": 0.4983034415899176, + "grad_norm": 1.4412407875061035, + "learning_rate": 2.9503174094871344e-05, + "loss": 1.5907, + "step": 1028 + }, + { + "epoch": 0.49878817256422686, + "grad_norm": 1.5524944067001343, + "learning_rate": 2.9461543111121128e-05, + "loss": 1.5992, + "step": 1029 + }, + { + "epoch": 0.4992729035385361, + "grad_norm": 1.46015202999115, + "learning_rate": 2.941989934448856e-05, + "loss": 1.3932, + "step": 1030 + }, + { + "epoch": 0.4997576345128454, + "grad_norm": 1.3838374614715576, + "learning_rate": 2.9378242914288272e-05, + "loss": 1.2414, + "step": 1031 + }, + { + "epoch": 0.5002423654871546, + "grad_norm": 1.40300714969635, + "learning_rate": 2.9336573939871186e-05, + "loss": 1.2298, + "step": 1032 + }, + { + "epoch": 0.5007270964614638, + "grad_norm": 1.3450630903244019, + "learning_rate": 2.9294892540624147e-05, + "loss": 1.6257, + "step": 1033 + }, + { + "epoch": 0.5012118274357732, + "grad_norm": 1.4591161012649536, + "learning_rate": 2.9253198835969607e-05, + "loss": 1.5271, + "step": 1034 + }, + { + "epoch": 0.5016965584100824, + "grad_norm": 1.4087638854980469, + "learning_rate": 2.9211492945365288e-05, + "loss": 1.2425, + "step": 1035 + }, + { + "epoch": 0.5021812893843917, + "grad_norm": 1.501910924911499, + "learning_rate": 2.9169774988303805e-05, + "loss": 1.5748, + "step": 1036 + }, + { + "epoch": 0.5026660203587009, + "grad_norm": 1.5783244371414185, + "learning_rate": 2.9128045084312344e-05, + "loss": 1.3556, + "step": 1037 + }, + { + "epoch": 0.5031507513330101, + "grad_norm": 1.54836905002594, + "learning_rate": 2.908630335295235e-05, + "loss": 1.6043, + "step": 1038 + }, + { + "epoch": 0.5036354823073195, + "grad_norm": 1.2901034355163574, + "learning_rate": 2.9044549913819124e-05, + "loss": 1.5115, + "step": 1039 + }, + { + "epoch": 0.5041202132816287, + "grad_norm": 1.3442738056182861, + "learning_rate": 2.9002784886541517e-05, + "loss": 1.4688, + "step": 1040 + }, + { + "epoch": 0.5046049442559379, + "grad_norm": 1.411210298538208, + "learning_rate": 2.8961008390781603e-05, + "loss": 1.5606, + "step": 1041 + }, + { + "epoch": 0.5050896752302472, + "grad_norm": 1.4159587621688843, + "learning_rate": 2.8919220546234282e-05, + "loss": 1.498, + "step": 1042 + }, + { + "epoch": 0.5055744062045565, + "grad_norm": 1.3815311193466187, + "learning_rate": 2.8877421472626996e-05, + "loss": 1.2571, + "step": 1043 + }, + { + "epoch": 0.5060591371788657, + "grad_norm": 1.5053796768188477, + "learning_rate": 2.8835611289719345e-05, + "loss": 1.336, + "step": 1044 + }, + { + "epoch": 0.506543868153175, + "grad_norm": 1.4755936861038208, + "learning_rate": 2.8793790117302765e-05, + "loss": 1.2908, + "step": 1045 + }, + { + "epoch": 0.5070285991274842, + "grad_norm": 1.34824800491333, + "learning_rate": 2.8751958075200185e-05, + "loss": 1.5134, + "step": 1046 + }, + { + "epoch": 0.5075133301017936, + "grad_norm": 1.3312855958938599, + "learning_rate": 2.8710115283265655e-05, + "loss": 1.3463, + "step": 1047 + }, + { + "epoch": 0.5079980610761028, + "grad_norm": 1.3728924989700317, + "learning_rate": 2.8668261861384045e-05, + "loss": 1.7027, + "step": 1048 + }, + { + "epoch": 0.508482792050412, + "grad_norm": 1.781646728515625, + "learning_rate": 2.8626397929470672e-05, + "loss": 1.2995, + "step": 1049 + }, + { + "epoch": 0.5089675230247213, + "grad_norm": 1.4653760194778442, + "learning_rate": 2.8584523607470976e-05, + "loss": 1.3966, + "step": 1050 + }, + { + "epoch": 0.5094522539990305, + "grad_norm": 1.5125970840454102, + "learning_rate": 2.854263901536015e-05, + "loss": 1.4322, + "step": 1051 + }, + { + "epoch": 0.5099369849733398, + "grad_norm": 1.3065565824508667, + "learning_rate": 2.8500744273142833e-05, + "loss": 1.4568, + "step": 1052 + }, + { + "epoch": 0.5104217159476491, + "grad_norm": 1.6204395294189453, + "learning_rate": 2.845883950085271e-05, + "loss": 1.5351, + "step": 1053 + }, + { + "epoch": 0.5109064469219583, + "grad_norm": 1.3245327472686768, + "learning_rate": 2.8416924818552238e-05, + "loss": 1.4935, + "step": 1054 + }, + { + "epoch": 0.5113911778962675, + "grad_norm": 1.4986622333526611, + "learning_rate": 2.8375000346332255e-05, + "loss": 1.3269, + "step": 1055 + }, + { + "epoch": 0.5118759088705769, + "grad_norm": 1.5058215856552124, + "learning_rate": 2.8333066204311654e-05, + "loss": 1.7093, + "step": 1056 + }, + { + "epoch": 0.5123606398448861, + "grad_norm": 1.4276849031448364, + "learning_rate": 2.829112251263702e-05, + "loss": 1.5234, + "step": 1057 + }, + { + "epoch": 0.5128453708191953, + "grad_norm": 1.4878147840499878, + "learning_rate": 2.824916939148231e-05, + "loss": 1.1303, + "step": 1058 + }, + { + "epoch": 0.5133301017935046, + "grad_norm": 1.4708224534988403, + "learning_rate": 2.8207206961048494e-05, + "loss": 1.5127, + "step": 1059 + }, + { + "epoch": 0.5138148327678138, + "grad_norm": 1.4829494953155518, + "learning_rate": 2.8165235341563212e-05, + "loss": 1.1638, + "step": 1060 + }, + { + "epoch": 0.5142995637421232, + "grad_norm": 1.3504929542541504, + "learning_rate": 2.8123254653280445e-05, + "loss": 1.5974, + "step": 1061 + }, + { + "epoch": 0.5147842947164324, + "grad_norm": 1.5481303930282593, + "learning_rate": 2.8081265016480137e-05, + "loss": 1.4454, + "step": 1062 + }, + { + "epoch": 0.5152690256907416, + "grad_norm": 1.4048713445663452, + "learning_rate": 2.8039266551467873e-05, + "loss": 1.5425, + "step": 1063 + }, + { + "epoch": 0.5157537566650509, + "grad_norm": 1.3949064016342163, + "learning_rate": 2.7997259378574564e-05, + "loss": 1.2964, + "step": 1064 + }, + { + "epoch": 0.5162384876393602, + "grad_norm": 1.4568665027618408, + "learning_rate": 2.7955243618156023e-05, + "loss": 1.5504, + "step": 1065 + }, + { + "epoch": 0.5167232186136694, + "grad_norm": 1.3728277683258057, + "learning_rate": 2.7913219390592704e-05, + "loss": 1.19, + "step": 1066 + }, + { + "epoch": 0.5172079495879787, + "grad_norm": 1.5279256105422974, + "learning_rate": 2.787118681628929e-05, + "loss": 1.737, + "step": 1067 + }, + { + "epoch": 0.5176926805622879, + "grad_norm": 1.4620921611785889, + "learning_rate": 2.7829146015674406e-05, + "loss": 1.461, + "step": 1068 + }, + { + "epoch": 0.5181774115365972, + "grad_norm": 1.5227320194244385, + "learning_rate": 2.778709710920024e-05, + "loss": 1.3925, + "step": 1069 + }, + { + "epoch": 0.5186621425109065, + "grad_norm": 1.4971587657928467, + "learning_rate": 2.7745040217342195e-05, + "loss": 1.4625, + "step": 1070 + }, + { + "epoch": 0.5191468734852157, + "grad_norm": 1.531641960144043, + "learning_rate": 2.7702975460598547e-05, + "loss": 1.3207, + "step": 1071 + }, + { + "epoch": 0.5196316044595249, + "grad_norm": 1.3242888450622559, + "learning_rate": 2.766090295949013e-05, + "loss": 1.5634, + "step": 1072 + }, + { + "epoch": 0.5201163354338342, + "grad_norm": 1.3874385356903076, + "learning_rate": 2.7618822834559947e-05, + "loss": 1.35, + "step": 1073 + }, + { + "epoch": 0.5206010664081435, + "grad_norm": 1.45573091506958, + "learning_rate": 2.757673520637285e-05, + "loss": 1.6007, + "step": 1074 + }, + { + "epoch": 0.5210857973824528, + "grad_norm": 1.45209538936615, + "learning_rate": 2.75346401955152e-05, + "loss": 1.3736, + "step": 1075 + }, + { + "epoch": 0.521570528356762, + "grad_norm": 1.4319308996200562, + "learning_rate": 2.749253792259448e-05, + "loss": 1.6778, + "step": 1076 + }, + { + "epoch": 0.5220552593310712, + "grad_norm": 1.388581395149231, + "learning_rate": 2.7450428508239024e-05, + "loss": 1.6616, + "step": 1077 + }, + { + "epoch": 0.5225399903053806, + "grad_norm": 1.3756077289581299, + "learning_rate": 2.7408312073097574e-05, + "loss": 1.1387, + "step": 1078 + }, + { + "epoch": 0.5230247212796898, + "grad_norm": 1.4797683954238892, + "learning_rate": 2.7366188737839026e-05, + "loss": 1.5972, + "step": 1079 + }, + { + "epoch": 0.523509452253999, + "grad_norm": 1.4945778846740723, + "learning_rate": 2.7324058623152056e-05, + "loss": 1.5199, + "step": 1080 + }, + { + "epoch": 0.5239941832283083, + "grad_norm": 1.3682905435562134, + "learning_rate": 2.7281921849744714e-05, + "loss": 1.3174, + "step": 1081 + }, + { + "epoch": 0.5244789142026175, + "grad_norm": 1.3681645393371582, + "learning_rate": 2.7239778538344163e-05, + "loss": 1.3481, + "step": 1082 + }, + { + "epoch": 0.5249636451769268, + "grad_norm": 1.43455171585083, + "learning_rate": 2.7197628809696306e-05, + "loss": 1.4334, + "step": 1083 + }, + { + "epoch": 0.5254483761512361, + "grad_norm": 1.50799560546875, + "learning_rate": 2.715547278456541e-05, + "loss": 1.5683, + "step": 1084 + }, + { + "epoch": 0.5259331071255453, + "grad_norm": 1.4469860792160034, + "learning_rate": 2.7113310583733797e-05, + "loss": 1.4747, + "step": 1085 + }, + { + "epoch": 0.5264178380998545, + "grad_norm": 1.420809030532837, + "learning_rate": 2.7071142328001465e-05, + "loss": 1.6305, + "step": 1086 + }, + { + "epoch": 0.5269025690741639, + "grad_norm": 1.3464183807373047, + "learning_rate": 2.7028968138185782e-05, + "loss": 1.3846, + "step": 1087 + }, + { + "epoch": 0.5273873000484731, + "grad_norm": 1.4121391773223877, + "learning_rate": 2.6986788135121106e-05, + "loss": 1.4262, + "step": 1088 + }, + { + "epoch": 0.5278720310227824, + "grad_norm": 1.4404566287994385, + "learning_rate": 2.6944602439658457e-05, + "loss": 1.7716, + "step": 1089 + }, + { + "epoch": 0.5283567619970916, + "grad_norm": 1.4350225925445557, + "learning_rate": 2.6902411172665147e-05, + "loss": 1.3612, + "step": 1090 + }, + { + "epoch": 0.5288414929714008, + "grad_norm": 1.46249520778656, + "learning_rate": 2.686021445502448e-05, + "loss": 1.699, + "step": 1091 + }, + { + "epoch": 0.5293262239457102, + "grad_norm": 1.4543901681900024, + "learning_rate": 2.681801240763535e-05, + "loss": 1.6293, + "step": 1092 + }, + { + "epoch": 0.5298109549200194, + "grad_norm": 1.5145845413208008, + "learning_rate": 2.6775805151411936e-05, + "loss": 1.4227, + "step": 1093 + }, + { + "epoch": 0.5302956858943286, + "grad_norm": 1.4717572927474976, + "learning_rate": 2.6733592807283344e-05, + "loss": 1.4983, + "step": 1094 + }, + { + "epoch": 0.5307804168686379, + "grad_norm": 1.4818400144577026, + "learning_rate": 2.6691375496193234e-05, + "loss": 1.3531, + "step": 1095 + }, + { + "epoch": 0.5312651478429472, + "grad_norm": 1.3545438051223755, + "learning_rate": 2.6649153339099524e-05, + "loss": 1.0584, + "step": 1096 + }, + { + "epoch": 0.5317498788172564, + "grad_norm": 1.457953691482544, + "learning_rate": 2.6606926456974013e-05, + "loss": 1.4133, + "step": 1097 + }, + { + "epoch": 0.5322346097915657, + "grad_norm": 1.343666434288025, + "learning_rate": 2.656469497080202e-05, + "loss": 1.3662, + "step": 1098 + }, + { + "epoch": 0.5327193407658749, + "grad_norm": 1.5306185483932495, + "learning_rate": 2.6522459001582078e-05, + "loss": 1.6159, + "step": 1099 + }, + { + "epoch": 0.5332040717401842, + "grad_norm": 1.4223085641860962, + "learning_rate": 2.648021867032554e-05, + "loss": 1.3616, + "step": 1100 + }, + { + "epoch": 0.5336888027144935, + "grad_norm": 1.4389088153839111, + "learning_rate": 2.643797409805628e-05, + "loss": 1.2718, + "step": 1101 + }, + { + "epoch": 0.5341735336888027, + "grad_norm": 1.5863919258117676, + "learning_rate": 2.6395725405810307e-05, + "loss": 1.5832, + "step": 1102 + }, + { + "epoch": 0.534658264663112, + "grad_norm": 1.531956672668457, + "learning_rate": 2.635347271463544e-05, + "loss": 1.3952, + "step": 1103 + }, + { + "epoch": 0.5351429956374212, + "grad_norm": 1.63759446144104, + "learning_rate": 2.631121614559096e-05, + "loss": 1.3839, + "step": 1104 + }, + { + "epoch": 0.5356277266117305, + "grad_norm": 1.6401275396347046, + "learning_rate": 2.6268955819747247e-05, + "loss": 1.6776, + "step": 1105 + }, + { + "epoch": 0.5361124575860398, + "grad_norm": 1.4038505554199219, + "learning_rate": 2.6226691858185454e-05, + "loss": 1.4152, + "step": 1106 + }, + { + "epoch": 0.536597188560349, + "grad_norm": 1.4665429592132568, + "learning_rate": 2.6184424381997146e-05, + "loss": 1.5916, + "step": 1107 + }, + { + "epoch": 0.5370819195346582, + "grad_norm": 1.3302826881408691, + "learning_rate": 2.6142153512283968e-05, + "loss": 1.5929, + "step": 1108 + }, + { + "epoch": 0.5375666505089676, + "grad_norm": 1.4651814699172974, + "learning_rate": 2.609987937015728e-05, + "loss": 1.5793, + "step": 1109 + }, + { + "epoch": 0.5380513814832768, + "grad_norm": 1.3761016130447388, + "learning_rate": 2.605760207673781e-05, + "loss": 1.4432, + "step": 1110 + }, + { + "epoch": 0.538536112457586, + "grad_norm": 1.5528533458709717, + "learning_rate": 2.601532175315532e-05, + "loss": 1.6668, + "step": 1111 + }, + { + "epoch": 0.5390208434318953, + "grad_norm": 1.3150146007537842, + "learning_rate": 2.5973038520548266e-05, + "loss": 1.3136, + "step": 1112 + }, + { + "epoch": 0.5395055744062045, + "grad_norm": 1.4814192056655884, + "learning_rate": 2.5930752500063425e-05, + "loss": 1.721, + "step": 1113 + }, + { + "epoch": 0.5399903053805138, + "grad_norm": 1.5142635107040405, + "learning_rate": 2.5888463812855578e-05, + "loss": 1.5931, + "step": 1114 + }, + { + "epoch": 0.5404750363548231, + "grad_norm": 1.3756675720214844, + "learning_rate": 2.5846172580087112e-05, + "loss": 1.0801, + "step": 1115 + }, + { + "epoch": 0.5409597673291323, + "grad_norm": 1.4426296949386597, + "learning_rate": 2.5803878922927755e-05, + "loss": 1.4838, + "step": 1116 + }, + { + "epoch": 0.5414444983034415, + "grad_norm": 1.2579292058944702, + "learning_rate": 2.576158296255413e-05, + "loss": 1.2591, + "step": 1117 + }, + { + "epoch": 0.5419292292777509, + "grad_norm": 1.4666320085525513, + "learning_rate": 2.5719284820149503e-05, + "loss": 1.5415, + "step": 1118 + }, + { + "epoch": 0.5424139602520601, + "grad_norm": 1.5992366075515747, + "learning_rate": 2.5676984616903367e-05, + "loss": 1.327, + "step": 1119 + }, + { + "epoch": 0.5428986912263694, + "grad_norm": 1.4156771898269653, + "learning_rate": 2.5634682474011128e-05, + "loss": 1.2807, + "step": 1120 + }, + { + "epoch": 0.5433834222006786, + "grad_norm": 1.4783258438110352, + "learning_rate": 2.559237851267374e-05, + "loss": 1.5313, + "step": 1121 + }, + { + "epoch": 0.5438681531749879, + "grad_norm": 1.400573492050171, + "learning_rate": 2.555007285409739e-05, + "loss": 1.3329, + "step": 1122 + }, + { + "epoch": 0.5443528841492972, + "grad_norm": 1.3685263395309448, + "learning_rate": 2.550776561949311e-05, + "loss": 1.4368, + "step": 1123 + }, + { + "epoch": 0.5448376151236064, + "grad_norm": 1.5202267169952393, + "learning_rate": 2.5465456930076435e-05, + "loss": 1.5863, + "step": 1124 + }, + { + "epoch": 0.5453223460979156, + "grad_norm": 1.5026854276657104, + "learning_rate": 2.54231469070671e-05, + "loss": 1.6135, + "step": 1125 + }, + { + "epoch": 0.5458070770722249, + "grad_norm": 1.2926137447357178, + "learning_rate": 2.5380835671688628e-05, + "loss": 1.1667, + "step": 1126 + }, + { + "epoch": 0.5462918080465342, + "grad_norm": 1.429006814956665, + "learning_rate": 2.5338523345168048e-05, + "loss": 1.3725, + "step": 1127 + }, + { + "epoch": 0.5467765390208434, + "grad_norm": 1.605655550956726, + "learning_rate": 2.52962100487355e-05, + "loss": 1.9144, + "step": 1128 + }, + { + "epoch": 0.5472612699951527, + "grad_norm": 1.3905277252197266, + "learning_rate": 2.525389590362388e-05, + "loss": 1.4073, + "step": 1129 + }, + { + "epoch": 0.5477460009694619, + "grad_norm": 1.443237543106079, + "learning_rate": 2.521158103106856e-05, + "loss": 1.6917, + "step": 1130 + }, + { + "epoch": 0.5482307319437713, + "grad_norm": 1.544218897819519, + "learning_rate": 2.5169265552306963e-05, + "loss": 1.4208, + "step": 1131 + }, + { + "epoch": 0.5487154629180805, + "grad_norm": 1.4166316986083984, + "learning_rate": 2.5126949588578264e-05, + "loss": 1.3686, + "step": 1132 + }, + { + "epoch": 0.5492001938923897, + "grad_norm": 1.3663363456726074, + "learning_rate": 2.508463326112302e-05, + "loss": 1.3377, + "step": 1133 + }, + { + "epoch": 0.549684924866699, + "grad_norm": 1.3695237636566162, + "learning_rate": 2.504231669118283e-05, + "loss": 1.384, + "step": 1134 + }, + { + "epoch": 0.5501696558410082, + "grad_norm": 1.418594479560852, + "learning_rate": 2.5e-05, + "loss": 1.6453, + "step": 1135 + }, + { + "epoch": 0.5506543868153175, + "grad_norm": 1.4432318210601807, + "learning_rate": 2.495768330881717e-05, + "loss": 1.6813, + "step": 1136 + }, + { + "epoch": 0.5511391177896268, + "grad_norm": 1.3886134624481201, + "learning_rate": 2.4915366738876986e-05, + "loss": 1.4878, + "step": 1137 + }, + { + "epoch": 0.551623848763936, + "grad_norm": 1.3143367767333984, + "learning_rate": 2.4873050411421738e-05, + "loss": 1.1719, + "step": 1138 + }, + { + "epoch": 0.5521085797382452, + "grad_norm": 1.4947563409805298, + "learning_rate": 2.483073444769304e-05, + "loss": 1.9726, + "step": 1139 + }, + { + "epoch": 0.5525933107125546, + "grad_norm": 1.4314428567886353, + "learning_rate": 2.478841896893145e-05, + "loss": 1.4453, + "step": 1140 + }, + { + "epoch": 0.5530780416868638, + "grad_norm": 1.5481895208358765, + "learning_rate": 2.4746104096376128e-05, + "loss": 1.5185, + "step": 1141 + }, + { + "epoch": 0.553562772661173, + "grad_norm": 1.3574029207229614, + "learning_rate": 2.470378995126451e-05, + "loss": 1.2467, + "step": 1142 + }, + { + "epoch": 0.5540475036354823, + "grad_norm": 1.4265308380126953, + "learning_rate": 2.4661476654831958e-05, + "loss": 1.3669, + "step": 1143 + }, + { + "epoch": 0.5545322346097916, + "grad_norm": 1.789808988571167, + "learning_rate": 2.4619164328311374e-05, + "loss": 1.4957, + "step": 1144 + }, + { + "epoch": 0.5550169655841009, + "grad_norm": 1.401563048362732, + "learning_rate": 2.4576853092932907e-05, + "loss": 1.7368, + "step": 1145 + }, + { + "epoch": 0.5555016965584101, + "grad_norm": 1.3560460805892944, + "learning_rate": 2.4534543069923567e-05, + "loss": 1.3568, + "step": 1146 + }, + { + "epoch": 0.5559864275327193, + "grad_norm": 1.4376869201660156, + "learning_rate": 2.4492234380506894e-05, + "loss": 1.4885, + "step": 1147 + }, + { + "epoch": 0.5564711585070286, + "grad_norm": 1.4365642070770264, + "learning_rate": 2.4449927145902606e-05, + "loss": 1.2875, + "step": 1148 + }, + { + "epoch": 0.5569558894813379, + "grad_norm": 1.4295824766159058, + "learning_rate": 2.4407621487326255e-05, + "loss": 1.4832, + "step": 1149 + }, + { + "epoch": 0.5574406204556471, + "grad_norm": 1.3713922500610352, + "learning_rate": 2.4365317525988885e-05, + "loss": 1.1781, + "step": 1150 + }, + { + "epoch": 0.5579253514299564, + "grad_norm": 1.5723742246627808, + "learning_rate": 2.4323015383096643e-05, + "loss": 1.4846, + "step": 1151 + }, + { + "epoch": 0.5584100824042656, + "grad_norm": 1.2833575010299683, + "learning_rate": 2.4280715179850506e-05, + "loss": 1.1875, + "step": 1152 + }, + { + "epoch": 0.5588948133785749, + "grad_norm": 1.355686068534851, + "learning_rate": 2.4238417037445875e-05, + "loss": 1.1902, + "step": 1153 + }, + { + "epoch": 0.5593795443528842, + "grad_norm": 1.415571689605713, + "learning_rate": 2.419612107707225e-05, + "loss": 1.5119, + "step": 1154 + }, + { + "epoch": 0.5598642753271934, + "grad_norm": 1.4047833681106567, + "learning_rate": 2.415382741991289e-05, + "loss": 1.3724, + "step": 1155 + }, + { + "epoch": 0.5603490063015026, + "grad_norm": 1.8079580068588257, + "learning_rate": 2.4111536187144425e-05, + "loss": 1.6128, + "step": 1156 + }, + { + "epoch": 0.5608337372758119, + "grad_norm": 1.410330057144165, + "learning_rate": 2.406924749993657e-05, + "loss": 1.3897, + "step": 1157 + }, + { + "epoch": 0.5613184682501212, + "grad_norm": 1.3198529481887817, + "learning_rate": 2.4026961479451733e-05, + "loss": 1.0365, + "step": 1158 + }, + { + "epoch": 0.5618031992244304, + "grad_norm": 1.3807947635650635, + "learning_rate": 2.3984678246844677e-05, + "loss": 1.3613, + "step": 1159 + }, + { + "epoch": 0.5622879301987397, + "grad_norm": 3.3659908771514893, + "learning_rate": 2.3942397923262204e-05, + "loss": 1.4527, + "step": 1160 + }, + { + "epoch": 0.5627726611730489, + "grad_norm": 1.3476786613464355, + "learning_rate": 2.3900120629842732e-05, + "loss": 1.1218, + "step": 1161 + }, + { + "epoch": 0.5632573921473583, + "grad_norm": 1.6585768461227417, + "learning_rate": 2.3857846487716038e-05, + "loss": 2.0664, + "step": 1162 + }, + { + "epoch": 0.5637421231216675, + "grad_norm": 1.4534820318222046, + "learning_rate": 2.3815575618002856e-05, + "loss": 1.466, + "step": 1163 + }, + { + "epoch": 0.5642268540959767, + "grad_norm": 1.3563114404678345, + "learning_rate": 2.3773308141814552e-05, + "loss": 1.3486, + "step": 1164 + }, + { + "epoch": 0.564711585070286, + "grad_norm": 1.4399093389511108, + "learning_rate": 2.3731044180252756e-05, + "loss": 1.3011, + "step": 1165 + }, + { + "epoch": 0.5651963160445953, + "grad_norm": 1.3951159715652466, + "learning_rate": 2.3688783854409045e-05, + "loss": 1.159, + "step": 1166 + }, + { + "epoch": 0.5656810470189045, + "grad_norm": 1.4155405759811401, + "learning_rate": 2.3646527285364565e-05, + "loss": 1.3806, + "step": 1167 + }, + { + "epoch": 0.5661657779932138, + "grad_norm": 1.3713849782943726, + "learning_rate": 2.3604274594189695e-05, + "loss": 1.4719, + "step": 1168 + }, + { + "epoch": 0.566650508967523, + "grad_norm": 1.5539053678512573, + "learning_rate": 2.3562025901943726e-05, + "loss": 1.5451, + "step": 1169 + }, + { + "epoch": 0.5671352399418322, + "grad_norm": 1.4450373649597168, + "learning_rate": 2.351978132967447e-05, + "loss": 1.3855, + "step": 1170 + }, + { + "epoch": 0.5676199709161416, + "grad_norm": 1.4404420852661133, + "learning_rate": 2.347754099841793e-05, + "loss": 1.7035, + "step": 1171 + }, + { + "epoch": 0.5681047018904508, + "grad_norm": 1.4115735292434692, + "learning_rate": 2.3435305029197984e-05, + "loss": 1.4108, + "step": 1172 + }, + { + "epoch": 0.56858943286476, + "grad_norm": 1.4475492238998413, + "learning_rate": 2.3393073543025996e-05, + "loss": 1.2907, + "step": 1173 + }, + { + "epoch": 0.5690741638390693, + "grad_norm": 1.3643134832382202, + "learning_rate": 2.335084666090048e-05, + "loss": 1.3279, + "step": 1174 + }, + { + "epoch": 0.5695588948133786, + "grad_norm": 1.480979323387146, + "learning_rate": 2.3308624503806772e-05, + "loss": 1.6189, + "step": 1175 + }, + { + "epoch": 0.5700436257876879, + "grad_norm": 1.3969759941101074, + "learning_rate": 2.3266407192716666e-05, + "loss": 1.472, + "step": 1176 + }, + { + "epoch": 0.5705283567619971, + "grad_norm": 1.454965591430664, + "learning_rate": 2.3224194848588066e-05, + "loss": 1.0431, + "step": 1177 + }, + { + "epoch": 0.5710130877363063, + "grad_norm": 1.3337841033935547, + "learning_rate": 2.3181987592364655e-05, + "loss": 1.2462, + "step": 1178 + }, + { + "epoch": 0.5714978187106156, + "grad_norm": 1.3526654243469238, + "learning_rate": 2.3139785544975527e-05, + "loss": 1.3556, + "step": 1179 + }, + { + "epoch": 0.5719825496849249, + "grad_norm": 1.464104413986206, + "learning_rate": 2.309758882733486e-05, + "loss": 1.594, + "step": 1180 + }, + { + "epoch": 0.5724672806592341, + "grad_norm": 1.5070477724075317, + "learning_rate": 2.305539756034155e-05, + "loss": 1.2799, + "step": 1181 + }, + { + "epoch": 0.5729520116335434, + "grad_norm": 1.747273325920105, + "learning_rate": 2.30132118648789e-05, + "loss": 1.4735, + "step": 1182 + }, + { + "epoch": 0.5734367426078526, + "grad_norm": 2.815830945968628, + "learning_rate": 2.2971031861814223e-05, + "loss": 1.3718, + "step": 1183 + }, + { + "epoch": 0.573921473582162, + "grad_norm": 1.4527679681777954, + "learning_rate": 2.2928857671998538e-05, + "loss": 1.4325, + "step": 1184 + }, + { + "epoch": 0.5744062045564712, + "grad_norm": 1.5000076293945312, + "learning_rate": 2.288668941626621e-05, + "loss": 1.5461, + "step": 1185 + }, + { + "epoch": 0.5748909355307804, + "grad_norm": 1.3602912425994873, + "learning_rate": 2.2844527215434592e-05, + "loss": 1.403, + "step": 1186 + }, + { + "epoch": 0.5753756665050896, + "grad_norm": 1.4031354188919067, + "learning_rate": 2.2802371190303696e-05, + "loss": 1.2408, + "step": 1187 + }, + { + "epoch": 0.575860397479399, + "grad_norm": 1.385236382484436, + "learning_rate": 2.2760221461655833e-05, + "loss": 1.4098, + "step": 1188 + }, + { + "epoch": 0.5763451284537082, + "grad_norm": 1.3685550689697266, + "learning_rate": 2.27180781502553e-05, + "loss": 1.2458, + "step": 1189 + }, + { + "epoch": 0.5768298594280175, + "grad_norm": 1.4121043682098389, + "learning_rate": 2.267594137684796e-05, + "loss": 1.6139, + "step": 1190 + }, + { + "epoch": 0.5773145904023267, + "grad_norm": 1.40248703956604, + "learning_rate": 2.2633811262160977e-05, + "loss": 1.6284, + "step": 1191 + }, + { + "epoch": 0.5777993213766359, + "grad_norm": 1.5229443311691284, + "learning_rate": 2.2591687926902432e-05, + "loss": 1.619, + "step": 1192 + }, + { + "epoch": 0.5782840523509453, + "grad_norm": 1.4838701486587524, + "learning_rate": 2.2549571491760986e-05, + "loss": 1.4843, + "step": 1193 + }, + { + "epoch": 0.5787687833252545, + "grad_norm": 1.4755678176879883, + "learning_rate": 2.2507462077405523e-05, + "loss": 1.4363, + "step": 1194 + }, + { + "epoch": 0.5792535142995637, + "grad_norm": 1.4962295293807983, + "learning_rate": 2.2465359804484806e-05, + "loss": 1.3032, + "step": 1195 + }, + { + "epoch": 0.579738245273873, + "grad_norm": 1.3037718534469604, + "learning_rate": 2.2423264793627148e-05, + "loss": 1.3224, + "step": 1196 + }, + { + "epoch": 0.5802229762481823, + "grad_norm": 1.3326340913772583, + "learning_rate": 2.2381177165440055e-05, + "loss": 1.5501, + "step": 1197 + }, + { + "epoch": 0.5807077072224915, + "grad_norm": 1.484641432762146, + "learning_rate": 2.2339097040509882e-05, + "loss": 1.5076, + "step": 1198 + }, + { + "epoch": 0.5811924381968008, + "grad_norm": 1.4163663387298584, + "learning_rate": 2.2297024539401463e-05, + "loss": 1.482, + "step": 1199 + }, + { + "epoch": 0.58167716917111, + "grad_norm": 1.3082435131072998, + "learning_rate": 2.225495978265782e-05, + "loss": 1.2917, + "step": 1200 + }, + { + "epoch": 0.5821619001454192, + "grad_norm": 1.4738914966583252, + "learning_rate": 2.2212902890799767e-05, + "loss": 1.6231, + "step": 1201 + }, + { + "epoch": 0.5826466311197286, + "grad_norm": 1.4668748378753662, + "learning_rate": 2.2170853984325597e-05, + "loss": 1.2994, + "step": 1202 + }, + { + "epoch": 0.5831313620940378, + "grad_norm": 1.4222160577774048, + "learning_rate": 2.2128813183710716e-05, + "loss": 1.4073, + "step": 1203 + }, + { + "epoch": 0.583616093068347, + "grad_norm": 1.6111295223236084, + "learning_rate": 2.2086780609407305e-05, + "loss": 1.1423, + "step": 1204 + }, + { + "epoch": 0.5841008240426563, + "grad_norm": 1.3994250297546387, + "learning_rate": 2.2044756381843983e-05, + "loss": 1.6045, + "step": 1205 + }, + { + "epoch": 0.5845855550169656, + "grad_norm": 1.3738783597946167, + "learning_rate": 2.2002740621425442e-05, + "loss": 1.2844, + "step": 1206 + }, + { + "epoch": 0.5850702859912749, + "grad_norm": 1.3349037170410156, + "learning_rate": 2.1960733448532126e-05, + "loss": 1.3563, + "step": 1207 + }, + { + "epoch": 0.5855550169655841, + "grad_norm": 1.464147925376892, + "learning_rate": 2.1918734983519873e-05, + "loss": 1.5385, + "step": 1208 + }, + { + "epoch": 0.5860397479398933, + "grad_norm": 1.2928396463394165, + "learning_rate": 2.1876745346719567e-05, + "loss": 1.1307, + "step": 1209 + }, + { + "epoch": 0.5865244789142026, + "grad_norm": 1.4599294662475586, + "learning_rate": 2.1834764658436797e-05, + "loss": 1.3856, + "step": 1210 + }, + { + "epoch": 0.5870092098885119, + "grad_norm": 1.4603444337844849, + "learning_rate": 2.1792793038951515e-05, + "loss": 1.4164, + "step": 1211 + }, + { + "epoch": 0.5874939408628211, + "grad_norm": 1.4382938146591187, + "learning_rate": 2.1750830608517696e-05, + "loss": 1.661, + "step": 1212 + }, + { + "epoch": 0.5879786718371304, + "grad_norm": 1.448847770690918, + "learning_rate": 2.1708877487362987e-05, + "loss": 1.498, + "step": 1213 + }, + { + "epoch": 0.5884634028114396, + "grad_norm": 1.4043089151382446, + "learning_rate": 2.1666933795688352e-05, + "loss": 1.4716, + "step": 1214 + }, + { + "epoch": 0.588948133785749, + "grad_norm": 1.4823040962219238, + "learning_rate": 2.1624999653667747e-05, + "loss": 1.4944, + "step": 1215 + }, + { + "epoch": 0.5894328647600582, + "grad_norm": 1.5355263948440552, + "learning_rate": 2.1583075181447764e-05, + "loss": 1.5809, + "step": 1216 + }, + { + "epoch": 0.5899175957343674, + "grad_norm": 1.4335788488388062, + "learning_rate": 2.1541160499147297e-05, + "loss": 1.4292, + "step": 1217 + }, + { + "epoch": 0.5904023267086767, + "grad_norm": 1.435947299003601, + "learning_rate": 2.1499255726857183e-05, + "loss": 1.5298, + "step": 1218 + }, + { + "epoch": 0.590887057682986, + "grad_norm": 1.5709877014160156, + "learning_rate": 2.1457360984639853e-05, + "loss": 1.304, + "step": 1219 + }, + { + "epoch": 0.5913717886572952, + "grad_norm": 1.3494329452514648, + "learning_rate": 2.141547639252903e-05, + "loss": 1.4771, + "step": 1220 + }, + { + "epoch": 0.5918565196316045, + "grad_norm": 1.6605241298675537, + "learning_rate": 2.137360207052933e-05, + "loss": 1.8418, + "step": 1221 + }, + { + "epoch": 0.5923412506059137, + "grad_norm": 1.2385108470916748, + "learning_rate": 2.1331738138615958e-05, + "loss": 1.3536, + "step": 1222 + }, + { + "epoch": 0.5928259815802229, + "grad_norm": 1.4509235620498657, + "learning_rate": 2.1289884716734347e-05, + "loss": 1.4231, + "step": 1223 + }, + { + "epoch": 0.5933107125545323, + "grad_norm": 1.551924705505371, + "learning_rate": 2.124804192479982e-05, + "loss": 1.3416, + "step": 1224 + }, + { + "epoch": 0.5937954435288415, + "grad_norm": 1.4182485342025757, + "learning_rate": 2.1206209882697234e-05, + "loss": 1.7183, + "step": 1225 + }, + { + "epoch": 0.5942801745031507, + "grad_norm": 2.135904550552368, + "learning_rate": 2.1164388710280654e-05, + "loss": 1.2838, + "step": 1226 + }, + { + "epoch": 0.59476490547746, + "grad_norm": 1.4602768421173096, + "learning_rate": 2.1122578527373016e-05, + "loss": 1.3902, + "step": 1227 + }, + { + "epoch": 0.5952496364517693, + "grad_norm": 1.485456943511963, + "learning_rate": 2.1080779453765727e-05, + "loss": 1.4738, + "step": 1228 + }, + { + "epoch": 0.5957343674260785, + "grad_norm": 1.4163349866867065, + "learning_rate": 2.1038991609218407e-05, + "loss": 1.2601, + "step": 1229 + }, + { + "epoch": 0.5962190984003878, + "grad_norm": 1.474471926689148, + "learning_rate": 2.099721511345849e-05, + "loss": 1.633, + "step": 1230 + }, + { + "epoch": 0.596703829374697, + "grad_norm": 1.3297244310379028, + "learning_rate": 2.0955450086180882e-05, + "loss": 1.1103, + "step": 1231 + }, + { + "epoch": 0.5971885603490062, + "grad_norm": 1.4120020866394043, + "learning_rate": 2.091369664704766e-05, + "loss": 1.469, + "step": 1232 + }, + { + "epoch": 0.5976732913233156, + "grad_norm": 1.4585497379302979, + "learning_rate": 2.0871954915687658e-05, + "loss": 1.4362, + "step": 1233 + }, + { + "epoch": 0.5981580222976248, + "grad_norm": 1.4092267751693726, + "learning_rate": 2.08302250116962e-05, + "loss": 1.3138, + "step": 1234 + }, + { + "epoch": 0.5986427532719341, + "grad_norm": 1.3591123819351196, + "learning_rate": 2.0788507054634714e-05, + "loss": 1.3283, + "step": 1235 + }, + { + "epoch": 0.5991274842462433, + "grad_norm": 1.3640435934066772, + "learning_rate": 2.074680116403039e-05, + "loss": 1.4952, + "step": 1236 + }, + { + "epoch": 0.5996122152205526, + "grad_norm": 1.4771053791046143, + "learning_rate": 2.070510745937586e-05, + "loss": 1.6309, + "step": 1237 + }, + { + "epoch": 0.6000969461948619, + "grad_norm": 1.4691507816314697, + "learning_rate": 2.066342606012882e-05, + "loss": 1.348, + "step": 1238 + }, + { + "epoch": 0.6005816771691711, + "grad_norm": 1.3989927768707275, + "learning_rate": 2.0621757085711734e-05, + "loss": 1.4557, + "step": 1239 + }, + { + "epoch": 0.6010664081434803, + "grad_norm": 1.4642021656036377, + "learning_rate": 2.058010065551145e-05, + "loss": 1.5403, + "step": 1240 + }, + { + "epoch": 0.6015511391177897, + "grad_norm": 1.473580241203308, + "learning_rate": 2.0538456888878878e-05, + "loss": 1.3097, + "step": 1241 + }, + { + "epoch": 0.6020358700920989, + "grad_norm": 1.3591980934143066, + "learning_rate": 2.0496825905128665e-05, + "loss": 1.4441, + "step": 1242 + }, + { + "epoch": 0.6025206010664081, + "grad_norm": 1.3612762689590454, + "learning_rate": 2.04552078235388e-05, + "loss": 1.2061, + "step": 1243 + }, + { + "epoch": 0.6030053320407174, + "grad_norm": 1.4053421020507812, + "learning_rate": 2.0413602763350337e-05, + "loss": 1.3619, + "step": 1244 + }, + { + "epoch": 0.6034900630150266, + "grad_norm": 1.2905552387237549, + "learning_rate": 2.0372010843766996e-05, + "loss": 1.1999, + "step": 1245 + }, + { + "epoch": 0.603974793989336, + "grad_norm": 1.4193109273910522, + "learning_rate": 2.0330432183954867e-05, + "loss": 1.512, + "step": 1246 + }, + { + "epoch": 0.6044595249636452, + "grad_norm": 1.3693450689315796, + "learning_rate": 2.0288866903042054e-05, + "loss": 1.3532, + "step": 1247 + }, + { + "epoch": 0.6049442559379544, + "grad_norm": 1.4601976871490479, + "learning_rate": 2.0247315120118284e-05, + "loss": 1.4071, + "step": 1248 + }, + { + "epoch": 0.6054289869122637, + "grad_norm": 1.480519413948059, + "learning_rate": 2.0205776954234663e-05, + "loss": 1.5318, + "step": 1249 + }, + { + "epoch": 0.605913717886573, + "grad_norm": 1.3617873191833496, + "learning_rate": 2.0164252524403263e-05, + "loss": 1.3624, + "step": 1250 + }, + { + "epoch": 0.6063984488608822, + "grad_norm": 1.3958122730255127, + "learning_rate": 2.0122741949596797e-05, + "loss": 1.0867, + "step": 1251 + }, + { + "epoch": 0.6068831798351915, + "grad_norm": 1.5380889177322388, + "learning_rate": 2.0081245348748286e-05, + "loss": 1.7665, + "step": 1252 + }, + { + "epoch": 0.6073679108095007, + "grad_norm": 1.4936906099319458, + "learning_rate": 2.0039762840750707e-05, + "loss": 1.1883, + "step": 1253 + }, + { + "epoch": 0.6078526417838099, + "grad_norm": 1.3765724897384644, + "learning_rate": 1.999829454445667e-05, + "loss": 1.2834, + "step": 1254 + }, + { + "epoch": 0.6083373727581193, + "grad_norm": 1.4031294584274292, + "learning_rate": 1.995684057867806e-05, + "loss": 1.2124, + "step": 1255 + }, + { + "epoch": 0.6088221037324285, + "grad_norm": 1.514238715171814, + "learning_rate": 1.991540106218572e-05, + "loss": 1.6542, + "step": 1256 + }, + { + "epoch": 0.6093068347067377, + "grad_norm": 1.4218300580978394, + "learning_rate": 1.9873976113709048e-05, + "loss": 1.6589, + "step": 1257 + }, + { + "epoch": 0.609791565681047, + "grad_norm": 1.5005664825439453, + "learning_rate": 1.983256585193575e-05, + "loss": 1.445, + "step": 1258 + }, + { + "epoch": 0.6102762966553563, + "grad_norm": 1.571577548980713, + "learning_rate": 1.979117039551144e-05, + "loss": 1.7245, + "step": 1259 + }, + { + "epoch": 0.6107610276296656, + "grad_norm": 1.4720330238342285, + "learning_rate": 1.9749789863039297e-05, + "loss": 1.4244, + "step": 1260 + }, + { + "epoch": 0.6112457586039748, + "grad_norm": 1.5714093446731567, + "learning_rate": 1.970842437307976e-05, + "loss": 1.6333, + "step": 1261 + }, + { + "epoch": 0.611730489578284, + "grad_norm": 1.4255337715148926, + "learning_rate": 1.9667074044150165e-05, + "loss": 1.5165, + "step": 1262 + }, + { + "epoch": 0.6122152205525934, + "grad_norm": 1.590695858001709, + "learning_rate": 1.96257389947244e-05, + "loss": 1.33, + "step": 1263 + }, + { + "epoch": 0.6126999515269026, + "grad_norm": 1.4347730875015259, + "learning_rate": 1.9584419343232584e-05, + "loss": 1.3907, + "step": 1264 + }, + { + "epoch": 0.6131846825012118, + "grad_norm": 1.46970796585083, + "learning_rate": 1.954311520806072e-05, + "loss": 1.3961, + "step": 1265 + }, + { + "epoch": 0.6136694134755211, + "grad_norm": 1.3484935760498047, + "learning_rate": 1.9501826707550366e-05, + "loss": 1.4027, + "step": 1266 + }, + { + "epoch": 0.6141541444498303, + "grad_norm": 1.3133972883224487, + "learning_rate": 1.9460553959998244e-05, + "loss": 1.3246, + "step": 1267 + }, + { + "epoch": 0.6146388754241396, + "grad_norm": 1.3001095056533813, + "learning_rate": 1.9419297083655976e-05, + "loss": 1.3312, + "step": 1268 + }, + { + "epoch": 0.6151236063984489, + "grad_norm": 1.4388257265090942, + "learning_rate": 1.937805619672971e-05, + "loss": 1.4682, + "step": 1269 + }, + { + "epoch": 0.6156083373727581, + "grad_norm": 1.4479858875274658, + "learning_rate": 1.9336831417379777e-05, + "loss": 1.4709, + "step": 1270 + }, + { + "epoch": 0.6160930683470673, + "grad_norm": 1.2688299417495728, + "learning_rate": 1.9295622863720356e-05, + "loss": 0.9973, + "step": 1271 + }, + { + "epoch": 0.6165777993213767, + "grad_norm": 1.4602196216583252, + "learning_rate": 1.9254430653819127e-05, + "loss": 1.5403, + "step": 1272 + }, + { + "epoch": 0.6170625302956859, + "grad_norm": 1.4173598289489746, + "learning_rate": 1.9213254905696964e-05, + "loss": 1.3238, + "step": 1273 + }, + { + "epoch": 0.6175472612699952, + "grad_norm": 1.3910014629364014, + "learning_rate": 1.9172095737327566e-05, + "loss": 1.347, + "step": 1274 + }, + { + "epoch": 0.6180319922443044, + "grad_norm": 1.4783943891525269, + "learning_rate": 1.9130953266637127e-05, + "loss": 1.6262, + "step": 1275 + }, + { + "epoch": 0.6185167232186136, + "grad_norm": 1.5285893678665161, + "learning_rate": 1.9089827611504013e-05, + "loss": 1.624, + "step": 1276 + }, + { + "epoch": 0.619001454192923, + "grad_norm": 1.4484494924545288, + "learning_rate": 1.9048718889758375e-05, + "loss": 1.3973, + "step": 1277 + }, + { + "epoch": 0.6194861851672322, + "grad_norm": 1.3599704504013062, + "learning_rate": 1.900762721918189e-05, + "loss": 1.3223, + "step": 1278 + }, + { + "epoch": 0.6199709161415414, + "grad_norm": 1.4407846927642822, + "learning_rate": 1.8966552717507364e-05, + "loss": 1.2713, + "step": 1279 + }, + { + "epoch": 0.6204556471158507, + "grad_norm": 1.4449760913848877, + "learning_rate": 1.8925495502418406e-05, + "loss": 1.6689, + "step": 1280 + }, + { + "epoch": 0.62094037809016, + "grad_norm": 1.5764880180358887, + "learning_rate": 1.8884455691549105e-05, + "loss": 1.5437, + "step": 1281 + }, + { + "epoch": 0.6214251090644692, + "grad_norm": 1.5581961870193481, + "learning_rate": 1.8843433402483683e-05, + "loss": 1.4253, + "step": 1282 + }, + { + "epoch": 0.6219098400387785, + "grad_norm": 1.384125828742981, + "learning_rate": 1.8802428752756172e-05, + "loss": 1.4686, + "step": 1283 + }, + { + "epoch": 0.6223945710130877, + "grad_norm": 1.4908874034881592, + "learning_rate": 1.876144185985003e-05, + "loss": 1.3817, + "step": 1284 + }, + { + "epoch": 0.622879301987397, + "grad_norm": 1.441921591758728, + "learning_rate": 1.8720472841197884e-05, + "loss": 1.1528, + "step": 1285 + }, + { + "epoch": 0.6233640329617063, + "grad_norm": 5.794642925262451, + "learning_rate": 1.867952181418111e-05, + "loss": 1.5045, + "step": 1286 + }, + { + "epoch": 0.6238487639360155, + "grad_norm": 1.430221676826477, + "learning_rate": 1.8638588896129557e-05, + "loss": 1.3444, + "step": 1287 + }, + { + "epoch": 0.6243334949103247, + "grad_norm": 1.3817723989486694, + "learning_rate": 1.8597674204321185e-05, + "loss": 1.4123, + "step": 1288 + }, + { + "epoch": 0.624818225884634, + "grad_norm": 1.4198659658432007, + "learning_rate": 1.8556777855981737e-05, + "loss": 1.76, + "step": 1289 + }, + { + "epoch": 0.6253029568589433, + "grad_norm": 1.387041449546814, + "learning_rate": 1.85158999682844e-05, + "loss": 1.4046, + "step": 1290 + }, + { + "epoch": 0.6257876878332526, + "grad_norm": 1.4337729215621948, + "learning_rate": 1.8475040658349454e-05, + "loss": 1.6913, + "step": 1291 + }, + { + "epoch": 0.6262724188075618, + "grad_norm": 1.2845624685287476, + "learning_rate": 1.843420004324397e-05, + "loss": 1.3359, + "step": 1292 + }, + { + "epoch": 0.626757149781871, + "grad_norm": 1.3890070915222168, + "learning_rate": 1.839337823998145e-05, + "loss": 1.3585, + "step": 1293 + }, + { + "epoch": 0.6272418807561804, + "grad_norm": 1.6244319677352905, + "learning_rate": 1.8352575365521503e-05, + "loss": 1.302, + "step": 1294 + }, + { + "epoch": 0.6277266117304896, + "grad_norm": 1.373167634010315, + "learning_rate": 1.8311791536769483e-05, + "loss": 1.3026, + "step": 1295 + }, + { + "epoch": 0.6282113427047988, + "grad_norm": 1.3994495868682861, + "learning_rate": 1.8271026870576197e-05, + "loss": 1.273, + "step": 1296 + }, + { + "epoch": 0.6286960736791081, + "grad_norm": 1.373883843421936, + "learning_rate": 1.8230281483737537e-05, + "loss": 1.5305, + "step": 1297 + }, + { + "epoch": 0.6291808046534173, + "grad_norm": 1.4813038110733032, + "learning_rate": 1.818955549299418e-05, + "loss": 1.244, + "step": 1298 + }, + { + "epoch": 0.6296655356277266, + "grad_norm": 1.3685561418533325, + "learning_rate": 1.8148849015031195e-05, + "loss": 1.4916, + "step": 1299 + }, + { + "epoch": 0.6301502666020359, + "grad_norm": 1.4443938732147217, + "learning_rate": 1.8108162166477766e-05, + "loss": 1.2453, + "step": 1300 + }, + { + "epoch": 0.6306349975763451, + "grad_norm": 1.4075669050216675, + "learning_rate": 1.806749506390684e-05, + "loss": 1.5644, + "step": 1301 + }, + { + "epoch": 0.6311197285506543, + "grad_norm": 1.4213589429855347, + "learning_rate": 1.802684782383478e-05, + "loss": 1.3007, + "step": 1302 + }, + { + "epoch": 0.6316044595249637, + "grad_norm": 1.3473411798477173, + "learning_rate": 1.798622056272104e-05, + "loss": 1.1438, + "step": 1303 + }, + { + "epoch": 0.6320891904992729, + "grad_norm": 1.3807307481765747, + "learning_rate": 1.7945613396967837e-05, + "loss": 1.3414, + "step": 1304 + }, + { + "epoch": 0.6325739214735822, + "grad_norm": 1.6303166151046753, + "learning_rate": 1.790502644291982e-05, + "loss": 1.7151, + "step": 1305 + }, + { + "epoch": 0.6330586524478914, + "grad_norm": 1.4952675104141235, + "learning_rate": 1.78644598168637e-05, + "loss": 1.66, + "step": 1306 + }, + { + "epoch": 0.6335433834222006, + "grad_norm": 1.3126227855682373, + "learning_rate": 1.7823913635027973e-05, + "loss": 1.1817, + "step": 1307 + }, + { + "epoch": 0.63402811439651, + "grad_norm": 1.4245887994766235, + "learning_rate": 1.7783388013582553e-05, + "loss": 1.4009, + "step": 1308 + }, + { + "epoch": 0.6345128453708192, + "grad_norm": 1.420230507850647, + "learning_rate": 1.7742883068638447e-05, + "loss": 1.408, + "step": 1309 + }, + { + "epoch": 0.6349975763451284, + "grad_norm": 1.3932969570159912, + "learning_rate": 1.770239891624741e-05, + "loss": 1.5593, + "step": 1310 + }, + { + "epoch": 0.6354823073194377, + "grad_norm": 1.392712116241455, + "learning_rate": 1.7661935672401632e-05, + "loss": 1.2642, + "step": 1311 + }, + { + "epoch": 0.635967038293747, + "grad_norm": 1.4296149015426636, + "learning_rate": 1.7621493453033405e-05, + "loss": 1.4418, + "step": 1312 + }, + { + "epoch": 0.6364517692680562, + "grad_norm": 1.367656946182251, + "learning_rate": 1.7581072374014777e-05, + "loss": 1.3111, + "step": 1313 + }, + { + "epoch": 0.6369365002423655, + "grad_norm": 1.353005290031433, + "learning_rate": 1.7540672551157227e-05, + "loss": 1.1314, + "step": 1314 + }, + { + "epoch": 0.6374212312166747, + "grad_norm": 1.514885663986206, + "learning_rate": 1.7500294100211315e-05, + "loss": 1.265, + "step": 1315 + }, + { + "epoch": 0.637905962190984, + "grad_norm": 1.365065336227417, + "learning_rate": 1.7459937136866392e-05, + "loss": 1.3412, + "step": 1316 + }, + { + "epoch": 0.6383906931652933, + "grad_norm": 1.6574945449829102, + "learning_rate": 1.7419601776750237e-05, + "loss": 1.5467, + "step": 1317 + }, + { + "epoch": 0.6388754241396025, + "grad_norm": 1.4217700958251953, + "learning_rate": 1.737928813542873e-05, + "loss": 1.5412, + "step": 1318 + }, + { + "epoch": 0.6393601551139118, + "grad_norm": 1.3700860738754272, + "learning_rate": 1.7338996328405526e-05, + "loss": 1.3197, + "step": 1319 + }, + { + "epoch": 0.639844886088221, + "grad_norm": 1.380444049835205, + "learning_rate": 1.7298726471121723e-05, + "loss": 1.6721, + "step": 1320 + }, + { + "epoch": 0.6403296170625303, + "grad_norm": 1.3262513875961304, + "learning_rate": 1.725847867895553e-05, + "loss": 1.274, + "step": 1321 + }, + { + "epoch": 0.6408143480368396, + "grad_norm": 1.5101852416992188, + "learning_rate": 1.7218253067221933e-05, + "loss": 1.4451, + "step": 1322 + }, + { + "epoch": 0.6412990790111488, + "grad_norm": 1.5045435428619385, + "learning_rate": 1.7178049751172366e-05, + "loss": 1.5808, + "step": 1323 + }, + { + "epoch": 0.641783809985458, + "grad_norm": 1.5001122951507568, + "learning_rate": 1.7137868845994397e-05, + "loss": 1.3329, + "step": 1324 + }, + { + "epoch": 0.6422685409597674, + "grad_norm": 1.3572721481323242, + "learning_rate": 1.709771046681137e-05, + "loss": 1.0717, + "step": 1325 + }, + { + "epoch": 0.6427532719340766, + "grad_norm": 1.542609453201294, + "learning_rate": 1.7057574728682095e-05, + "loss": 1.3739, + "step": 1326 + }, + { + "epoch": 0.6432380029083858, + "grad_norm": 1.4278963804244995, + "learning_rate": 1.7017461746600506e-05, + "loss": 1.37, + "step": 1327 + }, + { + "epoch": 0.6437227338826951, + "grad_norm": 1.5175418853759766, + "learning_rate": 1.6977371635495347e-05, + "loss": 1.9564, + "step": 1328 + }, + { + "epoch": 0.6442074648570043, + "grad_norm": 1.5065232515335083, + "learning_rate": 1.6937304510229834e-05, + "loss": 1.6646, + "step": 1329 + }, + { + "epoch": 0.6446921958313137, + "grad_norm": 1.5196707248687744, + "learning_rate": 1.6897260485601318e-05, + "loss": 1.4758, + "step": 1330 + }, + { + "epoch": 0.6451769268056229, + "grad_norm": 1.4596264362335205, + "learning_rate": 1.685723967634097e-05, + "loss": 1.3208, + "step": 1331 + }, + { + "epoch": 0.6456616577799321, + "grad_norm": 1.43583345413208, + "learning_rate": 1.681724219711344e-05, + "loss": 1.5051, + "step": 1332 + }, + { + "epoch": 0.6461463887542414, + "grad_norm": 1.985482931137085, + "learning_rate": 1.6777268162516548e-05, + "loss": 1.5165, + "step": 1333 + }, + { + "epoch": 0.6466311197285507, + "grad_norm": 1.4382468461990356, + "learning_rate": 1.6737317687080922e-05, + "loss": 1.889, + "step": 1334 + }, + { + "epoch": 0.6471158507028599, + "grad_norm": 1.4122154712677002, + "learning_rate": 1.6697390885269705e-05, + "loss": 1.3209, + "step": 1335 + }, + { + "epoch": 0.6476005816771692, + "grad_norm": 1.277051568031311, + "learning_rate": 1.6657487871478212e-05, + "loss": 1.1666, + "step": 1336 + }, + { + "epoch": 0.6480853126514784, + "grad_norm": 1.3974394798278809, + "learning_rate": 1.661760876003358e-05, + "loss": 1.6071, + "step": 1337 + }, + { + "epoch": 0.6485700436257877, + "grad_norm": 1.503825068473816, + "learning_rate": 1.65777536651945e-05, + "loss": 1.1412, + "step": 1338 + }, + { + "epoch": 0.649054774600097, + "grad_norm": 1.4791207313537598, + "learning_rate": 1.6537922701150828e-05, + "loss": 1.2004, + "step": 1339 + }, + { + "epoch": 0.6495395055744062, + "grad_norm": 1.4980095624923706, + "learning_rate": 1.6498115982023285e-05, + "loss": 1.5021, + "step": 1340 + }, + { + "epoch": 0.6500242365487154, + "grad_norm": 1.372834324836731, + "learning_rate": 1.645833362186313e-05, + "loss": 1.318, + "step": 1341 + }, + { + "epoch": 0.6505089675230247, + "grad_norm": 1.4026696681976318, + "learning_rate": 1.6418575734651832e-05, + "loss": 1.4896, + "step": 1342 + }, + { + "epoch": 0.650993698497334, + "grad_norm": 1.454119324684143, + "learning_rate": 1.6378842434300746e-05, + "loss": 1.3353, + "step": 1343 + }, + { + "epoch": 0.6514784294716433, + "grad_norm": 1.4449537992477417, + "learning_rate": 1.633913383465076e-05, + "loss": 1.7903, + "step": 1344 + }, + { + "epoch": 0.6519631604459525, + "grad_norm": 1.4829782247543335, + "learning_rate": 1.6299450049472022e-05, + "loss": 1.4147, + "step": 1345 + }, + { + "epoch": 0.6524478914202617, + "grad_norm": 1.4531413316726685, + "learning_rate": 1.6259791192463557e-05, + "loss": 1.0013, + "step": 1346 + }, + { + "epoch": 0.6529326223945711, + "grad_norm": 1.5042320489883423, + "learning_rate": 1.6220157377252994e-05, + "loss": 1.7003, + "step": 1347 + }, + { + "epoch": 0.6534173533688803, + "grad_norm": 1.499081015586853, + "learning_rate": 1.6180548717396198e-05, + "loss": 1.4914, + "step": 1348 + }, + { + "epoch": 0.6539020843431895, + "grad_norm": 1.2727124691009521, + "learning_rate": 1.6140965326376954e-05, + "loss": 1.0353, + "step": 1349 + }, + { + "epoch": 0.6543868153174988, + "grad_norm": 1.4354982376098633, + "learning_rate": 1.6101407317606666e-05, + "loss": 1.2793, + "step": 1350 + }, + { + "epoch": 0.654871546291808, + "grad_norm": 1.458020806312561, + "learning_rate": 1.6061874804424e-05, + "loss": 1.502, + "step": 1351 + }, + { + "epoch": 0.6553562772661173, + "grad_norm": 1.4475706815719604, + "learning_rate": 1.602236790009458e-05, + "loss": 1.6151, + "step": 1352 + }, + { + "epoch": 0.6558410082404266, + "grad_norm": 1.4461567401885986, + "learning_rate": 1.5982886717810676e-05, + "loss": 1.6717, + "step": 1353 + }, + { + "epoch": 0.6563257392147358, + "grad_norm": 1.4085007905960083, + "learning_rate": 1.5943431370690815e-05, + "loss": 1.4512, + "step": 1354 + }, + { + "epoch": 0.656810470189045, + "grad_norm": 1.3767906427383423, + "learning_rate": 1.590400197177954e-05, + "loss": 1.2589, + "step": 1355 + }, + { + "epoch": 0.6572952011633544, + "grad_norm": 1.5227607488632202, + "learning_rate": 1.5864598634047046e-05, + "loss": 1.4397, + "step": 1356 + }, + { + "epoch": 0.6577799321376636, + "grad_norm": 1.4575637578964233, + "learning_rate": 1.5825221470388847e-05, + "loss": 1.3085, + "step": 1357 + }, + { + "epoch": 0.6582646631119728, + "grad_norm": 1.2845321893692017, + "learning_rate": 1.5785870593625472e-05, + "loss": 1.3451, + "step": 1358 + }, + { + "epoch": 0.6587493940862821, + "grad_norm": 1.3598212003707886, + "learning_rate": 1.574654611650214e-05, + "loss": 1.4667, + "step": 1359 + }, + { + "epoch": 0.6592341250605914, + "grad_norm": 1.384535551071167, + "learning_rate": 1.5707248151688424e-05, + "loss": 1.3087, + "step": 1360 + }, + { + "epoch": 0.6597188560349007, + "grad_norm": 1.3146976232528687, + "learning_rate": 1.5667976811777932e-05, + "loss": 1.4136, + "step": 1361 + }, + { + "epoch": 0.6602035870092099, + "grad_norm": 1.5279932022094727, + "learning_rate": 1.5628732209287993e-05, + "loss": 1.3477, + "step": 1362 + }, + { + "epoch": 0.6606883179835191, + "grad_norm": 1.3585039377212524, + "learning_rate": 1.558951445665935e-05, + "loss": 1.3886, + "step": 1363 + }, + { + "epoch": 0.6611730489578284, + "grad_norm": 1.3408361673355103, + "learning_rate": 1.555032366625577e-05, + "loss": 1.677, + "step": 1364 + }, + { + "epoch": 0.6616577799321377, + "grad_norm": 1.3672118186950684, + "learning_rate": 1.5511159950363814e-05, + "loss": 1.3577, + "step": 1365 + }, + { + "epoch": 0.6621425109064469, + "grad_norm": 1.3929879665374756, + "learning_rate": 1.5472023421192445e-05, + "loss": 1.3506, + "step": 1366 + }, + { + "epoch": 0.6626272418807562, + "grad_norm": 1.4646409749984741, + "learning_rate": 1.5432914190872757e-05, + "loss": 1.6497, + "step": 1367 + }, + { + "epoch": 0.6631119728550654, + "grad_norm": 1.426300287246704, + "learning_rate": 1.539383237145761e-05, + "loss": 1.3643, + "step": 1368 + }, + { + "epoch": 0.6635967038293747, + "grad_norm": 1.4488145112991333, + "learning_rate": 1.5354778074921332e-05, + "loss": 1.2896, + "step": 1369 + }, + { + "epoch": 0.664081434803684, + "grad_norm": 1.503942847251892, + "learning_rate": 1.5315751413159394e-05, + "loss": 1.5616, + "step": 1370 + }, + { + "epoch": 0.6645661657779932, + "grad_norm": 1.4608261585235596, + "learning_rate": 1.52767524979881e-05, + "loss": 1.3562, + "step": 1371 + }, + { + "epoch": 0.6650508967523024, + "grad_norm": 1.4247395992279053, + "learning_rate": 1.5237781441144256e-05, + "loss": 1.4961, + "step": 1372 + }, + { + "epoch": 0.6655356277266117, + "grad_norm": 1.3943641185760498, + "learning_rate": 1.5198838354284817e-05, + "loss": 1.5362, + "step": 1373 + }, + { + "epoch": 0.666020358700921, + "grad_norm": 1.42288339138031, + "learning_rate": 1.515992334898664e-05, + "loss": 1.2119, + "step": 1374 + }, + { + "epoch": 0.6665050896752303, + "grad_norm": 1.3145508766174316, + "learning_rate": 1.512103653674612e-05, + "loss": 1.6042, + "step": 1375 + }, + { + "epoch": 0.6669898206495395, + "grad_norm": 1.4619483947753906, + "learning_rate": 1.5082178028978853e-05, + "loss": 1.4172, + "step": 1376 + }, + { + "epoch": 0.6674745516238487, + "grad_norm": 1.375780463218689, + "learning_rate": 1.5043347937019358e-05, + "loss": 1.4588, + "step": 1377 + }, + { + "epoch": 0.6679592825981581, + "grad_norm": 1.339187502861023, + "learning_rate": 1.5004546372120736e-05, + "loss": 1.4457, + "step": 1378 + }, + { + "epoch": 0.6684440135724673, + "grad_norm": 1.310111165046692, + "learning_rate": 1.4965773445454349e-05, + "loss": 1.1335, + "step": 1379 + }, + { + "epoch": 0.6689287445467765, + "grad_norm": 1.339221477508545, + "learning_rate": 1.492702926810951e-05, + "loss": 1.2751, + "step": 1380 + }, + { + "epoch": 0.6694134755210858, + "grad_norm": 6.000275611877441, + "learning_rate": 1.4888313951093169e-05, + "loss": 1.1046, + "step": 1381 + }, + { + "epoch": 0.6698982064953951, + "grad_norm": 1.5101951360702515, + "learning_rate": 1.4849627605329583e-05, + "loss": 1.496, + "step": 1382 + }, + { + "epoch": 0.6703829374697043, + "grad_norm": 1.4664740562438965, + "learning_rate": 1.481097034165998e-05, + "loss": 1.5279, + "step": 1383 + }, + { + "epoch": 0.6708676684440136, + "grad_norm": 1.461575984954834, + "learning_rate": 1.4772342270842299e-05, + "loss": 1.3002, + "step": 1384 + }, + { + "epoch": 0.6713523994183228, + "grad_norm": 1.4403537511825562, + "learning_rate": 1.4733743503550818e-05, + "loss": 1.7751, + "step": 1385 + }, + { + "epoch": 0.671837130392632, + "grad_norm": 1.3830691576004028, + "learning_rate": 1.4695174150375865e-05, + "loss": 1.2238, + "step": 1386 + }, + { + "epoch": 0.6723218613669414, + "grad_norm": 1.3348417282104492, + "learning_rate": 1.4656634321823493e-05, + "loss": 1.3973, + "step": 1387 + }, + { + "epoch": 0.6728065923412506, + "grad_norm": 1.3746894598007202, + "learning_rate": 1.461812412831515e-05, + "loss": 1.114, + "step": 1388 + }, + { + "epoch": 0.6732913233155599, + "grad_norm": 1.3570497035980225, + "learning_rate": 1.457964368018739e-05, + "loss": 1.4, + "step": 1389 + }, + { + "epoch": 0.6737760542898691, + "grad_norm": 1.3715555667877197, + "learning_rate": 1.4541193087691535e-05, + "loss": 1.2383, + "step": 1390 + }, + { + "epoch": 0.6742607852641784, + "grad_norm": 1.3852076530456543, + "learning_rate": 1.4502772460993385e-05, + "loss": 1.2747, + "step": 1391 + }, + { + "epoch": 0.6747455162384877, + "grad_norm": 1.203576683998108, + "learning_rate": 1.4464381910172858e-05, + "loss": 0.9592, + "step": 1392 + }, + { + "epoch": 0.6752302472127969, + "grad_norm": 1.6935615539550781, + "learning_rate": 1.4426021545223712e-05, + "loss": 1.7171, + "step": 1393 + }, + { + "epoch": 0.6757149781871061, + "grad_norm": 1.4629912376403809, + "learning_rate": 1.438769147605322e-05, + "loss": 1.4803, + "step": 1394 + }, + { + "epoch": 0.6761997091614154, + "grad_norm": 1.3970519304275513, + "learning_rate": 1.434939181248184e-05, + "loss": 1.5205, + "step": 1395 + }, + { + "epoch": 0.6766844401357247, + "grad_norm": 1.4061120748519897, + "learning_rate": 1.4311122664242954e-05, + "loss": 1.6476, + "step": 1396 + }, + { + "epoch": 0.6771691711100339, + "grad_norm": 1.41211998462677, + "learning_rate": 1.4272884140982462e-05, + "loss": 1.456, + "step": 1397 + }, + { + "epoch": 0.6776539020843432, + "grad_norm": 1.3315012454986572, + "learning_rate": 1.423467635225856e-05, + "loss": 1.4234, + "step": 1398 + }, + { + "epoch": 0.6781386330586524, + "grad_norm": 1.5095267295837402, + "learning_rate": 1.4196499407541359e-05, + "loss": 1.4583, + "step": 1399 + }, + { + "epoch": 0.6786233640329618, + "grad_norm": 1.3793443441390991, + "learning_rate": 1.4158353416212622e-05, + "loss": 1.2173, + "step": 1400 + }, + { + "epoch": 0.679108095007271, + "grad_norm": 1.3824712038040161, + "learning_rate": 1.4120238487565402e-05, + "loss": 1.3028, + "step": 1401 + }, + { + "epoch": 0.6795928259815802, + "grad_norm": 1.375754952430725, + "learning_rate": 1.4082154730803774e-05, + "loss": 1.2412, + "step": 1402 + }, + { + "epoch": 0.6800775569558895, + "grad_norm": 1.4535313844680786, + "learning_rate": 1.4044102255042475e-05, + "loss": 1.5615, + "step": 1403 + }, + { + "epoch": 0.6805622879301988, + "grad_norm": 1.379002332687378, + "learning_rate": 1.4006081169306656e-05, + "loss": 1.5593, + "step": 1404 + }, + { + "epoch": 0.681047018904508, + "grad_norm": 1.3821449279785156, + "learning_rate": 1.3968091582531495e-05, + "loss": 1.2261, + "step": 1405 + }, + { + "epoch": 0.6815317498788173, + "grad_norm": 1.4238389730453491, + "learning_rate": 1.3930133603561957e-05, + "loss": 1.2891, + "step": 1406 + }, + { + "epoch": 0.6820164808531265, + "grad_norm": 1.3251632452011108, + "learning_rate": 1.3892207341152416e-05, + "loss": 1.5867, + "step": 1407 + }, + { + "epoch": 0.6825012118274357, + "grad_norm": 3.028153896331787, + "learning_rate": 1.3854312903966377e-05, + "loss": 1.4604, + "step": 1408 + }, + { + "epoch": 0.6829859428017451, + "grad_norm": 1.363827109336853, + "learning_rate": 1.381645040057619e-05, + "loss": 1.1675, + "step": 1409 + }, + { + "epoch": 0.6834706737760543, + "grad_norm": 1.4435791969299316, + "learning_rate": 1.3778619939462667e-05, + "loss": 1.6013, + "step": 1410 + }, + { + "epoch": 0.6839554047503635, + "grad_norm": 1.4359902143478394, + "learning_rate": 1.3740821629014874e-05, + "loss": 1.6007, + "step": 1411 + }, + { + "epoch": 0.6844401357246728, + "grad_norm": 1.3930972814559937, + "learning_rate": 1.3703055577529686e-05, + "loss": 1.479, + "step": 1412 + }, + { + "epoch": 0.6849248666989821, + "grad_norm": 1.5467122793197632, + "learning_rate": 1.3665321893211618e-05, + "loss": 1.3401, + "step": 1413 + }, + { + "epoch": 0.6854095976732913, + "grad_norm": 1.3341065645217896, + "learning_rate": 1.3627620684172407e-05, + "loss": 1.4378, + "step": 1414 + }, + { + "epoch": 0.6858943286476006, + "grad_norm": 1.5038310289382935, + "learning_rate": 1.3589952058430778e-05, + "loss": 1.3485, + "step": 1415 + }, + { + "epoch": 0.6863790596219098, + "grad_norm": 1.3664665222167969, + "learning_rate": 1.3552316123912063e-05, + "loss": 1.4734, + "step": 1416 + }, + { + "epoch": 0.686863790596219, + "grad_norm": 1.4052238464355469, + "learning_rate": 1.3514712988447972e-05, + "loss": 1.3877, + "step": 1417 + }, + { + "epoch": 0.6873485215705284, + "grad_norm": 1.478548526763916, + "learning_rate": 1.3477142759776207e-05, + "loss": 1.6317, + "step": 1418 + }, + { + "epoch": 0.6878332525448376, + "grad_norm": 1.338326096534729, + "learning_rate": 1.343960554554019e-05, + "loss": 1.3528, + "step": 1419 + }, + { + "epoch": 0.6883179835191469, + "grad_norm": 1.3133000135421753, + "learning_rate": 1.3402101453288785e-05, + "loss": 1.3344, + "step": 1420 + }, + { + "epoch": 0.6888027144934561, + "grad_norm": 1.4202029705047607, + "learning_rate": 1.3364630590475923e-05, + "loss": 1.3458, + "step": 1421 + }, + { + "epoch": 0.6892874454677654, + "grad_norm": 1.3237619400024414, + "learning_rate": 1.3327193064460342e-05, + "loss": 1.2891, + "step": 1422 + }, + { + "epoch": 0.6897721764420747, + "grad_norm": 1.4759186506271362, + "learning_rate": 1.328978898250525e-05, + "loss": 1.7053, + "step": 1423 + }, + { + "epoch": 0.6902569074163839, + "grad_norm": 1.4690028429031372, + "learning_rate": 1.325241845177807e-05, + "loss": 1.5294, + "step": 1424 + }, + { + "epoch": 0.6907416383906931, + "grad_norm": 1.433915376663208, + "learning_rate": 1.3215081579350058e-05, + "loss": 1.2603, + "step": 1425 + }, + { + "epoch": 0.6912263693650024, + "grad_norm": 1.3608239889144897, + "learning_rate": 1.3177778472196068e-05, + "loss": 1.1412, + "step": 1426 + }, + { + "epoch": 0.6917111003393117, + "grad_norm": 1.4877392053604126, + "learning_rate": 1.3140509237194176e-05, + "loss": 1.8585, + "step": 1427 + }, + { + "epoch": 0.692195831313621, + "grad_norm": 1.3863829374313354, + "learning_rate": 1.3103273981125447e-05, + "loss": 1.6264, + "step": 1428 + }, + { + "epoch": 0.6926805622879302, + "grad_norm": 1.4459201097488403, + "learning_rate": 1.3066072810673557e-05, + "loss": 1.4349, + "step": 1429 + }, + { + "epoch": 0.6931652932622394, + "grad_norm": 1.4497578144073486, + "learning_rate": 1.302890583242457e-05, + "loss": 1.2253, + "step": 1430 + }, + { + "epoch": 0.6936500242365488, + "grad_norm": 1.4433369636535645, + "learning_rate": 1.2991773152866515e-05, + "loss": 1.1806, + "step": 1431 + }, + { + "epoch": 0.694134755210858, + "grad_norm": 1.3377454280853271, + "learning_rate": 1.2954674878389223e-05, + "loss": 1.2198, + "step": 1432 + }, + { + "epoch": 0.6946194861851672, + "grad_norm": 1.4302222728729248, + "learning_rate": 1.2917611115283901e-05, + "loss": 1.4984, + "step": 1433 + }, + { + "epoch": 0.6951042171594765, + "grad_norm": 1.5178338289260864, + "learning_rate": 1.2880581969742886e-05, + "loss": 1.3589, + "step": 1434 + }, + { + "epoch": 0.6955889481337858, + "grad_norm": 1.3676424026489258, + "learning_rate": 1.2843587547859361e-05, + "loss": 1.18, + "step": 1435 + }, + { + "epoch": 0.696073679108095, + "grad_norm": 1.4502981901168823, + "learning_rate": 1.2806627955626982e-05, + "loss": 1.3871, + "step": 1436 + }, + { + "epoch": 0.6965584100824043, + "grad_norm": 1.550480604171753, + "learning_rate": 1.2769703298939646e-05, + "loss": 1.2154, + "step": 1437 + }, + { + "epoch": 0.6970431410567135, + "grad_norm": 1.3464187383651733, + "learning_rate": 1.2732813683591121e-05, + "loss": 1.3843, + "step": 1438 + }, + { + "epoch": 0.6975278720310227, + "grad_norm": 1.508142352104187, + "learning_rate": 1.2695959215274816e-05, + "loss": 1.3424, + "step": 1439 + }, + { + "epoch": 0.6980126030053321, + "grad_norm": 1.3689768314361572, + "learning_rate": 1.2659139999583414e-05, + "loss": 1.1724, + "step": 1440 + }, + { + "epoch": 0.6984973339796413, + "grad_norm": 1.5775456428527832, + "learning_rate": 1.2622356142008593e-05, + "loss": 1.2543, + "step": 1441 + }, + { + "epoch": 0.6989820649539505, + "grad_norm": 1.415062427520752, + "learning_rate": 1.2585607747940729e-05, + "loss": 1.4339, + "step": 1442 + }, + { + "epoch": 0.6994667959282598, + "grad_norm": 1.458363652229309, + "learning_rate": 1.2548894922668612e-05, + "loss": 1.499, + "step": 1443 + }, + { + "epoch": 0.6999515269025691, + "grad_norm": 1.39089834690094, + "learning_rate": 1.2512217771379087e-05, + "loss": 1.274, + "step": 1444 + }, + { + "epoch": 0.7004362578768784, + "grad_norm": 1.365566611289978, + "learning_rate": 1.2475576399156825e-05, + "loss": 1.2721, + "step": 1445 + }, + { + "epoch": 0.7009209888511876, + "grad_norm": 1.3721204996109009, + "learning_rate": 1.2438970910983957e-05, + "loss": 1.4607, + "step": 1446 + }, + { + "epoch": 0.7014057198254968, + "grad_norm": 1.3698453903198242, + "learning_rate": 1.2402401411739806e-05, + "loss": 1.5212, + "step": 1447 + }, + { + "epoch": 0.701890450799806, + "grad_norm": 2.067800521850586, + "learning_rate": 1.2365868006200603e-05, + "loss": 1.5141, + "step": 1448 + }, + { + "epoch": 0.7023751817741154, + "grad_norm": 1.5352057218551636, + "learning_rate": 1.232937079903914e-05, + "loss": 1.5916, + "step": 1449 + }, + { + "epoch": 0.7028599127484246, + "grad_norm": 1.3993531465530396, + "learning_rate": 1.2292909894824528e-05, + "loss": 1.3013, + "step": 1450 + }, + { + "epoch": 0.7033446437227339, + "grad_norm": 1.529046654701233, + "learning_rate": 1.2256485398021808e-05, + "loss": 1.1088, + "step": 1451 + }, + { + "epoch": 0.7038293746970431, + "grad_norm": 1.5416908264160156, + "learning_rate": 1.222009741299178e-05, + "loss": 1.5297, + "step": 1452 + }, + { + "epoch": 0.7043141056713524, + "grad_norm": 1.4917817115783691, + "learning_rate": 1.2183746043990577e-05, + "loss": 1.2723, + "step": 1453 + }, + { + "epoch": 0.7047988366456617, + "grad_norm": 1.3837028741836548, + "learning_rate": 1.2147431395169459e-05, + "loss": 1.2728, + "step": 1454 + }, + { + "epoch": 0.7052835676199709, + "grad_norm": 1.434131383895874, + "learning_rate": 1.2111153570574454e-05, + "loss": 1.3624, + "step": 1455 + }, + { + "epoch": 0.7057682985942801, + "grad_norm": 1.430051326751709, + "learning_rate": 1.2074912674146107e-05, + "loss": 1.3779, + "step": 1456 + }, + { + "epoch": 0.7062530295685895, + "grad_norm": 1.4436354637145996, + "learning_rate": 1.2038708809719137e-05, + "loss": 1.3387, + "step": 1457 + }, + { + "epoch": 0.7067377605428987, + "grad_norm": 1.3345868587493896, + "learning_rate": 1.2002542081022165e-05, + "loss": 1.2027, + "step": 1458 + }, + { + "epoch": 0.707222491517208, + "grad_norm": 1.6215115785598755, + "learning_rate": 1.196641259167743e-05, + "loss": 1.6066, + "step": 1459 + }, + { + "epoch": 0.7077072224915172, + "grad_norm": 1.4360243082046509, + "learning_rate": 1.1930320445200463e-05, + "loss": 1.1322, + "step": 1460 + }, + { + "epoch": 0.7081919534658264, + "grad_norm": 1.3410156965255737, + "learning_rate": 1.1894265744999802e-05, + "loss": 1.1842, + "step": 1461 + }, + { + "epoch": 0.7086766844401358, + "grad_norm": 1.6034032106399536, + "learning_rate": 1.185824859437669e-05, + "loss": 1.7069, + "step": 1462 + }, + { + "epoch": 0.709161415414445, + "grad_norm": 1.647525668144226, + "learning_rate": 1.1822269096524812e-05, + "loss": 1.6953, + "step": 1463 + }, + { + "epoch": 0.7096461463887542, + "grad_norm": 1.4244221448898315, + "learning_rate": 1.1786327354529941e-05, + "loss": 1.4405, + "step": 1464 + }, + { + "epoch": 0.7101308773630635, + "grad_norm": 1.4078463315963745, + "learning_rate": 1.1750423471369703e-05, + "loss": 1.2905, + "step": 1465 + }, + { + "epoch": 0.7106156083373728, + "grad_norm": 1.5614463090896606, + "learning_rate": 1.1714557549913229e-05, + "loss": 1.8514, + "step": 1466 + }, + { + "epoch": 0.711100339311682, + "grad_norm": 1.4428095817565918, + "learning_rate": 1.1678729692920911e-05, + "loss": 1.2046, + "step": 1467 + }, + { + "epoch": 0.7115850702859913, + "grad_norm": 1.3544590473175049, + "learning_rate": 1.164294000304406e-05, + "loss": 1.2275, + "step": 1468 + }, + { + "epoch": 0.7120698012603005, + "grad_norm": 1.4303261041641235, + "learning_rate": 1.1607188582824635e-05, + "loss": 1.3526, + "step": 1469 + }, + { + "epoch": 0.7125545322346097, + "grad_norm": 1.548387050628662, + "learning_rate": 1.1571475534694951e-05, + "loss": 1.1652, + "step": 1470 + }, + { + "epoch": 0.7130392632089191, + "grad_norm": 1.4661262035369873, + "learning_rate": 1.1535800960977397e-05, + "loss": 1.7172, + "step": 1471 + }, + { + "epoch": 0.7135239941832283, + "grad_norm": 1.3347164392471313, + "learning_rate": 1.1500164963884107e-05, + "loss": 1.397, + "step": 1472 + }, + { + "epoch": 0.7140087251575375, + "grad_norm": 1.4761581420898438, + "learning_rate": 1.146456764551669e-05, + "loss": 1.7281, + "step": 1473 + }, + { + "epoch": 0.7144934561318468, + "grad_norm": 1.443963646888733, + "learning_rate": 1.142900910786596e-05, + "loss": 1.5084, + "step": 1474 + }, + { + "epoch": 0.7149781871061561, + "grad_norm": 1.5460129976272583, + "learning_rate": 1.139348945281158e-05, + "loss": 1.4503, + "step": 1475 + }, + { + "epoch": 0.7154629180804654, + "grad_norm": 1.3878977298736572, + "learning_rate": 1.1358008782121848e-05, + "loss": 1.4604, + "step": 1476 + }, + { + "epoch": 0.7159476490547746, + "grad_norm": 1.199691653251648, + "learning_rate": 1.1322567197453338e-05, + "loss": 1.0213, + "step": 1477 + }, + { + "epoch": 0.7164323800290838, + "grad_norm": 1.3305445909500122, + "learning_rate": 1.128716480035066e-05, + "loss": 1.3534, + "step": 1478 + }, + { + "epoch": 0.7169171110033932, + "grad_norm": 1.2963858842849731, + "learning_rate": 1.125180169224613e-05, + "loss": 1.1662, + "step": 1479 + }, + { + "epoch": 0.7174018419777024, + "grad_norm": 1.676682472229004, + "learning_rate": 1.1216477974459505e-05, + "loss": 1.652, + "step": 1480 + }, + { + "epoch": 0.7178865729520116, + "grad_norm": 1.3726409673690796, + "learning_rate": 1.1181193748197667e-05, + "loss": 1.2268, + "step": 1481 + }, + { + "epoch": 0.7183713039263209, + "grad_norm": 1.3873904943466187, + "learning_rate": 1.114594911455438e-05, + "loss": 1.2706, + "step": 1482 + }, + { + "epoch": 0.7188560349006301, + "grad_norm": 1.3303292989730835, + "learning_rate": 1.1110744174509952e-05, + "loss": 1.256, + "step": 1483 + }, + { + "epoch": 0.7193407658749394, + "grad_norm": 1.3690974712371826, + "learning_rate": 1.107557902893095e-05, + "loss": 1.1057, + "step": 1484 + }, + { + "epoch": 0.7198254968492487, + "grad_norm": 1.5268480777740479, + "learning_rate": 1.1040453778569961e-05, + "loss": 1.3311, + "step": 1485 + }, + { + "epoch": 0.7203102278235579, + "grad_norm": 1.5212996006011963, + "learning_rate": 1.100536852406523e-05, + "loss": 1.3117, + "step": 1486 + }, + { + "epoch": 0.7207949587978671, + "grad_norm": 1.370766282081604, + "learning_rate": 1.0970323365940444e-05, + "loss": 1.4976, + "step": 1487 + }, + { + "epoch": 0.7212796897721765, + "grad_norm": 1.3523683547973633, + "learning_rate": 1.0935318404604375e-05, + "loss": 1.365, + "step": 1488 + }, + { + "epoch": 0.7217644207464857, + "grad_norm": 1.9818627834320068, + "learning_rate": 1.090035374035065e-05, + "loss": 1.1948, + "step": 1489 + }, + { + "epoch": 0.722249151720795, + "grad_norm": 1.465571641921997, + "learning_rate": 1.0865429473357414e-05, + "loss": 1.6397, + "step": 1490 + }, + { + "epoch": 0.7227338826951042, + "grad_norm": 1.360374093055725, + "learning_rate": 1.0830545703687109e-05, + "loss": 1.3702, + "step": 1491 + }, + { + "epoch": 0.7232186136694134, + "grad_norm": 1.2991136312484741, + "learning_rate": 1.0795702531286106e-05, + "loss": 1.2215, + "step": 1492 + }, + { + "epoch": 0.7237033446437228, + "grad_norm": 1.4738467931747437, + "learning_rate": 1.0760900055984496e-05, + "loss": 1.7089, + "step": 1493 + }, + { + "epoch": 0.724188075618032, + "grad_norm": 1.5092021226882935, + "learning_rate": 1.0726138377495728e-05, + "loss": 1.5763, + "step": 1494 + }, + { + "epoch": 0.7246728065923412, + "grad_norm": 1.505959391593933, + "learning_rate": 1.0691417595416407e-05, + "loss": 1.4419, + "step": 1495 + }, + { + "epoch": 0.7251575375666505, + "grad_norm": 1.417377233505249, + "learning_rate": 1.0656737809225928e-05, + "loss": 1.264, + "step": 1496 + }, + { + "epoch": 0.7256422685409598, + "grad_norm": 1.3735849857330322, + "learning_rate": 1.0622099118286239e-05, + "loss": 1.3909, + "step": 1497 + }, + { + "epoch": 0.726126999515269, + "grad_norm": 1.5356085300445557, + "learning_rate": 1.0587501621841558e-05, + "loss": 1.4821, + "step": 1498 + }, + { + "epoch": 0.7266117304895783, + "grad_norm": 1.3162624835968018, + "learning_rate": 1.0552945419018065e-05, + "loss": 1.2017, + "step": 1499 + }, + { + "epoch": 0.7270964614638875, + "grad_norm": 1.4844067096710205, + "learning_rate": 1.0518430608823621e-05, + "loss": 1.5073, + "step": 1500 + }, + { + "epoch": 0.7275811924381969, + "grad_norm": 1.652967929840088, + "learning_rate": 1.0483957290147494e-05, + "loss": 1.5602, + "step": 1501 + }, + { + "epoch": 0.7280659234125061, + "grad_norm": 1.4265861511230469, + "learning_rate": 1.0449525561760098e-05, + "loss": 1.5582, + "step": 1502 + }, + { + "epoch": 0.7285506543868153, + "grad_norm": 1.723618984222412, + "learning_rate": 1.041513552231265e-05, + "loss": 1.869, + "step": 1503 + }, + { + "epoch": 0.7290353853611246, + "grad_norm": 1.3224735260009766, + "learning_rate": 1.0380787270336955e-05, + "loss": 1.3943, + "step": 1504 + }, + { + "epoch": 0.7295201163354338, + "grad_norm": 1.6832599639892578, + "learning_rate": 1.034648090424506e-05, + "loss": 1.4809, + "step": 1505 + }, + { + "epoch": 0.7300048473097431, + "grad_norm": 1.4091854095458984, + "learning_rate": 1.0312216522329038e-05, + "loss": 1.426, + "step": 1506 + }, + { + "epoch": 0.7304895782840524, + "grad_norm": 1.501326560974121, + "learning_rate": 1.0277994222760645e-05, + "loss": 1.6879, + "step": 1507 + }, + { + "epoch": 0.7309743092583616, + "grad_norm": 1.2879735231399536, + "learning_rate": 1.0243814103591074e-05, + "loss": 1.4322, + "step": 1508 + }, + { + "epoch": 0.7314590402326708, + "grad_norm": 1.6264989376068115, + "learning_rate": 1.0209676262750658e-05, + "loss": 1.2868, + "step": 1509 + }, + { + "epoch": 0.7319437712069802, + "grad_norm": 1.4507060050964355, + "learning_rate": 1.0175580798048625e-05, + "loss": 1.5371, + "step": 1510 + }, + { + "epoch": 0.7324285021812894, + "grad_norm": 1.3660101890563965, + "learning_rate": 1.0141527807172766e-05, + "loss": 1.4603, + "step": 1511 + }, + { + "epoch": 0.7329132331555986, + "grad_norm": 1.5483052730560303, + "learning_rate": 1.0107517387689166e-05, + "loss": 1.2558, + "step": 1512 + }, + { + "epoch": 0.7333979641299079, + "grad_norm": 1.5407285690307617, + "learning_rate": 1.0073549637041985e-05, + "loss": 1.3025, + "step": 1513 + }, + { + "epoch": 0.7338826951042171, + "grad_norm": 1.3726648092269897, + "learning_rate": 1.0039624652553073e-05, + "loss": 1.5513, + "step": 1514 + }, + { + "epoch": 0.7343674260785265, + "grad_norm": 1.3971195220947266, + "learning_rate": 1.0005742531421805e-05, + "loss": 1.2535, + "step": 1515 + }, + { + "epoch": 0.7348521570528357, + "grad_norm": 1.4116908311843872, + "learning_rate": 9.9719033707247e-06, + "loss": 1.1999, + "step": 1516 + }, + { + "epoch": 0.7353368880271449, + "grad_norm": 1.2860015630722046, + "learning_rate": 9.938107267415238e-06, + "loss": 1.4093, + "step": 1517 + }, + { + "epoch": 0.7358216190014542, + "grad_norm": 1.4017292261123657, + "learning_rate": 9.904354318323474e-06, + "loss": 1.3041, + "step": 1518 + }, + { + "epoch": 0.7363063499757635, + "grad_norm": 1.479777216911316, + "learning_rate": 9.870644620155877e-06, + "loss": 1.4719, + "step": 1519 + }, + { + "epoch": 0.7367910809500727, + "grad_norm": 1.596462368965149, + "learning_rate": 9.836978269494956e-06, + "loss": 1.4582, + "step": 1520 + }, + { + "epoch": 0.737275811924382, + "grad_norm": 1.3571925163269043, + "learning_rate": 9.80335536279906e-06, + "loss": 1.4515, + "step": 1521 + }, + { + "epoch": 0.7377605428986912, + "grad_norm": 1.3262391090393066, + "learning_rate": 9.76977599640204e-06, + "loss": 1.5619, + "step": 1522 + }, + { + "epoch": 0.7382452738730004, + "grad_norm": 1.3788524866104126, + "learning_rate": 9.736240266512992e-06, + "loss": 1.2029, + "step": 1523 + }, + { + "epoch": 0.7387300048473098, + "grad_norm": 1.3858743906021118, + "learning_rate": 9.702748269216021e-06, + "loss": 1.466, + "step": 1524 + }, + { + "epoch": 0.739214735821619, + "grad_norm": 1.5450936555862427, + "learning_rate": 9.669300100469902e-06, + "loss": 1.4527, + "step": 1525 + }, + { + "epoch": 0.7396994667959282, + "grad_norm": 1.3684203624725342, + "learning_rate": 9.635895856107855e-06, + "loss": 1.2464, + "step": 1526 + }, + { + "epoch": 0.7401841977702375, + "grad_norm": 1.3800790309906006, + "learning_rate": 9.60253563183724e-06, + "loss": 1.2337, + "step": 1527 + }, + { + "epoch": 0.7406689287445468, + "grad_norm": 1.3781445026397705, + "learning_rate": 9.569219523239292e-06, + "loss": 1.4606, + "step": 1528 + }, + { + "epoch": 0.741153659718856, + "grad_norm": 1.3597383499145508, + "learning_rate": 9.535947625768851e-06, + "loss": 1.2775, + "step": 1529 + }, + { + "epoch": 0.7416383906931653, + "grad_norm": 1.5373189449310303, + "learning_rate": 9.5027200347541e-06, + "loss": 1.4209, + "step": 1530 + }, + { + "epoch": 0.7421231216674745, + "grad_norm": 1.3820744752883911, + "learning_rate": 9.46953684539626e-06, + "loss": 1.482, + "step": 1531 + }, + { + "epoch": 0.7426078526417839, + "grad_norm": 1.2832772731781006, + "learning_rate": 9.436398152769349e-06, + "loss": 0.9302, + "step": 1532 + }, + { + "epoch": 0.7430925836160931, + "grad_norm": 1.4201370477676392, + "learning_rate": 9.403304051819883e-06, + "loss": 1.6234, + "step": 1533 + }, + { + "epoch": 0.7435773145904023, + "grad_norm": 1.4880331754684448, + "learning_rate": 9.370254637366638e-06, + "loss": 1.3413, + "step": 1534 + }, + { + "epoch": 0.7440620455647116, + "grad_norm": 1.386278510093689, + "learning_rate": 9.337250004100337e-06, + "loss": 1.5433, + "step": 1535 + }, + { + "epoch": 0.7445467765390208, + "grad_norm": 1.4576743841171265, + "learning_rate": 9.304290246583398e-06, + "loss": 1.686, + "step": 1536 + }, + { + "epoch": 0.7450315075133301, + "grad_norm": 1.4495551586151123, + "learning_rate": 9.271375459249698e-06, + "loss": 1.4784, + "step": 1537 + }, + { + "epoch": 0.7455162384876394, + "grad_norm": 1.3655641078948975, + "learning_rate": 9.238505736404212e-06, + "loss": 1.4092, + "step": 1538 + }, + { + "epoch": 0.7460009694619486, + "grad_norm": 1.4185867309570312, + "learning_rate": 9.205681172222854e-06, + "loss": 1.4356, + "step": 1539 + }, + { + "epoch": 0.7464857004362578, + "grad_norm": 1.4596229791641235, + "learning_rate": 9.172901860752117e-06, + "loss": 1.3854, + "step": 1540 + }, + { + "epoch": 0.7469704314105672, + "grad_norm": 1.3605375289916992, + "learning_rate": 9.140167895908867e-06, + "loss": 1.283, + "step": 1541 + }, + { + "epoch": 0.7474551623848764, + "grad_norm": 1.421411156654358, + "learning_rate": 9.107479371480016e-06, + "loss": 1.5658, + "step": 1542 + }, + { + "epoch": 0.7479398933591856, + "grad_norm": 1.4605516195297241, + "learning_rate": 9.074836381122312e-06, + "loss": 1.5596, + "step": 1543 + }, + { + "epoch": 0.7484246243334949, + "grad_norm": 1.7730802297592163, + "learning_rate": 9.04223901836202e-06, + "loss": 1.5225, + "step": 1544 + }, + { + "epoch": 0.7489093553078041, + "grad_norm": 1.4480133056640625, + "learning_rate": 9.009687376594694e-06, + "loss": 1.0663, + "step": 1545 + }, + { + "epoch": 0.7493940862821135, + "grad_norm": 1.3374335765838623, + "learning_rate": 8.977181549084884e-06, + "loss": 1.0286, + "step": 1546 + }, + { + "epoch": 0.7498788172564227, + "grad_norm": 1.3053975105285645, + "learning_rate": 8.944721628965868e-06, + "loss": 1.3729, + "step": 1547 + }, + { + "epoch": 0.7503635482307319, + "grad_norm": 1.3824571371078491, + "learning_rate": 8.912307709239394e-06, + "loss": 1.3485, + "step": 1548 + }, + { + "epoch": 0.7508482792050412, + "grad_norm": 1.3053250312805176, + "learning_rate": 8.879939882775443e-06, + "loss": 1.314, + "step": 1549 + }, + { + "epoch": 0.7513330101793505, + "grad_norm": 1.4634974002838135, + "learning_rate": 8.847618242311895e-06, + "loss": 1.3767, + "step": 1550 + }, + { + "epoch": 0.7518177411536597, + "grad_norm": 1.7195087671279907, + "learning_rate": 8.815342880454311e-06, + "loss": 1.7894, + "step": 1551 + }, + { + "epoch": 0.752302472127969, + "grad_norm": 1.4599210023880005, + "learning_rate": 8.783113889675679e-06, + "loss": 1.4189, + "step": 1552 + }, + { + "epoch": 0.7527872031022782, + "grad_norm": 1.5068151950836182, + "learning_rate": 8.750931362316094e-06, + "loss": 1.4675, + "step": 1553 + }, + { + "epoch": 0.7532719340765875, + "grad_norm": 1.356566071510315, + "learning_rate": 8.718795390582569e-06, + "loss": 1.0757, + "step": 1554 + }, + { + "epoch": 0.7537566650508968, + "grad_norm": 1.467347264289856, + "learning_rate": 8.686706066548686e-06, + "loss": 1.4609, + "step": 1555 + }, + { + "epoch": 0.754241396025206, + "grad_norm": 1.4406360387802124, + "learning_rate": 8.654663482154419e-06, + "loss": 1.4178, + "step": 1556 + }, + { + "epoch": 0.7547261269995152, + "grad_norm": 1.337432861328125, + "learning_rate": 8.622667729205771e-06, + "loss": 1.1877, + "step": 1557 + }, + { + "epoch": 0.7552108579738245, + "grad_norm": 1.2744303941726685, + "learning_rate": 8.590718899374628e-06, + "loss": 1.3257, + "step": 1558 + }, + { + "epoch": 0.7556955889481338, + "grad_norm": 1.3614236116409302, + "learning_rate": 8.558817084198387e-06, + "loss": 1.2649, + "step": 1559 + }, + { + "epoch": 0.7561803199224431, + "grad_norm": 1.5040825605392456, + "learning_rate": 8.52696237507978e-06, + "loss": 1.4065, + "step": 1560 + }, + { + "epoch": 0.7566650508967523, + "grad_norm": 1.427902102470398, + "learning_rate": 8.495154863286548e-06, + "loss": 1.6126, + "step": 1561 + }, + { + "epoch": 0.7571497818710615, + "grad_norm": 1.4786641597747803, + "learning_rate": 8.463394639951206e-06, + "loss": 1.6583, + "step": 1562 + }, + { + "epoch": 0.7576345128453709, + "grad_norm": 1.4081834554672241, + "learning_rate": 8.431681796070809e-06, + "loss": 1.3911, + "step": 1563 + }, + { + "epoch": 0.7581192438196801, + "grad_norm": 1.4340317249298096, + "learning_rate": 8.400016422506624e-06, + "loss": 1.5016, + "step": 1564 + }, + { + "epoch": 0.7586039747939893, + "grad_norm": 1.3345164060592651, + "learning_rate": 8.368398609983945e-06, + "loss": 1.0097, + "step": 1565 + }, + { + "epoch": 0.7590887057682986, + "grad_norm": 1.5775506496429443, + "learning_rate": 8.336828449091786e-06, + "loss": 1.7549, + "step": 1566 + }, + { + "epoch": 0.7595734367426078, + "grad_norm": 1.4792371988296509, + "learning_rate": 8.305306030282617e-06, + "loss": 1.3905, + "step": 1567 + }, + { + "epoch": 0.7600581677169171, + "grad_norm": 1.5295783281326294, + "learning_rate": 8.273831443872132e-06, + "loss": 1.6053, + "step": 1568 + }, + { + "epoch": 0.7605428986912264, + "grad_norm": 1.3806251287460327, + "learning_rate": 8.242404780038996e-06, + "loss": 1.2319, + "step": 1569 + }, + { + "epoch": 0.7610276296655356, + "grad_norm": 1.5314702987670898, + "learning_rate": 8.211026128824539e-06, + "loss": 1.4693, + "step": 1570 + }, + { + "epoch": 0.7615123606398448, + "grad_norm": 1.3532733917236328, + "learning_rate": 8.179695580132563e-06, + "loss": 1.4102, + "step": 1571 + }, + { + "epoch": 0.7619970916141542, + "grad_norm": 1.3659793138504028, + "learning_rate": 8.14841322372901e-06, + "loss": 1.041, + "step": 1572 + }, + { + "epoch": 0.7624818225884634, + "grad_norm": 1.439633846282959, + "learning_rate": 8.117179149241788e-06, + "loss": 1.4129, + "step": 1573 + }, + { + "epoch": 0.7629665535627727, + "grad_norm": 1.3845553398132324, + "learning_rate": 8.085993446160442e-06, + "loss": 1.2368, + "step": 1574 + }, + { + "epoch": 0.7634512845370819, + "grad_norm": 1.5613845586776733, + "learning_rate": 8.054856203835934e-06, + "loss": 1.5857, + "step": 1575 + }, + { + "epoch": 0.7639360155113912, + "grad_norm": 1.3105788230895996, + "learning_rate": 8.023767511480378e-06, + "loss": 1.2001, + "step": 1576 + }, + { + "epoch": 0.7644207464857005, + "grad_norm": 1.5210412740707397, + "learning_rate": 7.992727458166788e-06, + "loss": 1.6572, + "step": 1577 + }, + { + "epoch": 0.7649054774600097, + "grad_norm": 1.3836086988449097, + "learning_rate": 7.96173613282883e-06, + "loss": 1.3445, + "step": 1578 + }, + { + "epoch": 0.7653902084343189, + "grad_norm": 1.2600528001785278, + "learning_rate": 7.93079362426054e-06, + "loss": 1.184, + "step": 1579 + }, + { + "epoch": 0.7658749394086282, + "grad_norm": 1.443926453590393, + "learning_rate": 7.89990002111611e-06, + "loss": 1.3783, + "step": 1580 + }, + { + "epoch": 0.7663596703829375, + "grad_norm": 1.4408482313156128, + "learning_rate": 7.86905541190959e-06, + "loss": 1.4508, + "step": 1581 + }, + { + "epoch": 0.7668444013572467, + "grad_norm": 1.4853960275650024, + "learning_rate": 7.838259885014676e-06, + "loss": 1.5522, + "step": 1582 + }, + { + "epoch": 0.767329132331556, + "grad_norm": 1.9142934083938599, + "learning_rate": 7.807513528664414e-06, + "loss": 1.2533, + "step": 1583 + }, + { + "epoch": 0.7678138633058652, + "grad_norm": 1.4642645120620728, + "learning_rate": 7.776816430950997e-06, + "loss": 1.2509, + "step": 1584 + }, + { + "epoch": 0.7682985942801746, + "grad_norm": 1.3161667585372925, + "learning_rate": 7.746168679825468e-06, + "loss": 1.1, + "step": 1585 + }, + { + "epoch": 0.7687833252544838, + "grad_norm": 1.469278335571289, + "learning_rate": 7.715570363097487e-06, + "loss": 1.3898, + "step": 1586 + }, + { + "epoch": 0.769268056228793, + "grad_norm": 1.4469612836837769, + "learning_rate": 7.685021568435074e-06, + "loss": 1.358, + "step": 1587 + }, + { + "epoch": 0.7697527872031023, + "grad_norm": 1.5007667541503906, + "learning_rate": 7.654522383364387e-06, + "loss": 1.5311, + "step": 1588 + }, + { + "epoch": 0.7702375181774115, + "grad_norm": 1.4023587703704834, + "learning_rate": 7.624072895269418e-06, + "loss": 1.3025, + "step": 1589 + }, + { + "epoch": 0.7707222491517208, + "grad_norm": 1.348415493965149, + "learning_rate": 7.593673191391776e-06, + "loss": 1.2927, + "step": 1590 + }, + { + "epoch": 0.7712069801260301, + "grad_norm": 1.487410068511963, + "learning_rate": 7.563323358830448e-06, + "loss": 1.8384, + "step": 1591 + }, + { + "epoch": 0.7716917111003393, + "grad_norm": 1.5158098936080933, + "learning_rate": 7.533023484541513e-06, + "loss": 1.3989, + "step": 1592 + }, + { + "epoch": 0.7721764420746485, + "grad_norm": 1.2837767601013184, + "learning_rate": 7.502773655337936e-06, + "loss": 1.0266, + "step": 1593 + }, + { + "epoch": 0.7726611730489579, + "grad_norm": 1.5535281896591187, + "learning_rate": 7.472573957889267e-06, + "loss": 1.5511, + "step": 1594 + }, + { + "epoch": 0.7731459040232671, + "grad_norm": 1.699720025062561, + "learning_rate": 7.4424244787214656e-06, + "loss": 1.3947, + "step": 1595 + }, + { + "epoch": 0.7736306349975763, + "grad_norm": 1.3902910947799683, + "learning_rate": 7.4123253042165495e-06, + "loss": 1.5077, + "step": 1596 + }, + { + "epoch": 0.7741153659718856, + "grad_norm": 1.4567160606384277, + "learning_rate": 7.382276520612463e-06, + "loss": 1.377, + "step": 1597 + }, + { + "epoch": 0.7746000969461949, + "grad_norm": 1.6392182111740112, + "learning_rate": 7.352278214002739e-06, + "loss": 1.7202, + "step": 1598 + }, + { + "epoch": 0.7750848279205041, + "grad_norm": 1.4235200881958008, + "learning_rate": 7.3223304703363135e-06, + "loss": 1.3698, + "step": 1599 + }, + { + "epoch": 0.7755695588948134, + "grad_norm": 1.3485275506973267, + "learning_rate": 7.292433375417232e-06, + "loss": 1.1991, + "step": 1600 + }, + { + "epoch": 0.7760542898691226, + "grad_norm": 1.5387061834335327, + "learning_rate": 7.262587014904429e-06, + "loss": 1.414, + "step": 1601 + }, + { + "epoch": 0.7765390208434318, + "grad_norm": 1.4910553693771362, + "learning_rate": 7.232791474311493e-06, + "loss": 1.3939, + "step": 1602 + }, + { + "epoch": 0.7770237518177412, + "grad_norm": 1.3060449361801147, + "learning_rate": 7.203046839006383e-06, + "loss": 1.3913, + "step": 1603 + }, + { + "epoch": 0.7775084827920504, + "grad_norm": 1.4318301677703857, + "learning_rate": 7.173353194211247e-06, + "loss": 1.3158, + "step": 1604 + }, + { + "epoch": 0.7779932137663597, + "grad_norm": 1.4487580060958862, + "learning_rate": 7.143710625002078e-06, + "loss": 1.4206, + "step": 1605 + }, + { + "epoch": 0.7784779447406689, + "grad_norm": 2.377192258834839, + "learning_rate": 7.114119216308593e-06, + "loss": 1.3553, + "step": 1606 + }, + { + "epoch": 0.7789626757149782, + "grad_norm": 1.4438358545303345, + "learning_rate": 7.084579052913884e-06, + "loss": 1.6249, + "step": 1607 + }, + { + "epoch": 0.7794474066892875, + "grad_norm": 1.6939022541046143, + "learning_rate": 7.0550902194542525e-06, + "loss": 1.5022, + "step": 1608 + }, + { + "epoch": 0.7799321376635967, + "grad_norm": 1.4086045026779175, + "learning_rate": 7.0256528004188995e-06, + "loss": 1.486, + "step": 1609 + }, + { + "epoch": 0.7804168686379059, + "grad_norm": 1.3695285320281982, + "learning_rate": 6.996266880149749e-06, + "loss": 1.2197, + "step": 1610 + }, + { + "epoch": 0.7809015996122152, + "grad_norm": 1.3833764791488647, + "learning_rate": 6.966932542841156e-06, + "loss": 1.2276, + "step": 1611 + }, + { + "epoch": 0.7813863305865245, + "grad_norm": 1.2639784812927246, + "learning_rate": 6.937649872539675e-06, + "loss": 1.1809, + "step": 1612 + }, + { + "epoch": 0.7818710615608337, + "grad_norm": 1.495296597480774, + "learning_rate": 6.908418953143861e-06, + "loss": 1.3947, + "step": 1613 + }, + { + "epoch": 0.782355792535143, + "grad_norm": 1.2453677654266357, + "learning_rate": 6.879239868403964e-06, + "loss": 1.3538, + "step": 1614 + }, + { + "epoch": 0.7828405235094522, + "grad_norm": 1.3305635452270508, + "learning_rate": 6.8501127019217346e-06, + "loss": 1.3287, + "step": 1615 + }, + { + "epoch": 0.7833252544837616, + "grad_norm": 1.4396018981933594, + "learning_rate": 6.8210375371501625e-06, + "loss": 1.4193, + "step": 1616 + }, + { + "epoch": 0.7838099854580708, + "grad_norm": 1.4076224565505981, + "learning_rate": 6.7920144573932695e-06, + "loss": 1.3454, + "step": 1617 + }, + { + "epoch": 0.78429471643238, + "grad_norm": 1.4297701120376587, + "learning_rate": 6.7630435458058114e-06, + "loss": 1.4004, + "step": 1618 + }, + { + "epoch": 0.7847794474066893, + "grad_norm": 1.3946882486343384, + "learning_rate": 6.734124885393111e-06, + "loss": 1.5085, + "step": 1619 + }, + { + "epoch": 0.7852641783809986, + "grad_norm": 1.538183331489563, + "learning_rate": 6.705258559010755e-06, + "loss": 1.7725, + "step": 1620 + }, + { + "epoch": 0.7857489093553078, + "grad_norm": 1.3094892501831055, + "learning_rate": 6.676444649364416e-06, + "loss": 1.3554, + "step": 1621 + }, + { + "epoch": 0.7862336403296171, + "grad_norm": 1.6407068967819214, + "learning_rate": 6.647683239009556e-06, + "loss": 1.5119, + "step": 1622 + }, + { + "epoch": 0.7867183713039263, + "grad_norm": 1.463111400604248, + "learning_rate": 6.618974410351247e-06, + "loss": 1.4116, + "step": 1623 + }, + { + "epoch": 0.7872031022782355, + "grad_norm": 1.4133925437927246, + "learning_rate": 6.590318245643887e-06, + "loss": 1.3629, + "step": 1624 + }, + { + "epoch": 0.7876878332525449, + "grad_norm": 1.5195677280426025, + "learning_rate": 6.561714826990998e-06, + "loss": 1.8831, + "step": 1625 + }, + { + "epoch": 0.7881725642268541, + "grad_norm": 1.3426353931427002, + "learning_rate": 6.533164236344966e-06, + "loss": 1.1815, + "step": 1626 + }, + { + "epoch": 0.7886572952011633, + "grad_norm": 1.381893277168274, + "learning_rate": 6.504666555506825e-06, + "loss": 1.4206, + "step": 1627 + }, + { + "epoch": 0.7891420261754726, + "grad_norm": 1.40556800365448, + "learning_rate": 6.476221866126029e-06, + "loss": 1.2124, + "step": 1628 + }, + { + "epoch": 0.7896267571497819, + "grad_norm": 1.4516103267669678, + "learning_rate": 6.447830249700174e-06, + "loss": 1.2965, + "step": 1629 + }, + { + "epoch": 0.7901114881240912, + "grad_norm": 1.3030951023101807, + "learning_rate": 6.41949178757483e-06, + "loss": 1.2891, + "step": 1630 + }, + { + "epoch": 0.7905962190984004, + "grad_norm": 1.4862583875656128, + "learning_rate": 6.3912065609432415e-06, + "loss": 1.4867, + "step": 1631 + }, + { + "epoch": 0.7910809500727096, + "grad_norm": 1.3782804012298584, + "learning_rate": 6.362974650846157e-06, + "loss": 1.4044, + "step": 1632 + }, + { + "epoch": 0.7915656810470189, + "grad_norm": 1.3330339193344116, + "learning_rate": 6.334796138171542e-06, + "loss": 1.4485, + "step": 1633 + }, + { + "epoch": 0.7920504120213282, + "grad_norm": 1.527451753616333, + "learning_rate": 6.306671103654382e-06, + "loss": 1.5054, + "step": 1634 + }, + { + "epoch": 0.7925351429956374, + "grad_norm": 1.3172121047973633, + "learning_rate": 6.278599627876433e-06, + "loss": 1.3527, + "step": 1635 + }, + { + "epoch": 0.7930198739699467, + "grad_norm": 1.3731809854507446, + "learning_rate": 6.250581791266019e-06, + "loss": 1.4858, + "step": 1636 + }, + { + "epoch": 0.7935046049442559, + "grad_norm": 1.3459367752075195, + "learning_rate": 6.22261767409775e-06, + "loss": 1.3209, + "step": 1637 + }, + { + "epoch": 0.7939893359185652, + "grad_norm": 2.2093305587768555, + "learning_rate": 6.1947073564923576e-06, + "loss": 1.2707, + "step": 1638 + }, + { + "epoch": 0.7944740668928745, + "grad_norm": 1.42824387550354, + "learning_rate": 6.166850918416406e-06, + "loss": 1.3117, + "step": 1639 + }, + { + "epoch": 0.7949587978671837, + "grad_norm": 1.436257243156433, + "learning_rate": 6.139048439682085e-06, + "loss": 1.4706, + "step": 1640 + }, + { + "epoch": 0.7954435288414929, + "grad_norm": 1.4861083030700684, + "learning_rate": 6.111299999947009e-06, + "loss": 1.4673, + "step": 1641 + }, + { + "epoch": 0.7959282598158022, + "grad_norm": 1.4589564800262451, + "learning_rate": 6.083605678713939e-06, + "loss": 1.4155, + "step": 1642 + }, + { + "epoch": 0.7964129907901115, + "grad_norm": 1.4518787860870361, + "learning_rate": 6.055965555330606e-06, + "loss": 1.3371, + "step": 1643 + }, + { + "epoch": 0.7968977217644208, + "grad_norm": 1.3608945608139038, + "learning_rate": 6.028379708989418e-06, + "loss": 1.2777, + "step": 1644 + }, + { + "epoch": 0.79738245273873, + "grad_norm": 1.3667936325073242, + "learning_rate": 6.000848218727312e-06, + "loss": 1.2991, + "step": 1645 + }, + { + "epoch": 0.7978671837130392, + "grad_norm": 1.4242197275161743, + "learning_rate": 5.973371163425456e-06, + "loss": 1.4603, + "step": 1646 + }, + { + "epoch": 0.7983519146873486, + "grad_norm": 1.4276293516159058, + "learning_rate": 5.945948621809091e-06, + "loss": 1.214, + "step": 1647 + }, + { + "epoch": 0.7988366456616578, + "grad_norm": 1.4498988389968872, + "learning_rate": 5.91858067244723e-06, + "loss": 1.3601, + "step": 1648 + }, + { + "epoch": 0.799321376635967, + "grad_norm": 1.4795246124267578, + "learning_rate": 5.891267393752509e-06, + "loss": 1.5095, + "step": 1649 + }, + { + "epoch": 0.7998061076102763, + "grad_norm": 1.371968388557434, + "learning_rate": 5.864008863980897e-06, + "loss": 1.562, + "step": 1650 + }, + { + "epoch": 0.8002908385845856, + "grad_norm": 1.39975106716156, + "learning_rate": 5.836805161231507e-06, + "loss": 1.4923, + "step": 1651 + }, + { + "epoch": 0.8007755695588948, + "grad_norm": 1.4395267963409424, + "learning_rate": 5.809656363446381e-06, + "loss": 1.2459, + "step": 1652 + }, + { + "epoch": 0.8012603005332041, + "grad_norm": 1.383536696434021, + "learning_rate": 5.782562548410236e-06, + "loss": 1.1381, + "step": 1653 + }, + { + "epoch": 0.8017450315075133, + "grad_norm": 1.2943404912948608, + "learning_rate": 5.7555237937502616e-06, + "loss": 1.2698, + "step": 1654 + }, + { + "epoch": 0.8022297624818225, + "grad_norm": 1.4058960676193237, + "learning_rate": 5.7285401769358845e-06, + "loss": 1.6954, + "step": 1655 + }, + { + "epoch": 0.8027144934561319, + "grad_norm": 1.4720020294189453, + "learning_rate": 5.701611775278573e-06, + "loss": 1.5013, + "step": 1656 + }, + { + "epoch": 0.8031992244304411, + "grad_norm": 1.3335487842559814, + "learning_rate": 5.674738665931575e-06, + "loss": 1.239, + "step": 1657 + }, + { + "epoch": 0.8036839554047503, + "grad_norm": 1.3698316812515259, + "learning_rate": 5.647920925889744e-06, + "loss": 1.0645, + "step": 1658 + }, + { + "epoch": 0.8041686863790596, + "grad_norm": 1.3854635953903198, + "learning_rate": 5.6211586319892625e-06, + "loss": 1.3054, + "step": 1659 + }, + { + "epoch": 0.8046534173533689, + "grad_norm": 1.5994597673416138, + "learning_rate": 5.594451860907485e-06, + "loss": 1.411, + "step": 1660 + }, + { + "epoch": 0.8051381483276782, + "grad_norm": 1.4336647987365723, + "learning_rate": 5.567800689162658e-06, + "loss": 1.4892, + "step": 1661 + }, + { + "epoch": 0.8056228793019874, + "grad_norm": 1.753812551498413, + "learning_rate": 5.541205193113763e-06, + "loss": 1.6698, + "step": 1662 + }, + { + "epoch": 0.8061076102762966, + "grad_norm": 1.2079178094863892, + "learning_rate": 5.51466544896021e-06, + "loss": 1.0658, + "step": 1663 + }, + { + "epoch": 0.8065923412506059, + "grad_norm": 1.3493430614471436, + "learning_rate": 5.488181532741732e-06, + "loss": 1.3803, + "step": 1664 + }, + { + "epoch": 0.8070770722249152, + "grad_norm": 1.5290086269378662, + "learning_rate": 5.46175352033807e-06, + "loss": 1.5866, + "step": 1665 + }, + { + "epoch": 0.8075618031992244, + "grad_norm": 1.3983334302902222, + "learning_rate": 5.435381487468799e-06, + "loss": 1.595, + "step": 1666 + }, + { + "epoch": 0.8080465341735337, + "grad_norm": 1.5040364265441895, + "learning_rate": 5.409065509693126e-06, + "loss": 1.4392, + "step": 1667 + }, + { + "epoch": 0.8085312651478429, + "grad_norm": 1.456047534942627, + "learning_rate": 5.382805662409623e-06, + "loss": 1.504, + "step": 1668 + }, + { + "epoch": 0.8090159961221522, + "grad_norm": 1.338990330696106, + "learning_rate": 5.356602020856072e-06, + "loss": 1.5103, + "step": 1669 + }, + { + "epoch": 0.8095007270964615, + "grad_norm": 1.4504512548446655, + "learning_rate": 5.330454660109185e-06, + "loss": 1.0707, + "step": 1670 + }, + { + "epoch": 0.8099854580707707, + "grad_norm": 1.4192159175872803, + "learning_rate": 5.30436365508446e-06, + "loss": 1.6072, + "step": 1671 + }, + { + "epoch": 0.81047018904508, + "grad_norm": 1.5492674112319946, + "learning_rate": 5.278329080535896e-06, + "loss": 1.6888, + "step": 1672 + }, + { + "epoch": 0.8109549200193893, + "grad_norm": 1.4452522993087769, + "learning_rate": 5.252351011055831e-06, + "loss": 1.4031, + "step": 1673 + }, + { + "epoch": 0.8114396509936985, + "grad_norm": 1.4986207485198975, + "learning_rate": 5.226429521074691e-06, + "loss": 1.5358, + "step": 1674 + }, + { + "epoch": 0.8119243819680078, + "grad_norm": 1.3144514560699463, + "learning_rate": 5.20056468486082e-06, + "loss": 1.5166, + "step": 1675 + }, + { + "epoch": 0.812409112942317, + "grad_norm": 1.328102946281433, + "learning_rate": 5.174756576520218e-06, + "loss": 0.9864, + "step": 1676 + }, + { + "epoch": 0.8128938439166262, + "grad_norm": 1.3630974292755127, + "learning_rate": 5.149005269996374e-06, + "loss": 1.5685, + "step": 1677 + }, + { + "epoch": 0.8133785748909356, + "grad_norm": 1.7009714841842651, + "learning_rate": 5.123310839070011e-06, + "loss": 1.2827, + "step": 1678 + }, + { + "epoch": 0.8138633058652448, + "grad_norm": 1.3941096067428589, + "learning_rate": 5.097673357358907e-06, + "loss": 1.3748, + "step": 1679 + }, + { + "epoch": 0.814348036839554, + "grad_norm": 1.5804961919784546, + "learning_rate": 5.072092898317679e-06, + "loss": 1.9325, + "step": 1680 + }, + { + "epoch": 0.8148327678138633, + "grad_norm": 1.3486220836639404, + "learning_rate": 5.04656953523755e-06, + "loss": 1.0934, + "step": 1681 + }, + { + "epoch": 0.8153174987881726, + "grad_norm": 1.475723385810852, + "learning_rate": 5.021103341246186e-06, + "loss": 1.1359, + "step": 1682 + }, + { + "epoch": 0.8158022297624818, + "grad_norm": 1.3328871726989746, + "learning_rate": 4.995694389307412e-06, + "loss": 1.1081, + "step": 1683 + }, + { + "epoch": 0.8162869607367911, + "grad_norm": 1.5600857734680176, + "learning_rate": 4.9703427522210914e-06, + "loss": 1.7475, + "step": 1684 + }, + { + "epoch": 0.8167716917111003, + "grad_norm": 1.3647994995117188, + "learning_rate": 4.94504850262284e-06, + "loss": 1.35, + "step": 1685 + }, + { + "epoch": 0.8172564226854095, + "grad_norm": 1.3073344230651855, + "learning_rate": 4.91981171298388e-06, + "loss": 1.2369, + "step": 1686 + }, + { + "epoch": 0.8177411536597189, + "grad_norm": 1.335671067237854, + "learning_rate": 4.894632455610773e-06, + "loss": 1.2816, + "step": 1687 + }, + { + "epoch": 0.8182258846340281, + "grad_norm": 1.4057748317718506, + "learning_rate": 4.8695108026452745e-06, + "loss": 1.1653, + "step": 1688 + }, + { + "epoch": 0.8187106156083374, + "grad_norm": 1.358802318572998, + "learning_rate": 4.8444468260640755e-06, + "loss": 1.5163, + "step": 1689 + }, + { + "epoch": 0.8191953465826466, + "grad_norm": 1.3890354633331299, + "learning_rate": 4.819440597678612e-06, + "loss": 1.3894, + "step": 1690 + }, + { + "epoch": 0.8196800775569559, + "grad_norm": 1.4253144264221191, + "learning_rate": 4.794492189134892e-06, + "loss": 1.4175, + "step": 1691 + }, + { + "epoch": 0.8201648085312652, + "grad_norm": 1.2631031274795532, + "learning_rate": 4.769601671913234e-06, + "loss": 1.0669, + "step": 1692 + }, + { + "epoch": 0.8206495395055744, + "grad_norm": 1.477522611618042, + "learning_rate": 4.744769117328107e-06, + "loss": 1.6204, + "step": 1693 + }, + { + "epoch": 0.8211342704798836, + "grad_norm": 1.3935991525650024, + "learning_rate": 4.719994596527894e-06, + "loss": 1.2437, + "step": 1694 + }, + { + "epoch": 0.821619001454193, + "grad_norm": 1.2869259119033813, + "learning_rate": 4.695278180494725e-06, + "loss": 1.383, + "step": 1695 + }, + { + "epoch": 0.8221037324285022, + "grad_norm": 1.3717503547668457, + "learning_rate": 4.670619940044233e-06, + "loss": 1.3475, + "step": 1696 + }, + { + "epoch": 0.8225884634028114, + "grad_norm": 1.3983089923858643, + "learning_rate": 4.646019945825392e-06, + "loss": 1.5844, + "step": 1697 + }, + { + "epoch": 0.8230731943771207, + "grad_norm": 1.434476375579834, + "learning_rate": 4.621478268320265e-06, + "loss": 1.3396, + "step": 1698 + }, + { + "epoch": 0.8235579253514299, + "grad_norm": 1.392899990081787, + "learning_rate": 4.5969949778438575e-06, + "loss": 1.595, + "step": 1699 + }, + { + "epoch": 0.8240426563257393, + "grad_norm": 1.4684665203094482, + "learning_rate": 4.5725701445438775e-06, + "loss": 1.4212, + "step": 1700 + }, + { + "epoch": 0.8245273873000485, + "grad_norm": 1.4998910427093506, + "learning_rate": 4.548203838400539e-06, + "loss": 1.7012, + "step": 1701 + }, + { + "epoch": 0.8250121182743577, + "grad_norm": 1.3950318098068237, + "learning_rate": 4.523896129226371e-06, + "loss": 1.4352, + "step": 1702 + }, + { + "epoch": 0.825496849248667, + "grad_norm": 1.3992888927459717, + "learning_rate": 4.499647086666029e-06, + "loss": 1.3451, + "step": 1703 + }, + { + "epoch": 0.8259815802229763, + "grad_norm": 1.4925978183746338, + "learning_rate": 4.475456780196066e-06, + "loss": 1.5352, + "step": 1704 + }, + { + "epoch": 0.8264663111972855, + "grad_norm": 1.5991015434265137, + "learning_rate": 4.451325279124749e-06, + "loss": 1.5298, + "step": 1705 + }, + { + "epoch": 0.8269510421715948, + "grad_norm": 1.3293036222457886, + "learning_rate": 4.427252652591876e-06, + "loss": 1.1902, + "step": 1706 + }, + { + "epoch": 0.827435773145904, + "grad_norm": 1.5411173105239868, + "learning_rate": 4.40323896956854e-06, + "loss": 1.3795, + "step": 1707 + }, + { + "epoch": 0.8279205041202132, + "grad_norm": 1.4103530645370483, + "learning_rate": 4.379284298856973e-06, + "loss": 1.5143, + "step": 1708 + }, + { + "epoch": 0.8284052350945226, + "grad_norm": 1.4393373727798462, + "learning_rate": 4.3553887090903075e-06, + "loss": 1.4624, + "step": 1709 + }, + { + "epoch": 0.8288899660688318, + "grad_norm": 1.8007562160491943, + "learning_rate": 4.331552268732433e-06, + "loss": 1.6461, + "step": 1710 + }, + { + "epoch": 0.829374697043141, + "grad_norm": 1.4097933769226074, + "learning_rate": 4.307775046077739e-06, + "loss": 1.1364, + "step": 1711 + }, + { + "epoch": 0.8298594280174503, + "grad_norm": 1.2913793325424194, + "learning_rate": 4.284057109250961e-06, + "loss": 1.37, + "step": 1712 + }, + { + "epoch": 0.8303441589917596, + "grad_norm": 1.320348858833313, + "learning_rate": 4.2603985262069656e-06, + "loss": 1.6305, + "step": 1713 + }, + { + "epoch": 0.8308288899660689, + "grad_norm": 1.3655133247375488, + "learning_rate": 4.236799364730582e-06, + "loss": 1.4158, + "step": 1714 + }, + { + "epoch": 0.8313136209403781, + "grad_norm": 1.4090162515640259, + "learning_rate": 4.213259692436367e-06, + "loss": 1.4813, + "step": 1715 + }, + { + "epoch": 0.8317983519146873, + "grad_norm": 1.4175288677215576, + "learning_rate": 4.189779576768454e-06, + "loss": 1.6809, + "step": 1716 + }, + { + "epoch": 0.8322830828889967, + "grad_norm": 1.5763258934020996, + "learning_rate": 4.166359085000324e-06, + "loss": 1.8811, + "step": 1717 + }, + { + "epoch": 0.8327678138633059, + "grad_norm": 1.3878095149993896, + "learning_rate": 4.142998284234622e-06, + "loss": 1.35, + "step": 1718 + }, + { + "epoch": 0.8332525448376151, + "grad_norm": 1.4590060710906982, + "learning_rate": 4.119697241402998e-06, + "loss": 1.3375, + "step": 1719 + }, + { + "epoch": 0.8337372758119244, + "grad_norm": 1.3511408567428589, + "learning_rate": 4.096456023265866e-06, + "loss": 1.384, + "step": 1720 + }, + { + "epoch": 0.8342220067862336, + "grad_norm": 1.4394463300704956, + "learning_rate": 4.073274696412235e-06, + "loss": 1.6411, + "step": 1721 + }, + { + "epoch": 0.8347067377605429, + "grad_norm": 1.3683598041534424, + "learning_rate": 4.05015332725952e-06, + "loss": 1.2006, + "step": 1722 + }, + { + "epoch": 0.8351914687348522, + "grad_norm": 1.4397015571594238, + "learning_rate": 4.027091982053369e-06, + "loss": 1.4946, + "step": 1723 + }, + { + "epoch": 0.8356761997091614, + "grad_norm": 1.5546510219573975, + "learning_rate": 4.004090726867416e-06, + "loss": 1.5381, + "step": 1724 + }, + { + "epoch": 0.8361609306834706, + "grad_norm": 1.4166737794876099, + "learning_rate": 3.98114962760317e-06, + "loss": 1.4553, + "step": 1725 + }, + { + "epoch": 0.83664566165778, + "grad_norm": 1.4045753479003906, + "learning_rate": 3.9582687499897545e-06, + "loss": 1.3718, + "step": 1726 + }, + { + "epoch": 0.8371303926320892, + "grad_norm": 1.3510545492172241, + "learning_rate": 3.935448159583774e-06, + "loss": 1.4418, + "step": 1727 + }, + { + "epoch": 0.8376151236063984, + "grad_norm": 1.3021389245986938, + "learning_rate": 3.912687921769082e-06, + "loss": 1.183, + "step": 1728 + }, + { + "epoch": 0.8380998545807077, + "grad_norm": 1.3169225454330444, + "learning_rate": 3.88998810175662e-06, + "loss": 1.5796, + "step": 1729 + }, + { + "epoch": 0.8385845855550169, + "grad_norm": 1.3681930303573608, + "learning_rate": 3.8673487645842415e-06, + "loss": 1.2193, + "step": 1730 + }, + { + "epoch": 0.8390693165293263, + "grad_norm": 1.4157400131225586, + "learning_rate": 3.844769975116488e-06, + "loss": 1.3846, + "step": 1731 + }, + { + "epoch": 0.8395540475036355, + "grad_norm": 1.3580200672149658, + "learning_rate": 3.8222517980444325e-06, + "loss": 1.0942, + "step": 1732 + }, + { + "epoch": 0.8400387784779447, + "grad_norm": 1.3693870306015015, + "learning_rate": 3.7997942978854785e-06, + "loss": 1.2999, + "step": 1733 + }, + { + "epoch": 0.840523509452254, + "grad_norm": 1.290262222290039, + "learning_rate": 3.7773975389832043e-06, + "loss": 1.3288, + "step": 1734 + }, + { + "epoch": 0.8410082404265633, + "grad_norm": 1.3575208187103271, + "learning_rate": 3.7550615855071277e-06, + "loss": 1.5665, + "step": 1735 + }, + { + "epoch": 0.8414929714008725, + "grad_norm": 1.3926094770431519, + "learning_rate": 3.7327865014525787e-06, + "loss": 1.3471, + "step": 1736 + }, + { + "epoch": 0.8419777023751818, + "grad_norm": 1.4269956350326538, + "learning_rate": 3.710572350640465e-06, + "loss": 1.618, + "step": 1737 + }, + { + "epoch": 0.842462433349491, + "grad_norm": 1.4620566368103027, + "learning_rate": 3.6884191967171327e-06, + "loss": 1.4229, + "step": 1738 + }, + { + "epoch": 0.8429471643238002, + "grad_norm": 1.3834177255630493, + "learning_rate": 3.666327103154149e-06, + "loss": 1.5917, + "step": 1739 + }, + { + "epoch": 0.8434318952981096, + "grad_norm": 1.575303077697754, + "learning_rate": 3.644296133248143e-06, + "loss": 2.0171, + "step": 1740 + }, + { + "epoch": 0.8439166262724188, + "grad_norm": 1.4172624349594116, + "learning_rate": 3.6223263501206113e-06, + "loss": 1.7448, + "step": 1741 + }, + { + "epoch": 0.844401357246728, + "grad_norm": 1.4966297149658203, + "learning_rate": 3.600417816717755e-06, + "loss": 1.1987, + "step": 1742 + }, + { + "epoch": 0.8448860882210373, + "grad_norm": 1.4303022623062134, + "learning_rate": 3.578570595810274e-06, + "loss": 1.5639, + "step": 1743 + }, + { + "epoch": 0.8453708191953466, + "grad_norm": 1.5101985931396484, + "learning_rate": 3.5567847499932e-06, + "loss": 1.6489, + "step": 1744 + }, + { + "epoch": 0.8458555501696559, + "grad_norm": 1.4596188068389893, + "learning_rate": 3.535060341685731e-06, + "loss": 1.5892, + "step": 1745 + }, + { + "epoch": 0.8463402811439651, + "grad_norm": 1.283461332321167, + "learning_rate": 3.513397433131024e-06, + "loss": 1.3714, + "step": 1746 + }, + { + "epoch": 0.8468250121182743, + "grad_norm": 1.439650535583496, + "learning_rate": 3.491796086396043e-06, + "loss": 1.2675, + "step": 1747 + }, + { + "epoch": 0.8473097430925837, + "grad_norm": 1.3690756559371948, + "learning_rate": 3.4702563633713577e-06, + "loss": 1.255, + "step": 1748 + }, + { + "epoch": 0.8477944740668929, + "grad_norm": 1.511044979095459, + "learning_rate": 3.4487783257710015e-06, + "loss": 1.1604, + "step": 1749 + }, + { + "epoch": 0.8482792050412021, + "grad_norm": 1.4531290531158447, + "learning_rate": 3.4273620351322257e-06, + "loss": 1.4041, + "step": 1750 + }, + { + "epoch": 0.8487639360155114, + "grad_norm": 1.4664944410324097, + "learning_rate": 3.406007552815421e-06, + "loss": 1.4089, + "step": 1751 + }, + { + "epoch": 0.8492486669898206, + "grad_norm": 1.3850915431976318, + "learning_rate": 3.3847149400038527e-06, + "loss": 1.3229, + "step": 1752 + }, + { + "epoch": 0.8497333979641299, + "grad_norm": 1.419942021369934, + "learning_rate": 3.3634842577035447e-06, + "loss": 1.423, + "step": 1753 + }, + { + "epoch": 0.8502181289384392, + "grad_norm": 1.2842981815338135, + "learning_rate": 3.3423155667430708e-06, + "loss": 1.1818, + "step": 1754 + }, + { + "epoch": 0.8507028599127484, + "grad_norm": 1.3830044269561768, + "learning_rate": 3.321208927773384e-06, + "loss": 1.3562, + "step": 1755 + }, + { + "epoch": 0.8511875908870576, + "grad_norm": 1.5061596632003784, + "learning_rate": 3.3001644012676773e-06, + "loss": 1.2338, + "step": 1756 + }, + { + "epoch": 0.851672321861367, + "grad_norm": 1.3219194412231445, + "learning_rate": 3.279182047521151e-06, + "loss": 1.3248, + "step": 1757 + }, + { + "epoch": 0.8521570528356762, + "grad_norm": 1.2967970371246338, + "learning_rate": 3.258261926650902e-06, + "loss": 1.1634, + "step": 1758 + }, + { + "epoch": 0.8526417838099855, + "grad_norm": 1.3681789636611938, + "learning_rate": 3.2374040985957004e-06, + "loss": 1.3534, + "step": 1759 + }, + { + "epoch": 0.8531265147842947, + "grad_norm": 1.4028301239013672, + "learning_rate": 3.216608623115852e-06, + "loss": 1.4824, + "step": 1760 + }, + { + "epoch": 0.8536112457586039, + "grad_norm": 1.3395938873291016, + "learning_rate": 3.1958755597930017e-06, + "loss": 1.3257, + "step": 1761 + }, + { + "epoch": 0.8540959767329133, + "grad_norm": 1.5202916860580444, + "learning_rate": 3.175204968029999e-06, + "loss": 1.4002, + "step": 1762 + }, + { + "epoch": 0.8545807077072225, + "grad_norm": 1.3687208890914917, + "learning_rate": 3.1545969070506747e-06, + "loss": 1.3911, + "step": 1763 + }, + { + "epoch": 0.8550654386815317, + "grad_norm": 1.5068252086639404, + "learning_rate": 3.1340514358997293e-06, + "loss": 1.427, + "step": 1764 + }, + { + "epoch": 0.855550169655841, + "grad_norm": 1.3460315465927124, + "learning_rate": 3.1135686134425134e-06, + "loss": 1.2252, + "step": 1765 + }, + { + "epoch": 0.8560349006301503, + "grad_norm": 1.3183640241622925, + "learning_rate": 3.093148498364898e-06, + "loss": 1.1255, + "step": 1766 + }, + { + "epoch": 0.8565196316044595, + "grad_norm": 1.3901616334915161, + "learning_rate": 3.0727911491730764e-06, + "loss": 1.5205, + "step": 1767 + }, + { + "epoch": 0.8570043625787688, + "grad_norm": 1.479112982749939, + "learning_rate": 3.0524966241934153e-06, + "loss": 1.7212, + "step": 1768 + }, + { + "epoch": 0.857489093553078, + "grad_norm": 1.5991971492767334, + "learning_rate": 3.0322649815722915e-06, + "loss": 1.5865, + "step": 1769 + }, + { + "epoch": 0.8579738245273874, + "grad_norm": 1.4487693309783936, + "learning_rate": 3.012096279275892e-06, + "loss": 1.5127, + "step": 1770 + }, + { + "epoch": 0.8584585555016966, + "grad_norm": 1.623945951461792, + "learning_rate": 2.991990575090095e-06, + "loss": 1.5826, + "step": 1771 + }, + { + "epoch": 0.8589432864760058, + "grad_norm": 1.5717607736587524, + "learning_rate": 2.9719479266202664e-06, + "loss": 1.505, + "step": 1772 + }, + { + "epoch": 0.859428017450315, + "grad_norm": 1.4932787418365479, + "learning_rate": 2.9519683912911266e-06, + "loss": 1.4719, + "step": 1773 + }, + { + "epoch": 0.8599127484246243, + "grad_norm": 1.3678524494171143, + "learning_rate": 2.9320520263465463e-06, + "loss": 1.3981, + "step": 1774 + }, + { + "epoch": 0.8603974793989336, + "grad_norm": 1.2875282764434814, + "learning_rate": 2.9121988888494297e-06, + "loss": 1.2622, + "step": 1775 + }, + { + "epoch": 0.8608822103732429, + "grad_norm": 1.5293846130371094, + "learning_rate": 2.892409035681498e-06, + "loss": 1.6768, + "step": 1776 + }, + { + "epoch": 0.8613669413475521, + "grad_norm": 1.5416409969329834, + "learning_rate": 2.872682523543185e-06, + "loss": 1.4337, + "step": 1777 + }, + { + "epoch": 0.8618516723218613, + "grad_norm": 1.515479326248169, + "learning_rate": 2.8530194089534225e-06, + "loss": 1.3993, + "step": 1778 + }, + { + "epoch": 0.8623364032961707, + "grad_norm": 1.349339246749878, + "learning_rate": 2.833419748249511e-06, + "loss": 1.4137, + "step": 1779 + }, + { + "epoch": 0.8628211342704799, + "grad_norm": 1.4984550476074219, + "learning_rate": 2.8138835975869358e-06, + "loss": 1.7335, + "step": 1780 + }, + { + "epoch": 0.8633058652447891, + "grad_norm": 1.4722235202789307, + "learning_rate": 2.794411012939238e-06, + "loss": 1.2962, + "step": 1781 + }, + { + "epoch": 0.8637905962190984, + "grad_norm": 1.4962884187698364, + "learning_rate": 2.7750020500978193e-06, + "loss": 1.3812, + "step": 1782 + }, + { + "epoch": 0.8642753271934076, + "grad_norm": 1.2720054388046265, + "learning_rate": 2.7556567646717907e-06, + "loss": 1.2682, + "step": 1783 + }, + { + "epoch": 0.864760058167717, + "grad_norm": 1.5404802560806274, + "learning_rate": 2.7363752120878437e-06, + "loss": 1.5413, + "step": 1784 + }, + { + "epoch": 0.8652447891420262, + "grad_norm": 1.352376103401184, + "learning_rate": 2.717157447590041e-06, + "loss": 1.3569, + "step": 1785 + }, + { + "epoch": 0.8657295201163354, + "grad_norm": 1.4405183792114258, + "learning_rate": 2.6980035262397037e-06, + "loss": 1.4793, + "step": 1786 + }, + { + "epoch": 0.8662142510906446, + "grad_norm": 1.5227733850479126, + "learning_rate": 2.6789135029152173e-06, + "loss": 1.4665, + "step": 1787 + }, + { + "epoch": 0.866698982064954, + "grad_norm": 1.4717252254486084, + "learning_rate": 2.659887432311917e-06, + "loss": 1.2643, + "step": 1788 + }, + { + "epoch": 0.8671837130392632, + "grad_norm": 1.542318344116211, + "learning_rate": 2.6409253689418656e-06, + "loss": 1.4383, + "step": 1789 + }, + { + "epoch": 0.8676684440135725, + "grad_norm": 1.4374393224716187, + "learning_rate": 2.6220273671337807e-06, + "loss": 1.2678, + "step": 1790 + }, + { + "epoch": 0.8681531749878817, + "grad_norm": 1.5301721096038818, + "learning_rate": 2.603193481032801e-06, + "loss": 1.7229, + "step": 1791 + }, + { + "epoch": 0.868637905962191, + "grad_norm": 1.2812771797180176, + "learning_rate": 2.584423764600391e-06, + "loss": 1.6145, + "step": 1792 + }, + { + "epoch": 0.8691226369365003, + "grad_norm": 1.3902934789657593, + "learning_rate": 2.5657182716141452e-06, + "loss": 1.3589, + "step": 1793 + }, + { + "epoch": 0.8696073679108095, + "grad_norm": 1.4703373908996582, + "learning_rate": 2.547077055667646e-06, + "loss": 1.4929, + "step": 1794 + }, + { + "epoch": 0.8700920988851187, + "grad_norm": 1.8839432001113892, + "learning_rate": 2.528500170170339e-06, + "loss": 1.3907, + "step": 1795 + }, + { + "epoch": 0.870576829859428, + "grad_norm": 1.3980306386947632, + "learning_rate": 2.5099876683473244e-06, + "loss": 1.5614, + "step": 1796 + }, + { + "epoch": 0.8710615608337373, + "grad_norm": 1.3951743841171265, + "learning_rate": 2.4915396032392567e-06, + "loss": 1.3248, + "step": 1797 + }, + { + "epoch": 0.8715462918080465, + "grad_norm": 1.453779697418213, + "learning_rate": 2.473156027702164e-06, + "loss": 1.4249, + "step": 1798 + }, + { + "epoch": 0.8720310227823558, + "grad_norm": 1.3146569728851318, + "learning_rate": 2.4548369944073004e-06, + "loss": 1.2215, + "step": 1799 + }, + { + "epoch": 0.872515753756665, + "grad_norm": 1.4370759725570679, + "learning_rate": 2.4365825558409966e-06, + "loss": 1.5261, + "step": 1800 + }, + { + "epoch": 0.8730004847309744, + "grad_norm": 1.3861037492752075, + "learning_rate": 2.4183927643045253e-06, + "loss": 1.3451, + "step": 1801 + }, + { + "epoch": 0.8734852157052836, + "grad_norm": 1.2893579006195068, + "learning_rate": 2.4002676719139166e-06, + "loss": 1.2465, + "step": 1802 + }, + { + "epoch": 0.8739699466795928, + "grad_norm": 1.4540156126022339, + "learning_rate": 2.3822073305998534e-06, + "loss": 1.3423, + "step": 1803 + }, + { + "epoch": 0.8744546776539021, + "grad_norm": 1.4508488178253174, + "learning_rate": 2.3642117921074734e-06, + "loss": 1.527, + "step": 1804 + }, + { + "epoch": 0.8749394086282113, + "grad_norm": 1.4460153579711914, + "learning_rate": 2.3462811079962705e-06, + "loss": 1.7063, + "step": 1805 + }, + { + "epoch": 0.8754241396025206, + "grad_norm": 1.468252420425415, + "learning_rate": 2.328415329639902e-06, + "loss": 1.5137, + "step": 1806 + }, + { + "epoch": 0.8759088705768299, + "grad_norm": 1.3156002759933472, + "learning_rate": 2.310614508226078e-06, + "loss": 1.2969, + "step": 1807 + }, + { + "epoch": 0.8763936015511391, + "grad_norm": 1.447495698928833, + "learning_rate": 2.292878694756384e-06, + "loss": 1.3326, + "step": 1808 + }, + { + "epoch": 0.8768783325254483, + "grad_norm": 1.4856112003326416, + "learning_rate": 2.2752079400461564e-06, + "loss": 1.3841, + "step": 1809 + }, + { + "epoch": 0.8773630634997577, + "grad_norm": 1.4724615812301636, + "learning_rate": 2.257602294724337e-06, + "loss": 1.5061, + "step": 1810 + }, + { + "epoch": 0.8778477944740669, + "grad_norm": 1.5109659433364868, + "learning_rate": 2.24006180923331e-06, + "loss": 1.5312, + "step": 1811 + }, + { + "epoch": 0.8783325254483761, + "grad_norm": 1.8590974807739258, + "learning_rate": 2.222586533828777e-06, + "loss": 1.6278, + "step": 1812 + }, + { + "epoch": 0.8788172564226854, + "grad_norm": 1.5733957290649414, + "learning_rate": 2.2051765185795965e-06, + "loss": 1.323, + "step": 1813 + }, + { + "epoch": 0.8793019873969947, + "grad_norm": 1.3470954895019531, + "learning_rate": 2.1878318133676607e-06, + "loss": 1.1649, + "step": 1814 + }, + { + "epoch": 0.879786718371304, + "grad_norm": 1.4746617078781128, + "learning_rate": 2.170552467887721e-06, + "loss": 1.7258, + "step": 1815 + }, + { + "epoch": 0.8802714493456132, + "grad_norm": 1.3480514287948608, + "learning_rate": 2.1533385316472864e-06, + "loss": 1.1594, + "step": 1816 + }, + { + "epoch": 0.8807561803199224, + "grad_norm": 1.4327815771102905, + "learning_rate": 2.136190053966444e-06, + "loss": 1.3378, + "step": 1817 + }, + { + "epoch": 0.8812409112942317, + "grad_norm": 1.433623194694519, + "learning_rate": 2.119107083977742e-06, + "loss": 1.5589, + "step": 1818 + }, + { + "epoch": 0.881725642268541, + "grad_norm": 1.3920966386795044, + "learning_rate": 2.1020896706260367e-06, + "loss": 1.4108, + "step": 1819 + }, + { + "epoch": 0.8822103732428502, + "grad_norm": 1.4291924238204956, + "learning_rate": 2.08513786266836e-06, + "loss": 1.4014, + "step": 1820 + }, + { + "epoch": 0.8826951042171595, + "grad_norm": 2.1731436252593994, + "learning_rate": 2.068251708673777e-06, + "loss": 1.4262, + "step": 1821 + }, + { + "epoch": 0.8831798351914687, + "grad_norm": 1.3496067523956299, + "learning_rate": 2.051431257023237e-06, + "loss": 1.2465, + "step": 1822 + }, + { + "epoch": 0.883664566165778, + "grad_norm": 1.3288339376449585, + "learning_rate": 2.0346765559094567e-06, + "loss": 1.0933, + "step": 1823 + }, + { + "epoch": 0.8841492971400873, + "grad_norm": 1.2372487783432007, + "learning_rate": 2.0179876533367587e-06, + "loss": 1.2749, + "step": 1824 + }, + { + "epoch": 0.8846340281143965, + "grad_norm": 1.4109827280044556, + "learning_rate": 2.0013645971209527e-06, + "loss": 1.3048, + "step": 1825 + }, + { + "epoch": 0.8851187590887057, + "grad_norm": 1.4247568845748901, + "learning_rate": 1.984807434889177e-06, + "loss": 1.3601, + "step": 1826 + }, + { + "epoch": 0.885603490063015, + "grad_norm": 1.3820182085037231, + "learning_rate": 1.9683162140798045e-06, + "loss": 1.288, + "step": 1827 + }, + { + "epoch": 0.8860882210373243, + "grad_norm": 1.4704811573028564, + "learning_rate": 1.9518909819422336e-06, + "loss": 1.4445, + "step": 1828 + }, + { + "epoch": 0.8865729520116336, + "grad_norm": 1.5304688215255737, + "learning_rate": 1.935531785536834e-06, + "loss": 1.8307, + "step": 1829 + }, + { + "epoch": 0.8870576829859428, + "grad_norm": 1.4468873739242554, + "learning_rate": 1.919238671734758e-06, + "loss": 1.3208, + "step": 1830 + }, + { + "epoch": 0.887542413960252, + "grad_norm": 1.4901260137557983, + "learning_rate": 1.9030116872178316e-06, + "loss": 1.5234, + "step": 1831 + }, + { + "epoch": 0.8880271449345614, + "grad_norm": 1.4315540790557861, + "learning_rate": 1.886850878478405e-06, + "loss": 1.2287, + "step": 1832 + }, + { + "epoch": 0.8885118759088706, + "grad_norm": 1.2842153310775757, + "learning_rate": 1.87075629181922e-06, + "loss": 1.4168, + "step": 1833 + }, + { + "epoch": 0.8889966068831798, + "grad_norm": 1.3464667797088623, + "learning_rate": 1.8547279733533042e-06, + "loss": 1.122, + "step": 1834 + }, + { + "epoch": 0.8894813378574891, + "grad_norm": 1.4574693441390991, + "learning_rate": 1.8387659690038e-06, + "loss": 1.496, + "step": 1835 + }, + { + "epoch": 0.8899660688317984, + "grad_norm": 1.374718427658081, + "learning_rate": 1.822870324503867e-06, + "loss": 1.4209, + "step": 1836 + }, + { + "epoch": 0.8904507998061076, + "grad_norm": 1.4177495241165161, + "learning_rate": 1.8070410853965104e-06, + "loss": 1.4393, + "step": 1837 + }, + { + "epoch": 0.8909355307804169, + "grad_norm": 1.4920076131820679, + "learning_rate": 1.7912782970345044e-06, + "loss": 1.4351, + "step": 1838 + }, + { + "epoch": 0.8914202617547261, + "grad_norm": 1.3668889999389648, + "learning_rate": 1.7755820045802145e-06, + "loss": 1.4363, + "step": 1839 + }, + { + "epoch": 0.8919049927290353, + "grad_norm": 1.2889249324798584, + "learning_rate": 1.7599522530055006e-06, + "loss": 1.335, + "step": 1840 + }, + { + "epoch": 0.8923897237033447, + "grad_norm": 1.3367114067077637, + "learning_rate": 1.744389087091558e-06, + "loss": 1.4699, + "step": 1841 + }, + { + "epoch": 0.8928744546776539, + "grad_norm": 1.393117904663086, + "learning_rate": 1.7288925514288262e-06, + "loss": 1.5868, + "step": 1842 + }, + { + "epoch": 0.8933591856519632, + "grad_norm": 1.229596734046936, + "learning_rate": 1.7134626904168228e-06, + "loss": 1.0758, + "step": 1843 + }, + { + "epoch": 0.8938439166262724, + "grad_norm": 1.3445252180099487, + "learning_rate": 1.6980995482640373e-06, + "loss": 1.5242, + "step": 1844 + }, + { + "epoch": 0.8943286476005817, + "grad_norm": 1.4988356828689575, + "learning_rate": 1.682803168987815e-06, + "loss": 1.5926, + "step": 1845 + }, + { + "epoch": 0.894813378574891, + "grad_norm": 1.3614110946655273, + "learning_rate": 1.6675735964142015e-06, + "loss": 1.3875, + "step": 1846 + }, + { + "epoch": 0.8952981095492002, + "grad_norm": 1.330135703086853, + "learning_rate": 1.6524108741778372e-06, + "loss": 1.4712, + "step": 1847 + }, + { + "epoch": 0.8957828405235094, + "grad_norm": 1.4532225131988525, + "learning_rate": 1.6373150457218267e-06, + "loss": 1.6974, + "step": 1848 + }, + { + "epoch": 0.8962675714978187, + "grad_norm": 1.4383496046066284, + "learning_rate": 1.6222861542976252e-06, + "loss": 1.637, + "step": 1849 + }, + { + "epoch": 0.896752302472128, + "grad_norm": 1.4017609357833862, + "learning_rate": 1.6073242429648916e-06, + "loss": 1.5166, + "step": 1850 + }, + { + "epoch": 0.8972370334464372, + "grad_norm": 1.5737212896347046, + "learning_rate": 1.5924293545913876e-06, + "loss": 1.4628, + "step": 1851 + }, + { + "epoch": 0.8977217644207465, + "grad_norm": 1.3844492435455322, + "learning_rate": 1.5776015318528403e-06, + "loss": 1.2499, + "step": 1852 + }, + { + "epoch": 0.8982064953950557, + "grad_norm": 1.4061801433563232, + "learning_rate": 1.5628408172328301e-06, + "loss": 1.5193, + "step": 1853 + }, + { + "epoch": 0.898691226369365, + "grad_norm": 1.3358350992202759, + "learning_rate": 1.5481472530226554e-06, + "loss": 1.1172, + "step": 1854 + }, + { + "epoch": 0.8991759573436743, + "grad_norm": 1.4957354068756104, + "learning_rate": 1.5335208813212375e-06, + "loss": 1.5576, + "step": 1855 + }, + { + "epoch": 0.8996606883179835, + "grad_norm": 1.3805766105651855, + "learning_rate": 1.5189617440349635e-06, + "loss": 1.4324, + "step": 1856 + }, + { + "epoch": 0.9001454192922927, + "grad_norm": 1.4622647762298584, + "learning_rate": 1.5044698828775932e-06, + "loss": 1.2463, + "step": 1857 + }, + { + "epoch": 0.900630150266602, + "grad_norm": 1.4765523672103882, + "learning_rate": 1.4900453393701358e-06, + "loss": 1.6936, + "step": 1858 + }, + { + "epoch": 0.9011148812409113, + "grad_norm": 1.4264954328536987, + "learning_rate": 1.4756881548407153e-06, + "loss": 1.4019, + "step": 1859 + }, + { + "epoch": 0.9015996122152206, + "grad_norm": 1.4087258577346802, + "learning_rate": 1.4613983704244826e-06, + "loss": 1.4478, + "step": 1860 + }, + { + "epoch": 0.9020843431895298, + "grad_norm": 1.7178231477737427, + "learning_rate": 1.44717602706346e-06, + "loss": 1.3309, + "step": 1861 + }, + { + "epoch": 0.902569074163839, + "grad_norm": 1.351010799407959, + "learning_rate": 1.4330211655064568e-06, + "loss": 1.3857, + "step": 1862 + }, + { + "epoch": 0.9030538051381484, + "grad_norm": 1.4305837154388428, + "learning_rate": 1.4189338263089241e-06, + "loss": 1.3635, + "step": 1863 + }, + { + "epoch": 0.9035385361124576, + "grad_norm": 1.2843518257141113, + "learning_rate": 1.4049140498328728e-06, + "loss": 1.1113, + "step": 1864 + }, + { + "epoch": 0.9040232670867668, + "grad_norm": 1.3791704177856445, + "learning_rate": 1.3909618762467186e-06, + "loss": 1.1953, + "step": 1865 + }, + { + "epoch": 0.9045079980610761, + "grad_norm": 1.3165531158447266, + "learning_rate": 1.3770773455251935e-06, + "loss": 1.1486, + "step": 1866 + }, + { + "epoch": 0.9049927290353854, + "grad_norm": 1.3703712224960327, + "learning_rate": 1.3632604974492257e-06, + "loss": 1.4523, + "step": 1867 + }, + { + "epoch": 0.9054774600096946, + "grad_norm": 1.28277587890625, + "learning_rate": 1.3495113716058272e-06, + "loss": 1.506, + "step": 1868 + }, + { + "epoch": 0.9059621909840039, + "grad_norm": 1.3652846813201904, + "learning_rate": 1.335830007387967e-06, + "loss": 1.3893, + "step": 1869 + }, + { + "epoch": 0.9064469219583131, + "grad_norm": 1.3692935705184937, + "learning_rate": 1.3222164439944811e-06, + "loss": 1.644, + "step": 1870 + }, + { + "epoch": 0.9069316529326223, + "grad_norm": 1.5364172458648682, + "learning_rate": 1.3086707204299414e-06, + "loss": 1.6484, + "step": 1871 + }, + { + "epoch": 0.9074163839069317, + "grad_norm": 1.4699918031692505, + "learning_rate": 1.2951928755045417e-06, + "loss": 1.593, + "step": 1872 + }, + { + "epoch": 0.9079011148812409, + "grad_norm": 1.437369465827942, + "learning_rate": 1.281782947834015e-06, + "loss": 1.5027, + "step": 1873 + }, + { + "epoch": 0.9083858458555502, + "grad_norm": 1.4166172742843628, + "learning_rate": 1.26844097583948e-06, + "loss": 1.506, + "step": 1874 + }, + { + "epoch": 0.9088705768298594, + "grad_norm": 1.340541124343872, + "learning_rate": 1.2551669977473813e-06, + "loss": 1.4797, + "step": 1875 + }, + { + "epoch": 0.9093553078041687, + "grad_norm": 1.5571600198745728, + "learning_rate": 1.241961051589316e-06, + "loss": 1.5784, + "step": 1876 + }, + { + "epoch": 0.909840038778478, + "grad_norm": 1.496361494064331, + "learning_rate": 1.2288231752019956e-06, + "loss": 1.3185, + "step": 1877 + }, + { + "epoch": 0.9103247697527872, + "grad_norm": 1.3229130506515503, + "learning_rate": 1.2157534062270798e-06, + "loss": 1.4291, + "step": 1878 + }, + { + "epoch": 0.9108095007270964, + "grad_norm": 1.5197927951812744, + "learning_rate": 1.2027517821111112e-06, + "loss": 1.3644, + "step": 1879 + }, + { + "epoch": 0.9112942317014057, + "grad_norm": 1.3992700576782227, + "learning_rate": 1.1898183401053697e-06, + "loss": 1.1666, + "step": 1880 + }, + { + "epoch": 0.911778962675715, + "grad_norm": 1.4355274438858032, + "learning_rate": 1.1769531172658e-06, + "loss": 1.487, + "step": 1881 + }, + { + "epoch": 0.9122636936500242, + "grad_norm": 1.310941219329834, + "learning_rate": 1.1641561504528803e-06, + "loss": 1.3596, + "step": 1882 + }, + { + "epoch": 0.9127484246243335, + "grad_norm": 1.495992660522461, + "learning_rate": 1.1514274763315292e-06, + "loss": 1.8062, + "step": 1883 + }, + { + "epoch": 0.9132331555986427, + "grad_norm": 2.58510684967041, + "learning_rate": 1.1387671313710075e-06, + "loss": 1.4036, + "step": 1884 + }, + { + "epoch": 0.913717886572952, + "grad_norm": 1.5952808856964111, + "learning_rate": 1.1261751518447882e-06, + "loss": 1.6931, + "step": 1885 + }, + { + "epoch": 0.9142026175472613, + "grad_norm": 1.2862507104873657, + "learning_rate": 1.113651573830482e-06, + "loss": 1.1187, + "step": 1886 + }, + { + "epoch": 0.9146873485215705, + "grad_norm": 1.285225510597229, + "learning_rate": 1.1011964332097114e-06, + "loss": 1.5083, + "step": 1887 + }, + { + "epoch": 0.9151720794958798, + "grad_norm": 1.5734096765518188, + "learning_rate": 1.0888097656680253e-06, + "loss": 1.646, + "step": 1888 + }, + { + "epoch": 0.9156568104701891, + "grad_norm": 1.4412444829940796, + "learning_rate": 1.0764916066947794e-06, + "loss": 1.5821, + "step": 1889 + }, + { + "epoch": 0.9161415414444983, + "grad_norm": 1.3290297985076904, + "learning_rate": 1.0642419915830537e-06, + "loss": 1.1081, + "step": 1890 + }, + { + "epoch": 0.9166262724188076, + "grad_norm": 1.4718834161758423, + "learning_rate": 1.0520609554295346e-06, + "loss": 1.6698, + "step": 1891 + }, + { + "epoch": 0.9171110033931168, + "grad_norm": 1.3476266860961914, + "learning_rate": 1.0399485331344273e-06, + "loss": 1.2459, + "step": 1892 + }, + { + "epoch": 0.917595734367426, + "grad_norm": 1.4318119287490845, + "learning_rate": 1.027904759401338e-06, + "loss": 1.3108, + "step": 1893 + }, + { + "epoch": 0.9180804653417354, + "grad_norm": 1.476616621017456, + "learning_rate": 1.0159296687372034e-06, + "loss": 1.7684, + "step": 1894 + }, + { + "epoch": 0.9185651963160446, + "grad_norm": 1.3496558666229248, + "learning_rate": 1.0040232954521557e-06, + "loss": 1.4406, + "step": 1895 + }, + { + "epoch": 0.9190499272903538, + "grad_norm": 1.3763827085494995, + "learning_rate": 9.92185673659457e-07, + "loss": 1.2898, + "step": 1896 + }, + { + "epoch": 0.9195346582646631, + "grad_norm": 1.430915117263794, + "learning_rate": 9.804168372753858e-07, + "loss": 1.4238, + "step": 1897 + }, + { + "epoch": 0.9200193892389724, + "grad_norm": 1.3764996528625488, + "learning_rate": 9.687168200191304e-07, + "loss": 1.5283, + "step": 1898 + }, + { + "epoch": 0.9205041202132817, + "grad_norm": 1.4979218244552612, + "learning_rate": 9.570856554127205e-07, + "loss": 1.853, + "step": 1899 + }, + { + "epoch": 0.9209888511875909, + "grad_norm": 1.4959334135055542, + "learning_rate": 9.455233767808991e-07, + "loss": 1.7275, + "step": 1900 + }, + { + "epoch": 0.9214735821619001, + "grad_norm": 1.2733063697814941, + "learning_rate": 9.340300172510586e-07, + "loss": 1.4255, + "step": 1901 + }, + { + "epoch": 0.9219583131362094, + "grad_norm": 1.451502799987793, + "learning_rate": 9.226056097531105e-07, + "loss": 1.1018, + "step": 1902 + }, + { + "epoch": 0.9224430441105187, + "grad_norm": 1.3837565183639526, + "learning_rate": 9.112501870194273e-07, + "loss": 1.7232, + "step": 1903 + }, + { + "epoch": 0.9229277750848279, + "grad_norm": 1.4380730390548706, + "learning_rate": 8.9996378158472e-07, + "loss": 1.2878, + "step": 1904 + }, + { + "epoch": 0.9234125060591372, + "grad_norm": 1.2848694324493408, + "learning_rate": 8.887464257859579e-07, + "loss": 1.2934, + "step": 1905 + }, + { + "epoch": 0.9238972370334464, + "grad_norm": 1.2803257703781128, + "learning_rate": 8.775981517622794e-07, + "loss": 1.297, + "step": 1906 + }, + { + "epoch": 0.9243819680077557, + "grad_norm": 1.3022807836532593, + "learning_rate": 8.665189914548955e-07, + "loss": 1.2849, + "step": 1907 + }, + { + "epoch": 0.924866698982065, + "grad_norm": 1.3697919845581055, + "learning_rate": 8.555089766069891e-07, + "loss": 1.3761, + "step": 1908 + }, + { + "epoch": 0.9253514299563742, + "grad_norm": 1.5067558288574219, + "learning_rate": 8.445681387636406e-07, + "loss": 1.31, + "step": 1909 + }, + { + "epoch": 0.9258361609306834, + "grad_norm": 1.403795838356018, + "learning_rate": 8.336965092717281e-07, + "loss": 1.4403, + "step": 1910 + }, + { + "epoch": 0.9263208919049928, + "grad_norm": 1.440237283706665, + "learning_rate": 8.228941192798323e-07, + "loss": 1.7129, + "step": 1911 + }, + { + "epoch": 0.926805622879302, + "grad_norm": 1.4832885265350342, + "learning_rate": 8.121609997381652e-07, + "loss": 1.5841, + "step": 1912 + }, + { + "epoch": 0.9272903538536112, + "grad_norm": 1.4952884912490845, + "learning_rate": 8.014971813984611e-07, + "loss": 1.6965, + "step": 1913 + }, + { + "epoch": 0.9277750848279205, + "grad_norm": 1.588004231452942, + "learning_rate": 7.909026948139081e-07, + "loss": 1.5482, + "step": 1914 + }, + { + "epoch": 0.9282598158022297, + "grad_norm": 1.387260913848877, + "learning_rate": 7.803775703390359e-07, + "loss": 1.2515, + "step": 1915 + }, + { + "epoch": 0.9287445467765391, + "grad_norm": 1.292482852935791, + "learning_rate": 7.699218381296531e-07, + "loss": 1.3326, + "step": 1916 + }, + { + "epoch": 0.9292292777508483, + "grad_norm": 1.427830696105957, + "learning_rate": 7.595355281427435e-07, + "loss": 1.5905, + "step": 1917 + }, + { + "epoch": 0.9297140087251575, + "grad_norm": 1.3774360418319702, + "learning_rate": 7.492186701364007e-07, + "loss": 1.6201, + "step": 1918 + }, + { + "epoch": 0.9301987396994668, + "grad_norm": 1.4959042072296143, + "learning_rate": 7.389712936697129e-07, + "loss": 1.4392, + "step": 1919 + }, + { + "epoch": 0.9306834706737761, + "grad_norm": 1.2562229633331299, + "learning_rate": 7.287934281027114e-07, + "loss": 1.2257, + "step": 1920 + }, + { + "epoch": 0.9311682016480853, + "grad_norm": 1.3972541093826294, + "learning_rate": 7.186851025962532e-07, + "loss": 1.1484, + "step": 1921 + }, + { + "epoch": 0.9316529326223946, + "grad_norm": 1.3372113704681396, + "learning_rate": 7.086463461119658e-07, + "loss": 1.2652, + "step": 1922 + }, + { + "epoch": 0.9321376635967038, + "grad_norm": 1.3388514518737793, + "learning_rate": 6.9867718741215e-07, + "loss": 1.5144, + "step": 1923 + }, + { + "epoch": 0.932622394571013, + "grad_norm": 1.3194801807403564, + "learning_rate": 6.887776550597025e-07, + "loss": 1.1255, + "step": 1924 + }, + { + "epoch": 0.9331071255453224, + "grad_norm": 1.4161765575408936, + "learning_rate": 6.789477774180236e-07, + "loss": 1.7097, + "step": 1925 + }, + { + "epoch": 0.9335918565196316, + "grad_norm": 1.3846325874328613, + "learning_rate": 6.691875826509514e-07, + "loss": 1.445, + "step": 1926 + }, + { + "epoch": 0.9340765874939408, + "grad_norm": 1.311930537223816, + "learning_rate": 6.59497098722675e-07, + "loss": 1.406, + "step": 1927 + }, + { + "epoch": 0.9345613184682501, + "grad_norm": 1.3026055097579956, + "learning_rate": 6.498763533976437e-07, + "loss": 1.2652, + "step": 1928 + }, + { + "epoch": 0.9350460494425594, + "grad_norm": 1.4192198514938354, + "learning_rate": 6.403253742405107e-07, + "loss": 1.6227, + "step": 1929 + }, + { + "epoch": 0.9355307804168687, + "grad_norm": 1.3232550621032715, + "learning_rate": 6.308441886160254e-07, + "loss": 1.4756, + "step": 1930 + }, + { + "epoch": 0.9360155113911779, + "grad_norm": 1.3774874210357666, + "learning_rate": 6.214328236889861e-07, + "loss": 1.3581, + "step": 1931 + }, + { + "epoch": 0.9365002423654871, + "grad_norm": 1.381434679031372, + "learning_rate": 6.120913064241313e-07, + "loss": 1.2108, + "step": 1932 + }, + { + "epoch": 0.9369849733397965, + "grad_norm": 1.3841496706008911, + "learning_rate": 6.02819663586085e-07, + "loss": 1.5833, + "step": 1933 + }, + { + "epoch": 0.9374697043141057, + "grad_norm": 1.4273308515548706, + "learning_rate": 5.936179217392673e-07, + "loss": 1.3397, + "step": 1934 + }, + { + "epoch": 0.9379544352884149, + "grad_norm": 1.4026015996932983, + "learning_rate": 5.844861072478336e-07, + "loss": 1.6483, + "step": 1935 + }, + { + "epoch": 0.9384391662627242, + "grad_norm": 1.4809978008270264, + "learning_rate": 5.754242462755771e-07, + "loss": 1.4236, + "step": 1936 + }, + { + "epoch": 0.9389238972370334, + "grad_norm": 1.4197194576263428, + "learning_rate": 5.664323647858655e-07, + "loss": 1.5785, + "step": 1937 + }, + { + "epoch": 0.9394086282113427, + "grad_norm": 1.4222776889801025, + "learning_rate": 5.575104885415794e-07, + "loss": 1.5917, + "step": 1938 + }, + { + "epoch": 0.939893359185652, + "grad_norm": 1.4109939336776733, + "learning_rate": 5.486586431050072e-07, + "loss": 1.5329, + "step": 1939 + }, + { + "epoch": 0.9403780901599612, + "grad_norm": 1.3708782196044922, + "learning_rate": 5.398768538378063e-07, + "loss": 1.1433, + "step": 1940 + }, + { + "epoch": 0.9408628211342704, + "grad_norm": 1.3859466314315796, + "learning_rate": 5.311651459009054e-07, + "loss": 1.4556, + "step": 1941 + }, + { + "epoch": 0.9413475521085798, + "grad_norm": 1.3821712732315063, + "learning_rate": 5.225235442544468e-07, + "loss": 1.654, + "step": 1942 + }, + { + "epoch": 0.941832283082889, + "grad_norm": 1.4499861001968384, + "learning_rate": 5.139520736577058e-07, + "loss": 1.7619, + "step": 1943 + }, + { + "epoch": 0.9423170140571983, + "grad_norm": 1.6326532363891602, + "learning_rate": 5.05450758669021e-07, + "loss": 1.8829, + "step": 1944 + }, + { + "epoch": 0.9428017450315075, + "grad_norm": 1.4821033477783203, + "learning_rate": 4.97019623645728e-07, + "loss": 1.3496, + "step": 1945 + }, + { + "epoch": 0.9432864760058167, + "grad_norm": 1.308059573173523, + "learning_rate": 4.886586927440956e-07, + "loss": 1.3063, + "step": 1946 + }, + { + "epoch": 0.9437712069801261, + "grad_norm": 1.3683174848556519, + "learning_rate": 4.803679899192392e-07, + "loss": 1.4177, + "step": 1947 + }, + { + "epoch": 0.9442559379544353, + "grad_norm": 1.246372938156128, + "learning_rate": 4.7214753892506625e-07, + "loss": 1.2904, + "step": 1948 + }, + { + "epoch": 0.9447406689287445, + "grad_norm": 1.3812105655670166, + "learning_rate": 4.6399736331420305e-07, + "loss": 1.4883, + "step": 1949 + }, + { + "epoch": 0.9452253999030538, + "grad_norm": 1.4763976335525513, + "learning_rate": 4.559174864379234e-07, + "loss": 1.5377, + "step": 1950 + }, + { + "epoch": 0.9457101308773631, + "grad_norm": 1.4200667142868042, + "learning_rate": 4.4790793144610097e-07, + "loss": 1.2448, + "step": 1951 + }, + { + "epoch": 0.9461948618516723, + "grad_norm": 1.5260330438613892, + "learning_rate": 4.399687212871123e-07, + "loss": 1.33, + "step": 1952 + }, + { + "epoch": 0.9466795928259816, + "grad_norm": 1.5376918315887451, + "learning_rate": 4.320998787077923e-07, + "loss": 1.3794, + "step": 1953 + }, + { + "epoch": 0.9471643238002908, + "grad_norm": 1.408273696899414, + "learning_rate": 4.243014262533679e-07, + "loss": 1.1804, + "step": 1954 + }, + { + "epoch": 0.9476490547746, + "grad_norm": 1.395007848739624, + "learning_rate": 4.165733862673854e-07, + "loss": 1.2267, + "step": 1955 + }, + { + "epoch": 0.9481337857489094, + "grad_norm": 1.4566540718078613, + "learning_rate": 4.0891578089164996e-07, + "loss": 1.2163, + "step": 1956 + }, + { + "epoch": 0.9486185167232186, + "grad_norm": 1.295602560043335, + "learning_rate": 4.0132863206616965e-07, + "loss": 1.1429, + "step": 1957 + }, + { + "epoch": 0.9491032476975279, + "grad_norm": 1.410298228263855, + "learning_rate": 3.938119615290753e-07, + "loss": 1.5398, + "step": 1958 + }, + { + "epoch": 0.9495879786718371, + "grad_norm": 1.3872941732406616, + "learning_rate": 3.8636579081657577e-07, + "loss": 1.1388, + "step": 1959 + }, + { + "epoch": 0.9500727096461464, + "grad_norm": 1.5268107652664185, + "learning_rate": 3.7899014126288876e-07, + "loss": 1.8655, + "step": 1960 + }, + { + "epoch": 0.9505574406204557, + "grad_norm": 1.472521424293518, + "learning_rate": 3.716850340001715e-07, + "loss": 1.7315, + "step": 1961 + }, + { + "epoch": 0.9510421715947649, + "grad_norm": 1.4450392723083496, + "learning_rate": 3.644504899584844e-07, + "loss": 1.3071, + "step": 1962 + }, + { + "epoch": 0.9515269025690741, + "grad_norm": 1.436614990234375, + "learning_rate": 3.5728652986570245e-07, + "loss": 1.4648, + "step": 1963 + }, + { + "epoch": 0.9520116335433835, + "grad_norm": 1.3284350633621216, + "learning_rate": 3.5019317424747064e-07, + "loss": 1.1009, + "step": 1964 + }, + { + "epoch": 0.9524963645176927, + "grad_norm": 1.3217493295669556, + "learning_rate": 3.43170443427146e-07, + "loss": 1.303, + "step": 1965 + }, + { + "epoch": 0.9529810954920019, + "grad_norm": 1.4290056228637695, + "learning_rate": 3.3621835752573884e-07, + "loss": 1.4079, + "step": 1966 + }, + { + "epoch": 0.9534658264663112, + "grad_norm": 1.4261441230773926, + "learning_rate": 3.293369364618465e-07, + "loss": 1.241, + "step": 1967 + }, + { + "epoch": 0.9539505574406204, + "grad_norm": 1.4014172554016113, + "learning_rate": 3.2252619995160885e-07, + "loss": 1.641, + "step": 1968 + }, + { + "epoch": 0.9544352884149298, + "grad_norm": 1.4024168252944946, + "learning_rate": 3.1578616750863875e-07, + "loss": 1.3255, + "step": 1969 + }, + { + "epoch": 0.954920019389239, + "grad_norm": 1.462719440460205, + "learning_rate": 3.0911685844398353e-07, + "loss": 1.3231, + "step": 1970 + }, + { + "epoch": 0.9554047503635482, + "grad_norm": 1.350813865661621, + "learning_rate": 3.025182918660496e-07, + "loss": 1.1891, + "step": 1971 + }, + { + "epoch": 0.9558894813378574, + "grad_norm": 1.3725783824920654, + "learning_rate": 2.9599048668055853e-07, + "loss": 1.2594, + "step": 1972 + }, + { + "epoch": 0.9563742123121668, + "grad_norm": 1.4037284851074219, + "learning_rate": 2.8953346159049375e-07, + "loss": 1.3825, + "step": 1973 + }, + { + "epoch": 0.956858943286476, + "grad_norm": 2.3289217948913574, + "learning_rate": 2.831472350960485e-07, + "loss": 1.4553, + "step": 1974 + }, + { + "epoch": 0.9573436742607853, + "grad_norm": 1.3470871448516846, + "learning_rate": 2.7683182549456123e-07, + "loss": 1.6123, + "step": 1975 + }, + { + "epoch": 0.9578284052350945, + "grad_norm": 1.5660077333450317, + "learning_rate": 2.705872508804747e-07, + "loss": 1.5749, + "step": 1976 + }, + { + "epoch": 0.9583131362094037, + "grad_norm": 1.4341754913330078, + "learning_rate": 2.644135291452854e-07, + "loss": 1.1978, + "step": 1977 + }, + { + "epoch": 0.9587978671837131, + "grad_norm": 1.4906240701675415, + "learning_rate": 2.5831067797747746e-07, + "loss": 1.6585, + "step": 1978 + }, + { + "epoch": 0.9592825981580223, + "grad_norm": 1.3474236726760864, + "learning_rate": 2.5227871486249164e-07, + "loss": 1.3896, + "step": 1979 + }, + { + "epoch": 0.9597673291323315, + "grad_norm": 1.4273277521133423, + "learning_rate": 2.463176570826592e-07, + "loss": 1.5367, + "step": 1980 + }, + { + "epoch": 0.9602520601066408, + "grad_norm": 1.3798199892044067, + "learning_rate": 2.404275217171625e-07, + "loss": 1.129, + "step": 1981 + }, + { + "epoch": 0.9607367910809501, + "grad_norm": 1.284221887588501, + "learning_rate": 2.3460832564197455e-07, + "loss": 1.2853, + "step": 1982 + }, + { + "epoch": 0.9612215220552593, + "grad_norm": 1.366292953491211, + "learning_rate": 2.288600855298306e-07, + "loss": 1.2893, + "step": 1983 + }, + { + "epoch": 0.9617062530295686, + "grad_norm": 1.3929351568222046, + "learning_rate": 2.2318281785015936e-07, + "loss": 1.5092, + "step": 1984 + }, + { + "epoch": 0.9621909840038778, + "grad_norm": 1.2522450685501099, + "learning_rate": 2.1757653886904927e-07, + "loss": 1.39, + "step": 1985 + }, + { + "epoch": 0.9626757149781872, + "grad_norm": 1.3403209447860718, + "learning_rate": 2.120412646491904e-07, + "loss": 1.3051, + "step": 1986 + }, + { + "epoch": 0.9631604459524964, + "grad_norm": 1.3888131380081177, + "learning_rate": 2.0657701104984384e-07, + "loss": 1.4746, + "step": 1987 + }, + { + "epoch": 0.9636451769268056, + "grad_norm": 1.4653536081314087, + "learning_rate": 2.0118379372678354e-07, + "loss": 1.8005, + "step": 1988 + }, + { + "epoch": 0.9641299079011149, + "grad_norm": 1.3622173070907593, + "learning_rate": 1.9586162813225174e-07, + "loss": 1.366, + "step": 1989 + }, + { + "epoch": 0.9646146388754241, + "grad_norm": 1.3387054204940796, + "learning_rate": 1.9061052951492575e-07, + "loss": 1.4404, + "step": 1990 + }, + { + "epoch": 0.9650993698497334, + "grad_norm": 1.361503005027771, + "learning_rate": 1.8543051291986247e-07, + "loss": 1.4713, + "step": 1991 + }, + { + "epoch": 0.9655841008240427, + "grad_norm": 1.3547799587249756, + "learning_rate": 1.80321593188465e-07, + "loss": 1.2779, + "step": 1992 + }, + { + "epoch": 0.9660688317983519, + "grad_norm": 1.6397618055343628, + "learning_rate": 1.7528378495842435e-07, + "loss": 1.4227, + "step": 1993 + }, + { + "epoch": 0.9665535627726611, + "grad_norm": 1.4830430746078491, + "learning_rate": 1.7031710266370016e-07, + "loss": 1.5416, + "step": 1994 + }, + { + "epoch": 0.9670382937469705, + "grad_norm": 1.4233936071395874, + "learning_rate": 1.6542156053446223e-07, + "loss": 1.2741, + "step": 1995 + }, + { + "epoch": 0.9675230247212797, + "grad_norm": 1.3970363140106201, + "learning_rate": 1.6059717259705175e-07, + "loss": 1.2843, + "step": 1996 + }, + { + "epoch": 0.968007755695589, + "grad_norm": 1.4266377687454224, + "learning_rate": 1.5584395267394802e-07, + "loss": 1.2624, + "step": 1997 + }, + { + "epoch": 0.9684924866698982, + "grad_norm": 1.3139270544052124, + "learning_rate": 1.5116191438372394e-07, + "loss": 1.3138, + "step": 1998 + }, + { + "epoch": 0.9689772176442074, + "grad_norm": 1.3843278884887695, + "learning_rate": 1.4655107114101007e-07, + "loss": 1.3294, + "step": 1999 + }, + { + "epoch": 0.9694619486185168, + "grad_norm": 1.2902089357376099, + "learning_rate": 1.4201143615645006e-07, + "loss": 1.3585, + "step": 2000 + }, + { + "epoch": 0.969946679592826, + "grad_norm": 1.469571590423584, + "learning_rate": 1.3754302243667304e-07, + "loss": 1.4797, + "step": 2001 + }, + { + "epoch": 0.9704314105671352, + "grad_norm": 1.3052209615707397, + "learning_rate": 1.331458427842408e-07, + "loss": 1.3906, + "step": 2002 + }, + { + "epoch": 0.9709161415414445, + "grad_norm": 1.5683739185333252, + "learning_rate": 1.2881990979763393e-07, + "loss": 1.5129, + "step": 2003 + }, + { + "epoch": 0.9714008725157538, + "grad_norm": 1.4021413326263428, + "learning_rate": 1.2456523587118517e-07, + "loss": 1.2832, + "step": 2004 + }, + { + "epoch": 0.971885603490063, + "grad_norm": 1.4269837141036987, + "learning_rate": 1.2038183319507955e-07, + "loss": 1.3047, + "step": 2005 + }, + { + "epoch": 0.9723703344643723, + "grad_norm": 1.4196062088012695, + "learning_rate": 1.1626971375528484e-07, + "loss": 1.2863, + "step": 2006 + }, + { + "epoch": 0.9728550654386815, + "grad_norm": 1.4747346639633179, + "learning_rate": 1.1222888933354602e-07, + "loss": 1.4186, + "step": 2007 + }, + { + "epoch": 0.9733397964129908, + "grad_norm": 1.3612735271453857, + "learning_rate": 1.0825937150732989e-07, + "loss": 1.5173, + "step": 2008 + }, + { + "epoch": 0.9738245273873001, + "grad_norm": 1.586757779121399, + "learning_rate": 1.043611716498083e-07, + "loss": 1.3961, + "step": 2009 + }, + { + "epoch": 0.9743092583616093, + "grad_norm": 1.2929972410202026, + "learning_rate": 1.0053430092981097e-07, + "loss": 1.0006, + "step": 2010 + }, + { + "epoch": 0.9747939893359185, + "grad_norm": 1.3124991655349731, + "learning_rate": 9.677877031180615e-08, + "loss": 1.1774, + "step": 2011 + }, + { + "epoch": 0.9752787203102278, + "grad_norm": 1.4618499279022217, + "learning_rate": 9.30945905558589e-08, + "loss": 1.5174, + "step": 2012 + }, + { + "epoch": 0.9757634512845371, + "grad_norm": 1.536896824836731, + "learning_rate": 8.948177221760889e-08, + "loss": 2.057, + "step": 2013 + }, + { + "epoch": 0.9762481822588464, + "grad_norm": 1.474232792854309, + "learning_rate": 8.594032564823717e-08, + "loss": 1.3491, + "step": 2014 + }, + { + "epoch": 0.9767329132331556, + "grad_norm": 1.3744844198226929, + "learning_rate": 8.247026099443277e-08, + "loss": 1.4189, + "step": 2015 + }, + { + "epoch": 0.9772176442074648, + "grad_norm": 1.3619568347930908, + "learning_rate": 7.907158819836503e-08, + "loss": 1.4581, + "step": 2016 + }, + { + "epoch": 0.9777023751817742, + "grad_norm": 1.3401083946228027, + "learning_rate": 7.574431699766127e-08, + "loss": 1.523, + "step": 2017 + }, + { + "epoch": 0.9781871061560834, + "grad_norm": 1.4377765655517578, + "learning_rate": 7.248845692537088e-08, + "loss": 1.4517, + "step": 2018 + }, + { + "epoch": 0.9786718371303926, + "grad_norm": 1.4789650440216064, + "learning_rate": 6.930401730994573e-08, + "loss": 1.5122, + "step": 2019 + }, + { + "epoch": 0.9791565681047019, + "grad_norm": 1.4197337627410889, + "learning_rate": 6.619100727520422e-08, + "loss": 1.2408, + "step": 2020 + }, + { + "epoch": 0.9796412990790111, + "grad_norm": 1.294545292854309, + "learning_rate": 6.314943574030896e-08, + "loss": 1.4684, + "step": 2021 + }, + { + "epoch": 0.9801260300533204, + "grad_norm": 1.500753402709961, + "learning_rate": 6.01793114197502e-08, + "loss": 1.6566, + "step": 2022 + }, + { + "epoch": 0.9806107610276297, + "grad_norm": 1.4268614053726196, + "learning_rate": 5.728064282330137e-08, + "loss": 1.4571, + "step": 2023 + }, + { + "epoch": 0.9810954920019389, + "grad_norm": 1.3109159469604492, + "learning_rate": 5.4453438256019115e-08, + "loss": 1.17, + "step": 2024 + }, + { + "epoch": 0.9815802229762481, + "grad_norm": 1.3804876804351807, + "learning_rate": 5.169770581819888e-08, + "loss": 1.3375, + "step": 2025 + }, + { + "epoch": 0.9820649539505575, + "grad_norm": 1.758589267730713, + "learning_rate": 4.901345340535824e-08, + "loss": 1.6656, + "step": 2026 + }, + { + "epoch": 0.9825496849248667, + "grad_norm": 1.500661015510559, + "learning_rate": 4.6400688708217455e-08, + "loss": 1.522, + "step": 2027 + }, + { + "epoch": 0.983034415899176, + "grad_norm": 1.3010058403015137, + "learning_rate": 4.385941921268011e-08, + "loss": 1.075, + "step": 2028 + }, + { + "epoch": 0.9835191468734852, + "grad_norm": 1.2376115322113037, + "learning_rate": 4.138965219979973e-08, + "loss": 1.0122, + "step": 2029 + }, + { + "epoch": 0.9840038778477945, + "grad_norm": 1.4325613975524902, + "learning_rate": 3.8991394745771516e-08, + "loss": 1.5664, + "step": 2030 + }, + { + "epoch": 0.9844886088221038, + "grad_norm": 1.3779929876327515, + "learning_rate": 3.666465372190453e-08, + "loss": 1.2317, + "step": 2031 + }, + { + "epoch": 0.984973339796413, + "grad_norm": 1.4506980180740356, + "learning_rate": 3.440943579460232e-08, + "loss": 1.3087, + "step": 2032 + }, + { + "epoch": 0.9854580707707222, + "grad_norm": 1.5038971900939941, + "learning_rate": 3.2225747425351785e-08, + "loss": 1.6422, + "step": 2033 + }, + { + "epoch": 0.9859428017450315, + "grad_norm": 1.4041410684585571, + "learning_rate": 3.011359487068987e-08, + "loss": 1.5016, + "step": 2034 + }, + { + "epoch": 0.9864275327193408, + "grad_norm": 1.475483775138855, + "learning_rate": 2.807298418220361e-08, + "loss": 1.5435, + "step": 2035 + }, + { + "epoch": 0.98691226369365, + "grad_norm": 1.4113868474960327, + "learning_rate": 2.6103921206499517e-08, + "loss": 1.5201, + "step": 2036 + }, + { + "epoch": 0.9873969946679593, + "grad_norm": 1.4753867387771606, + "learning_rate": 2.4206411585186996e-08, + "loss": 1.6761, + "step": 2037 + }, + { + "epoch": 0.9878817256422685, + "grad_norm": 1.380028247833252, + "learning_rate": 2.2380460754875544e-08, + "loss": 1.2831, + "step": 2038 + }, + { + "epoch": 0.9883664566165778, + "grad_norm": 1.2903367280960083, + "learning_rate": 2.0626073947138668e-08, + "loss": 1.235, + "step": 2039 + }, + { + "epoch": 0.9888511875908871, + "grad_norm": 1.7924153804779053, + "learning_rate": 1.8943256188516667e-08, + "loss": 1.4112, + "step": 2040 + }, + { + "epoch": 0.9893359185651963, + "grad_norm": 1.4166916608810425, + "learning_rate": 1.7332012300494417e-08, + "loss": 1.651, + "step": 2041 + }, + { + "epoch": 0.9898206495395055, + "grad_norm": 1.2969902753829956, + "learning_rate": 1.5792346899490275e-08, + "loss": 1.2091, + "step": 2042 + }, + { + "epoch": 0.9903053805138148, + "grad_norm": 1.426344633102417, + "learning_rate": 1.4324264396836651e-08, + "loss": 1.4629, + "step": 2043 + }, + { + "epoch": 0.9907901114881241, + "grad_norm": 1.4453415870666504, + "learning_rate": 1.2927768998774458e-08, + "loss": 1.4487, + "step": 2044 + }, + { + "epoch": 0.9912748424624334, + "grad_norm": 1.3389601707458496, + "learning_rate": 1.1602864706442008e-08, + "loss": 1.2715, + "step": 2045 + }, + { + "epoch": 0.9917595734367426, + "grad_norm": 1.382049560546875, + "learning_rate": 1.0349555315855574e-08, + "loss": 1.3052, + "step": 2046 + }, + { + "epoch": 0.9922443044110518, + "grad_norm": 1.3508886098861694, + "learning_rate": 9.167844417901083e-09, + "loss": 1.4453, + "step": 2047 + }, + { + "epoch": 0.9927290353853612, + "grad_norm": 1.3657429218292236, + "learning_rate": 8.057735398331324e-09, + "loss": 1.4893, + "step": 2048 + }, + { + "epoch": 0.9932137663596704, + "grad_norm": 1.395776629447937, + "learning_rate": 7.0192314377520715e-09, + "loss": 1.283, + "step": 2049 + }, + { + "epoch": 0.9936984973339796, + "grad_norm": 1.4177159070968628, + "learning_rate": 6.052335511599893e-09, + "loss": 1.2549, + "step": 2050 + }, + { + "epoch": 0.9941832283082889, + "grad_norm": 1.4473752975463867, + "learning_rate": 5.1570503901587905e-09, + "loss": 1.49, + "step": 2051 + }, + { + "epoch": 0.9946679592825982, + "grad_norm": 1.3230009078979492, + "learning_rate": 4.333378638532448e-09, + "loss": 1.7049, + "step": 2052 + }, + { + "epoch": 0.9951526902569074, + "grad_norm": 1.2757775783538818, + "learning_rate": 3.581322616641458e-09, + "loss": 1.1337, + "step": 2053 + }, + { + "epoch": 0.9956374212312167, + "grad_norm": 1.4444496631622314, + "learning_rate": 2.9008844792260957e-09, + "loss": 1.6056, + "step": 2054 + }, + { + "epoch": 0.9961221522055259, + "grad_norm": 1.566375970840454, + "learning_rate": 2.292066175821339e-09, + "loss": 1.3957, + "step": 2055 + }, + { + "epoch": 0.9966068831798351, + "grad_norm": 1.4305633306503296, + "learning_rate": 1.754869450773522e-09, + "loss": 1.4048, + "step": 2056 + }, + { + "epoch": 0.9970916141541445, + "grad_norm": 1.446074366569519, + "learning_rate": 1.2892958432153546e-09, + "loss": 1.6426, + "step": 2057 + }, + { + "epoch": 0.9975763451284537, + "grad_norm": 1.418178915977478, + "learning_rate": 8.953466870742499e-10, + "loss": 1.2807, + "step": 2058 + }, + { + "epoch": 0.998061076102763, + "grad_norm": 1.4988263845443726, + "learning_rate": 5.730231110639972e-10, + "loss": 1.61, + "step": 2059 + }, + { + "epoch": 0.9985458070770722, + "grad_norm": 1.3278765678405762, + "learning_rate": 3.2232603868476153e-10, + "loss": 1.1255, + "step": 2060 + }, + { + "epoch": 0.9990305380513815, + "grad_norm": 1.492288589477539, + "learning_rate": 1.4325618821198207e-10, + "loss": 1.5154, + "step": 2061 + }, + { + "epoch": 0.9995152690256908, + "grad_norm": 1.3655840158462524, + "learning_rate": 3.5814072707474054e-11, + "loss": 1.4809, + "step": 2062 + }, + { + "epoch": 1.0, + "grad_norm": 1.489664912223816, + "learning_rate": 0.0, + "loss": 1.4304, + "step": 2063 + } + ], + "logging_steps": 1, + "max_steps": 2063, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8633303665737728.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}