diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15079 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.998837479655894, + "eval_steps": 500, + "global_step": 2148, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0013950244129272262, + "grad_norm": 9.061795042102073, + "learning_rate": 0.0, + "loss": 1.2425, + "step": 1 + }, + { + "epoch": 0.0027900488258544524, + "grad_norm": 7.74265471964034, + "learning_rate": 4.6511627906976744e-08, + "loss": 1.1023, + "step": 2 + }, + { + "epoch": 0.0041850732387816785, + "grad_norm": 6.389720437917294, + "learning_rate": 9.302325581395349e-08, + "loss": 0.9673, + "step": 3 + }, + { + "epoch": 0.005580097651708905, + "grad_norm": 8.421817121439247, + "learning_rate": 1.3953488372093024e-07, + "loss": 1.1736, + "step": 4 + }, + { + "epoch": 0.006975122064636131, + "grad_norm": 7.580574098658975, + "learning_rate": 1.8604651162790698e-07, + "loss": 1.0903, + "step": 5 + }, + { + "epoch": 0.008370146477563357, + "grad_norm": 8.104772232891781, + "learning_rate": 2.3255813953488374e-07, + "loss": 1.1323, + "step": 6 + }, + { + "epoch": 0.009765170890490584, + "grad_norm": 8.156214672673578, + "learning_rate": 2.790697674418605e-07, + "loss": 1.218, + "step": 7 + }, + { + "epoch": 0.01116019530341781, + "grad_norm": 8.526741313250225, + "learning_rate": 3.2558139534883724e-07, + "loss": 1.231, + "step": 8 + }, + { + "epoch": 0.012555219716345037, + "grad_norm": 6.989791978173938, + "learning_rate": 3.7209302325581396e-07, + "loss": 1.0199, + "step": 9 + }, + { + "epoch": 0.013950244129272262, + "grad_norm": 7.652571631562263, + "learning_rate": 4.186046511627907e-07, + "loss": 1.0509, + "step": 10 + }, + { + "epoch": 0.015345268542199489, + "grad_norm": 8.029656913285312, + "learning_rate": 4.651162790697675e-07, + "loss": 1.1003, + "step": 11 + }, + { + "epoch": 0.016740292955126714, + "grad_norm": 6.822970986263167, + "learning_rate": 5.116279069767442e-07, + "loss": 1.0076, + "step": 12 + }, + { + "epoch": 0.01813531736805394, + "grad_norm": 7.446027562629797, + "learning_rate": 5.58139534883721e-07, + "loss": 1.1319, + "step": 13 + }, + { + "epoch": 0.01953034178098117, + "grad_norm": 8.32595611951422, + "learning_rate": 6.046511627906977e-07, + "loss": 1.1838, + "step": 14 + }, + { + "epoch": 0.020925366193908394, + "grad_norm": 6.223433909698243, + "learning_rate": 6.511627906976745e-07, + "loss": 0.9731, + "step": 15 + }, + { + "epoch": 0.02232039060683562, + "grad_norm": 7.0235311966869585, + "learning_rate": 6.976744186046513e-07, + "loss": 1.0504, + "step": 16 + }, + { + "epoch": 0.023715415019762844, + "grad_norm": 5.211705448894807, + "learning_rate": 7.441860465116279e-07, + "loss": 0.8548, + "step": 17 + }, + { + "epoch": 0.025110439432690073, + "grad_norm": 7.825683956735391, + "learning_rate": 7.906976744186047e-07, + "loss": 1.1983, + "step": 18 + }, + { + "epoch": 0.0265054638456173, + "grad_norm": 5.0106067786805575, + "learning_rate": 8.372093023255814e-07, + "loss": 0.9202, + "step": 19 + }, + { + "epoch": 0.027900488258544524, + "grad_norm": 6.830548485045446, + "learning_rate": 8.837209302325582e-07, + "loss": 1.0445, + "step": 20 + }, + { + "epoch": 0.029295512671471752, + "grad_norm": 5.726619544487671, + "learning_rate": 9.30232558139535e-07, + "loss": 0.9837, + "step": 21 + }, + { + "epoch": 0.030690537084398978, + "grad_norm": 5.8000958536016265, + "learning_rate": 9.767441860465117e-07, + "loss": 1.0272, + "step": 22 + }, + { + "epoch": 0.03208556149732621, + "grad_norm": 5.703395850340747, + "learning_rate": 1.0232558139534884e-06, + "loss": 1.0211, + "step": 23 + }, + { + "epoch": 0.03348058591025343, + "grad_norm": 3.586220906876636, + "learning_rate": 1.0697674418604653e-06, + "loss": 0.834, + "step": 24 + }, + { + "epoch": 0.03487561032318066, + "grad_norm": 4.9748168215918, + "learning_rate": 1.116279069767442e-06, + "loss": 0.9711, + "step": 25 + }, + { + "epoch": 0.03627063473610788, + "grad_norm": 4.018635863811052, + "learning_rate": 1.1627906976744188e-06, + "loss": 0.9558, + "step": 26 + }, + { + "epoch": 0.03766565914903511, + "grad_norm": 5.03749104808541, + "learning_rate": 1.2093023255813954e-06, + "loss": 1.0273, + "step": 27 + }, + { + "epoch": 0.03906068356196234, + "grad_norm": 3.9324782587964053, + "learning_rate": 1.2558139534883723e-06, + "loss": 0.9129, + "step": 28 + }, + { + "epoch": 0.04045570797488956, + "grad_norm": 3.550424744823837, + "learning_rate": 1.302325581395349e-06, + "loss": 0.8896, + "step": 29 + }, + { + "epoch": 0.04185073238781679, + "grad_norm": 3.446948058404404, + "learning_rate": 1.3488372093023258e-06, + "loss": 0.8867, + "step": 30 + }, + { + "epoch": 0.043245756800744016, + "grad_norm": 4.032262199203236, + "learning_rate": 1.3953488372093025e-06, + "loss": 0.8187, + "step": 31 + }, + { + "epoch": 0.04464078121367124, + "grad_norm": 4.125895000157052, + "learning_rate": 1.4418604651162794e-06, + "loss": 0.899, + "step": 32 + }, + { + "epoch": 0.04603580562659847, + "grad_norm": 3.8704377788889506, + "learning_rate": 1.4883720930232558e-06, + "loss": 0.9593, + "step": 33 + }, + { + "epoch": 0.04743083003952569, + "grad_norm": 3.9800203887919126, + "learning_rate": 1.534883720930233e-06, + "loss": 0.9171, + "step": 34 + }, + { + "epoch": 0.04882585445245292, + "grad_norm": 3.6904834931603605, + "learning_rate": 1.5813953488372093e-06, + "loss": 0.913, + "step": 35 + }, + { + "epoch": 0.050220878865380146, + "grad_norm": 3.63034156122338, + "learning_rate": 1.6279069767441862e-06, + "loss": 0.9465, + "step": 36 + }, + { + "epoch": 0.05161590327830737, + "grad_norm": 3.1225625019387353, + "learning_rate": 1.6744186046511629e-06, + "loss": 0.8543, + "step": 37 + }, + { + "epoch": 0.0530109276912346, + "grad_norm": 3.0282628625387824, + "learning_rate": 1.7209302325581397e-06, + "loss": 0.7939, + "step": 38 + }, + { + "epoch": 0.054405952104161825, + "grad_norm": 3.1020850997672964, + "learning_rate": 1.7674418604651164e-06, + "loss": 0.9034, + "step": 39 + }, + { + "epoch": 0.05580097651708905, + "grad_norm": 3.352063702118774, + "learning_rate": 1.8139534883720933e-06, + "loss": 0.8522, + "step": 40 + }, + { + "epoch": 0.057196000930016276, + "grad_norm": 3.1699146178254916, + "learning_rate": 1.86046511627907e-06, + "loss": 0.8169, + "step": 41 + }, + { + "epoch": 0.058591025342943505, + "grad_norm": 3.158963197691139, + "learning_rate": 1.9069767441860468e-06, + "loss": 0.8102, + "step": 42 + }, + { + "epoch": 0.05998604975587073, + "grad_norm": 3.2617484927812606, + "learning_rate": 1.9534883720930235e-06, + "loss": 0.792, + "step": 43 + }, + { + "epoch": 0.061381074168797956, + "grad_norm": 3.087539810163175, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.826, + "step": 44 + }, + { + "epoch": 0.06277609858172518, + "grad_norm": 3.3949003321789726, + "learning_rate": 2.0465116279069768e-06, + "loss": 0.8794, + "step": 45 + }, + { + "epoch": 0.06417112299465241, + "grad_norm": 2.9635898587108325, + "learning_rate": 2.0930232558139536e-06, + "loss": 0.7587, + "step": 46 + }, + { + "epoch": 0.06556614740757963, + "grad_norm": 3.089235385596214, + "learning_rate": 2.1395348837209305e-06, + "loss": 0.8536, + "step": 47 + }, + { + "epoch": 0.06696117182050686, + "grad_norm": 2.800805655960869, + "learning_rate": 2.1860465116279074e-06, + "loss": 0.8456, + "step": 48 + }, + { + "epoch": 0.06835619623343409, + "grad_norm": 2.6203629396683437, + "learning_rate": 2.232558139534884e-06, + "loss": 0.8775, + "step": 49 + }, + { + "epoch": 0.06975122064636131, + "grad_norm": 2.8227128567921405, + "learning_rate": 2.2790697674418607e-06, + "loss": 0.8134, + "step": 50 + }, + { + "epoch": 0.07114624505928854, + "grad_norm": 2.7691797695343614, + "learning_rate": 2.3255813953488376e-06, + "loss": 0.8683, + "step": 51 + }, + { + "epoch": 0.07254126947221576, + "grad_norm": 2.5774573516975074, + "learning_rate": 2.3720930232558144e-06, + "loss": 0.8091, + "step": 52 + }, + { + "epoch": 0.07393629388514299, + "grad_norm": 2.813557786400301, + "learning_rate": 2.418604651162791e-06, + "loss": 0.8208, + "step": 53 + }, + { + "epoch": 0.07533131829807022, + "grad_norm": 2.9437387416240823, + "learning_rate": 2.4651162790697678e-06, + "loss": 0.8564, + "step": 54 + }, + { + "epoch": 0.07672634271099744, + "grad_norm": 2.6877272303391324, + "learning_rate": 2.5116279069767446e-06, + "loss": 0.7985, + "step": 55 + }, + { + "epoch": 0.07812136712392467, + "grad_norm": 2.6939943804651314, + "learning_rate": 2.558139534883721e-06, + "loss": 0.8415, + "step": 56 + }, + { + "epoch": 0.07951639153685189, + "grad_norm": 2.680875520170419, + "learning_rate": 2.604651162790698e-06, + "loss": 0.8192, + "step": 57 + }, + { + "epoch": 0.08091141594977912, + "grad_norm": 2.5440723987372666, + "learning_rate": 2.6511627906976744e-06, + "loss": 0.7869, + "step": 58 + }, + { + "epoch": 0.08230644036270635, + "grad_norm": 2.7357452934000164, + "learning_rate": 2.6976744186046517e-06, + "loss": 0.8903, + "step": 59 + }, + { + "epoch": 0.08370146477563357, + "grad_norm": 2.4887440165658283, + "learning_rate": 2.744186046511628e-06, + "loss": 0.7487, + "step": 60 + }, + { + "epoch": 0.0850964891885608, + "grad_norm": 2.606394012411958, + "learning_rate": 2.790697674418605e-06, + "loss": 0.7484, + "step": 61 + }, + { + "epoch": 0.08649151360148803, + "grad_norm": 2.551608507364557, + "learning_rate": 2.8372093023255815e-06, + "loss": 0.8419, + "step": 62 + }, + { + "epoch": 0.08788653801441525, + "grad_norm": 2.457113149714823, + "learning_rate": 2.8837209302325587e-06, + "loss": 0.7387, + "step": 63 + }, + { + "epoch": 0.08928156242734248, + "grad_norm": 2.6907615175482293, + "learning_rate": 2.930232558139535e-06, + "loss": 0.8337, + "step": 64 + }, + { + "epoch": 0.0906765868402697, + "grad_norm": 2.394218938845241, + "learning_rate": 2.9767441860465116e-06, + "loss": 0.7526, + "step": 65 + }, + { + "epoch": 0.09207161125319693, + "grad_norm": 2.5987172501099605, + "learning_rate": 3.0232558139534885e-06, + "loss": 0.8057, + "step": 66 + }, + { + "epoch": 0.09346663566612416, + "grad_norm": 2.5273926089912058, + "learning_rate": 3.069767441860466e-06, + "loss": 0.8628, + "step": 67 + }, + { + "epoch": 0.09486166007905138, + "grad_norm": 2.539576955247472, + "learning_rate": 3.1162790697674423e-06, + "loss": 0.8137, + "step": 68 + }, + { + "epoch": 0.0962566844919786, + "grad_norm": 2.4735207026102652, + "learning_rate": 3.1627906976744187e-06, + "loss": 0.7567, + "step": 69 + }, + { + "epoch": 0.09765170890490583, + "grad_norm": 2.329402847723333, + "learning_rate": 3.2093023255813956e-06, + "loss": 0.7229, + "step": 70 + }, + { + "epoch": 0.09904673331783306, + "grad_norm": 2.51709324412196, + "learning_rate": 3.2558139534883724e-06, + "loss": 0.7585, + "step": 71 + }, + { + "epoch": 0.10044175773076029, + "grad_norm": 2.5097007626658403, + "learning_rate": 3.3023255813953493e-06, + "loss": 0.7643, + "step": 72 + }, + { + "epoch": 0.10183678214368752, + "grad_norm": 2.516406479315615, + "learning_rate": 3.3488372093023258e-06, + "loss": 0.7743, + "step": 73 + }, + { + "epoch": 0.10323180655661474, + "grad_norm": 2.39487772273171, + "learning_rate": 3.3953488372093026e-06, + "loss": 0.7836, + "step": 74 + }, + { + "epoch": 0.10462683096954196, + "grad_norm": 2.5591412684982573, + "learning_rate": 3.4418604651162795e-06, + "loss": 0.7613, + "step": 75 + }, + { + "epoch": 0.1060218553824692, + "grad_norm": 2.4784750402218925, + "learning_rate": 3.4883720930232564e-06, + "loss": 0.6626, + "step": 76 + }, + { + "epoch": 0.10741687979539642, + "grad_norm": 2.1253121107818203, + "learning_rate": 3.534883720930233e-06, + "loss": 0.6501, + "step": 77 + }, + { + "epoch": 0.10881190420832365, + "grad_norm": 2.4808216335670537, + "learning_rate": 3.5813953488372093e-06, + "loss": 0.794, + "step": 78 + }, + { + "epoch": 0.11020692862125087, + "grad_norm": 2.326427938400137, + "learning_rate": 3.6279069767441866e-06, + "loss": 0.7634, + "step": 79 + }, + { + "epoch": 0.1116019530341781, + "grad_norm": 2.6060325903250603, + "learning_rate": 3.674418604651163e-06, + "loss": 0.8042, + "step": 80 + }, + { + "epoch": 0.11299697744710532, + "grad_norm": 2.4821373099188615, + "learning_rate": 3.72093023255814e-06, + "loss": 0.762, + "step": 81 + }, + { + "epoch": 0.11439200186003255, + "grad_norm": 2.898157554318415, + "learning_rate": 3.7674418604651163e-06, + "loss": 0.8571, + "step": 82 + }, + { + "epoch": 0.11578702627295978, + "grad_norm": 2.4843334688361374, + "learning_rate": 3.8139534883720936e-06, + "loss": 0.7576, + "step": 83 + }, + { + "epoch": 0.11718205068588701, + "grad_norm": 2.387738096347522, + "learning_rate": 3.86046511627907e-06, + "loss": 0.7543, + "step": 84 + }, + { + "epoch": 0.11857707509881422, + "grad_norm": 2.335434248204514, + "learning_rate": 3.906976744186047e-06, + "loss": 0.7395, + "step": 85 + }, + { + "epoch": 0.11997209951174145, + "grad_norm": 2.3375745829156584, + "learning_rate": 3.953488372093024e-06, + "loss": 0.6685, + "step": 86 + }, + { + "epoch": 0.12136712392466868, + "grad_norm": 2.5136417014675074, + "learning_rate": 4.000000000000001e-06, + "loss": 0.8063, + "step": 87 + }, + { + "epoch": 0.12276214833759591, + "grad_norm": 2.5442316547663584, + "learning_rate": 4.0465116279069775e-06, + "loss": 0.8206, + "step": 88 + }, + { + "epoch": 0.12415717275052314, + "grad_norm": 2.555591985292735, + "learning_rate": 4.0930232558139536e-06, + "loss": 0.8606, + "step": 89 + }, + { + "epoch": 0.12555219716345037, + "grad_norm": 2.4405315538013808, + "learning_rate": 4.1395348837209304e-06, + "loss": 0.7434, + "step": 90 + }, + { + "epoch": 0.12694722157637758, + "grad_norm": 2.398256087733658, + "learning_rate": 4.186046511627907e-06, + "loss": 0.7421, + "step": 91 + }, + { + "epoch": 0.12834224598930483, + "grad_norm": 2.4927763742524203, + "learning_rate": 4.232558139534884e-06, + "loss": 0.7935, + "step": 92 + }, + { + "epoch": 0.12973727040223204, + "grad_norm": 2.5428069514715994, + "learning_rate": 4.279069767441861e-06, + "loss": 0.7443, + "step": 93 + }, + { + "epoch": 0.13113229481515926, + "grad_norm": 2.364288381058622, + "learning_rate": 4.325581395348837e-06, + "loss": 0.7112, + "step": 94 + }, + { + "epoch": 0.1325273192280865, + "grad_norm": 2.011169128440702, + "learning_rate": 4.372093023255815e-06, + "loss": 0.6569, + "step": 95 + }, + { + "epoch": 0.1339223436410137, + "grad_norm": 2.349408114611445, + "learning_rate": 4.418604651162791e-06, + "loss": 0.7492, + "step": 96 + }, + { + "epoch": 0.13531736805394096, + "grad_norm": 2.4540877828585677, + "learning_rate": 4.465116279069768e-06, + "loss": 0.7292, + "step": 97 + }, + { + "epoch": 0.13671239246686817, + "grad_norm": 2.3254215830382963, + "learning_rate": 4.5116279069767445e-06, + "loss": 0.6947, + "step": 98 + }, + { + "epoch": 0.13810741687979539, + "grad_norm": 2.2929641391200986, + "learning_rate": 4.558139534883721e-06, + "loss": 0.7623, + "step": 99 + }, + { + "epoch": 0.13950244129272263, + "grad_norm": 2.34717403373499, + "learning_rate": 4.604651162790698e-06, + "loss": 0.6874, + "step": 100 + }, + { + "epoch": 0.14089746570564984, + "grad_norm": 2.4456769972692065, + "learning_rate": 4.651162790697675e-06, + "loss": 0.771, + "step": 101 + }, + { + "epoch": 0.1422924901185771, + "grad_norm": 2.418110442277691, + "learning_rate": 4.697674418604651e-06, + "loss": 0.7647, + "step": 102 + }, + { + "epoch": 0.1436875145315043, + "grad_norm": 2.574913622817416, + "learning_rate": 4.744186046511629e-06, + "loss": 0.7203, + "step": 103 + }, + { + "epoch": 0.14508253894443152, + "grad_norm": 2.389331829708997, + "learning_rate": 4.790697674418605e-06, + "loss": 0.7176, + "step": 104 + }, + { + "epoch": 0.14647756335735876, + "grad_norm": 2.422530481112432, + "learning_rate": 4.837209302325582e-06, + "loss": 0.7032, + "step": 105 + }, + { + "epoch": 0.14787258777028597, + "grad_norm": 2.4343657094034574, + "learning_rate": 4.883720930232559e-06, + "loss": 0.6991, + "step": 106 + }, + { + "epoch": 0.14926761218321322, + "grad_norm": 2.3962869391793884, + "learning_rate": 4.9302325581395355e-06, + "loss": 0.7668, + "step": 107 + }, + { + "epoch": 0.15066263659614043, + "grad_norm": 2.3514460901020304, + "learning_rate": 4.976744186046512e-06, + "loss": 0.738, + "step": 108 + }, + { + "epoch": 0.15205766100906765, + "grad_norm": 2.7488872789310093, + "learning_rate": 5.023255813953489e-06, + "loss": 0.9257, + "step": 109 + }, + { + "epoch": 0.1534526854219949, + "grad_norm": 2.1877704055112397, + "learning_rate": 5.069767441860466e-06, + "loss": 0.6922, + "step": 110 + }, + { + "epoch": 0.1548477098349221, + "grad_norm": 2.292046125466671, + "learning_rate": 5.116279069767442e-06, + "loss": 0.7168, + "step": 111 + }, + { + "epoch": 0.15624273424784935, + "grad_norm": 2.39558344544093, + "learning_rate": 5.162790697674419e-06, + "loss": 0.7668, + "step": 112 + }, + { + "epoch": 0.15763775866077656, + "grad_norm": 2.37532566612421, + "learning_rate": 5.209302325581396e-06, + "loss": 0.7413, + "step": 113 + }, + { + "epoch": 0.15903278307370378, + "grad_norm": 2.570621775062793, + "learning_rate": 5.255813953488372e-06, + "loss": 0.7662, + "step": 114 + }, + { + "epoch": 0.16042780748663102, + "grad_norm": 2.437707261804497, + "learning_rate": 5.302325581395349e-06, + "loss": 0.7503, + "step": 115 + }, + { + "epoch": 0.16182283189955823, + "grad_norm": 2.3490732373961993, + "learning_rate": 5.348837209302326e-06, + "loss": 0.7234, + "step": 116 + }, + { + "epoch": 0.16321785631248548, + "grad_norm": 2.553878394018376, + "learning_rate": 5.395348837209303e-06, + "loss": 0.7924, + "step": 117 + }, + { + "epoch": 0.1646128807254127, + "grad_norm": 2.53998019160818, + "learning_rate": 5.44186046511628e-06, + "loss": 0.7456, + "step": 118 + }, + { + "epoch": 0.16600790513833993, + "grad_norm": 2.5752545753938216, + "learning_rate": 5.488372093023256e-06, + "loss": 0.7171, + "step": 119 + }, + { + "epoch": 0.16740292955126715, + "grad_norm": 2.5025495310223866, + "learning_rate": 5.534883720930233e-06, + "loss": 0.7204, + "step": 120 + }, + { + "epoch": 0.16879795396419436, + "grad_norm": 2.465879917317966, + "learning_rate": 5.58139534883721e-06, + "loss": 0.7511, + "step": 121 + }, + { + "epoch": 0.1701929783771216, + "grad_norm": 2.4842892653706583, + "learning_rate": 5.627906976744186e-06, + "loss": 0.7705, + "step": 122 + }, + { + "epoch": 0.17158800279004882, + "grad_norm": 2.656958791529201, + "learning_rate": 5.674418604651163e-06, + "loss": 0.7901, + "step": 123 + }, + { + "epoch": 0.17298302720297606, + "grad_norm": 2.5105437832639, + "learning_rate": 5.72093023255814e-06, + "loss": 0.731, + "step": 124 + }, + { + "epoch": 0.17437805161590328, + "grad_norm": 2.307050253420812, + "learning_rate": 5.7674418604651175e-06, + "loss": 0.7704, + "step": 125 + }, + { + "epoch": 0.1757730760288305, + "grad_norm": 2.4819230064286706, + "learning_rate": 5.8139534883720935e-06, + "loss": 0.7553, + "step": 126 + }, + { + "epoch": 0.17716810044175774, + "grad_norm": 2.6590280236699106, + "learning_rate": 5.86046511627907e-06, + "loss": 0.7641, + "step": 127 + }, + { + "epoch": 0.17856312485468495, + "grad_norm": 2.3396747269388696, + "learning_rate": 5.906976744186047e-06, + "loss": 0.76, + "step": 128 + }, + { + "epoch": 0.1799581492676122, + "grad_norm": 2.3959239000906263, + "learning_rate": 5.953488372093023e-06, + "loss": 0.83, + "step": 129 + }, + { + "epoch": 0.1813531736805394, + "grad_norm": 2.4068741447922277, + "learning_rate": 6e-06, + "loss": 0.7451, + "step": 130 + }, + { + "epoch": 0.18274819809346662, + "grad_norm": 2.5306239349683977, + "learning_rate": 6.046511627906977e-06, + "loss": 0.6759, + "step": 131 + }, + { + "epoch": 0.18414322250639387, + "grad_norm": 2.445947719768049, + "learning_rate": 6.093023255813954e-06, + "loss": 0.769, + "step": 132 + }, + { + "epoch": 0.18553824691932108, + "grad_norm": 2.5399331456936274, + "learning_rate": 6.139534883720932e-06, + "loss": 0.7435, + "step": 133 + }, + { + "epoch": 0.18693327133224832, + "grad_norm": 2.2974985090948845, + "learning_rate": 6.186046511627908e-06, + "loss": 0.7345, + "step": 134 + }, + { + "epoch": 0.18832829574517554, + "grad_norm": 2.6237877338647873, + "learning_rate": 6.2325581395348845e-06, + "loss": 0.8024, + "step": 135 + }, + { + "epoch": 0.18972332015810275, + "grad_norm": 2.5524074818038485, + "learning_rate": 6.279069767441861e-06, + "loss": 0.7748, + "step": 136 + }, + { + "epoch": 0.19111834457103, + "grad_norm": 2.562006189602683, + "learning_rate": 6.325581395348837e-06, + "loss": 0.753, + "step": 137 + }, + { + "epoch": 0.1925133689839572, + "grad_norm": 2.4958479653255545, + "learning_rate": 6.372093023255814e-06, + "loss": 0.7657, + "step": 138 + }, + { + "epoch": 0.19390839339688445, + "grad_norm": 2.717321817819931, + "learning_rate": 6.418604651162791e-06, + "loss": 0.71, + "step": 139 + }, + { + "epoch": 0.19530341780981167, + "grad_norm": 2.4089710684455983, + "learning_rate": 6.465116279069767e-06, + "loss": 0.7847, + "step": 140 + }, + { + "epoch": 0.1966984422227389, + "grad_norm": 2.477539261318324, + "learning_rate": 6.511627906976745e-06, + "loss": 0.7988, + "step": 141 + }, + { + "epoch": 0.19809346663566613, + "grad_norm": 2.6298344335984685, + "learning_rate": 6.558139534883722e-06, + "loss": 0.7494, + "step": 142 + }, + { + "epoch": 0.19948849104859334, + "grad_norm": 2.37624775861681, + "learning_rate": 6.604651162790699e-06, + "loss": 0.7409, + "step": 143 + }, + { + "epoch": 0.20088351546152058, + "grad_norm": 2.3379677755783597, + "learning_rate": 6.651162790697675e-06, + "loss": 0.7019, + "step": 144 + }, + { + "epoch": 0.2022785398744478, + "grad_norm": 2.404331847659135, + "learning_rate": 6.6976744186046515e-06, + "loss": 0.7986, + "step": 145 + }, + { + "epoch": 0.20367356428737504, + "grad_norm": 2.329041308240456, + "learning_rate": 6.744186046511628e-06, + "loss": 0.7384, + "step": 146 + }, + { + "epoch": 0.20506858870030226, + "grad_norm": 2.554766751041686, + "learning_rate": 6.790697674418605e-06, + "loss": 0.8329, + "step": 147 + }, + { + "epoch": 0.20646361311322947, + "grad_norm": 2.380435295817839, + "learning_rate": 6.837209302325581e-06, + "loss": 0.7275, + "step": 148 + }, + { + "epoch": 0.20785863752615671, + "grad_norm": 2.542245175246868, + "learning_rate": 6.883720930232559e-06, + "loss": 0.8281, + "step": 149 + }, + { + "epoch": 0.20925366193908393, + "grad_norm": 2.436384871263683, + "learning_rate": 6.930232558139536e-06, + "loss": 0.763, + "step": 150 + }, + { + "epoch": 0.21064868635201117, + "grad_norm": 2.577321451306135, + "learning_rate": 6.976744186046513e-06, + "loss": 0.7587, + "step": 151 + }, + { + "epoch": 0.2120437107649384, + "grad_norm": 2.1824061916227544, + "learning_rate": 7.023255813953489e-06, + "loss": 0.6997, + "step": 152 + }, + { + "epoch": 0.2134387351778656, + "grad_norm": 2.5006587847408013, + "learning_rate": 7.069767441860466e-06, + "loss": 0.7555, + "step": 153 + }, + { + "epoch": 0.21483375959079284, + "grad_norm": 2.357971605450965, + "learning_rate": 7.1162790697674425e-06, + "loss": 0.7273, + "step": 154 + }, + { + "epoch": 0.21622878400372006, + "grad_norm": 2.4104364154469096, + "learning_rate": 7.1627906976744185e-06, + "loss": 0.7338, + "step": 155 + }, + { + "epoch": 0.2176238084166473, + "grad_norm": 2.331567384611702, + "learning_rate": 7.209302325581395e-06, + "loss": 0.755, + "step": 156 + }, + { + "epoch": 0.21901883282957452, + "grad_norm": 2.199020523596816, + "learning_rate": 7.255813953488373e-06, + "loss": 0.6787, + "step": 157 + }, + { + "epoch": 0.22041385724250173, + "grad_norm": 2.5951329934583787, + "learning_rate": 7.30232558139535e-06, + "loss": 0.7364, + "step": 158 + }, + { + "epoch": 0.22180888165542897, + "grad_norm": 2.2810633560589597, + "learning_rate": 7.348837209302326e-06, + "loss": 0.7253, + "step": 159 + }, + { + "epoch": 0.2232039060683562, + "grad_norm": 2.4924517088895652, + "learning_rate": 7.395348837209303e-06, + "loss": 0.748, + "step": 160 + }, + { + "epoch": 0.22459893048128343, + "grad_norm": 2.4519591046477682, + "learning_rate": 7.44186046511628e-06, + "loss": 0.741, + "step": 161 + }, + { + "epoch": 0.22599395489421065, + "grad_norm": 2.427349032223749, + "learning_rate": 7.488372093023256e-06, + "loss": 0.7369, + "step": 162 + }, + { + "epoch": 0.22738897930713786, + "grad_norm": 2.4574734880892235, + "learning_rate": 7.534883720930233e-06, + "loss": 0.7681, + "step": 163 + }, + { + "epoch": 0.2287840037200651, + "grad_norm": 2.3629173763703175, + "learning_rate": 7.5813953488372095e-06, + "loss": 0.6676, + "step": 164 + }, + { + "epoch": 0.23017902813299232, + "grad_norm": 2.499777704465563, + "learning_rate": 7.627906976744187e-06, + "loss": 0.7475, + "step": 165 + }, + { + "epoch": 0.23157405254591956, + "grad_norm": 2.124247898175583, + "learning_rate": 7.674418604651164e-06, + "loss": 0.6732, + "step": 166 + }, + { + "epoch": 0.23296907695884678, + "grad_norm": 2.4833385152209417, + "learning_rate": 7.72093023255814e-06, + "loss": 0.7008, + "step": 167 + }, + { + "epoch": 0.23436410137177402, + "grad_norm": 2.440899551714562, + "learning_rate": 7.767441860465116e-06, + "loss": 0.725, + "step": 168 + }, + { + "epoch": 0.23575912578470123, + "grad_norm": 2.3266389679720123, + "learning_rate": 7.813953488372094e-06, + "loss": 0.7323, + "step": 169 + }, + { + "epoch": 0.23715415019762845, + "grad_norm": 2.1958822444892916, + "learning_rate": 7.86046511627907e-06, + "loss": 0.6617, + "step": 170 + }, + { + "epoch": 0.2385491746105557, + "grad_norm": 2.1838589739129355, + "learning_rate": 7.906976744186048e-06, + "loss": 0.7279, + "step": 171 + }, + { + "epoch": 0.2399441990234829, + "grad_norm": 2.429658187245016, + "learning_rate": 7.953488372093024e-06, + "loss": 0.7089, + "step": 172 + }, + { + "epoch": 0.24133922343641015, + "grad_norm": 2.5141806643559423, + "learning_rate": 8.000000000000001e-06, + "loss": 0.7744, + "step": 173 + }, + { + "epoch": 0.24273424784933736, + "grad_norm": 2.3532093516479073, + "learning_rate": 8.046511627906977e-06, + "loss": 0.738, + "step": 174 + }, + { + "epoch": 0.24412927226226458, + "grad_norm": 2.3567949350894892, + "learning_rate": 8.093023255813955e-06, + "loss": 0.7661, + "step": 175 + }, + { + "epoch": 0.24552429667519182, + "grad_norm": 2.422487493668288, + "learning_rate": 8.139534883720931e-06, + "loss": 0.7738, + "step": 176 + }, + { + "epoch": 0.24691932108811904, + "grad_norm": 2.533137019052819, + "learning_rate": 8.186046511627907e-06, + "loss": 0.7684, + "step": 177 + }, + { + "epoch": 0.24831434550104628, + "grad_norm": 2.5053972465039376, + "learning_rate": 8.232558139534885e-06, + "loss": 0.7735, + "step": 178 + }, + { + "epoch": 0.2497093699139735, + "grad_norm": 2.2993122862473534, + "learning_rate": 8.279069767441861e-06, + "loss": 0.7121, + "step": 179 + }, + { + "epoch": 0.25110439432690074, + "grad_norm": 2.4034617971465946, + "learning_rate": 8.325581395348837e-06, + "loss": 0.758, + "step": 180 + }, + { + "epoch": 0.25249941873982795, + "grad_norm": 2.5374194516013566, + "learning_rate": 8.372093023255815e-06, + "loss": 0.75, + "step": 181 + }, + { + "epoch": 0.25389444315275517, + "grad_norm": 2.321732284911669, + "learning_rate": 8.418604651162792e-06, + "loss": 0.7085, + "step": 182 + }, + { + "epoch": 0.2552894675656824, + "grad_norm": 2.4488456028756462, + "learning_rate": 8.465116279069768e-06, + "loss": 0.7467, + "step": 183 + }, + { + "epoch": 0.25668449197860965, + "grad_norm": 2.452894565948684, + "learning_rate": 8.511627906976744e-06, + "loss": 0.7142, + "step": 184 + }, + { + "epoch": 0.25807951639153687, + "grad_norm": 2.369770045849115, + "learning_rate": 8.558139534883722e-06, + "loss": 0.7071, + "step": 185 + }, + { + "epoch": 0.2594745408044641, + "grad_norm": 2.0567126918314877, + "learning_rate": 8.604651162790698e-06, + "loss": 0.6288, + "step": 186 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 2.350291216305672, + "learning_rate": 8.651162790697674e-06, + "loss": 0.7848, + "step": 187 + }, + { + "epoch": 0.2622645896303185, + "grad_norm": 2.314951381733544, + "learning_rate": 8.697674418604652e-06, + "loss": 0.7262, + "step": 188 + }, + { + "epoch": 0.2636596140432458, + "grad_norm": 2.304013798793699, + "learning_rate": 8.74418604651163e-06, + "loss": 0.6982, + "step": 189 + }, + { + "epoch": 0.265054638456173, + "grad_norm": 2.331062126039812, + "learning_rate": 8.790697674418606e-06, + "loss": 0.7191, + "step": 190 + }, + { + "epoch": 0.2664496628691002, + "grad_norm": 2.3981290812587024, + "learning_rate": 8.837209302325582e-06, + "loss": 0.7599, + "step": 191 + }, + { + "epoch": 0.2678446872820274, + "grad_norm": 2.3469258787402123, + "learning_rate": 8.88372093023256e-06, + "loss": 0.7005, + "step": 192 + }, + { + "epoch": 0.26923971169495464, + "grad_norm": 2.322650241728307, + "learning_rate": 8.930232558139535e-06, + "loss": 0.7553, + "step": 193 + }, + { + "epoch": 0.2706347361078819, + "grad_norm": 2.390850393607927, + "learning_rate": 8.976744186046511e-06, + "loss": 0.7011, + "step": 194 + }, + { + "epoch": 0.2720297605208091, + "grad_norm": 2.3611482677828004, + "learning_rate": 9.023255813953489e-06, + "loss": 0.715, + "step": 195 + }, + { + "epoch": 0.27342478493373634, + "grad_norm": 2.251847626785549, + "learning_rate": 9.069767441860465e-06, + "loss": 0.7798, + "step": 196 + }, + { + "epoch": 0.27481980934666356, + "grad_norm": 2.2203131238743374, + "learning_rate": 9.116279069767443e-06, + "loss": 0.7094, + "step": 197 + }, + { + "epoch": 0.27621483375959077, + "grad_norm": 2.3743505262739992, + "learning_rate": 9.162790697674419e-06, + "loss": 0.7082, + "step": 198 + }, + { + "epoch": 0.27760985817251804, + "grad_norm": 2.382614063698063, + "learning_rate": 9.209302325581397e-06, + "loss": 0.7115, + "step": 199 + }, + { + "epoch": 0.27900488258544526, + "grad_norm": 2.4087918128165775, + "learning_rate": 9.255813953488373e-06, + "loss": 0.7255, + "step": 200 + }, + { + "epoch": 0.2803999069983725, + "grad_norm": 2.2494923080443168, + "learning_rate": 9.30232558139535e-06, + "loss": 0.7212, + "step": 201 + }, + { + "epoch": 0.2817949314112997, + "grad_norm": 2.293386790821147, + "learning_rate": 9.348837209302326e-06, + "loss": 0.7484, + "step": 202 + }, + { + "epoch": 0.2831899558242269, + "grad_norm": 2.3471917637572477, + "learning_rate": 9.395348837209302e-06, + "loss": 0.7688, + "step": 203 + }, + { + "epoch": 0.2845849802371542, + "grad_norm": 2.400058271047396, + "learning_rate": 9.44186046511628e-06, + "loss": 0.7279, + "step": 204 + }, + { + "epoch": 0.2859800046500814, + "grad_norm": 2.4522386019787725, + "learning_rate": 9.488372093023258e-06, + "loss": 0.7716, + "step": 205 + }, + { + "epoch": 0.2873750290630086, + "grad_norm": 2.6239510123204477, + "learning_rate": 9.534883720930234e-06, + "loss": 0.743, + "step": 206 + }, + { + "epoch": 0.2887700534759358, + "grad_norm": 2.136688074275227, + "learning_rate": 9.58139534883721e-06, + "loss": 0.6951, + "step": 207 + }, + { + "epoch": 0.29016507788886303, + "grad_norm": 2.4603362182924076, + "learning_rate": 9.627906976744188e-06, + "loss": 0.7707, + "step": 208 + }, + { + "epoch": 0.2915601023017903, + "grad_norm": 2.3184884483119643, + "learning_rate": 9.674418604651164e-06, + "loss": 0.7577, + "step": 209 + }, + { + "epoch": 0.2929551267147175, + "grad_norm": 2.3897163298829716, + "learning_rate": 9.72093023255814e-06, + "loss": 0.767, + "step": 210 + }, + { + "epoch": 0.29435015112764473, + "grad_norm": 2.290611569859189, + "learning_rate": 9.767441860465117e-06, + "loss": 0.7195, + "step": 211 + }, + { + "epoch": 0.29574517554057195, + "grad_norm": 2.614865650081256, + "learning_rate": 9.813953488372093e-06, + "loss": 0.7677, + "step": 212 + }, + { + "epoch": 0.29714019995349916, + "grad_norm": 2.409329394194762, + "learning_rate": 9.860465116279071e-06, + "loss": 0.7396, + "step": 213 + }, + { + "epoch": 0.29853522436642643, + "grad_norm": 2.327197479223712, + "learning_rate": 9.906976744186047e-06, + "loss": 0.7102, + "step": 214 + }, + { + "epoch": 0.29993024877935365, + "grad_norm": 2.4339401095238045, + "learning_rate": 9.953488372093025e-06, + "loss": 0.7671, + "step": 215 + }, + { + "epoch": 0.30132527319228086, + "grad_norm": 2.406077578159272, + "learning_rate": 1e-05, + "loss": 0.7447, + "step": 216 + }, + { + "epoch": 0.3027202976052081, + "grad_norm": 2.4625625301213647, + "learning_rate": 9.999993396473114e-06, + "loss": 0.7896, + "step": 217 + }, + { + "epoch": 0.3041153220181353, + "grad_norm": 2.2307465016265513, + "learning_rate": 9.999973585909898e-06, + "loss": 0.6554, + "step": 218 + }, + { + "epoch": 0.30551034643106256, + "grad_norm": 2.3558678943771962, + "learning_rate": 9.99994056836268e-06, + "loss": 0.7142, + "step": 219 + }, + { + "epoch": 0.3069053708439898, + "grad_norm": 2.2935965111443273, + "learning_rate": 9.999894343918674e-06, + "loss": 0.7234, + "step": 220 + }, + { + "epoch": 0.308300395256917, + "grad_norm": 2.2050114399403333, + "learning_rate": 9.999834912699974e-06, + "loss": 0.7748, + "step": 221 + }, + { + "epoch": 0.3096954196698442, + "grad_norm": 2.2482433300501734, + "learning_rate": 9.999762274863567e-06, + "loss": 0.7253, + "step": 222 + }, + { + "epoch": 0.3110904440827714, + "grad_norm": 2.399635963689245, + "learning_rate": 9.999676430601318e-06, + "loss": 0.7806, + "step": 223 + }, + { + "epoch": 0.3124854684956987, + "grad_norm": 2.2011507392317187, + "learning_rate": 9.999577380139976e-06, + "loss": 0.7026, + "step": 224 + }, + { + "epoch": 0.3138804929086259, + "grad_norm": 2.3655833768745977, + "learning_rate": 9.999465123741172e-06, + "loss": 0.7078, + "step": 225 + }, + { + "epoch": 0.3152755173215531, + "grad_norm": 2.387043969086896, + "learning_rate": 9.999339661701424e-06, + "loss": 0.7031, + "step": 226 + }, + { + "epoch": 0.31667054173448034, + "grad_norm": 2.2360354228157666, + "learning_rate": 9.99920099435213e-06, + "loss": 0.7505, + "step": 227 + }, + { + "epoch": 0.31806556614740755, + "grad_norm": 2.4994174229622836, + "learning_rate": 9.999049122059565e-06, + "loss": 0.7708, + "step": 228 + }, + { + "epoch": 0.3194605905603348, + "grad_norm": 2.133696380023235, + "learning_rate": 9.998884045224886e-06, + "loss": 0.7241, + "step": 229 + }, + { + "epoch": 0.32085561497326204, + "grad_norm": 2.241964560461335, + "learning_rate": 9.998705764284132e-06, + "loss": 0.6949, + "step": 230 + }, + { + "epoch": 0.32225063938618925, + "grad_norm": 2.475034321702833, + "learning_rate": 9.998514279708212e-06, + "loss": 0.7826, + "step": 231 + }, + { + "epoch": 0.32364566379911647, + "grad_norm": 2.36402035615075, + "learning_rate": 9.998309592002914e-06, + "loss": 0.7745, + "step": 232 + }, + { + "epoch": 0.32504068821204374, + "grad_norm": 2.3658257454389475, + "learning_rate": 9.99809170170891e-06, + "loss": 0.6859, + "step": 233 + }, + { + "epoch": 0.32643571262497095, + "grad_norm": 2.273680448775777, + "learning_rate": 9.997860609401732e-06, + "loss": 0.7775, + "step": 234 + }, + { + "epoch": 0.32783073703789817, + "grad_norm": 2.201487231731571, + "learning_rate": 9.99761631569179e-06, + "loss": 0.7803, + "step": 235 + }, + { + "epoch": 0.3292257614508254, + "grad_norm": 2.4155650660144348, + "learning_rate": 9.997358821224365e-06, + "loss": 0.8011, + "step": 236 + }, + { + "epoch": 0.3306207858637526, + "grad_norm": 2.359732428185931, + "learning_rate": 9.997088126679607e-06, + "loss": 0.7299, + "step": 237 + }, + { + "epoch": 0.33201581027667987, + "grad_norm": 2.3093662561481367, + "learning_rate": 9.996804232772528e-06, + "loss": 0.7319, + "step": 238 + }, + { + "epoch": 0.3334108346896071, + "grad_norm": 2.272053877068559, + "learning_rate": 9.996507140253012e-06, + "loss": 0.7559, + "step": 239 + }, + { + "epoch": 0.3348058591025343, + "grad_norm": 2.1570286455643575, + "learning_rate": 9.9961968499058e-06, + "loss": 0.7401, + "step": 240 + }, + { + "epoch": 0.3362008835154615, + "grad_norm": 2.4238684062299205, + "learning_rate": 9.9958733625505e-06, + "loss": 0.7662, + "step": 241 + }, + { + "epoch": 0.3375959079283887, + "grad_norm": 2.146480418424315, + "learning_rate": 9.995536679041568e-06, + "loss": 0.7078, + "step": 242 + }, + { + "epoch": 0.338990932341316, + "grad_norm": 2.320463871074251, + "learning_rate": 9.99518680026833e-06, + "loss": 0.7589, + "step": 243 + }, + { + "epoch": 0.3403859567542432, + "grad_norm": 2.2760580968152144, + "learning_rate": 9.994823727154957e-06, + "loss": 0.7427, + "step": 244 + }, + { + "epoch": 0.34178098116717043, + "grad_norm": 2.5684664291414294, + "learning_rate": 9.994447460660473e-06, + "loss": 0.7679, + "step": 245 + }, + { + "epoch": 0.34317600558009764, + "grad_norm": 2.1635703031164977, + "learning_rate": 9.994058001778754e-06, + "loss": 0.7536, + "step": 246 + }, + { + "epoch": 0.34457102999302486, + "grad_norm": 2.282114366302871, + "learning_rate": 9.99365535153852e-06, + "loss": 0.7267, + "step": 247 + }, + { + "epoch": 0.34596605440595213, + "grad_norm": 2.231237877658349, + "learning_rate": 9.993239511003338e-06, + "loss": 0.7372, + "step": 248 + }, + { + "epoch": 0.34736107881887934, + "grad_norm": 2.481316957977788, + "learning_rate": 9.992810481271611e-06, + "loss": 0.7335, + "step": 249 + }, + { + "epoch": 0.34875610323180656, + "grad_norm": 2.302521453924701, + "learning_rate": 9.992368263476585e-06, + "loss": 0.7511, + "step": 250 + }, + { + "epoch": 0.3501511276447338, + "grad_norm": 2.2387510321179875, + "learning_rate": 9.991912858786335e-06, + "loss": 0.7613, + "step": 251 + }, + { + "epoch": 0.351546152057661, + "grad_norm": 2.361564774571516, + "learning_rate": 9.991444268403776e-06, + "loss": 0.7642, + "step": 252 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 2.343319789459909, + "learning_rate": 9.990962493566645e-06, + "loss": 0.8018, + "step": 253 + }, + { + "epoch": 0.3543362008835155, + "grad_norm": 2.444856016657922, + "learning_rate": 9.99046753554751e-06, + "loss": 0.7034, + "step": 254 + }, + { + "epoch": 0.3557312252964427, + "grad_norm": 2.295542656637622, + "learning_rate": 9.989959395653756e-06, + "loss": 0.7357, + "step": 255 + }, + { + "epoch": 0.3571262497093699, + "grad_norm": 2.1172219443470057, + "learning_rate": 9.989438075227588e-06, + "loss": 0.6318, + "step": 256 + }, + { + "epoch": 0.3585212741222971, + "grad_norm": 2.317934296315436, + "learning_rate": 9.988903575646032e-06, + "loss": 0.6826, + "step": 257 + }, + { + "epoch": 0.3599162985352244, + "grad_norm": 2.249565309628914, + "learning_rate": 9.988355898320917e-06, + "loss": 0.7302, + "step": 258 + }, + { + "epoch": 0.3613113229481516, + "grad_norm": 2.61405603733907, + "learning_rate": 9.987795044698885e-06, + "loss": 0.7599, + "step": 259 + }, + { + "epoch": 0.3627063473610788, + "grad_norm": 2.393309752919588, + "learning_rate": 9.98722101626138e-06, + "loss": 0.7434, + "step": 260 + }, + { + "epoch": 0.36410137177400603, + "grad_norm": 2.255993754480847, + "learning_rate": 9.986633814524648e-06, + "loss": 0.7009, + "step": 261 + }, + { + "epoch": 0.36549639618693325, + "grad_norm": 2.1899632672810796, + "learning_rate": 9.986033441039731e-06, + "loss": 0.7352, + "step": 262 + }, + { + "epoch": 0.3668914205998605, + "grad_norm": 2.4758174903524837, + "learning_rate": 9.985419897392459e-06, + "loss": 0.7922, + "step": 263 + }, + { + "epoch": 0.36828644501278773, + "grad_norm": 1.9484666270746862, + "learning_rate": 9.984793185203456e-06, + "loss": 0.692, + "step": 264 + }, + { + "epoch": 0.36968146942571495, + "grad_norm": 2.231671078245673, + "learning_rate": 9.984153306128124e-06, + "loss": 0.7602, + "step": 265 + }, + { + "epoch": 0.37107649383864216, + "grad_norm": 2.5406824511193933, + "learning_rate": 9.983500261856646e-06, + "loss": 0.7612, + "step": 266 + }, + { + "epoch": 0.3724715182515694, + "grad_norm": 2.215219371603928, + "learning_rate": 9.982834054113982e-06, + "loss": 0.6931, + "step": 267 + }, + { + "epoch": 0.37386654266449665, + "grad_norm": 2.4375255794403103, + "learning_rate": 9.98215468465986e-06, + "loss": 0.7192, + "step": 268 + }, + { + "epoch": 0.37526156707742386, + "grad_norm": 2.168943298360225, + "learning_rate": 9.981462155288773e-06, + "loss": 0.7385, + "step": 269 + }, + { + "epoch": 0.3766565914903511, + "grad_norm": 2.2784901454453554, + "learning_rate": 9.980756467829977e-06, + "loss": 0.6834, + "step": 270 + }, + { + "epoch": 0.3780516159032783, + "grad_norm": 2.335696005944563, + "learning_rate": 9.98003762414748e-06, + "loss": 0.7263, + "step": 271 + }, + { + "epoch": 0.3794466403162055, + "grad_norm": 2.181003569392429, + "learning_rate": 9.979305626140046e-06, + "loss": 0.6903, + "step": 272 + }, + { + "epoch": 0.3808416647291328, + "grad_norm": 2.1412825487141087, + "learning_rate": 9.978560475741181e-06, + "loss": 0.712, + "step": 273 + }, + { + "epoch": 0.38223668914206, + "grad_norm": 2.2673165084490687, + "learning_rate": 9.977802174919134e-06, + "loss": 0.7921, + "step": 274 + }, + { + "epoch": 0.3836317135549872, + "grad_norm": 2.142363448667009, + "learning_rate": 9.977030725676887e-06, + "loss": 0.6938, + "step": 275 + }, + { + "epoch": 0.3850267379679144, + "grad_norm": 2.2876816347837434, + "learning_rate": 9.976246130052157e-06, + "loss": 0.7565, + "step": 276 + }, + { + "epoch": 0.38642176238084164, + "grad_norm": 2.2736845275915187, + "learning_rate": 9.97544839011738e-06, + "loss": 0.8085, + "step": 277 + }, + { + "epoch": 0.3878167867937689, + "grad_norm": 2.1051620059141247, + "learning_rate": 9.974637507979721e-06, + "loss": 0.6934, + "step": 278 + }, + { + "epoch": 0.3892118112066961, + "grad_norm": 2.3005570106943587, + "learning_rate": 9.973813485781045e-06, + "loss": 0.7029, + "step": 279 + }, + { + "epoch": 0.39060683561962334, + "grad_norm": 2.060997419935413, + "learning_rate": 9.972976325697938e-06, + "loss": 0.6991, + "step": 280 + }, + { + "epoch": 0.39200186003255055, + "grad_norm": 2.049588415049291, + "learning_rate": 9.972126029941685e-06, + "loss": 0.7229, + "step": 281 + }, + { + "epoch": 0.3933968844454778, + "grad_norm": 2.2293739933043155, + "learning_rate": 9.97126260075826e-06, + "loss": 0.7045, + "step": 282 + }, + { + "epoch": 0.39479190885840504, + "grad_norm": 2.3316135043046327, + "learning_rate": 9.97038604042834e-06, + "loss": 0.7787, + "step": 283 + }, + { + "epoch": 0.39618693327133225, + "grad_norm": 2.2340910959336147, + "learning_rate": 9.969496351267278e-06, + "loss": 0.7174, + "step": 284 + }, + { + "epoch": 0.39758195768425947, + "grad_norm": 2.298045654117731, + "learning_rate": 9.96859353562511e-06, + "loss": 0.7462, + "step": 285 + }, + { + "epoch": 0.3989769820971867, + "grad_norm": 2.1837557728814123, + "learning_rate": 9.967677595886542e-06, + "loss": 0.7167, + "step": 286 + }, + { + "epoch": 0.40037200651011395, + "grad_norm": 2.27299808508551, + "learning_rate": 9.96674853447095e-06, + "loss": 0.7433, + "step": 287 + }, + { + "epoch": 0.40176703092304117, + "grad_norm": 2.137166870007726, + "learning_rate": 9.96580635383236e-06, + "loss": 0.6633, + "step": 288 + }, + { + "epoch": 0.4031620553359684, + "grad_norm": 1.9262822690731025, + "learning_rate": 9.964851056459465e-06, + "loss": 0.6312, + "step": 289 + }, + { + "epoch": 0.4045570797488956, + "grad_norm": 2.269824827715954, + "learning_rate": 9.963882644875594e-06, + "loss": 0.7178, + "step": 290 + }, + { + "epoch": 0.4059521041618228, + "grad_norm": 2.2048707014322, + "learning_rate": 9.96290112163872e-06, + "loss": 0.7068, + "step": 291 + }, + { + "epoch": 0.4073471285747501, + "grad_norm": 2.322124072727594, + "learning_rate": 9.961906489341452e-06, + "loss": 0.7339, + "step": 292 + }, + { + "epoch": 0.4087421529876773, + "grad_norm": 2.4571329521811367, + "learning_rate": 9.960898750611019e-06, + "loss": 0.7715, + "step": 293 + }, + { + "epoch": 0.4101371774006045, + "grad_norm": 2.5174823882242015, + "learning_rate": 9.959877908109274e-06, + "loss": 0.8336, + "step": 294 + }, + { + "epoch": 0.41153220181353173, + "grad_norm": 2.208634018316973, + "learning_rate": 9.958843964532683e-06, + "loss": 0.6829, + "step": 295 + }, + { + "epoch": 0.41292722622645894, + "grad_norm": 2.222290757312979, + "learning_rate": 9.957796922612314e-06, + "loss": 0.7138, + "step": 296 + }, + { + "epoch": 0.4143222506393862, + "grad_norm": 2.5274252464650755, + "learning_rate": 9.956736785113833e-06, + "loss": 0.7725, + "step": 297 + }, + { + "epoch": 0.41571727505231343, + "grad_norm": 2.0860385651911346, + "learning_rate": 9.955663554837503e-06, + "loss": 0.6504, + "step": 298 + }, + { + "epoch": 0.41711229946524064, + "grad_norm": 2.3408784898712787, + "learning_rate": 9.954577234618162e-06, + "loss": 0.7158, + "step": 299 + }, + { + "epoch": 0.41850732387816786, + "grad_norm": 2.2270712846507514, + "learning_rate": 9.953477827325229e-06, + "loss": 0.7358, + "step": 300 + }, + { + "epoch": 0.4199023482910951, + "grad_norm": 2.1586659413799127, + "learning_rate": 9.952365335862693e-06, + "loss": 0.7668, + "step": 301 + }, + { + "epoch": 0.42129737270402234, + "grad_norm": 2.234673416696326, + "learning_rate": 9.951239763169097e-06, + "loss": 0.7895, + "step": 302 + }, + { + "epoch": 0.42269239711694956, + "grad_norm": 2.195858274108098, + "learning_rate": 9.950101112217543e-06, + "loss": 0.7191, + "step": 303 + }, + { + "epoch": 0.4240874215298768, + "grad_norm": 2.1565944180125376, + "learning_rate": 9.948949386015677e-06, + "loss": 0.6953, + "step": 304 + }, + { + "epoch": 0.425482445942804, + "grad_norm": 2.2088440863358905, + "learning_rate": 9.947784587605678e-06, + "loss": 0.6961, + "step": 305 + }, + { + "epoch": 0.4268774703557312, + "grad_norm": 2.411909196100123, + "learning_rate": 9.946606720064257e-06, + "loss": 0.7571, + "step": 306 + }, + { + "epoch": 0.4282724947686585, + "grad_norm": 2.398065188628216, + "learning_rate": 9.945415786502649e-06, + "loss": 0.7381, + "step": 307 + }, + { + "epoch": 0.4296675191815857, + "grad_norm": 2.166520322060508, + "learning_rate": 9.944211790066597e-06, + "loss": 0.7238, + "step": 308 + }, + { + "epoch": 0.4310625435945129, + "grad_norm": 2.3660967755228746, + "learning_rate": 9.94299473393635e-06, + "loss": 0.7767, + "step": 309 + }, + { + "epoch": 0.4324575680074401, + "grad_norm": 2.239729559357496, + "learning_rate": 9.941764621326655e-06, + "loss": 0.7166, + "step": 310 + }, + { + "epoch": 0.43385259242036733, + "grad_norm": 2.118960181107485, + "learning_rate": 9.94052145548674e-06, + "loss": 0.754, + "step": 311 + }, + { + "epoch": 0.4352476168332946, + "grad_norm": 2.374426767740763, + "learning_rate": 9.939265239700321e-06, + "loss": 0.7863, + "step": 312 + }, + { + "epoch": 0.4366426412462218, + "grad_norm": 2.369429638423822, + "learning_rate": 9.93799597728558e-06, + "loss": 0.7489, + "step": 313 + }, + { + "epoch": 0.43803766565914903, + "grad_norm": 2.387324721804876, + "learning_rate": 9.936713671595158e-06, + "loss": 0.7778, + "step": 314 + }, + { + "epoch": 0.43943269007207625, + "grad_norm": 2.2609612055446093, + "learning_rate": 9.935418326016153e-06, + "loss": 0.7073, + "step": 315 + }, + { + "epoch": 0.44082771448500346, + "grad_norm": 2.194501412971286, + "learning_rate": 9.934109943970103e-06, + "loss": 0.7858, + "step": 316 + }, + { + "epoch": 0.44222273889793073, + "grad_norm": 2.2443511833058687, + "learning_rate": 9.932788528912983e-06, + "loss": 0.7556, + "step": 317 + }, + { + "epoch": 0.44361776331085795, + "grad_norm": 2.262002280386999, + "learning_rate": 9.931454084335192e-06, + "loss": 0.7557, + "step": 318 + }, + { + "epoch": 0.44501278772378516, + "grad_norm": 2.0267896100249034, + "learning_rate": 9.930106613761549e-06, + "loss": 0.6984, + "step": 319 + }, + { + "epoch": 0.4464078121367124, + "grad_norm": 2.192015174282251, + "learning_rate": 9.928746120751275e-06, + "loss": 0.7323, + "step": 320 + }, + { + "epoch": 0.4478028365496396, + "grad_norm": 2.2328778916714014, + "learning_rate": 9.927372608897992e-06, + "loss": 0.7685, + "step": 321 + }, + { + "epoch": 0.44919786096256686, + "grad_norm": 2.1611404588310323, + "learning_rate": 9.925986081829708e-06, + "loss": 0.6904, + "step": 322 + }, + { + "epoch": 0.4505928853754941, + "grad_norm": 2.3906038854954232, + "learning_rate": 9.924586543208812e-06, + "loss": 0.7651, + "step": 323 + }, + { + "epoch": 0.4519879097884213, + "grad_norm": 2.2150342718775384, + "learning_rate": 9.923173996732058e-06, + "loss": 0.7088, + "step": 324 + }, + { + "epoch": 0.4533829342013485, + "grad_norm": 2.257194983707292, + "learning_rate": 9.921748446130564e-06, + "loss": 0.7583, + "step": 325 + }, + { + "epoch": 0.4547779586142757, + "grad_norm": 2.049635371526883, + "learning_rate": 9.920309895169793e-06, + "loss": 0.7118, + "step": 326 + }, + { + "epoch": 0.456172983027203, + "grad_norm": 2.2226016906419734, + "learning_rate": 9.91885834764955e-06, + "loss": 0.7132, + "step": 327 + }, + { + "epoch": 0.4575680074401302, + "grad_norm": 2.2068467340854614, + "learning_rate": 9.917393807403965e-06, + "loss": 0.7676, + "step": 328 + }, + { + "epoch": 0.4589630318530574, + "grad_norm": 2.0506777316664775, + "learning_rate": 9.915916278301496e-06, + "loss": 0.7045, + "step": 329 + }, + { + "epoch": 0.46035805626598464, + "grad_norm": 2.24941878997737, + "learning_rate": 9.9144257642449e-06, + "loss": 0.6901, + "step": 330 + }, + { + "epoch": 0.46175308067891185, + "grad_norm": 1.9825175559087265, + "learning_rate": 9.91292226917124e-06, + "loss": 0.6572, + "step": 331 + }, + { + "epoch": 0.4631481050918391, + "grad_norm": 2.29410576307341, + "learning_rate": 9.91140579705186e-06, + "loss": 0.7534, + "step": 332 + }, + { + "epoch": 0.46454312950476634, + "grad_norm": 2.1703274382213507, + "learning_rate": 9.909876351892388e-06, + "loss": 0.769, + "step": 333 + }, + { + "epoch": 0.46593815391769355, + "grad_norm": 2.028231097517855, + "learning_rate": 9.908333937732718e-06, + "loss": 0.7213, + "step": 334 + }, + { + "epoch": 0.46733317833062077, + "grad_norm": 2.1183249761708995, + "learning_rate": 9.906778558647e-06, + "loss": 0.7089, + "step": 335 + }, + { + "epoch": 0.46872820274354804, + "grad_norm": 2.0941215519004683, + "learning_rate": 9.905210218743626e-06, + "loss": 0.7282, + "step": 336 + }, + { + "epoch": 0.47012322715647525, + "grad_norm": 2.093942221954034, + "learning_rate": 9.903628922165227e-06, + "loss": 0.7011, + "step": 337 + }, + { + "epoch": 0.47151825156940247, + "grad_norm": 2.3302808968769986, + "learning_rate": 9.902034673088656e-06, + "loss": 0.7477, + "step": 338 + }, + { + "epoch": 0.4729132759823297, + "grad_norm": 2.152602661906724, + "learning_rate": 9.90042747572498e-06, + "loss": 0.6833, + "step": 339 + }, + { + "epoch": 0.4743083003952569, + "grad_norm": 2.349715876121456, + "learning_rate": 9.898807334319471e-06, + "loss": 0.7693, + "step": 340 + }, + { + "epoch": 0.47570332480818417, + "grad_norm": 2.173833769245502, + "learning_rate": 9.897174253151583e-06, + "loss": 0.7206, + "step": 341 + }, + { + "epoch": 0.4770983492211114, + "grad_norm": 2.259839449442954, + "learning_rate": 9.895528236534957e-06, + "loss": 0.7403, + "step": 342 + }, + { + "epoch": 0.4784933736340386, + "grad_norm": 2.3578368736204207, + "learning_rate": 9.893869288817397e-06, + "loss": 0.7909, + "step": 343 + }, + { + "epoch": 0.4798883980469658, + "grad_norm": 2.009722455372278, + "learning_rate": 9.89219741438087e-06, + "loss": 0.7118, + "step": 344 + }, + { + "epoch": 0.48128342245989303, + "grad_norm": 2.2025524361064814, + "learning_rate": 9.890512617641474e-06, + "loss": 0.7331, + "step": 345 + }, + { + "epoch": 0.4826784468728203, + "grad_norm": 2.285563908098744, + "learning_rate": 9.888814903049458e-06, + "loss": 0.756, + "step": 346 + }, + { + "epoch": 0.4840734712857475, + "grad_norm": 2.2142506724911457, + "learning_rate": 9.88710427508918e-06, + "loss": 0.7708, + "step": 347 + }, + { + "epoch": 0.48546849569867473, + "grad_norm": 2.2408289991941195, + "learning_rate": 9.885380738279111e-06, + "loss": 0.6929, + "step": 348 + }, + { + "epoch": 0.48686352011160194, + "grad_norm": 2.1080662535889476, + "learning_rate": 9.883644297171821e-06, + "loss": 0.7265, + "step": 349 + }, + { + "epoch": 0.48825854452452916, + "grad_norm": 2.090284461635897, + "learning_rate": 9.881894956353963e-06, + "loss": 0.7531, + "step": 350 + }, + { + "epoch": 0.48965356893745643, + "grad_norm": 2.1569142034243027, + "learning_rate": 9.880132720446265e-06, + "loss": 0.7172, + "step": 351 + }, + { + "epoch": 0.49104859335038364, + "grad_norm": 2.217562182182237, + "learning_rate": 9.878357594103516e-06, + "loss": 0.7737, + "step": 352 + }, + { + "epoch": 0.49244361776331086, + "grad_norm": 2.136573317949441, + "learning_rate": 9.876569582014554e-06, + "loss": 0.7205, + "step": 353 + }, + { + "epoch": 0.4938386421762381, + "grad_norm": 2.170725336806224, + "learning_rate": 9.874768688902252e-06, + "loss": 0.7557, + "step": 354 + }, + { + "epoch": 0.4952336665891653, + "grad_norm": 1.9745473881575657, + "learning_rate": 9.87295491952351e-06, + "loss": 0.7539, + "step": 355 + }, + { + "epoch": 0.49662869100209256, + "grad_norm": 2.1969404437698485, + "learning_rate": 9.871128278669238e-06, + "loss": 0.7255, + "step": 356 + }, + { + "epoch": 0.4980237154150198, + "grad_norm": 2.1672166906798886, + "learning_rate": 9.869288771164344e-06, + "loss": 0.7185, + "step": 357 + }, + { + "epoch": 0.499418739827947, + "grad_norm": 2.260245290048278, + "learning_rate": 9.867436401867723e-06, + "loss": 0.7316, + "step": 358 + }, + { + "epoch": 0.5008137642408742, + "grad_norm": 2.3386260178155363, + "learning_rate": 9.865571175672245e-06, + "loss": 0.7634, + "step": 359 + }, + { + "epoch": 0.5022087886538015, + "grad_norm": 2.1433025976011373, + "learning_rate": 9.863693097504733e-06, + "loss": 0.7115, + "step": 360 + }, + { + "epoch": 0.5036038130667286, + "grad_norm": 2.130459491307474, + "learning_rate": 9.86180217232597e-06, + "loss": 0.7737, + "step": 361 + }, + { + "epoch": 0.5049988374796559, + "grad_norm": 2.419050275087982, + "learning_rate": 9.859898405130661e-06, + "loss": 0.792, + "step": 362 + }, + { + "epoch": 0.5063938618925832, + "grad_norm": 2.301514017797739, + "learning_rate": 9.85798180094744e-06, + "loss": 0.7134, + "step": 363 + }, + { + "epoch": 0.5077888863055103, + "grad_norm": 2.2579550766262795, + "learning_rate": 9.856052364838846e-06, + "loss": 0.8086, + "step": 364 + }, + { + "epoch": 0.5091839107184376, + "grad_norm": 2.1715387851235093, + "learning_rate": 9.854110101901308e-06, + "loss": 0.6743, + "step": 365 + }, + { + "epoch": 0.5105789351313648, + "grad_norm": 1.9745286906089936, + "learning_rate": 9.852155017265146e-06, + "loss": 0.6278, + "step": 366 + }, + { + "epoch": 0.511973959544292, + "grad_norm": 2.211330485942951, + "learning_rate": 9.850187116094538e-06, + "loss": 0.7201, + "step": 367 + }, + { + "epoch": 0.5133689839572193, + "grad_norm": 2.089703071837029, + "learning_rate": 9.848206403587521e-06, + "loss": 0.762, + "step": 368 + }, + { + "epoch": 0.5147640083701465, + "grad_norm": 2.291478063761164, + "learning_rate": 9.84621288497597e-06, + "loss": 0.7553, + "step": 369 + }, + { + "epoch": 0.5161590327830737, + "grad_norm": 2.1165384247792787, + "learning_rate": 9.844206565525585e-06, + "loss": 0.7479, + "step": 370 + }, + { + "epoch": 0.5175540571960009, + "grad_norm": 2.153127909928685, + "learning_rate": 9.842187450535881e-06, + "loss": 0.7315, + "step": 371 + }, + { + "epoch": 0.5189490816089282, + "grad_norm": 2.4142012063691767, + "learning_rate": 9.840155545340169e-06, + "loss": 0.7433, + "step": 372 + }, + { + "epoch": 0.5203441060218554, + "grad_norm": 2.248477383285279, + "learning_rate": 9.838110855305548e-06, + "loss": 0.71, + "step": 373 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 2.018915478548314, + "learning_rate": 9.836053385832881e-06, + "loss": 0.6496, + "step": 374 + }, + { + "epoch": 0.5231341548477099, + "grad_norm": 2.1418708589656763, + "learning_rate": 9.833983142356792e-06, + "loss": 0.7287, + "step": 375 + }, + { + "epoch": 0.524529179260637, + "grad_norm": 2.2765930572342206, + "learning_rate": 9.831900130345645e-06, + "loss": 0.702, + "step": 376 + }, + { + "epoch": 0.5259242036735643, + "grad_norm": 2.059446170325579, + "learning_rate": 9.829804355301527e-06, + "loss": 0.6754, + "step": 377 + }, + { + "epoch": 0.5273192280864916, + "grad_norm": 2.054278197112765, + "learning_rate": 9.827695822760245e-06, + "loss": 0.6766, + "step": 378 + }, + { + "epoch": 0.5287142524994187, + "grad_norm": 2.2795716839624522, + "learning_rate": 9.825574538291293e-06, + "loss": 0.7354, + "step": 379 + }, + { + "epoch": 0.530109276912346, + "grad_norm": 2.0454813982579445, + "learning_rate": 9.823440507497863e-06, + "loss": 0.6778, + "step": 380 + }, + { + "epoch": 0.5315043013252732, + "grad_norm": 2.2322626658913163, + "learning_rate": 9.821293736016802e-06, + "loss": 0.6926, + "step": 381 + }, + { + "epoch": 0.5328993257382004, + "grad_norm": 2.256035142767682, + "learning_rate": 9.819134229518617e-06, + "loss": 0.7456, + "step": 382 + }, + { + "epoch": 0.5342943501511277, + "grad_norm": 2.1446252720693253, + "learning_rate": 9.81696199370745e-06, + "loss": 0.7224, + "step": 383 + }, + { + "epoch": 0.5356893745640549, + "grad_norm": 2.122015405672622, + "learning_rate": 9.814777034321069e-06, + "loss": 0.7462, + "step": 384 + }, + { + "epoch": 0.5370843989769821, + "grad_norm": 2.215882487448917, + "learning_rate": 9.812579357130848e-06, + "loss": 0.7693, + "step": 385 + }, + { + "epoch": 0.5384794233899093, + "grad_norm": 2.0156378005077924, + "learning_rate": 9.810368967941757e-06, + "loss": 0.714, + "step": 386 + }, + { + "epoch": 0.5398744478028366, + "grad_norm": 2.175601216674339, + "learning_rate": 9.808145872592341e-06, + "loss": 0.7194, + "step": 387 + }, + { + "epoch": 0.5412694722157638, + "grad_norm": 2.1421467209857425, + "learning_rate": 9.80591007695471e-06, + "loss": 0.6899, + "step": 388 + }, + { + "epoch": 0.542664496628691, + "grad_norm": 2.168215447780632, + "learning_rate": 9.803661586934514e-06, + "loss": 0.7049, + "step": 389 + }, + { + "epoch": 0.5440595210416183, + "grad_norm": 2.2372275971583826, + "learning_rate": 9.801400408470943e-06, + "loss": 0.7563, + "step": 390 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 2.1698822817889765, + "learning_rate": 9.799126547536695e-06, + "loss": 0.7542, + "step": 391 + }, + { + "epoch": 0.5468495698674727, + "grad_norm": 2.1058241804935633, + "learning_rate": 9.796840010137972e-06, + "loss": 0.7029, + "step": 392 + }, + { + "epoch": 0.5482445942804, + "grad_norm": 2.107054044072128, + "learning_rate": 9.79454080231446e-06, + "loss": 0.8368, + "step": 393 + }, + { + "epoch": 0.5496396186933271, + "grad_norm": 2.0206549367281177, + "learning_rate": 9.79222893013931e-06, + "loss": 0.646, + "step": 394 + }, + { + "epoch": 0.5510346431062544, + "grad_norm": 2.2344446550776302, + "learning_rate": 9.789904399719124e-06, + "loss": 0.7744, + "step": 395 + }, + { + "epoch": 0.5524296675191815, + "grad_norm": 2.10009826145433, + "learning_rate": 9.787567217193944e-06, + "loss": 0.6684, + "step": 396 + }, + { + "epoch": 0.5538246919321088, + "grad_norm": 2.143962574748327, + "learning_rate": 9.785217388737232e-06, + "loss": 0.744, + "step": 397 + }, + { + "epoch": 0.5552197163450361, + "grad_norm": 2.061035666351773, + "learning_rate": 9.782854920555844e-06, + "loss": 0.6964, + "step": 398 + }, + { + "epoch": 0.5566147407579632, + "grad_norm": 1.9503185836027328, + "learning_rate": 9.780479818890032e-06, + "loss": 0.7649, + "step": 399 + }, + { + "epoch": 0.5580097651708905, + "grad_norm": 2.1497215485134293, + "learning_rate": 9.778092090013416e-06, + "loss": 0.7134, + "step": 400 + }, + { + "epoch": 0.5594047895838177, + "grad_norm": 1.9121136304128945, + "learning_rate": 9.775691740232966e-06, + "loss": 0.6743, + "step": 401 + }, + { + "epoch": 0.560799813996745, + "grad_norm": 2.0333296587405756, + "learning_rate": 9.773278775888995e-06, + "loss": 0.7461, + "step": 402 + }, + { + "epoch": 0.5621948384096722, + "grad_norm": 2.1416975619415672, + "learning_rate": 9.77085320335513e-06, + "loss": 0.706, + "step": 403 + }, + { + "epoch": 0.5635898628225994, + "grad_norm": 2.2072504287834893, + "learning_rate": 9.768415029038304e-06, + "loss": 0.7114, + "step": 404 + }, + { + "epoch": 0.5649848872355266, + "grad_norm": 2.0068520321883985, + "learning_rate": 9.76596425937874e-06, + "loss": 0.6889, + "step": 405 + }, + { + "epoch": 0.5663799116484538, + "grad_norm": 2.043527386927699, + "learning_rate": 9.763500900849926e-06, + "loss": 0.6655, + "step": 406 + }, + { + "epoch": 0.5677749360613811, + "grad_norm": 2.3563340564084028, + "learning_rate": 9.761024959958605e-06, + "loss": 0.7508, + "step": 407 + }, + { + "epoch": 0.5691699604743083, + "grad_norm": 2.318133765773976, + "learning_rate": 9.75853644324475e-06, + "loss": 0.7575, + "step": 408 + }, + { + "epoch": 0.5705649848872355, + "grad_norm": 2.336246379174633, + "learning_rate": 9.756035357281559e-06, + "loss": 0.7395, + "step": 409 + }, + { + "epoch": 0.5719600093001628, + "grad_norm": 2.0727771110869555, + "learning_rate": 9.753521708675426e-06, + "loss": 0.7023, + "step": 410 + }, + { + "epoch": 0.5733550337130899, + "grad_norm": 2.1368053878525615, + "learning_rate": 9.75099550406593e-06, + "loss": 0.7111, + "step": 411 + }, + { + "epoch": 0.5747500581260172, + "grad_norm": 2.0396312273143304, + "learning_rate": 9.748456750125817e-06, + "loss": 0.7549, + "step": 412 + }, + { + "epoch": 0.5761450825389445, + "grad_norm": 2.15230083012261, + "learning_rate": 9.745905453560976e-06, + "loss": 0.7068, + "step": 413 + }, + { + "epoch": 0.5775401069518716, + "grad_norm": 1.9402253989855511, + "learning_rate": 9.74334162111043e-06, + "loss": 0.7451, + "step": 414 + }, + { + "epoch": 0.5789351313647989, + "grad_norm": 2.0708294796332423, + "learning_rate": 9.740765259546312e-06, + "loss": 0.7244, + "step": 415 + }, + { + "epoch": 0.5803301557777261, + "grad_norm": 2.247404010227105, + "learning_rate": 9.738176375673856e-06, + "loss": 0.7391, + "step": 416 + }, + { + "epoch": 0.5817251801906533, + "grad_norm": 2.158120031551307, + "learning_rate": 9.735574976331362e-06, + "loss": 0.6581, + "step": 417 + }, + { + "epoch": 0.5831202046035806, + "grad_norm": 2.1433546708079123, + "learning_rate": 9.732961068390199e-06, + "loss": 0.7543, + "step": 418 + }, + { + "epoch": 0.5845152290165078, + "grad_norm": 2.340443501167355, + "learning_rate": 9.730334658754767e-06, + "loss": 0.8277, + "step": 419 + }, + { + "epoch": 0.585910253429435, + "grad_norm": 2.071223812481573, + "learning_rate": 9.727695754362498e-06, + "loss": 0.7457, + "step": 420 + }, + { + "epoch": 0.5873052778423622, + "grad_norm": 2.072387969114693, + "learning_rate": 9.725044362183817e-06, + "loss": 0.7159, + "step": 421 + }, + { + "epoch": 0.5887003022552895, + "grad_norm": 2.0362411373585934, + "learning_rate": 9.722380489222145e-06, + "loss": 0.6802, + "step": 422 + }, + { + "epoch": 0.5900953266682167, + "grad_norm": 1.9495420698612345, + "learning_rate": 9.71970414251386e-06, + "loss": 0.6581, + "step": 423 + }, + { + "epoch": 0.5914903510811439, + "grad_norm": 2.12038681669184, + "learning_rate": 9.717015329128294e-06, + "loss": 0.7466, + "step": 424 + }, + { + "epoch": 0.5928853754940712, + "grad_norm": 2.130743892578285, + "learning_rate": 9.714314056167711e-06, + "loss": 0.7127, + "step": 425 + }, + { + "epoch": 0.5942803999069983, + "grad_norm": 1.8415534741189161, + "learning_rate": 9.711600330767278e-06, + "loss": 0.6402, + "step": 426 + }, + { + "epoch": 0.5956754243199256, + "grad_norm": 2.1495516487854354, + "learning_rate": 9.708874160095061e-06, + "loss": 0.7193, + "step": 427 + }, + { + "epoch": 0.5970704487328529, + "grad_norm": 2.0662475547351504, + "learning_rate": 9.706135551351996e-06, + "loss": 0.7214, + "step": 428 + }, + { + "epoch": 0.59846547314578, + "grad_norm": 2.0147057581611603, + "learning_rate": 9.703384511771874e-06, + "loss": 0.6683, + "step": 429 + }, + { + "epoch": 0.5998604975587073, + "grad_norm": 2.0161707870474923, + "learning_rate": 9.700621048621322e-06, + "loss": 0.6625, + "step": 430 + }, + { + "epoch": 0.6012555219716345, + "grad_norm": 1.9250124900125618, + "learning_rate": 9.697845169199775e-06, + "loss": 0.6494, + "step": 431 + }, + { + "epoch": 0.6026505463845617, + "grad_norm": 2.2505039960336695, + "learning_rate": 9.69505688083948e-06, + "loss": 0.7627, + "step": 432 + }, + { + "epoch": 0.604045570797489, + "grad_norm": 1.9421507056472695, + "learning_rate": 9.692256190905444e-06, + "loss": 0.6568, + "step": 433 + }, + { + "epoch": 0.6054405952104162, + "grad_norm": 2.1917513336388263, + "learning_rate": 9.689443106795442e-06, + "loss": 0.7268, + "step": 434 + }, + { + "epoch": 0.6068356196233434, + "grad_norm": 2.177671026580378, + "learning_rate": 9.686617635939988e-06, + "loss": 0.6854, + "step": 435 + }, + { + "epoch": 0.6082306440362706, + "grad_norm": 2.1823436209408302, + "learning_rate": 9.683779785802306e-06, + "loss": 0.7476, + "step": 436 + }, + { + "epoch": 0.6096256684491979, + "grad_norm": 2.2350100854518016, + "learning_rate": 9.680929563878327e-06, + "loss": 0.7023, + "step": 437 + }, + { + "epoch": 0.6110206928621251, + "grad_norm": 2.210520947527818, + "learning_rate": 9.678066977696656e-06, + "loss": 0.7577, + "step": 438 + }, + { + "epoch": 0.6124157172750523, + "grad_norm": 2.096018066597859, + "learning_rate": 9.675192034818561e-06, + "loss": 0.7356, + "step": 439 + }, + { + "epoch": 0.6138107416879796, + "grad_norm": 2.1973984969358873, + "learning_rate": 9.672304742837945e-06, + "loss": 0.8018, + "step": 440 + }, + { + "epoch": 0.6152057661009067, + "grad_norm": 2.1078413941321847, + "learning_rate": 9.669405109381335e-06, + "loss": 0.7291, + "step": 441 + }, + { + "epoch": 0.616600790513834, + "grad_norm": 2.1083633709141676, + "learning_rate": 9.66649314210785e-06, + "loss": 0.7295, + "step": 442 + }, + { + "epoch": 0.6179958149267613, + "grad_norm": 1.971998147575059, + "learning_rate": 9.663568848709194e-06, + "loss": 0.7279, + "step": 443 + }, + { + "epoch": 0.6193908393396884, + "grad_norm": 2.12121490325952, + "learning_rate": 9.660632236909628e-06, + "loss": 0.7248, + "step": 444 + }, + { + "epoch": 0.6207858637526157, + "grad_norm": 1.91738211282543, + "learning_rate": 9.657683314465948e-06, + "loss": 0.6881, + "step": 445 + }, + { + "epoch": 0.6221808881655428, + "grad_norm": 2.1377260400944667, + "learning_rate": 9.65472208916747e-06, + "loss": 0.7835, + "step": 446 + }, + { + "epoch": 0.6235759125784701, + "grad_norm": 2.1471575379428147, + "learning_rate": 9.651748568836007e-06, + "loss": 0.7644, + "step": 447 + }, + { + "epoch": 0.6249709369913974, + "grad_norm": 2.0889010116447744, + "learning_rate": 9.648762761325847e-06, + "loss": 0.6891, + "step": 448 + }, + { + "epoch": 0.6263659614043245, + "grad_norm": 2.052766079746152, + "learning_rate": 9.645764674523732e-06, + "loss": 0.7842, + "step": 449 + }, + { + "epoch": 0.6277609858172518, + "grad_norm": 2.0717698815155754, + "learning_rate": 9.642754316348846e-06, + "loss": 0.7092, + "step": 450 + }, + { + "epoch": 0.629156010230179, + "grad_norm": 2.003583787136136, + "learning_rate": 9.639731694752776e-06, + "loss": 0.7502, + "step": 451 + }, + { + "epoch": 0.6305510346431062, + "grad_norm": 2.2022170633535905, + "learning_rate": 9.636696817719511e-06, + "loss": 0.7535, + "step": 452 + }, + { + "epoch": 0.6319460590560335, + "grad_norm": 2.2113641807562483, + "learning_rate": 9.633649693265406e-06, + "loss": 0.7492, + "step": 453 + }, + { + "epoch": 0.6333410834689607, + "grad_norm": 2.2244980414553717, + "learning_rate": 9.630590329439169e-06, + "loss": 0.7785, + "step": 454 + }, + { + "epoch": 0.634736107881888, + "grad_norm": 2.1937904123098346, + "learning_rate": 9.627518734321837e-06, + "loss": 0.7498, + "step": 455 + }, + { + "epoch": 0.6361311322948151, + "grad_norm": 2.1493347301981562, + "learning_rate": 9.624434916026752e-06, + "loss": 0.7419, + "step": 456 + }, + { + "epoch": 0.6375261567077424, + "grad_norm": 2.052486403142985, + "learning_rate": 9.621338882699547e-06, + "loss": 0.7394, + "step": 457 + }, + { + "epoch": 0.6389211811206696, + "grad_norm": 2.094585882639152, + "learning_rate": 9.618230642518117e-06, + "loss": 0.6976, + "step": 458 + }, + { + "epoch": 0.6403162055335968, + "grad_norm": 2.0524171839415466, + "learning_rate": 9.615110203692602e-06, + "loss": 0.7693, + "step": 459 + }, + { + "epoch": 0.6417112299465241, + "grad_norm": 2.1271109954195255, + "learning_rate": 9.61197757446536e-06, + "loss": 0.7251, + "step": 460 + }, + { + "epoch": 0.6431062543594513, + "grad_norm": 1.9893310410454985, + "learning_rate": 9.608832763110955e-06, + "loss": 0.6991, + "step": 461 + }, + { + "epoch": 0.6445012787723785, + "grad_norm": 2.1917383599900195, + "learning_rate": 9.605675777936123e-06, + "loss": 0.721, + "step": 462 + }, + { + "epoch": 0.6458963031853058, + "grad_norm": 2.065643026017452, + "learning_rate": 9.60250662727976e-06, + "loss": 0.7297, + "step": 463 + }, + { + "epoch": 0.6472913275982329, + "grad_norm": 2.110893858590918, + "learning_rate": 9.599325319512893e-06, + "loss": 0.7308, + "step": 464 + }, + { + "epoch": 0.6486863520111602, + "grad_norm": 2.0073869238725686, + "learning_rate": 9.596131863038664e-06, + "loss": 0.7128, + "step": 465 + }, + { + "epoch": 0.6500813764240875, + "grad_norm": 2.0487238199374116, + "learning_rate": 9.592926266292305e-06, + "loss": 0.7089, + "step": 466 + }, + { + "epoch": 0.6514764008370146, + "grad_norm": 2.256500523858216, + "learning_rate": 9.589708537741109e-06, + "loss": 0.7346, + "step": 467 + }, + { + "epoch": 0.6528714252499419, + "grad_norm": 2.02187405184828, + "learning_rate": 9.586478685884424e-06, + "loss": 0.6863, + "step": 468 + }, + { + "epoch": 0.6542664496628691, + "grad_norm": 2.079210108446843, + "learning_rate": 9.583236719253611e-06, + "loss": 0.7043, + "step": 469 + }, + { + "epoch": 0.6556614740757963, + "grad_norm": 2.1732934816756595, + "learning_rate": 9.579982646412039e-06, + "loss": 0.7179, + "step": 470 + }, + { + "epoch": 0.6570564984887236, + "grad_norm": 1.9394830539544319, + "learning_rate": 9.576716475955048e-06, + "loss": 0.6399, + "step": 471 + }, + { + "epoch": 0.6584515229016508, + "grad_norm": 2.056824279904441, + "learning_rate": 9.573438216509937e-06, + "loss": 0.6993, + "step": 472 + }, + { + "epoch": 0.659846547314578, + "grad_norm": 2.0143468165401277, + "learning_rate": 9.570147876735937e-06, + "loss": 0.7372, + "step": 473 + }, + { + "epoch": 0.6612415717275052, + "grad_norm": 1.8931355054634165, + "learning_rate": 9.566845465324185e-06, + "loss": 0.6776, + "step": 474 + }, + { + "epoch": 0.6626365961404325, + "grad_norm": 1.9966387912195087, + "learning_rate": 9.563530990997707e-06, + "loss": 0.7671, + "step": 475 + }, + { + "epoch": 0.6640316205533597, + "grad_norm": 2.0817025057585106, + "learning_rate": 9.560204462511392e-06, + "loss": 0.7347, + "step": 476 + }, + { + "epoch": 0.6654266449662869, + "grad_norm": 1.9834426105549026, + "learning_rate": 9.556865888651965e-06, + "loss": 0.7549, + "step": 477 + }, + { + "epoch": 0.6668216693792142, + "grad_norm": 1.9850422235674683, + "learning_rate": 9.553515278237975e-06, + "loss": 0.6782, + "step": 478 + }, + { + "epoch": 0.6682166937921413, + "grad_norm": 2.1734748685895076, + "learning_rate": 9.550152640119757e-06, + "loss": 0.7347, + "step": 479 + }, + { + "epoch": 0.6696117182050686, + "grad_norm": 2.141674611099673, + "learning_rate": 9.546777983179421e-06, + "loss": 0.7581, + "step": 480 + }, + { + "epoch": 0.6710067426179959, + "grad_norm": 1.8999376232525609, + "learning_rate": 9.543391316330822e-06, + "loss": 0.6363, + "step": 481 + }, + { + "epoch": 0.672401767030923, + "grad_norm": 2.0279797772457187, + "learning_rate": 9.539992648519538e-06, + "loss": 0.7463, + "step": 482 + }, + { + "epoch": 0.6737967914438503, + "grad_norm": 2.232464103440747, + "learning_rate": 9.536581988722848e-06, + "loss": 0.8279, + "step": 483 + }, + { + "epoch": 0.6751918158567775, + "grad_norm": 2.1498923126332934, + "learning_rate": 9.533159345949704e-06, + "loss": 0.7615, + "step": 484 + }, + { + "epoch": 0.6765868402697047, + "grad_norm": 1.9252842661024974, + "learning_rate": 9.529724729240712e-06, + "loss": 0.6805, + "step": 485 + }, + { + "epoch": 0.677981864682632, + "grad_norm": 2.0678336057421647, + "learning_rate": 9.526278147668104e-06, + "loss": 0.696, + "step": 486 + }, + { + "epoch": 0.6793768890955592, + "grad_norm": 2.075760114316058, + "learning_rate": 9.522819610335721e-06, + "loss": 0.7246, + "step": 487 + }, + { + "epoch": 0.6807719135084864, + "grad_norm": 1.8528598818636521, + "learning_rate": 9.519349126378975e-06, + "loss": 0.6839, + "step": 488 + }, + { + "epoch": 0.6821669379214136, + "grad_norm": 2.200019813823996, + "learning_rate": 9.515866704964846e-06, + "loss": 0.7495, + "step": 489 + }, + { + "epoch": 0.6835619623343409, + "grad_norm": 2.0916960362541683, + "learning_rate": 9.512372355291838e-06, + "loss": 0.7135, + "step": 490 + }, + { + "epoch": 0.6849569867472681, + "grad_norm": 1.8178206174973344, + "learning_rate": 9.50886608658996e-06, + "loss": 0.6405, + "step": 491 + }, + { + "epoch": 0.6863520111601953, + "grad_norm": 2.038956346042495, + "learning_rate": 9.505347908120712e-06, + "loss": 0.7408, + "step": 492 + }, + { + "epoch": 0.6877470355731226, + "grad_norm": 2.0715000507645147, + "learning_rate": 9.501817829177046e-06, + "loss": 0.7166, + "step": 493 + }, + { + "epoch": 0.6891420599860497, + "grad_norm": 2.1401200540716094, + "learning_rate": 9.498275859083353e-06, + "loss": 0.7301, + "step": 494 + }, + { + "epoch": 0.690537084398977, + "grad_norm": 2.042730741273458, + "learning_rate": 9.494722007195427e-06, + "loss": 0.7038, + "step": 495 + }, + { + "epoch": 0.6919321088119043, + "grad_norm": 2.0381664858059594, + "learning_rate": 9.491156282900454e-06, + "loss": 0.7126, + "step": 496 + }, + { + "epoch": 0.6933271332248314, + "grad_norm": 2.1216424421132616, + "learning_rate": 9.487578695616974e-06, + "loss": 0.7825, + "step": 497 + }, + { + "epoch": 0.6947221576377587, + "grad_norm": 2.1488526330119817, + "learning_rate": 9.483989254794865e-06, + "loss": 0.7303, + "step": 498 + }, + { + "epoch": 0.6961171820506858, + "grad_norm": 1.9356761977741135, + "learning_rate": 9.480387969915318e-06, + "loss": 0.6243, + "step": 499 + }, + { + "epoch": 0.6975122064636131, + "grad_norm": 2.0978054484537436, + "learning_rate": 9.476774850490803e-06, + "loss": 0.738, + "step": 500 + }, + { + "epoch": 0.6989072308765404, + "grad_norm": 2.055561455319892, + "learning_rate": 9.47314990606505e-06, + "loss": 0.7204, + "step": 501 + }, + { + "epoch": 0.7003022552894675, + "grad_norm": 2.0680562366639017, + "learning_rate": 9.46951314621303e-06, + "loss": 0.6993, + "step": 502 + }, + { + "epoch": 0.7016972797023948, + "grad_norm": 2.2996197722023473, + "learning_rate": 9.465864580540917e-06, + "loss": 0.8076, + "step": 503 + }, + { + "epoch": 0.703092304115322, + "grad_norm": 2.120383099992889, + "learning_rate": 9.462204218686075e-06, + "loss": 0.703, + "step": 504 + }, + { + "epoch": 0.7044873285282492, + "grad_norm": 2.089709579485426, + "learning_rate": 9.458532070317021e-06, + "loss": 0.727, + "step": 505 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 2.003557653106253, + "learning_rate": 9.454848145133406e-06, + "loss": 0.7068, + "step": 506 + }, + { + "epoch": 0.7072773773541037, + "grad_norm": 2.2817445962593568, + "learning_rate": 9.451152452865991e-06, + "loss": 0.7133, + "step": 507 + }, + { + "epoch": 0.708672401767031, + "grad_norm": 1.9539415691936202, + "learning_rate": 9.447445003276618e-06, + "loss": 0.6586, + "step": 508 + }, + { + "epoch": 0.7100674261799581, + "grad_norm": 2.1970670306323252, + "learning_rate": 9.443725806158182e-06, + "loss": 0.7327, + "step": 509 + }, + { + "epoch": 0.7114624505928854, + "grad_norm": 1.9539434604144215, + "learning_rate": 9.439994871334614e-06, + "loss": 0.716, + "step": 510 + }, + { + "epoch": 0.7128574750058126, + "grad_norm": 2.187186646260326, + "learning_rate": 9.43625220866084e-06, + "loss": 0.7124, + "step": 511 + }, + { + "epoch": 0.7142524994187398, + "grad_norm": 2.1115906329667467, + "learning_rate": 9.432497828022775e-06, + "loss": 0.7073, + "step": 512 + }, + { + "epoch": 0.7156475238316671, + "grad_norm": 2.162561189896867, + "learning_rate": 9.428731739337277e-06, + "loss": 0.8114, + "step": 513 + }, + { + "epoch": 0.7170425482445942, + "grad_norm": 2.229988057904462, + "learning_rate": 9.424953952552134e-06, + "loss": 0.7294, + "step": 514 + }, + { + "epoch": 0.7184375726575215, + "grad_norm": 2.0444286689625777, + "learning_rate": 9.421164477646031e-06, + "loss": 0.7036, + "step": 515 + }, + { + "epoch": 0.7198325970704488, + "grad_norm": 2.234165446135079, + "learning_rate": 9.41736332462853e-06, + "loss": 0.7559, + "step": 516 + }, + { + "epoch": 0.7212276214833759, + "grad_norm": 2.119512330275446, + "learning_rate": 9.413550503540039e-06, + "loss": 0.7381, + "step": 517 + }, + { + "epoch": 0.7226226458963032, + "grad_norm": 2.0076919464556537, + "learning_rate": 9.409726024451781e-06, + "loss": 0.7415, + "step": 518 + }, + { + "epoch": 0.7240176703092304, + "grad_norm": 1.9682427916024752, + "learning_rate": 9.40588989746578e-06, + "loss": 0.689, + "step": 519 + }, + { + "epoch": 0.7254126947221576, + "grad_norm": 2.018101001493541, + "learning_rate": 9.402042132714817e-06, + "loss": 0.701, + "step": 520 + }, + { + "epoch": 0.7268077191350849, + "grad_norm": 2.0465788846719306, + "learning_rate": 9.398182740362424e-06, + "loss": 0.7293, + "step": 521 + }, + { + "epoch": 0.7282027435480121, + "grad_norm": 2.0004289134814948, + "learning_rate": 9.39431173060284e-06, + "loss": 0.7275, + "step": 522 + }, + { + "epoch": 0.7295977679609393, + "grad_norm": 1.9897615420029091, + "learning_rate": 9.390429113660993e-06, + "loss": 0.7067, + "step": 523 + }, + { + "epoch": 0.7309927923738665, + "grad_norm": 2.108334666567339, + "learning_rate": 9.38653489979247e-06, + "loss": 0.7598, + "step": 524 + }, + { + "epoch": 0.7323878167867938, + "grad_norm": 2.0753873908464016, + "learning_rate": 9.382629099283486e-06, + "loss": 0.7073, + "step": 525 + }, + { + "epoch": 0.733782841199721, + "grad_norm": 2.0224236511779643, + "learning_rate": 9.378711722450866e-06, + "loss": 0.7318, + "step": 526 + }, + { + "epoch": 0.7351778656126482, + "grad_norm": 2.030128554628202, + "learning_rate": 9.374782779642013e-06, + "loss": 0.7123, + "step": 527 + }, + { + "epoch": 0.7365728900255755, + "grad_norm": 2.0236490043479325, + "learning_rate": 9.370842281234876e-06, + "loss": 0.7294, + "step": 528 + }, + { + "epoch": 0.7379679144385026, + "grad_norm": 1.9068088991344194, + "learning_rate": 9.366890237637932e-06, + "loss": 0.6811, + "step": 529 + }, + { + "epoch": 0.7393629388514299, + "grad_norm": 1.9480660854694714, + "learning_rate": 9.362926659290149e-06, + "loss": 0.6497, + "step": 530 + }, + { + "epoch": 0.7407579632643572, + "grad_norm": 2.0886372769291, + "learning_rate": 9.358951556660968e-06, + "loss": 0.7122, + "step": 531 + }, + { + "epoch": 0.7421529876772843, + "grad_norm": 2.0028547451656307, + "learning_rate": 9.354964940250269e-06, + "loss": 0.6725, + "step": 532 + }, + { + "epoch": 0.7435480120902116, + "grad_norm": 1.9028605868969077, + "learning_rate": 9.35096682058834e-06, + "loss": 0.6919, + "step": 533 + }, + { + "epoch": 0.7449430365031388, + "grad_norm": 2.0144375936051304, + "learning_rate": 9.346957208235857e-06, + "loss": 0.7434, + "step": 534 + }, + { + "epoch": 0.746338060916066, + "grad_norm": 2.0375245422409587, + "learning_rate": 9.342936113783855e-06, + "loss": 0.701, + "step": 535 + }, + { + "epoch": 0.7477330853289933, + "grad_norm": 2.1341077847797685, + "learning_rate": 9.338903547853698e-06, + "loss": 0.727, + "step": 536 + }, + { + "epoch": 0.7491281097419205, + "grad_norm": 2.1262987908622346, + "learning_rate": 9.334859521097046e-06, + "loss": 0.7128, + "step": 537 + }, + { + "epoch": 0.7505231341548477, + "grad_norm": 1.9324820289749198, + "learning_rate": 9.330804044195836e-06, + "loss": 0.6821, + "step": 538 + }, + { + "epoch": 0.7519181585677749, + "grad_norm": 2.1731776165453995, + "learning_rate": 9.326737127862249e-06, + "loss": 0.7122, + "step": 539 + }, + { + "epoch": 0.7533131829807022, + "grad_norm": 2.16764837125525, + "learning_rate": 9.32265878283868e-06, + "loss": 0.6994, + "step": 540 + }, + { + "epoch": 0.7547082073936294, + "grad_norm": 2.1002883470462534, + "learning_rate": 9.318569019897713e-06, + "loss": 0.7543, + "step": 541 + }, + { + "epoch": 0.7561032318065566, + "grad_norm": 1.9503441807929596, + "learning_rate": 9.314467849842093e-06, + "loss": 0.7119, + "step": 542 + }, + { + "epoch": 0.7574982562194839, + "grad_norm": 2.082793001639134, + "learning_rate": 9.310355283504696e-06, + "loss": 0.7421, + "step": 543 + }, + { + "epoch": 0.758893280632411, + "grad_norm": 1.9966160162515905, + "learning_rate": 9.306231331748496e-06, + "loss": 0.7221, + "step": 544 + }, + { + "epoch": 0.7602883050453383, + "grad_norm": 2.0065387403716475, + "learning_rate": 9.302096005466547e-06, + "loss": 0.6442, + "step": 545 + }, + { + "epoch": 0.7616833294582656, + "grad_norm": 2.2265608152032033, + "learning_rate": 9.29794931558194e-06, + "loss": 0.729, + "step": 546 + }, + { + "epoch": 0.7630783538711927, + "grad_norm": 2.1210857543143544, + "learning_rate": 9.29379127304779e-06, + "loss": 0.7336, + "step": 547 + }, + { + "epoch": 0.76447337828412, + "grad_norm": 2.013283139083976, + "learning_rate": 9.289621888847194e-06, + "loss": 0.7628, + "step": 548 + }, + { + "epoch": 0.7658684026970471, + "grad_norm": 1.8491716982292445, + "learning_rate": 9.285441173993207e-06, + "loss": 0.6365, + "step": 549 + }, + { + "epoch": 0.7672634271099744, + "grad_norm": 2.0841548133116783, + "learning_rate": 9.281249139528816e-06, + "loss": 0.7515, + "step": 550 + }, + { + "epoch": 0.7686584515229017, + "grad_norm": 2.0286961204733256, + "learning_rate": 9.277045796526904e-06, + "loss": 0.7287, + "step": 551 + }, + { + "epoch": 0.7700534759358288, + "grad_norm": 2.068176459964423, + "learning_rate": 9.272831156090229e-06, + "loss": 0.7283, + "step": 552 + }, + { + "epoch": 0.7714485003487561, + "grad_norm": 1.9633423625558333, + "learning_rate": 9.268605229351387e-06, + "loss": 0.7137, + "step": 553 + }, + { + "epoch": 0.7728435247616833, + "grad_norm": 2.100616722256089, + "learning_rate": 9.264368027472785e-06, + "loss": 0.723, + "step": 554 + }, + { + "epoch": 0.7742385491746105, + "grad_norm": 2.0743653211044357, + "learning_rate": 9.260119561646614e-06, + "loss": 0.6835, + "step": 555 + }, + { + "epoch": 0.7756335735875378, + "grad_norm": 2.0116199805624735, + "learning_rate": 9.255859843094817e-06, + "loss": 0.7277, + "step": 556 + }, + { + "epoch": 0.777028598000465, + "grad_norm": 2.1259690175231927, + "learning_rate": 9.25158888306906e-06, + "loss": 0.6814, + "step": 557 + }, + { + "epoch": 0.7784236224133922, + "grad_norm": 2.0748219863598023, + "learning_rate": 9.247306692850705e-06, + "loss": 0.7092, + "step": 558 + }, + { + "epoch": 0.7798186468263194, + "grad_norm": 2.114742076251193, + "learning_rate": 9.243013283750774e-06, + "loss": 0.7712, + "step": 559 + }, + { + "epoch": 0.7812136712392467, + "grad_norm": 1.9896360099254937, + "learning_rate": 9.238708667109924e-06, + "loss": 0.7242, + "step": 560 + }, + { + "epoch": 0.782608695652174, + "grad_norm": 2.002612155228932, + "learning_rate": 9.234392854298414e-06, + "loss": 0.7389, + "step": 561 + }, + { + "epoch": 0.7840037200651011, + "grad_norm": 1.8844768635672766, + "learning_rate": 9.230065856716081e-06, + "loss": 0.6751, + "step": 562 + }, + { + "epoch": 0.7853987444780284, + "grad_norm": 1.9496677958310997, + "learning_rate": 9.225727685792302e-06, + "loss": 0.7079, + "step": 563 + }, + { + "epoch": 0.7867937688909556, + "grad_norm": 2.209067432956444, + "learning_rate": 9.221378352985967e-06, + "loss": 0.7749, + "step": 564 + }, + { + "epoch": 0.7881887933038828, + "grad_norm": 2.0722492473038674, + "learning_rate": 9.217017869785453e-06, + "loss": 0.7057, + "step": 565 + }, + { + "epoch": 0.7895838177168101, + "grad_norm": 2.089575443137789, + "learning_rate": 9.212646247708585e-06, + "loss": 0.7431, + "step": 566 + }, + { + "epoch": 0.7909788421297372, + "grad_norm": 2.063481302101242, + "learning_rate": 9.208263498302613e-06, + "loss": 0.7566, + "step": 567 + }, + { + "epoch": 0.7923738665426645, + "grad_norm": 1.954690180809161, + "learning_rate": 9.203869633144182e-06, + "loss": 0.7144, + "step": 568 + }, + { + "epoch": 0.7937688909555918, + "grad_norm": 2.0082313404704295, + "learning_rate": 9.19946466383929e-06, + "loss": 0.6873, + "step": 569 + }, + { + "epoch": 0.7951639153685189, + "grad_norm": 1.9452585653387595, + "learning_rate": 9.19504860202327e-06, + "loss": 0.6699, + "step": 570 + }, + { + "epoch": 0.7965589397814462, + "grad_norm": 1.9448395893952017, + "learning_rate": 9.19062145936076e-06, + "loss": 0.6453, + "step": 571 + }, + { + "epoch": 0.7979539641943734, + "grad_norm": 2.195833612126511, + "learning_rate": 9.186183247545657e-06, + "loss": 0.783, + "step": 572 + }, + { + "epoch": 0.7993489886073006, + "grad_norm": 2.094994509102352, + "learning_rate": 9.181733978301103e-06, + "loss": 0.715, + "step": 573 + }, + { + "epoch": 0.8007440130202279, + "grad_norm": 2.0540910795180705, + "learning_rate": 9.177273663379449e-06, + "loss": 0.6741, + "step": 574 + }, + { + "epoch": 0.8021390374331551, + "grad_norm": 2.2376315377141442, + "learning_rate": 9.172802314562214e-06, + "loss": 0.7622, + "step": 575 + }, + { + "epoch": 0.8035340618460823, + "grad_norm": 1.9630842252894045, + "learning_rate": 9.16831994366007e-06, + "loss": 0.6517, + "step": 576 + }, + { + "epoch": 0.8049290862590095, + "grad_norm": 2.030945005147177, + "learning_rate": 9.1638265625128e-06, + "loss": 0.6638, + "step": 577 + }, + { + "epoch": 0.8063241106719368, + "grad_norm": 2.2152569686775547, + "learning_rate": 9.159322182989265e-06, + "loss": 0.7079, + "step": 578 + }, + { + "epoch": 0.807719135084864, + "grad_norm": 2.1222069890221342, + "learning_rate": 9.154806816987386e-06, + "loss": 0.6598, + "step": 579 + }, + { + "epoch": 0.8091141594977912, + "grad_norm": 2.039956611874931, + "learning_rate": 9.150280476434098e-06, + "loss": 0.6929, + "step": 580 + }, + { + "epoch": 0.8105091839107185, + "grad_norm": 1.9595741185624305, + "learning_rate": 9.145743173285325e-06, + "loss": 0.6494, + "step": 581 + }, + { + "epoch": 0.8119042083236456, + "grad_norm": 1.914717122402942, + "learning_rate": 9.141194919525949e-06, + "loss": 0.7067, + "step": 582 + }, + { + "epoch": 0.8132992327365729, + "grad_norm": 2.073947853584459, + "learning_rate": 9.136635727169776e-06, + "loss": 0.7273, + "step": 583 + }, + { + "epoch": 0.8146942571495002, + "grad_norm": 2.0532623111872086, + "learning_rate": 9.132065608259505e-06, + "loss": 0.7517, + "step": 584 + }, + { + "epoch": 0.8160892815624273, + "grad_norm": 2.1186320230239937, + "learning_rate": 9.127484574866699e-06, + "loss": 0.7242, + "step": 585 + }, + { + "epoch": 0.8174843059753546, + "grad_norm": 2.0267276329942097, + "learning_rate": 9.122892639091748e-06, + "loss": 0.7265, + "step": 586 + }, + { + "epoch": 0.8188793303882818, + "grad_norm": 1.9639315452093837, + "learning_rate": 9.118289813063842e-06, + "loss": 0.6903, + "step": 587 + }, + { + "epoch": 0.820274354801209, + "grad_norm": 1.9810738468423978, + "learning_rate": 9.11367610894093e-06, + "loss": 0.7175, + "step": 588 + }, + { + "epoch": 0.8216693792141363, + "grad_norm": 2.012247865597074, + "learning_rate": 9.109051538909707e-06, + "loss": 0.6597, + "step": 589 + }, + { + "epoch": 0.8230644036270635, + "grad_norm": 2.075632269020032, + "learning_rate": 9.104416115185557e-06, + "loss": 0.7694, + "step": 590 + }, + { + "epoch": 0.8244594280399907, + "grad_norm": 2.143338879263666, + "learning_rate": 9.099769850012539e-06, + "loss": 0.7563, + "step": 591 + }, + { + "epoch": 0.8258544524529179, + "grad_norm": 1.970372364129679, + "learning_rate": 9.095112755663349e-06, + "loss": 0.6855, + "step": 592 + }, + { + "epoch": 0.8272494768658452, + "grad_norm": 1.9819891434355508, + "learning_rate": 9.090444844439284e-06, + "loss": 0.6982, + "step": 593 + }, + { + "epoch": 0.8286445012787724, + "grad_norm": 1.9951180654869123, + "learning_rate": 9.085766128670218e-06, + "loss": 0.7537, + "step": 594 + }, + { + "epoch": 0.8300395256916996, + "grad_norm": 2.136480645592744, + "learning_rate": 9.08107662071456e-06, + "loss": 0.7076, + "step": 595 + }, + { + "epoch": 0.8314345501046269, + "grad_norm": 2.01008074884727, + "learning_rate": 9.076376332959222e-06, + "loss": 0.724, + "step": 596 + }, + { + "epoch": 0.832829574517554, + "grad_norm": 2.083568256311646, + "learning_rate": 9.071665277819603e-06, + "loss": 0.6798, + "step": 597 + }, + { + "epoch": 0.8342245989304813, + "grad_norm": 2.0713110208220917, + "learning_rate": 9.066943467739529e-06, + "loss": 0.7138, + "step": 598 + }, + { + "epoch": 0.8356196233434086, + "grad_norm": 2.1995098528060653, + "learning_rate": 9.06221091519124e-06, + "loss": 0.7579, + "step": 599 + }, + { + "epoch": 0.8370146477563357, + "grad_norm": 2.1177635938489097, + "learning_rate": 9.057467632675357e-06, + "loss": 0.7709, + "step": 600 + }, + { + "epoch": 0.838409672169263, + "grad_norm": 1.9726857051363949, + "learning_rate": 9.05271363272083e-06, + "loss": 0.7283, + "step": 601 + }, + { + "epoch": 0.8398046965821901, + "grad_norm": 2.042816975781451, + "learning_rate": 9.047948927884927e-06, + "loss": 0.7155, + "step": 602 + }, + { + "epoch": 0.8411997209951174, + "grad_norm": 1.9911024298162479, + "learning_rate": 9.043173530753196e-06, + "loss": 0.7073, + "step": 603 + }, + { + "epoch": 0.8425947454080447, + "grad_norm": 2.188550491711223, + "learning_rate": 9.038387453939416e-06, + "loss": 0.7105, + "step": 604 + }, + { + "epoch": 0.8439897698209718, + "grad_norm": 2.1281020722530015, + "learning_rate": 9.033590710085584e-06, + "loss": 0.6798, + "step": 605 + }, + { + "epoch": 0.8453847942338991, + "grad_norm": 1.9560683283165319, + "learning_rate": 9.028783311861874e-06, + "loss": 0.6667, + "step": 606 + }, + { + "epoch": 0.8467798186468263, + "grad_norm": 1.9865019489409481, + "learning_rate": 9.023965271966595e-06, + "loss": 0.673, + "step": 607 + }, + { + "epoch": 0.8481748430597535, + "grad_norm": 1.975291998773914, + "learning_rate": 9.019136603126171e-06, + "loss": 0.7207, + "step": 608 + }, + { + "epoch": 0.8495698674726808, + "grad_norm": 1.9802488936538558, + "learning_rate": 9.0142973180951e-06, + "loss": 0.7083, + "step": 609 + }, + { + "epoch": 0.850964891885608, + "grad_norm": 2.0530113658928224, + "learning_rate": 9.00944742965592e-06, + "loss": 0.7348, + "step": 610 + }, + { + "epoch": 0.8523599162985352, + "grad_norm": 1.891855625943604, + "learning_rate": 9.004586950619182e-06, + "loss": 0.6832, + "step": 611 + }, + { + "epoch": 0.8537549407114624, + "grad_norm": 2.1036407225739033, + "learning_rate": 8.999715893823404e-06, + "loss": 0.7417, + "step": 612 + }, + { + "epoch": 0.8551499651243897, + "grad_norm": 1.9636866345666872, + "learning_rate": 8.994834272135049e-06, + "loss": 0.7189, + "step": 613 + }, + { + "epoch": 0.856544989537317, + "grad_norm": 1.9975444039165857, + "learning_rate": 8.989942098448485e-06, + "loss": 0.7115, + "step": 614 + }, + { + "epoch": 0.8579400139502441, + "grad_norm": 1.9947649682649553, + "learning_rate": 8.985039385685952e-06, + "loss": 0.698, + "step": 615 + }, + { + "epoch": 0.8593350383631714, + "grad_norm": 1.9552201916690322, + "learning_rate": 8.98012614679753e-06, + "loss": 0.6364, + "step": 616 + }, + { + "epoch": 0.8607300627760985, + "grad_norm": 2.2114912964614946, + "learning_rate": 8.975202394761098e-06, + "loss": 0.6814, + "step": 617 + }, + { + "epoch": 0.8621250871890258, + "grad_norm": 2.0505082371309817, + "learning_rate": 8.970268142582312e-06, + "loss": 0.7083, + "step": 618 + }, + { + "epoch": 0.8635201116019531, + "grad_norm": 2.0065112653612003, + "learning_rate": 8.965323403294553e-06, + "loss": 0.671, + "step": 619 + }, + { + "epoch": 0.8649151360148802, + "grad_norm": 2.050198240561233, + "learning_rate": 8.960368189958913e-06, + "loss": 0.7219, + "step": 620 + }, + { + "epoch": 0.8663101604278075, + "grad_norm": 2.0821632762041, + "learning_rate": 8.955402515664144e-06, + "loss": 0.7255, + "step": 621 + }, + { + "epoch": 0.8677051848407347, + "grad_norm": 2.2374361424470472, + "learning_rate": 8.950426393526633e-06, + "loss": 0.7127, + "step": 622 + }, + { + "epoch": 0.8691002092536619, + "grad_norm": 1.949975651680759, + "learning_rate": 8.945439836690359e-06, + "loss": 0.703, + "step": 623 + }, + { + "epoch": 0.8704952336665892, + "grad_norm": 2.018953506212755, + "learning_rate": 8.940442858326871e-06, + "loss": 0.6668, + "step": 624 + }, + { + "epoch": 0.8718902580795164, + "grad_norm": 1.936317481354868, + "learning_rate": 8.935435471635238e-06, + "loss": 0.7056, + "step": 625 + }, + { + "epoch": 0.8732852824924436, + "grad_norm": 2.0282916260482526, + "learning_rate": 8.93041768984203e-06, + "loss": 0.6652, + "step": 626 + }, + { + "epoch": 0.8746803069053708, + "grad_norm": 2.121255903905443, + "learning_rate": 8.925389526201264e-06, + "loss": 0.6941, + "step": 627 + }, + { + "epoch": 0.8760753313182981, + "grad_norm": 1.8461831360013177, + "learning_rate": 8.920350993994387e-06, + "loss": 0.6375, + "step": 628 + }, + { + "epoch": 0.8774703557312253, + "grad_norm": 2.0563491350367724, + "learning_rate": 8.915302106530234e-06, + "loss": 0.6932, + "step": 629 + }, + { + "epoch": 0.8788653801441525, + "grad_norm": 2.0350553302487695, + "learning_rate": 8.91024287714499e-06, + "loss": 0.6683, + "step": 630 + }, + { + "epoch": 0.8802604045570798, + "grad_norm": 1.9629287472963255, + "learning_rate": 8.905173319202159e-06, + "loss": 0.6976, + "step": 631 + }, + { + "epoch": 0.8816554289700069, + "grad_norm": 1.9555638782336193, + "learning_rate": 8.900093446092523e-06, + "loss": 0.6904, + "step": 632 + }, + { + "epoch": 0.8830504533829342, + "grad_norm": 2.010183514705677, + "learning_rate": 8.895003271234116e-06, + "loss": 0.7959, + "step": 633 + }, + { + "epoch": 0.8844454777958615, + "grad_norm": 2.038174167643802, + "learning_rate": 8.889902808072178e-06, + "loss": 0.6629, + "step": 634 + }, + { + "epoch": 0.8858405022087886, + "grad_norm": 2.0007565750548957, + "learning_rate": 8.884792070079128e-06, + "loss": 0.6973, + "step": 635 + }, + { + "epoch": 0.8872355266217159, + "grad_norm": 1.9422648258075415, + "learning_rate": 8.879671070754527e-06, + "loss": 0.6891, + "step": 636 + }, + { + "epoch": 0.8886305510346431, + "grad_norm": 2.0732698051235987, + "learning_rate": 8.874539823625037e-06, + "loss": 0.7315, + "step": 637 + }, + { + "epoch": 0.8900255754475703, + "grad_norm": 1.9047096265263974, + "learning_rate": 8.869398342244387e-06, + "loss": 0.6898, + "step": 638 + }, + { + "epoch": 0.8914205998604976, + "grad_norm": 2.367826429998185, + "learning_rate": 8.86424664019334e-06, + "loss": 0.7776, + "step": 639 + }, + { + "epoch": 0.8928156242734248, + "grad_norm": 1.9082787894649413, + "learning_rate": 8.859084731079664e-06, + "loss": 0.6917, + "step": 640 + }, + { + "epoch": 0.894210648686352, + "grad_norm": 2.0179079093688097, + "learning_rate": 8.853912628538072e-06, + "loss": 0.7115, + "step": 641 + }, + { + "epoch": 0.8956056730992792, + "grad_norm": 1.9593411702559214, + "learning_rate": 8.84873034623022e-06, + "loss": 0.6561, + "step": 642 + }, + { + "epoch": 0.8970006975122065, + "grad_norm": 1.9223616018745664, + "learning_rate": 8.84353789784464e-06, + "loss": 0.7232, + "step": 643 + }, + { + "epoch": 0.8983957219251337, + "grad_norm": 1.9916894711438682, + "learning_rate": 8.83833529709672e-06, + "loss": 0.7092, + "step": 644 + }, + { + "epoch": 0.8997907463380609, + "grad_norm": 1.8700641744681288, + "learning_rate": 8.833122557728667e-06, + "loss": 0.6773, + "step": 645 + }, + { + "epoch": 0.9011857707509882, + "grad_norm": 2.0854934713518376, + "learning_rate": 8.827899693509467e-06, + "loss": 0.7535, + "step": 646 + }, + { + "epoch": 0.9025807951639153, + "grad_norm": 1.9895333488216704, + "learning_rate": 8.82266671823485e-06, + "loss": 0.7206, + "step": 647 + }, + { + "epoch": 0.9039758195768426, + "grad_norm": 1.8896104921166434, + "learning_rate": 8.817423645727252e-06, + "loss": 0.6875, + "step": 648 + }, + { + "epoch": 0.9053708439897699, + "grad_norm": 2.266703090067289, + "learning_rate": 8.812170489835784e-06, + "loss": 0.7802, + "step": 649 + }, + { + "epoch": 0.906765868402697, + "grad_norm": 2.002845695676102, + "learning_rate": 8.806907264436183e-06, + "loss": 0.6868, + "step": 650 + }, + { + "epoch": 0.9081608928156243, + "grad_norm": 1.9373983885808481, + "learning_rate": 8.801633983430794e-06, + "loss": 0.6593, + "step": 651 + }, + { + "epoch": 0.9095559172285514, + "grad_norm": 1.8743487506726677, + "learning_rate": 8.796350660748516e-06, + "loss": 0.646, + "step": 652 + }, + { + "epoch": 0.9109509416414787, + "grad_norm": 2.0801545860927626, + "learning_rate": 8.791057310344775e-06, + "loss": 0.6965, + "step": 653 + }, + { + "epoch": 0.912345966054406, + "grad_norm": 2.037064963904586, + "learning_rate": 8.785753946201484e-06, + "loss": 0.7919, + "step": 654 + }, + { + "epoch": 0.9137409904673331, + "grad_norm": 1.9757786317526882, + "learning_rate": 8.780440582327005e-06, + "loss": 0.6847, + "step": 655 + }, + { + "epoch": 0.9151360148802604, + "grad_norm": 1.904008821134603, + "learning_rate": 8.775117232756116e-06, + "loss": 0.6505, + "step": 656 + }, + { + "epoch": 0.9165310392931876, + "grad_norm": 1.9990189247235826, + "learning_rate": 8.769783911549968e-06, + "loss": 0.7283, + "step": 657 + }, + { + "epoch": 0.9179260637061148, + "grad_norm": 1.9708975603644308, + "learning_rate": 8.764440632796055e-06, + "loss": 0.7027, + "step": 658 + }, + { + "epoch": 0.9193210881190421, + "grad_norm": 1.8662648197783636, + "learning_rate": 8.75908741060817e-06, + "loss": 0.7245, + "step": 659 + }, + { + "epoch": 0.9207161125319693, + "grad_norm": 1.920194446953376, + "learning_rate": 8.75372425912637e-06, + "loss": 0.7019, + "step": 660 + }, + { + "epoch": 0.9221111369448965, + "grad_norm": 2.2198974167640655, + "learning_rate": 8.748351192516943e-06, + "loss": 0.7775, + "step": 661 + }, + { + "epoch": 0.9235061613578237, + "grad_norm": 2.094121585262976, + "learning_rate": 8.742968224972366e-06, + "loss": 0.6971, + "step": 662 + }, + { + "epoch": 0.924901185770751, + "grad_norm": 1.9084243245926524, + "learning_rate": 8.737575370711265e-06, + "loss": 0.6826, + "step": 663 + }, + { + "epoch": 0.9262962101836782, + "grad_norm": 2.032697852643676, + "learning_rate": 8.732172643978383e-06, + "loss": 0.7119, + "step": 664 + }, + { + "epoch": 0.9276912345966054, + "grad_norm": 2.0461653810471416, + "learning_rate": 8.726760059044542e-06, + "loss": 0.6939, + "step": 665 + }, + { + "epoch": 0.9290862590095327, + "grad_norm": 2.130754432146634, + "learning_rate": 8.721337630206603e-06, + "loss": 0.7491, + "step": 666 + }, + { + "epoch": 0.93048128342246, + "grad_norm": 2.0904014494400265, + "learning_rate": 8.715905371787426e-06, + "loss": 0.7298, + "step": 667 + }, + { + "epoch": 0.9318763078353871, + "grad_norm": 1.9843213685121637, + "learning_rate": 8.710463298135836e-06, + "loss": 0.6429, + "step": 668 + }, + { + "epoch": 0.9332713322483144, + "grad_norm": 1.917630694546447, + "learning_rate": 8.705011423626589e-06, + "loss": 0.7401, + "step": 669 + }, + { + "epoch": 0.9346663566612415, + "grad_norm": 2.13375068139831, + "learning_rate": 8.699549762660318e-06, + "loss": 0.6956, + "step": 670 + }, + { + "epoch": 0.9360613810741688, + "grad_norm": 1.975503491309611, + "learning_rate": 8.69407832966352e-06, + "loss": 0.7035, + "step": 671 + }, + { + "epoch": 0.9374564054870961, + "grad_norm": 2.0791411874973518, + "learning_rate": 8.688597139088494e-06, + "loss": 0.7222, + "step": 672 + }, + { + "epoch": 0.9388514299000232, + "grad_norm": 1.8469076240138975, + "learning_rate": 8.683106205413316e-06, + "loss": 0.6668, + "step": 673 + }, + { + "epoch": 0.9402464543129505, + "grad_norm": 1.7928528011941791, + "learning_rate": 8.677605543141797e-06, + "loss": 0.6437, + "step": 674 + }, + { + "epoch": 0.9416414787258777, + "grad_norm": 2.2286342353729136, + "learning_rate": 8.672095166803445e-06, + "loss": 0.7352, + "step": 675 + }, + { + "epoch": 0.9430365031388049, + "grad_norm": 1.905774060183822, + "learning_rate": 8.666575090953426e-06, + "loss": 0.7106, + "step": 676 + }, + { + "epoch": 0.9444315275517322, + "grad_norm": 2.068546911494722, + "learning_rate": 8.661045330172533e-06, + "loss": 0.7773, + "step": 677 + }, + { + "epoch": 0.9458265519646594, + "grad_norm": 1.9281757004651763, + "learning_rate": 8.65550589906713e-06, + "loss": 0.662, + "step": 678 + }, + { + "epoch": 0.9472215763775866, + "grad_norm": 1.855641783248281, + "learning_rate": 8.649956812269134e-06, + "loss": 0.6778, + "step": 679 + }, + { + "epoch": 0.9486166007905138, + "grad_norm": 1.8442734995129473, + "learning_rate": 8.644398084435959e-06, + "loss": 0.6684, + "step": 680 + }, + { + "epoch": 0.9500116252034411, + "grad_norm": 1.9714376993675073, + "learning_rate": 8.63882973025049e-06, + "loss": 0.6837, + "step": 681 + }, + { + "epoch": 0.9514066496163683, + "grad_norm": 1.8876407415682392, + "learning_rate": 8.63325176442104e-06, + "loss": 0.7157, + "step": 682 + }, + { + "epoch": 0.9528016740292955, + "grad_norm": 1.9056546611112206, + "learning_rate": 8.627664201681305e-06, + "loss": 0.6769, + "step": 683 + }, + { + "epoch": 0.9541966984422228, + "grad_norm": 2.0721004187013614, + "learning_rate": 8.622067056790333e-06, + "loss": 0.7439, + "step": 684 + }, + { + "epoch": 0.9555917228551499, + "grad_norm": 1.8805956857202908, + "learning_rate": 8.616460344532483e-06, + "loss": 0.7115, + "step": 685 + }, + { + "epoch": 0.9569867472680772, + "grad_norm": 1.8952885482730308, + "learning_rate": 8.610844079717387e-06, + "loss": 0.6435, + "step": 686 + }, + { + "epoch": 0.9583817716810045, + "grad_norm": 1.9989059022509683, + "learning_rate": 8.605218277179907e-06, + "loss": 0.7322, + "step": 687 + }, + { + "epoch": 0.9597767960939316, + "grad_norm": 1.7616680538530942, + "learning_rate": 8.599582951780095e-06, + "loss": 0.6235, + "step": 688 + }, + { + "epoch": 0.9611718205068589, + "grad_norm": 1.9662690661857283, + "learning_rate": 8.593938118403164e-06, + "loss": 0.6739, + "step": 689 + }, + { + "epoch": 0.9625668449197861, + "grad_norm": 2.0362652708193356, + "learning_rate": 8.588283791959437e-06, + "loss": 0.7047, + "step": 690 + }, + { + "epoch": 0.9639618693327133, + "grad_norm": 2.015021257411785, + "learning_rate": 8.582619987384311e-06, + "loss": 0.6863, + "step": 691 + }, + { + "epoch": 0.9653568937456406, + "grad_norm": 1.8836047793904986, + "learning_rate": 8.57694671963822e-06, + "loss": 0.652, + "step": 692 + }, + { + "epoch": 0.9667519181585678, + "grad_norm": 2.0164471666869357, + "learning_rate": 8.571264003706596e-06, + "loss": 0.7219, + "step": 693 + }, + { + "epoch": 0.968146942571495, + "grad_norm": 1.9205579637723724, + "learning_rate": 8.565571854599825e-06, + "loss": 0.6737, + "step": 694 + }, + { + "epoch": 0.9695419669844222, + "grad_norm": 2.0544942254164766, + "learning_rate": 8.559870287353214e-06, + "loss": 0.7372, + "step": 695 + }, + { + "epoch": 0.9709369913973495, + "grad_norm": 1.966795085808272, + "learning_rate": 8.554159317026939e-06, + "loss": 0.653, + "step": 696 + }, + { + "epoch": 0.9723320158102767, + "grad_norm": 2.021147157715166, + "learning_rate": 8.548438958706022e-06, + "loss": 0.7458, + "step": 697 + }, + { + "epoch": 0.9737270402232039, + "grad_norm": 2.113696018431275, + "learning_rate": 8.542709227500276e-06, + "loss": 0.7645, + "step": 698 + }, + { + "epoch": 0.9751220646361312, + "grad_norm": 1.9206237217188158, + "learning_rate": 8.536970138544278e-06, + "loss": 0.6644, + "step": 699 + }, + { + "epoch": 0.9765170890490583, + "grad_norm": 1.9453633858006116, + "learning_rate": 8.531221706997316e-06, + "loss": 0.6547, + "step": 700 + }, + { + "epoch": 0.9779121134619856, + "grad_norm": 2.0426945048651937, + "learning_rate": 8.525463948043365e-06, + "loss": 0.7077, + "step": 701 + }, + { + "epoch": 0.9793071378749129, + "grad_norm": 1.931889958107307, + "learning_rate": 8.519696876891024e-06, + "loss": 0.6439, + "step": 702 + }, + { + "epoch": 0.98070216228784, + "grad_norm": 1.9159274278923777, + "learning_rate": 8.513920508773499e-06, + "loss": 0.6966, + "step": 703 + }, + { + "epoch": 0.9820971867007673, + "grad_norm": 1.9359622556425784, + "learning_rate": 8.508134858948553e-06, + "loss": 0.6921, + "step": 704 + }, + { + "epoch": 0.9834922111136944, + "grad_norm": 1.9696782797857875, + "learning_rate": 8.502339942698463e-06, + "loss": 0.6635, + "step": 705 + }, + { + "epoch": 0.9848872355266217, + "grad_norm": 1.9677260443909579, + "learning_rate": 8.496535775329982e-06, + "loss": 0.7245, + "step": 706 + }, + { + "epoch": 0.986282259939549, + "grad_norm": 1.9675680167275715, + "learning_rate": 8.4907223721743e-06, + "loss": 0.6777, + "step": 707 + }, + { + "epoch": 0.9876772843524761, + "grad_norm": 1.806078447306801, + "learning_rate": 8.484899748587003e-06, + "loss": 0.6661, + "step": 708 + }, + { + "epoch": 0.9890723087654034, + "grad_norm": 2.2490305816466964, + "learning_rate": 8.479067919948032e-06, + "loss": 0.6972, + "step": 709 + }, + { + "epoch": 0.9904673331783306, + "grad_norm": 2.128480955160666, + "learning_rate": 8.473226901661643e-06, + "loss": 0.786, + "step": 710 + }, + { + "epoch": 0.9918623575912578, + "grad_norm": 2.085090723256057, + "learning_rate": 8.46737670915636e-06, + "loss": 0.6869, + "step": 711 + }, + { + "epoch": 0.9932573820041851, + "grad_norm": 2.0381152772038, + "learning_rate": 8.46151735788495e-06, + "loss": 0.6968, + "step": 712 + }, + { + "epoch": 0.9946524064171123, + "grad_norm": 1.8660889839328152, + "learning_rate": 8.455648863324364e-06, + "loss": 0.6421, + "step": 713 + }, + { + "epoch": 0.9960474308300395, + "grad_norm": 1.8767449139152448, + "learning_rate": 8.449771240975707e-06, + "loss": 0.6685, + "step": 714 + }, + { + "epoch": 0.9974424552429667, + "grad_norm": 1.9246273464410195, + "learning_rate": 8.443884506364192e-06, + "loss": 0.6835, + "step": 715 + }, + { + "epoch": 0.998837479655894, + "grad_norm": 1.99350094104789, + "learning_rate": 8.437988675039108e-06, + "loss": 0.6762, + "step": 716 + }, + { + "epoch": 1.0013950244129273, + "grad_norm": 2.7852354658837086, + "learning_rate": 8.432083762573761e-06, + "loss": 1.1719, + "step": 717 + }, + { + "epoch": 1.0027900488258545, + "grad_norm": 1.7585462074635738, + "learning_rate": 8.426169784565452e-06, + "loss": 0.4956, + "step": 718 + }, + { + "epoch": 1.0041850732387816, + "grad_norm": 1.8518154743606603, + "learning_rate": 8.420246756635431e-06, + "loss": 0.4464, + "step": 719 + }, + { + "epoch": 1.0055800976517089, + "grad_norm": 1.8843110874735378, + "learning_rate": 8.414314694428842e-06, + "loss": 0.4761, + "step": 720 + }, + { + "epoch": 1.0069751220646361, + "grad_norm": 1.6047121473200383, + "learning_rate": 8.408373613614699e-06, + "loss": 0.4887, + "step": 721 + }, + { + "epoch": 1.0083701464775634, + "grad_norm": 1.8784731067028053, + "learning_rate": 8.40242352988584e-06, + "loss": 0.4422, + "step": 722 + }, + { + "epoch": 1.0097651708904907, + "grad_norm": 1.8159678540316857, + "learning_rate": 8.396464458958876e-06, + "loss": 0.5311, + "step": 723 + }, + { + "epoch": 1.0111601953034177, + "grad_norm": 2.027238459653953, + "learning_rate": 8.390496416574166e-06, + "loss": 0.4802, + "step": 724 + }, + { + "epoch": 1.012555219716345, + "grad_norm": 2.179302244779955, + "learning_rate": 8.384519418495755e-06, + "loss": 0.5205, + "step": 725 + }, + { + "epoch": 1.0139502441292723, + "grad_norm": 2.175655456291187, + "learning_rate": 8.378533480511355e-06, + "loss": 0.4955, + "step": 726 + }, + { + "epoch": 1.0153452685421995, + "grad_norm": 2.8562208735960826, + "learning_rate": 8.372538618432282e-06, + "loss": 0.5112, + "step": 727 + }, + { + "epoch": 1.0167402929551268, + "grad_norm": 2.2651246091600314, + "learning_rate": 8.366534848093434e-06, + "loss": 0.5169, + "step": 728 + }, + { + "epoch": 1.0181353173680538, + "grad_norm": 2.2772660173459665, + "learning_rate": 8.360522185353234e-06, + "loss": 0.4812, + "step": 729 + }, + { + "epoch": 1.0195303417809811, + "grad_norm": 1.9713967680804616, + "learning_rate": 8.354500646093592e-06, + "loss": 0.4655, + "step": 730 + }, + { + "epoch": 1.0209253661939084, + "grad_norm": 2.22363796857583, + "learning_rate": 8.348470246219872e-06, + "loss": 0.5245, + "step": 731 + }, + { + "epoch": 1.0223203906068357, + "grad_norm": 2.018998971676804, + "learning_rate": 8.342431001660826e-06, + "loss": 0.4552, + "step": 732 + }, + { + "epoch": 1.023715415019763, + "grad_norm": 1.9417878883494086, + "learning_rate": 8.33638292836859e-06, + "loss": 0.4881, + "step": 733 + }, + { + "epoch": 1.02511043943269, + "grad_norm": 1.9746946629683426, + "learning_rate": 8.330326042318605e-06, + "loss": 0.5036, + "step": 734 + }, + { + "epoch": 1.0265054638456172, + "grad_norm": 1.8367303689092216, + "learning_rate": 8.324260359509594e-06, + "loss": 0.4848, + "step": 735 + }, + { + "epoch": 1.0279004882585445, + "grad_norm": 1.9065855234784623, + "learning_rate": 8.31818589596352e-06, + "loss": 0.4561, + "step": 736 + }, + { + "epoch": 1.0292955126714718, + "grad_norm": 2.203951507850641, + "learning_rate": 8.312102667725534e-06, + "loss": 0.5201, + "step": 737 + }, + { + "epoch": 1.030690537084399, + "grad_norm": 1.86322512488284, + "learning_rate": 8.306010690863943e-06, + "loss": 0.4636, + "step": 738 + }, + { + "epoch": 1.032085561497326, + "grad_norm": 2.021085507857238, + "learning_rate": 8.299909981470159e-06, + "loss": 0.4868, + "step": 739 + }, + { + "epoch": 1.0334805859102534, + "grad_norm": 2.0336313733189715, + "learning_rate": 8.29380055565866e-06, + "loss": 0.5011, + "step": 740 + }, + { + "epoch": 1.0348756103231807, + "grad_norm": 1.9700200483932437, + "learning_rate": 8.28768242956695e-06, + "loss": 0.4794, + "step": 741 + }, + { + "epoch": 1.036270634736108, + "grad_norm": 1.9333816097421348, + "learning_rate": 8.281555619355515e-06, + "loss": 0.5558, + "step": 742 + }, + { + "epoch": 1.0376656591490352, + "grad_norm": 2.1097816531163454, + "learning_rate": 8.275420141207775e-06, + "loss": 0.5026, + "step": 743 + }, + { + "epoch": 1.0390606835619622, + "grad_norm": 2.066539214548412, + "learning_rate": 8.269276011330048e-06, + "loss": 0.4581, + "step": 744 + }, + { + "epoch": 1.0404557079748895, + "grad_norm": 1.8956395492111373, + "learning_rate": 8.263123245951504e-06, + "loss": 0.5454, + "step": 745 + }, + { + "epoch": 1.0418507323878168, + "grad_norm": 2.100666149512502, + "learning_rate": 8.256961861324127e-06, + "loss": 0.4801, + "step": 746 + }, + { + "epoch": 1.043245756800744, + "grad_norm": 2.149566004009623, + "learning_rate": 8.250791873722662e-06, + "loss": 0.4692, + "step": 747 + }, + { + "epoch": 1.0446407812136713, + "grad_norm": 2.054274149085442, + "learning_rate": 8.244613299444581e-06, + "loss": 0.5404, + "step": 748 + }, + { + "epoch": 1.0460358056265984, + "grad_norm": 2.009479396810494, + "learning_rate": 8.238426154810035e-06, + "loss": 0.4521, + "step": 749 + }, + { + "epoch": 1.0474308300395256, + "grad_norm": 2.0004470029627464, + "learning_rate": 8.232230456161819e-06, + "loss": 0.4466, + "step": 750 + }, + { + "epoch": 1.048825854452453, + "grad_norm": 2.154526582868792, + "learning_rate": 8.226026219865313e-06, + "loss": 0.5059, + "step": 751 + }, + { + "epoch": 1.0502208788653802, + "grad_norm": 2.088904126217079, + "learning_rate": 8.219813462308458e-06, + "loss": 0.4749, + "step": 752 + }, + { + "epoch": 1.0516159032783075, + "grad_norm": 2.0692774381070125, + "learning_rate": 8.213592199901692e-06, + "loss": 0.4313, + "step": 753 + }, + { + "epoch": 1.0530109276912345, + "grad_norm": 1.9215415829687428, + "learning_rate": 8.207362449077932e-06, + "loss": 0.4707, + "step": 754 + }, + { + "epoch": 1.0544059521041618, + "grad_norm": 2.0945380875354225, + "learning_rate": 8.201124226292505e-06, + "loss": 0.4781, + "step": 755 + }, + { + "epoch": 1.055800976517089, + "grad_norm": 2.000101834591612, + "learning_rate": 8.19487754802312e-06, + "loss": 0.4688, + "step": 756 + }, + { + "epoch": 1.0571960009300163, + "grad_norm": 2.2202484065922876, + "learning_rate": 8.18862243076982e-06, + "loss": 0.4761, + "step": 757 + }, + { + "epoch": 1.0585910253429436, + "grad_norm": 2.0625752659923187, + "learning_rate": 8.18235889105494e-06, + "loss": 0.4315, + "step": 758 + }, + { + "epoch": 1.0599860497558706, + "grad_norm": 2.045140697582116, + "learning_rate": 8.17608694542306e-06, + "loss": 0.5018, + "step": 759 + }, + { + "epoch": 1.061381074168798, + "grad_norm": 2.003649970486099, + "learning_rate": 8.169806610440966e-06, + "loss": 0.4967, + "step": 760 + }, + { + "epoch": 1.0627760985817252, + "grad_norm": 2.1830296634359616, + "learning_rate": 8.163517902697602e-06, + "loss": 0.5049, + "step": 761 + }, + { + "epoch": 1.0641711229946524, + "grad_norm": 2.0816421018361995, + "learning_rate": 8.157220838804026e-06, + "loss": 0.5132, + "step": 762 + }, + { + "epoch": 1.0655661474075797, + "grad_norm": 2.084476758712473, + "learning_rate": 8.150915435393371e-06, + "loss": 0.4771, + "step": 763 + }, + { + "epoch": 1.0669611718205068, + "grad_norm": 1.9998410644432016, + "learning_rate": 8.1446017091208e-06, + "loss": 0.4261, + "step": 764 + }, + { + "epoch": 1.068356196233434, + "grad_norm": 1.9495671767021174, + "learning_rate": 8.138279676663458e-06, + "loss": 0.5264, + "step": 765 + }, + { + "epoch": 1.0697512206463613, + "grad_norm": 2.195184767639989, + "learning_rate": 8.131949354720425e-06, + "loss": 0.4366, + "step": 766 + }, + { + "epoch": 1.0711462450592886, + "grad_norm": 1.9821878881590826, + "learning_rate": 8.125610760012685e-06, + "loss": 0.4736, + "step": 767 + }, + { + "epoch": 1.0725412694722158, + "grad_norm": 1.9988670353991758, + "learning_rate": 8.11926390928307e-06, + "loss": 0.5174, + "step": 768 + }, + { + "epoch": 1.073936293885143, + "grad_norm": 2.0566037377417365, + "learning_rate": 8.112908819296217e-06, + "loss": 0.4676, + "step": 769 + }, + { + "epoch": 1.0753313182980702, + "grad_norm": 1.987767203894708, + "learning_rate": 8.106545506838533e-06, + "loss": 0.4954, + "step": 770 + }, + { + "epoch": 1.0767263427109974, + "grad_norm": 1.9333413400597803, + "learning_rate": 8.100173988718136e-06, + "loss": 0.4687, + "step": 771 + }, + { + "epoch": 1.0781213671239247, + "grad_norm": 2.0208241483857874, + "learning_rate": 8.093794281764824e-06, + "loss": 0.4971, + "step": 772 + }, + { + "epoch": 1.079516391536852, + "grad_norm": 2.1850495091404234, + "learning_rate": 8.087406402830026e-06, + "loss": 0.469, + "step": 773 + }, + { + "epoch": 1.080911415949779, + "grad_norm": 2.0322699994558384, + "learning_rate": 8.081010368786751e-06, + "loss": 0.4778, + "step": 774 + }, + { + "epoch": 1.0823064403627063, + "grad_norm": 1.9091269108666133, + "learning_rate": 8.074606196529554e-06, + "loss": 0.4675, + "step": 775 + }, + { + "epoch": 1.0837014647756336, + "grad_norm": 2.035468646541553, + "learning_rate": 8.068193902974482e-06, + "loss": 0.4726, + "step": 776 + }, + { + "epoch": 1.0850964891885608, + "grad_norm": 2.052286438982202, + "learning_rate": 8.06177350505904e-06, + "loss": 0.437, + "step": 777 + }, + { + "epoch": 1.086491513601488, + "grad_norm": 2.186255322231635, + "learning_rate": 8.055345019742133e-06, + "loss": 0.4578, + "step": 778 + }, + { + "epoch": 1.0878865380144151, + "grad_norm": 2.0869198110476237, + "learning_rate": 8.048908464004032e-06, + "loss": 0.5158, + "step": 779 + }, + { + "epoch": 1.0892815624273424, + "grad_norm": 2.2422828894640374, + "learning_rate": 8.042463854846325e-06, + "loss": 0.4455, + "step": 780 + }, + { + "epoch": 1.0906765868402697, + "grad_norm": 2.020266529668748, + "learning_rate": 8.036011209291872e-06, + "loss": 0.4682, + "step": 781 + }, + { + "epoch": 1.092071611253197, + "grad_norm": 2.1762604344777174, + "learning_rate": 8.029550544384758e-06, + "loss": 0.4544, + "step": 782 + }, + { + "epoch": 1.0934666356661242, + "grad_norm": 2.075854485507843, + "learning_rate": 8.023081877190257e-06, + "loss": 0.5303, + "step": 783 + }, + { + "epoch": 1.0948616600790513, + "grad_norm": 2.1957708758825385, + "learning_rate": 8.016605224794773e-06, + "loss": 0.4799, + "step": 784 + }, + { + "epoch": 1.0962566844919786, + "grad_norm": 2.047398963820383, + "learning_rate": 8.010120604305806e-06, + "loss": 0.465, + "step": 785 + }, + { + "epoch": 1.0976517089049058, + "grad_norm": 2.1167412676353714, + "learning_rate": 8.003628032851904e-06, + "loss": 0.4773, + "step": 786 + }, + { + "epoch": 1.099046733317833, + "grad_norm": 2.05543114145942, + "learning_rate": 7.997127527582613e-06, + "loss": 0.4638, + "step": 787 + }, + { + "epoch": 1.1004417577307604, + "grad_norm": 2.5883330126395183, + "learning_rate": 7.990619105668437e-06, + "loss": 0.4429, + "step": 788 + }, + { + "epoch": 1.1018367821436876, + "grad_norm": 1.7471444573255699, + "learning_rate": 7.984102784300794e-06, + "loss": 0.4823, + "step": 789 + }, + { + "epoch": 1.1032318065566147, + "grad_norm": 2.0341688631816552, + "learning_rate": 7.977578580691963e-06, + "loss": 0.4912, + "step": 790 + }, + { + "epoch": 1.104626830969542, + "grad_norm": 1.9309193576466954, + "learning_rate": 7.971046512075047e-06, + "loss": 0.4465, + "step": 791 + }, + { + "epoch": 1.1060218553824692, + "grad_norm": 2.073755962027876, + "learning_rate": 7.964506595703923e-06, + "loss": 0.456, + "step": 792 + }, + { + "epoch": 1.1074168797953965, + "grad_norm": 1.9513764017619648, + "learning_rate": 7.957958848853193e-06, + "loss": 0.5207, + "step": 793 + }, + { + "epoch": 1.1088119042083235, + "grad_norm": 2.210555465361039, + "learning_rate": 7.95140328881815e-06, + "loss": 0.5029, + "step": 794 + }, + { + "epoch": 1.1102069286212508, + "grad_norm": 2.0145979377685475, + "learning_rate": 7.944839932914718e-06, + "loss": 0.4881, + "step": 795 + }, + { + "epoch": 1.111601953034178, + "grad_norm": 2.1886552216768784, + "learning_rate": 7.938268798479419e-06, + "loss": 0.4801, + "step": 796 + }, + { + "epoch": 1.1129969774471054, + "grad_norm": 1.9143808865966454, + "learning_rate": 7.931689902869314e-06, + "loss": 0.4987, + "step": 797 + }, + { + "epoch": 1.1143920018600326, + "grad_norm": 2.229623106539157, + "learning_rate": 7.925103263461971e-06, + "loss": 0.4978, + "step": 798 + }, + { + "epoch": 1.11578702627296, + "grad_norm": 1.9414732070488792, + "learning_rate": 7.91850889765541e-06, + "loss": 0.458, + "step": 799 + }, + { + "epoch": 1.117182050685887, + "grad_norm": 2.092745745628236, + "learning_rate": 7.91190682286806e-06, + "loss": 0.4389, + "step": 800 + }, + { + "epoch": 1.1185770750988142, + "grad_norm": 2.123397220956053, + "learning_rate": 7.905297056538713e-06, + "loss": 0.4635, + "step": 801 + }, + { + "epoch": 1.1199720995117415, + "grad_norm": 1.968518140063321, + "learning_rate": 7.898679616126474e-06, + "loss": 0.4799, + "step": 802 + }, + { + "epoch": 1.1213671239246688, + "grad_norm": 1.9781979017384825, + "learning_rate": 7.892054519110726e-06, + "loss": 0.4876, + "step": 803 + }, + { + "epoch": 1.1227621483375958, + "grad_norm": 2.1208452802156725, + "learning_rate": 7.885421782991064e-06, + "loss": 0.5135, + "step": 804 + }, + { + "epoch": 1.124157172750523, + "grad_norm": 2.0090102765020954, + "learning_rate": 7.878781425287277e-06, + "loss": 0.4445, + "step": 805 + }, + { + "epoch": 1.1255521971634503, + "grad_norm": 1.9927344582343813, + "learning_rate": 7.872133463539274e-06, + "loss": 0.4357, + "step": 806 + }, + { + "epoch": 1.1269472215763776, + "grad_norm": 2.0716300721802665, + "learning_rate": 7.86547791530705e-06, + "loss": 0.5389, + "step": 807 + }, + { + "epoch": 1.1283422459893049, + "grad_norm": 1.9819888932135203, + "learning_rate": 7.858814798170644e-06, + "loss": 0.492, + "step": 808 + }, + { + "epoch": 1.1297372704022322, + "grad_norm": 2.0463707519984218, + "learning_rate": 7.852144129730087e-06, + "loss": 0.4736, + "step": 809 + }, + { + "epoch": 1.1311322948151592, + "grad_norm": 2.1975286753620877, + "learning_rate": 7.84546592760535e-06, + "loss": 0.4879, + "step": 810 + }, + { + "epoch": 1.1325273192280865, + "grad_norm": 1.911921932483923, + "learning_rate": 7.83878020943631e-06, + "loss": 0.5166, + "step": 811 + }, + { + "epoch": 1.1339223436410137, + "grad_norm": 2.1256753078627764, + "learning_rate": 7.832086992882697e-06, + "loss": 0.4841, + "step": 812 + }, + { + "epoch": 1.135317368053941, + "grad_norm": 1.8043220542453693, + "learning_rate": 7.825386295624043e-06, + "loss": 0.4479, + "step": 813 + }, + { + "epoch": 1.136712392466868, + "grad_norm": 2.027151075919505, + "learning_rate": 7.818678135359641e-06, + "loss": 0.4979, + "step": 814 + }, + { + "epoch": 1.1381074168797953, + "grad_norm": 2.0578677546704607, + "learning_rate": 7.811962529808499e-06, + "loss": 0.4757, + "step": 815 + }, + { + "epoch": 1.1395024412927226, + "grad_norm": 2.0604192529558656, + "learning_rate": 7.805239496709291e-06, + "loss": 0.4531, + "step": 816 + }, + { + "epoch": 1.1408974657056499, + "grad_norm": 1.94855850047, + "learning_rate": 7.798509053820305e-06, + "loss": 0.4417, + "step": 817 + }, + { + "epoch": 1.1422924901185771, + "grad_norm": 2.023024363909115, + "learning_rate": 7.79177121891941e-06, + "loss": 0.4615, + "step": 818 + }, + { + "epoch": 1.1436875145315044, + "grad_norm": 2.106362451480915, + "learning_rate": 7.785026009803993e-06, + "loss": 0.503, + "step": 819 + }, + { + "epoch": 1.1450825389444315, + "grad_norm": 2.270663758773595, + "learning_rate": 7.778273444290921e-06, + "loss": 0.4791, + "step": 820 + }, + { + "epoch": 1.1464775633573587, + "grad_norm": 2.1367829318763647, + "learning_rate": 7.771513540216496e-06, + "loss": 0.4937, + "step": 821 + }, + { + "epoch": 1.147872587770286, + "grad_norm": 2.1680686104730755, + "learning_rate": 7.764746315436399e-06, + "loss": 0.4543, + "step": 822 + }, + { + "epoch": 1.1492676121832133, + "grad_norm": 2.0947572237591454, + "learning_rate": 7.75797178782565e-06, + "loss": 0.4529, + "step": 823 + }, + { + "epoch": 1.1506626365961403, + "grad_norm": 1.7934788611254668, + "learning_rate": 7.751189975278561e-06, + "loss": 0.4068, + "step": 824 + }, + { + "epoch": 1.1520576610090676, + "grad_norm": 2.0307708711760895, + "learning_rate": 7.744400895708683e-06, + "loss": 0.4659, + "step": 825 + }, + { + "epoch": 1.1534526854219949, + "grad_norm": 2.061646896391218, + "learning_rate": 7.737604567048766e-06, + "loss": 0.5493, + "step": 826 + }, + { + "epoch": 1.1548477098349221, + "grad_norm": 2.193247299615853, + "learning_rate": 7.730801007250704e-06, + "loss": 0.4671, + "step": 827 + }, + { + "epoch": 1.1562427342478494, + "grad_norm": 2.033168768238963, + "learning_rate": 7.72399023428549e-06, + "loss": 0.4642, + "step": 828 + }, + { + "epoch": 1.1576377586607767, + "grad_norm": 1.921507053967542, + "learning_rate": 7.717172266143178e-06, + "loss": 0.5365, + "step": 829 + }, + { + "epoch": 1.1590327830737037, + "grad_norm": 2.1786658189049835, + "learning_rate": 7.710347120832821e-06, + "loss": 0.4573, + "step": 830 + }, + { + "epoch": 1.160427807486631, + "grad_norm": 2.04271905830797, + "learning_rate": 7.703514816382432e-06, + "loss": 0.4397, + "step": 831 + }, + { + "epoch": 1.1618228318995583, + "grad_norm": 2.010020828781065, + "learning_rate": 7.696675370838929e-06, + "loss": 0.454, + "step": 832 + }, + { + "epoch": 1.1632178563124855, + "grad_norm": 2.0652609343361292, + "learning_rate": 7.689828802268102e-06, + "loss": 0.4565, + "step": 833 + }, + { + "epoch": 1.1646128807254126, + "grad_norm": 1.8081504823167087, + "learning_rate": 7.682975128754548e-06, + "loss": 0.4496, + "step": 834 + }, + { + "epoch": 1.1660079051383399, + "grad_norm": 2.073580143759261, + "learning_rate": 7.676114368401635e-06, + "loss": 0.5335, + "step": 835 + }, + { + "epoch": 1.1674029295512671, + "grad_norm": 2.146815116325766, + "learning_rate": 7.66924653933145e-06, + "loss": 0.4867, + "step": 836 + }, + { + "epoch": 1.1687979539641944, + "grad_norm": 2.0789485213664145, + "learning_rate": 7.662371659684749e-06, + "loss": 0.5105, + "step": 837 + }, + { + "epoch": 1.1701929783771217, + "grad_norm": 2.0786101027949915, + "learning_rate": 7.655489747620913e-06, + "loss": 0.4726, + "step": 838 + }, + { + "epoch": 1.171588002790049, + "grad_norm": 2.021038432128547, + "learning_rate": 7.648600821317901e-06, + "loss": 0.4993, + "step": 839 + }, + { + "epoch": 1.172983027202976, + "grad_norm": 2.1039185200269848, + "learning_rate": 7.641704898972194e-06, + "loss": 0.4922, + "step": 840 + }, + { + "epoch": 1.1743780516159033, + "grad_norm": 2.1495345339138328, + "learning_rate": 7.634801998798755e-06, + "loss": 0.5102, + "step": 841 + }, + { + "epoch": 1.1757730760288305, + "grad_norm": 2.0719811459725057, + "learning_rate": 7.6278921390309834e-06, + "loss": 0.4355, + "step": 842 + }, + { + "epoch": 1.1771681004417578, + "grad_norm": 1.8630425662831487, + "learning_rate": 7.620975337920653e-06, + "loss": 0.4703, + "step": 843 + }, + { + "epoch": 1.1785631248546848, + "grad_norm": 2.046791765467504, + "learning_rate": 7.6140516137378786e-06, + "loss": 0.4223, + "step": 844 + }, + { + "epoch": 1.179958149267612, + "grad_norm": 1.9132364567899665, + "learning_rate": 7.607120984771058e-06, + "loss": 0.4596, + "step": 845 + }, + { + "epoch": 1.1813531736805394, + "grad_norm": 1.9802117029437554, + "learning_rate": 7.600183469326829e-06, + "loss": 0.4659, + "step": 846 + }, + { + "epoch": 1.1827481980934667, + "grad_norm": 2.025431606531357, + "learning_rate": 7.593239085730022e-06, + "loss": 0.5016, + "step": 847 + }, + { + "epoch": 1.184143222506394, + "grad_norm": 2.368669248836562, + "learning_rate": 7.586287852323605e-06, + "loss": 0.5117, + "step": 848 + }, + { + "epoch": 1.1855382469193212, + "grad_norm": 2.100026020486569, + "learning_rate": 7.579329787468639e-06, + "loss": 0.4535, + "step": 849 + }, + { + "epoch": 1.1869332713322482, + "grad_norm": 2.0663388575312056, + "learning_rate": 7.572364909544235e-06, + "loss": 0.4993, + "step": 850 + }, + { + "epoch": 1.1883282957451755, + "grad_norm": 1.884056547062916, + "learning_rate": 7.565393236947494e-06, + "loss": 0.4346, + "step": 851 + }, + { + "epoch": 1.1897233201581028, + "grad_norm": 2.1776882352748728, + "learning_rate": 7.558414788093467e-06, + "loss": 0.4755, + "step": 852 + }, + { + "epoch": 1.19111834457103, + "grad_norm": 2.061273986301572, + "learning_rate": 7.551429581415104e-06, + "loss": 0.5118, + "step": 853 + }, + { + "epoch": 1.192513368983957, + "grad_norm": 2.1865289893443527, + "learning_rate": 7.5444376353632064e-06, + "loss": 0.4577, + "step": 854 + }, + { + "epoch": 1.1939083933968844, + "grad_norm": 1.8247742290026874, + "learning_rate": 7.537438968406372e-06, + "loss": 0.4904, + "step": 855 + }, + { + "epoch": 1.1953034178098116, + "grad_norm": 2.1078079523273887, + "learning_rate": 7.530433599030962e-06, + "loss": 0.4653, + "step": 856 + }, + { + "epoch": 1.196698442222739, + "grad_norm": 2.1391503263974045, + "learning_rate": 7.5234215457410255e-06, + "loss": 0.5067, + "step": 857 + }, + { + "epoch": 1.1980934666356662, + "grad_norm": 2.2368165670079354, + "learning_rate": 7.516402827058283e-06, + "loss": 0.5353, + "step": 858 + }, + { + "epoch": 1.1994884910485935, + "grad_norm": 1.931059913532765, + "learning_rate": 7.509377461522049e-06, + "loss": 0.4786, + "step": 859 + }, + { + "epoch": 1.2008835154615205, + "grad_norm": 2.330043518759979, + "learning_rate": 7.502345467689202e-06, + "loss": 0.5049, + "step": 860 + }, + { + "epoch": 1.2022785398744478, + "grad_norm": 2.1386669121489437, + "learning_rate": 7.4953068641341255e-06, + "loss": 0.4504, + "step": 861 + }, + { + "epoch": 1.203673564287375, + "grad_norm": 2.1135532365263847, + "learning_rate": 7.488261669448662e-06, + "loss": 0.5204, + "step": 862 + }, + { + "epoch": 1.2050685887003023, + "grad_norm": 2.0651469477865816, + "learning_rate": 7.4812099022420636e-06, + "loss": 0.4627, + "step": 863 + }, + { + "epoch": 1.2064636131132294, + "grad_norm": 2.034159070644421, + "learning_rate": 7.474151581140947e-06, + "loss": 0.4753, + "step": 864 + }, + { + "epoch": 1.2078586375261566, + "grad_norm": 2.145642949505401, + "learning_rate": 7.4670867247892346e-06, + "loss": 0.4714, + "step": 865 + }, + { + "epoch": 1.209253661939084, + "grad_norm": 1.8618981108225734, + "learning_rate": 7.460015351848115e-06, + "loss": 0.4818, + "step": 866 + }, + { + "epoch": 1.2106486863520112, + "grad_norm": 2.0821388794136215, + "learning_rate": 7.4529374809959895e-06, + "loss": 0.4179, + "step": 867 + }, + { + "epoch": 1.2120437107649384, + "grad_norm": 1.944351950216618, + "learning_rate": 7.445853130928422e-06, + "loss": 0.5009, + "step": 868 + }, + { + "epoch": 1.2134387351778657, + "grad_norm": 2.2608731884977824, + "learning_rate": 7.438762320358089e-06, + "loss": 0.5172, + "step": 869 + }, + { + "epoch": 1.2148337595907928, + "grad_norm": 2.046899982267453, + "learning_rate": 7.431665068014737e-06, + "loss": 0.4701, + "step": 870 + }, + { + "epoch": 1.21622878400372, + "grad_norm": 1.9959592229395409, + "learning_rate": 7.424561392645122e-06, + "loss": 0.4717, + "step": 871 + }, + { + "epoch": 1.2176238084166473, + "grad_norm": 2.0838483616097827, + "learning_rate": 7.417451313012971e-06, + "loss": 0.49, + "step": 872 + }, + { + "epoch": 1.2190188328295746, + "grad_norm": 2.19031892792901, + "learning_rate": 7.410334847898921e-06, + "loss": 0.5118, + "step": 873 + }, + { + "epoch": 1.2204138572425016, + "grad_norm": 2.0327900333900923, + "learning_rate": 7.403212016100484e-06, + "loss": 0.4625, + "step": 874 + }, + { + "epoch": 1.221808881655429, + "grad_norm": 2.0196003750349183, + "learning_rate": 7.396082836431981e-06, + "loss": 0.4624, + "step": 875 + }, + { + "epoch": 1.2232039060683562, + "grad_norm": 2.043291413010201, + "learning_rate": 7.388947327724506e-06, + "loss": 0.4511, + "step": 876 + }, + { + "epoch": 1.2245989304812834, + "grad_norm": 2.0166904136766854, + "learning_rate": 7.3818055088258676e-06, + "loss": 0.4653, + "step": 877 + }, + { + "epoch": 1.2259939548942107, + "grad_norm": 2.172035909388715, + "learning_rate": 7.374657398600542e-06, + "loss": 0.4502, + "step": 878 + }, + { + "epoch": 1.227388979307138, + "grad_norm": 2.0085633602200015, + "learning_rate": 7.367503015929627e-06, + "loss": 0.4665, + "step": 879 + }, + { + "epoch": 1.228784003720065, + "grad_norm": 2.0313790988567075, + "learning_rate": 7.3603423797107845e-06, + "loss": 0.4512, + "step": 880 + }, + { + "epoch": 1.2301790281329923, + "grad_norm": 2.1088602216689663, + "learning_rate": 7.353175508858195e-06, + "loss": 0.4963, + "step": 881 + }, + { + "epoch": 1.2315740525459196, + "grad_norm": 2.222624875679751, + "learning_rate": 7.3460024223025095e-06, + "loss": 0.509, + "step": 882 + }, + { + "epoch": 1.2329690769588468, + "grad_norm": 1.678805125515682, + "learning_rate": 7.338823138990796e-06, + "loss": 0.4363, + "step": 883 + }, + { + "epoch": 1.234364101371774, + "grad_norm": 2.0825313015032143, + "learning_rate": 7.33163767788649e-06, + "loss": 0.4606, + "step": 884 + }, + { + "epoch": 1.2357591257847012, + "grad_norm": 2.008078631188548, + "learning_rate": 7.324446057969346e-06, + "loss": 0.5088, + "step": 885 + }, + { + "epoch": 1.2371541501976284, + "grad_norm": 2.120130042894298, + "learning_rate": 7.317248298235387e-06, + "loss": 0.4908, + "step": 886 + }, + { + "epoch": 1.2385491746105557, + "grad_norm": 2.026566890552797, + "learning_rate": 7.3100444176968514e-06, + "loss": 0.4453, + "step": 887 + }, + { + "epoch": 1.239944199023483, + "grad_norm": 2.16601301998483, + "learning_rate": 7.302834435382147e-06, + "loss": 0.5485, + "step": 888 + }, + { + "epoch": 1.2413392234364102, + "grad_norm": 2.140553776744054, + "learning_rate": 7.2956183703358e-06, + "loss": 0.4321, + "step": 889 + }, + { + "epoch": 1.2427342478493373, + "grad_norm": 1.9128745400529434, + "learning_rate": 7.288396241618401e-06, + "loss": 0.4705, + "step": 890 + }, + { + "epoch": 1.2441292722622646, + "grad_norm": 2.1228545685375377, + "learning_rate": 7.281168068306559e-06, + "loss": 0.5109, + "step": 891 + }, + { + "epoch": 1.2455242966751918, + "grad_norm": 1.9958676358218619, + "learning_rate": 7.2739338694928485e-06, + "loss": 0.4264, + "step": 892 + }, + { + "epoch": 1.246919321088119, + "grad_norm": 2.017602542711396, + "learning_rate": 7.266693664285761e-06, + "loss": 0.4858, + "step": 893 + }, + { + "epoch": 1.2483143455010464, + "grad_norm": 2.1655539794588843, + "learning_rate": 7.259447471809651e-06, + "loss": 0.4718, + "step": 894 + }, + { + "epoch": 1.2497093699139734, + "grad_norm": 1.8961156639730357, + "learning_rate": 7.252195311204689e-06, + "loss": 0.4976, + "step": 895 + }, + { + "epoch": 1.2511043943269007, + "grad_norm": 2.2453477317683594, + "learning_rate": 7.244937201626812e-06, + "loss": 0.5222, + "step": 896 + }, + { + "epoch": 1.252499418739828, + "grad_norm": 2.127532832546729, + "learning_rate": 7.237673162247667e-06, + "loss": 0.5229, + "step": 897 + }, + { + "epoch": 1.2538944431527552, + "grad_norm": 2.1894133849734203, + "learning_rate": 7.230403212254566e-06, + "loss": 0.4418, + "step": 898 + }, + { + "epoch": 1.2552894675656825, + "grad_norm": 2.0080709918299755, + "learning_rate": 7.223127370850433e-06, + "loss": 0.4867, + "step": 899 + }, + { + "epoch": 1.2566844919786098, + "grad_norm": 2.08931797506218, + "learning_rate": 7.215845657253755e-06, + "loss": 0.4997, + "step": 900 + }, + { + "epoch": 1.2580795163915368, + "grad_norm": 2.119270394002083, + "learning_rate": 7.208558090698528e-06, + "loss": 0.4672, + "step": 901 + }, + { + "epoch": 1.259474540804464, + "grad_norm": 2.0460383698950912, + "learning_rate": 7.2012646904342065e-06, + "loss": 0.5123, + "step": 902 + }, + { + "epoch": 1.2608695652173914, + "grad_norm": 2.1384976321501177, + "learning_rate": 7.193965475725659e-06, + "loss": 0.4686, + "step": 903 + }, + { + "epoch": 1.2622645896303184, + "grad_norm": 1.9220601194054647, + "learning_rate": 7.186660465853111e-06, + "loss": 0.4803, + "step": 904 + }, + { + "epoch": 1.2636596140432457, + "grad_norm": 2.0262630040398917, + "learning_rate": 7.1793496801120885e-06, + "loss": 0.472, + "step": 905 + }, + { + "epoch": 1.265054638456173, + "grad_norm": 2.0585200478781482, + "learning_rate": 7.172033137813387e-06, + "loss": 0.4805, + "step": 906 + }, + { + "epoch": 1.2664496628691002, + "grad_norm": 2.2127511883039315, + "learning_rate": 7.1647108582829924e-06, + "loss": 0.5139, + "step": 907 + }, + { + "epoch": 1.2678446872820275, + "grad_norm": 2.1318837737050584, + "learning_rate": 7.157382860862059e-06, + "loss": 0.4948, + "step": 908 + }, + { + "epoch": 1.2692397116949548, + "grad_norm": 2.0404146831089727, + "learning_rate": 7.1500491649068345e-06, + "loss": 0.4759, + "step": 909 + }, + { + "epoch": 1.270634736107882, + "grad_norm": 2.16885361299071, + "learning_rate": 7.1427097897886225e-06, + "loss": 0.4651, + "step": 910 + }, + { + "epoch": 1.272029760520809, + "grad_norm": 1.8126562233953132, + "learning_rate": 7.135364754893729e-06, + "loss": 0.4827, + "step": 911 + }, + { + "epoch": 1.2734247849337363, + "grad_norm": 2.037530800408793, + "learning_rate": 7.128014079623408e-06, + "loss": 0.471, + "step": 912 + }, + { + "epoch": 1.2748198093466636, + "grad_norm": 2.113312788616271, + "learning_rate": 7.120657783393809e-06, + "loss": 0.4536, + "step": 913 + }, + { + "epoch": 1.2762148337595907, + "grad_norm": 1.9683652942609144, + "learning_rate": 7.113295885635936e-06, + "loss": 0.4603, + "step": 914 + }, + { + "epoch": 1.277609858172518, + "grad_norm": 2.0398678179326266, + "learning_rate": 7.105928405795584e-06, + "loss": 0.53, + "step": 915 + }, + { + "epoch": 1.2790048825854452, + "grad_norm": 2.0550203014189155, + "learning_rate": 7.098555363333289e-06, + "loss": 0.4635, + "step": 916 + }, + { + "epoch": 1.2803999069983725, + "grad_norm": 2.0420956994823833, + "learning_rate": 7.091176777724291e-06, + "loss": 0.4923, + "step": 917 + }, + { + "epoch": 1.2817949314112997, + "grad_norm": 2.122118367593062, + "learning_rate": 7.083792668458463e-06, + "loss": 0.4572, + "step": 918 + }, + { + "epoch": 1.283189955824227, + "grad_norm": 1.9605764589538413, + "learning_rate": 7.076403055040271e-06, + "loss": 0.4613, + "step": 919 + }, + { + "epoch": 1.2845849802371543, + "grad_norm": 1.994014351784954, + "learning_rate": 7.069007956988718e-06, + "loss": 0.4917, + "step": 920 + }, + { + "epoch": 1.2859800046500813, + "grad_norm": 2.0322178140926526, + "learning_rate": 7.061607393837295e-06, + "loss": 0.5037, + "step": 921 + }, + { + "epoch": 1.2873750290630086, + "grad_norm": 2.0501993691106892, + "learning_rate": 7.0542013851339316e-06, + "loss": 0.4111, + "step": 922 + }, + { + "epoch": 1.2887700534759359, + "grad_norm": 1.9055682007118293, + "learning_rate": 7.04678995044094e-06, + "loss": 0.4402, + "step": 923 + }, + { + "epoch": 1.290165077888863, + "grad_norm": 1.9513266676688439, + "learning_rate": 7.039373109334957e-06, + "loss": 0.4794, + "step": 924 + }, + { + "epoch": 1.2915601023017902, + "grad_norm": 2.0144597802151134, + "learning_rate": 7.031950881406913e-06, + "loss": 0.4823, + "step": 925 + }, + { + "epoch": 1.2929551267147175, + "grad_norm": 2.100982200223343, + "learning_rate": 7.024523286261959e-06, + "loss": 0.448, + "step": 926 + }, + { + "epoch": 1.2943501511276447, + "grad_norm": 1.9455540836634895, + "learning_rate": 7.017090343519421e-06, + "loss": 0.4824, + "step": 927 + }, + { + "epoch": 1.295745175540572, + "grad_norm": 2.1126530723502595, + "learning_rate": 7.009652072812758e-06, + "loss": 0.4473, + "step": 928 + }, + { + "epoch": 1.2971401999534993, + "grad_norm": 1.9516374387293445, + "learning_rate": 7.0022084937895e-06, + "loss": 0.4934, + "step": 929 + }, + { + "epoch": 1.2985352243664265, + "grad_norm": 2.160226598724437, + "learning_rate": 6.994759626111189e-06, + "loss": 0.5027, + "step": 930 + }, + { + "epoch": 1.2999302487793536, + "grad_norm": 2.149165455764662, + "learning_rate": 6.987305489453352e-06, + "loss": 0.532, + "step": 931 + }, + { + "epoch": 1.3013252731922809, + "grad_norm": 2.169198580033365, + "learning_rate": 6.979846103505423e-06, + "loss": 0.496, + "step": 932 + }, + { + "epoch": 1.3027202976052081, + "grad_norm": 1.9709006258399464, + "learning_rate": 6.972381487970702e-06, + "loss": 0.4476, + "step": 933 + }, + { + "epoch": 1.3041153220181352, + "grad_norm": 2.080456063544832, + "learning_rate": 6.964911662566309e-06, + "loss": 0.517, + "step": 934 + }, + { + "epoch": 1.3055103464310625, + "grad_norm": 2.1878295005029407, + "learning_rate": 6.957436647023117e-06, + "loss": 0.492, + "step": 935 + }, + { + "epoch": 1.3069053708439897, + "grad_norm": 1.9473985278522523, + "learning_rate": 6.949956461085714e-06, + "loss": 0.4913, + "step": 936 + }, + { + "epoch": 1.308300395256917, + "grad_norm": 2.085237018377905, + "learning_rate": 6.942471124512346e-06, + "loss": 0.4729, + "step": 937 + }, + { + "epoch": 1.3096954196698443, + "grad_norm": 2.0171640171405705, + "learning_rate": 6.934980657074859e-06, + "loss": 0.4186, + "step": 938 + }, + { + "epoch": 1.3110904440827715, + "grad_norm": 1.9628946990758591, + "learning_rate": 6.9274850785586526e-06, + "loss": 0.482, + "step": 939 + }, + { + "epoch": 1.3124854684956988, + "grad_norm": 1.9230193122254409, + "learning_rate": 6.919984408762632e-06, + "loss": 0.4612, + "step": 940 + }, + { + "epoch": 1.3138804929086259, + "grad_norm": 2.009734075109168, + "learning_rate": 6.9124786674991465e-06, + "loss": 0.4674, + "step": 941 + }, + { + "epoch": 1.3152755173215531, + "grad_norm": 1.9403798007263833, + "learning_rate": 6.90496787459394e-06, + "loss": 0.4751, + "step": 942 + }, + { + "epoch": 1.3166705417344804, + "grad_norm": 2.0729815313209063, + "learning_rate": 6.897452049886103e-06, + "loss": 0.4748, + "step": 943 + }, + { + "epoch": 1.3180655661474074, + "grad_norm": 1.8042882335697186, + "learning_rate": 6.889931213228015e-06, + "loss": 0.5374, + "step": 944 + }, + { + "epoch": 1.3194605905603347, + "grad_norm": 2.1582377635460706, + "learning_rate": 6.882405384485294e-06, + "loss": 0.4967, + "step": 945 + }, + { + "epoch": 1.320855614973262, + "grad_norm": 2.0774574203867053, + "learning_rate": 6.874874583536748e-06, + "loss": 0.4928, + "step": 946 + }, + { + "epoch": 1.3222506393861893, + "grad_norm": 2.1126118556498525, + "learning_rate": 6.867338830274312e-06, + "loss": 0.5296, + "step": 947 + }, + { + "epoch": 1.3236456637991165, + "grad_norm": 2.2327182347799956, + "learning_rate": 6.8597981446030095e-06, + "loss": 0.5252, + "step": 948 + }, + { + "epoch": 1.3250406882120438, + "grad_norm": 2.0860141510662964, + "learning_rate": 6.852252546440885e-06, + "loss": 0.4903, + "step": 949 + }, + { + "epoch": 1.326435712624971, + "grad_norm": 2.044140821899386, + "learning_rate": 6.844702055718964e-06, + "loss": 0.4587, + "step": 950 + }, + { + "epoch": 1.3278307370378981, + "grad_norm": 1.9786800376206937, + "learning_rate": 6.837146692381197e-06, + "loss": 0.4675, + "step": 951 + }, + { + "epoch": 1.3292257614508254, + "grad_norm": 1.9825281201744271, + "learning_rate": 6.8295864763843965e-06, + "loss": 0.5127, + "step": 952 + }, + { + "epoch": 1.3306207858637527, + "grad_norm": 2.0986306954363227, + "learning_rate": 6.822021427698201e-06, + "loss": 0.4522, + "step": 953 + }, + { + "epoch": 1.33201581027668, + "grad_norm": 2.1261538900308747, + "learning_rate": 6.814451566305014e-06, + "loss": 0.5263, + "step": 954 + }, + { + "epoch": 1.333410834689607, + "grad_norm": 2.2448289680883855, + "learning_rate": 6.806876912199945e-06, + "loss": 0.4962, + "step": 955 + }, + { + "epoch": 1.3348058591025342, + "grad_norm": 2.14005220926737, + "learning_rate": 6.7992974853907655e-06, + "loss": 0.4844, + "step": 956 + }, + { + "epoch": 1.3362008835154615, + "grad_norm": 2.094501275219526, + "learning_rate": 6.791713305897861e-06, + "loss": 0.4773, + "step": 957 + }, + { + "epoch": 1.3375959079283888, + "grad_norm": 1.8776892871093087, + "learning_rate": 6.78412439375416e-06, + "loss": 0.4769, + "step": 958 + }, + { + "epoch": 1.338990932341316, + "grad_norm": 2.0257109173056476, + "learning_rate": 6.776530769005099e-06, + "loss": 0.4574, + "step": 959 + }, + { + "epoch": 1.3403859567542433, + "grad_norm": 2.011593652147381, + "learning_rate": 6.768932451708557e-06, + "loss": 0.4733, + "step": 960 + }, + { + "epoch": 1.3417809811671704, + "grad_norm": 2.0880918905336867, + "learning_rate": 6.761329461934814e-06, + "loss": 0.4717, + "step": 961 + }, + { + "epoch": 1.3431760055800976, + "grad_norm": 2.0756463467401467, + "learning_rate": 6.753721819766489e-06, + "loss": 0.4956, + "step": 962 + }, + { + "epoch": 1.344571029993025, + "grad_norm": 2.1951630126685346, + "learning_rate": 6.746109545298488e-06, + "loss": 0.4821, + "step": 963 + }, + { + "epoch": 1.3459660544059522, + "grad_norm": 2.076217947734719, + "learning_rate": 6.738492658637957e-06, + "loss": 0.4408, + "step": 964 + }, + { + "epoch": 1.3473610788188792, + "grad_norm": 1.8012492184873017, + "learning_rate": 6.730871179904218e-06, + "loss": 0.5316, + "step": 965 + }, + { + "epoch": 1.3487561032318065, + "grad_norm": 2.0662235372305093, + "learning_rate": 6.723245129228732e-06, + "loss": 0.4425, + "step": 966 + }, + { + "epoch": 1.3501511276447338, + "grad_norm": 2.1510994544097723, + "learning_rate": 6.7156145267550275e-06, + "loss": 0.47, + "step": 967 + }, + { + "epoch": 1.351546152057661, + "grad_norm": 2.06829815044842, + "learning_rate": 6.707979392638663e-06, + "loss": 0.4589, + "step": 968 + }, + { + "epoch": 1.3529411764705883, + "grad_norm": 2.0138794871206858, + "learning_rate": 6.700339747047162e-06, + "loss": 0.489, + "step": 969 + }, + { + "epoch": 1.3543362008835156, + "grad_norm": 2.041938633849983, + "learning_rate": 6.692695610159966e-06, + "loss": 0.481, + "step": 970 + }, + { + "epoch": 1.3557312252964426, + "grad_norm": 1.9340098627616258, + "learning_rate": 6.685047002168382e-06, + "loss": 0.4518, + "step": 971 + }, + { + "epoch": 1.35712624970937, + "grad_norm": 2.0444945166428288, + "learning_rate": 6.677393943275525e-06, + "loss": 0.4652, + "step": 972 + }, + { + "epoch": 1.3585212741222972, + "grad_norm": 2.0057051447255527, + "learning_rate": 6.669736453696266e-06, + "loss": 0.4662, + "step": 973 + }, + { + "epoch": 1.3599162985352244, + "grad_norm": 2.099453331432414, + "learning_rate": 6.66207455365718e-06, + "loss": 0.4565, + "step": 974 + }, + { + "epoch": 1.3613113229481515, + "grad_norm": 1.9165876489537292, + "learning_rate": 6.6544082633964955e-06, + "loss": 0.4398, + "step": 975 + }, + { + "epoch": 1.3627063473610788, + "grad_norm": 2.135272106746183, + "learning_rate": 6.646737603164031e-06, + "loss": 0.5094, + "step": 976 + }, + { + "epoch": 1.364101371774006, + "grad_norm": 2.139613224778207, + "learning_rate": 6.639062593221152e-06, + "loss": 0.5341, + "step": 977 + }, + { + "epoch": 1.3654963961869333, + "grad_norm": 2.530233359548065, + "learning_rate": 6.6313832538407106e-06, + "loss": 0.4969, + "step": 978 + }, + { + "epoch": 1.3668914205998606, + "grad_norm": 1.974541177446713, + "learning_rate": 6.623699605306999e-06, + "loss": 0.4917, + "step": 979 + }, + { + "epoch": 1.3682864450127878, + "grad_norm": 2.1328867084448313, + "learning_rate": 6.6160116679156874e-06, + "loss": 0.4667, + "step": 980 + }, + { + "epoch": 1.369681469425715, + "grad_norm": 2.217976729899946, + "learning_rate": 6.608319461973778e-06, + "loss": 0.4602, + "step": 981 + }, + { + "epoch": 1.3710764938386422, + "grad_norm": 2.0033070485546705, + "learning_rate": 6.6006230077995424e-06, + "loss": 0.4614, + "step": 982 + }, + { + "epoch": 1.3724715182515694, + "grad_norm": 1.7949480230161823, + "learning_rate": 6.592922325722483e-06, + "loss": 0.4625, + "step": 983 + }, + { + "epoch": 1.3738665426644967, + "grad_norm": 2.3605371814580987, + "learning_rate": 6.58521743608326e-06, + "loss": 0.5013, + "step": 984 + }, + { + "epoch": 1.3752615670774238, + "grad_norm": 2.079920357785801, + "learning_rate": 6.577508359233653e-06, + "loss": 0.4887, + "step": 985 + }, + { + "epoch": 1.376656591490351, + "grad_norm": 2.168691229695169, + "learning_rate": 6.569795115536502e-06, + "loss": 0.4736, + "step": 986 + }, + { + "epoch": 1.3780516159032783, + "grad_norm": 2.041653674751869, + "learning_rate": 6.562077725365648e-06, + "loss": 0.5144, + "step": 987 + }, + { + "epoch": 1.3794466403162056, + "grad_norm": 2.018405822849198, + "learning_rate": 6.554356209105892e-06, + "loss": 0.4193, + "step": 988 + }, + { + "epoch": 1.3808416647291328, + "grad_norm": 2.0791572967119665, + "learning_rate": 6.54663058715293e-06, + "loss": 0.5241, + "step": 989 + }, + { + "epoch": 1.38223668914206, + "grad_norm": 2.0829602943845797, + "learning_rate": 6.538900879913301e-06, + "loss": 0.4861, + "step": 990 + }, + { + "epoch": 1.3836317135549872, + "grad_norm": 2.1547428115463094, + "learning_rate": 6.531167107804337e-06, + "loss": 0.4959, + "step": 991 + }, + { + "epoch": 1.3850267379679144, + "grad_norm": 2.0439903006152833, + "learning_rate": 6.523429291254109e-06, + "loss": 0.5031, + "step": 992 + }, + { + "epoch": 1.3864217623808417, + "grad_norm": 2.1425187821342435, + "learning_rate": 6.515687450701367e-06, + "loss": 0.476, + "step": 993 + }, + { + "epoch": 1.387816786793769, + "grad_norm": 2.043901736829992, + "learning_rate": 6.507941606595492e-06, + "loss": 0.4368, + "step": 994 + }, + { + "epoch": 1.389211811206696, + "grad_norm": 1.9144888000668951, + "learning_rate": 6.500191779396439e-06, + "loss": 0.4878, + "step": 995 + }, + { + "epoch": 1.3906068356196233, + "grad_norm": 2.136038237292774, + "learning_rate": 6.492437989574689e-06, + "loss": 0.4824, + "step": 996 + }, + { + "epoch": 1.3920018600325506, + "grad_norm": 2.123580451737323, + "learning_rate": 6.48468025761118e-06, + "loss": 0.4832, + "step": 997 + }, + { + "epoch": 1.3933968844454778, + "grad_norm": 2.030731921604436, + "learning_rate": 6.476918603997273e-06, + "loss": 0.4543, + "step": 998 + }, + { + "epoch": 1.394791908858405, + "grad_norm": 2.33185642715308, + "learning_rate": 6.469153049234683e-06, + "loss": 0.5085, + "step": 999 + }, + { + "epoch": 1.3961869332713324, + "grad_norm": 2.1984413784598393, + "learning_rate": 6.461383613835427e-06, + "loss": 0.4935, + "step": 1000 + }, + { + "epoch": 1.3975819576842594, + "grad_norm": 1.9194445089831693, + "learning_rate": 6.453610318321777e-06, + "loss": 0.4486, + "step": 1001 + }, + { + "epoch": 1.3989769820971867, + "grad_norm": 1.9845065956042707, + "learning_rate": 6.445833183226201e-06, + "loss": 0.5038, + "step": 1002 + }, + { + "epoch": 1.400372006510114, + "grad_norm": 2.261131914609956, + "learning_rate": 6.438052229091303e-06, + "loss": 0.4836, + "step": 1003 + }, + { + "epoch": 1.4017670309230412, + "grad_norm": 2.1179514773362595, + "learning_rate": 6.430267476469783e-06, + "loss": 0.4887, + "step": 1004 + }, + { + "epoch": 1.4031620553359683, + "grad_norm": 2.0705617150447955, + "learning_rate": 6.4224789459243705e-06, + "loss": 0.485, + "step": 1005 + }, + { + "epoch": 1.4045570797488955, + "grad_norm": 2.0525161666844114, + "learning_rate": 6.4146866580277686e-06, + "loss": 0.4524, + "step": 1006 + }, + { + "epoch": 1.4059521041618228, + "grad_norm": 2.009720205709189, + "learning_rate": 6.406890633362618e-06, + "loss": 0.4404, + "step": 1007 + }, + { + "epoch": 1.40734712857475, + "grad_norm": 1.9015907571149264, + "learning_rate": 6.3990908925214155e-06, + "loss": 0.4251, + "step": 1008 + }, + { + "epoch": 1.4087421529876774, + "grad_norm": 1.886712184163379, + "learning_rate": 6.391287456106483e-06, + "loss": 0.4772, + "step": 1009 + }, + { + "epoch": 1.4101371774006046, + "grad_norm": 2.1448546108736135, + "learning_rate": 6.383480344729903e-06, + "loss": 0.5044, + "step": 1010 + }, + { + "epoch": 1.4115322018135317, + "grad_norm": 2.1232999504648546, + "learning_rate": 6.375669579013461e-06, + "loss": 0.5154, + "step": 1011 + }, + { + "epoch": 1.412927226226459, + "grad_norm": 2.0320440158568642, + "learning_rate": 6.367855179588597e-06, + "loss": 0.5162, + "step": 1012 + }, + { + "epoch": 1.4143222506393862, + "grad_norm": 2.082682281577738, + "learning_rate": 6.3600371670963525e-06, + "loss": 0.4703, + "step": 1013 + }, + { + "epoch": 1.4157172750523135, + "grad_norm": 2.0549361603179057, + "learning_rate": 6.352215562187307e-06, + "loss": 0.4864, + "step": 1014 + }, + { + "epoch": 1.4171122994652405, + "grad_norm": 2.008308887519786, + "learning_rate": 6.344390385521534e-06, + "loss": 0.4803, + "step": 1015 + }, + { + "epoch": 1.4185073238781678, + "grad_norm": 2.153291637026902, + "learning_rate": 6.33656165776854e-06, + "loss": 0.4873, + "step": 1016 + }, + { + "epoch": 1.419902348291095, + "grad_norm": 2.234058551814876, + "learning_rate": 6.328729399607206e-06, + "loss": 0.4814, + "step": 1017 + }, + { + "epoch": 1.4212973727040223, + "grad_norm": 2.027510476745004, + "learning_rate": 6.320893631725748e-06, + "loss": 0.4638, + "step": 1018 + }, + { + "epoch": 1.4226923971169496, + "grad_norm": 2.032668407885514, + "learning_rate": 6.313054374821647e-06, + "loss": 0.4828, + "step": 1019 + }, + { + "epoch": 1.4240874215298769, + "grad_norm": 2.0022767352637283, + "learning_rate": 6.305211649601595e-06, + "loss": 0.4163, + "step": 1020 + }, + { + "epoch": 1.425482445942804, + "grad_norm": 1.8646715081707912, + "learning_rate": 6.29736547678146e-06, + "loss": 0.4599, + "step": 1021 + }, + { + "epoch": 1.4268774703557312, + "grad_norm": 2.104118263529368, + "learning_rate": 6.289515877086199e-06, + "loss": 0.5469, + "step": 1022 + }, + { + "epoch": 1.4282724947686585, + "grad_norm": 2.0535906765696468, + "learning_rate": 6.2816628712498315e-06, + "loss": 0.4806, + "step": 1023 + }, + { + "epoch": 1.4296675191815857, + "grad_norm": 1.9740833955901596, + "learning_rate": 6.273806480015374e-06, + "loss": 0.4685, + "step": 1024 + }, + { + "epoch": 1.4310625435945128, + "grad_norm": 2.068364016166393, + "learning_rate": 6.265946724134782e-06, + "loss": 0.4175, + "step": 1025 + }, + { + "epoch": 1.43245756800744, + "grad_norm": 1.93323563183723, + "learning_rate": 6.258083624368895e-06, + "loss": 0.4816, + "step": 1026 + }, + { + "epoch": 1.4338525924203673, + "grad_norm": 1.981160775740525, + "learning_rate": 6.250217201487395e-06, + "loss": 0.4893, + "step": 1027 + }, + { + "epoch": 1.4352476168332946, + "grad_norm": 1.99307431242008, + "learning_rate": 6.242347476268733e-06, + "loss": 0.4831, + "step": 1028 + }, + { + "epoch": 1.4366426412462219, + "grad_norm": 2.1199144029755566, + "learning_rate": 6.2344744695000855e-06, + "loss": 0.4926, + "step": 1029 + }, + { + "epoch": 1.4380376656591491, + "grad_norm": 2.1548929965564527, + "learning_rate": 6.226598201977299e-06, + "loss": 0.4912, + "step": 1030 + }, + { + "epoch": 1.4394326900720762, + "grad_norm": 2.138841310759008, + "learning_rate": 6.218718694504831e-06, + "loss": 0.5061, + "step": 1031 + }, + { + "epoch": 1.4408277144850035, + "grad_norm": 2.059139335954622, + "learning_rate": 6.2108359678956954e-06, + "loss": 0.4455, + "step": 1032 + }, + { + "epoch": 1.4422227388979307, + "grad_norm": 2.257526307269232, + "learning_rate": 6.202950042971414e-06, + "loss": 0.5006, + "step": 1033 + }, + { + "epoch": 1.443617763310858, + "grad_norm": 2.0602776029453627, + "learning_rate": 6.19506094056195e-06, + "loss": 0.5123, + "step": 1034 + }, + { + "epoch": 1.445012787723785, + "grad_norm": 2.3502378746003174, + "learning_rate": 6.187168681505666e-06, + "loss": 0.4879, + "step": 1035 + }, + { + "epoch": 1.4464078121367123, + "grad_norm": 2.069457866799051, + "learning_rate": 6.17927328664926e-06, + "loss": 0.4667, + "step": 1036 + }, + { + "epoch": 1.4478028365496396, + "grad_norm": 1.8813160122162929, + "learning_rate": 6.171374776847711e-06, + "loss": 0.4959, + "step": 1037 + }, + { + "epoch": 1.4491978609625669, + "grad_norm": 2.146202652822659, + "learning_rate": 6.163473172964229e-06, + "loss": 0.4592, + "step": 1038 + }, + { + "epoch": 1.4505928853754941, + "grad_norm": 2.088457160035899, + "learning_rate": 6.1555684958701965e-06, + "loss": 0.4867, + "step": 1039 + }, + { + "epoch": 1.4519879097884214, + "grad_norm": 2.0634903279814125, + "learning_rate": 6.1476607664451105e-06, + "loss": 0.4948, + "step": 1040 + }, + { + "epoch": 1.4533829342013485, + "grad_norm": 2.0752059196250565, + "learning_rate": 6.1397500055765345e-06, + "loss": 0.5038, + "step": 1041 + }, + { + "epoch": 1.4547779586142757, + "grad_norm": 2.1416930775369236, + "learning_rate": 6.131836234160036e-06, + "loss": 0.4784, + "step": 1042 + }, + { + "epoch": 1.456172983027203, + "grad_norm": 2.019710198196397, + "learning_rate": 6.123919473099134e-06, + "loss": 0.4724, + "step": 1043 + }, + { + "epoch": 1.4575680074401303, + "grad_norm": 2.096288377676786, + "learning_rate": 6.115999743305252e-06, + "loss": 0.4567, + "step": 1044 + }, + { + "epoch": 1.4589630318530573, + "grad_norm": 2.0619796314944026, + "learning_rate": 6.1080770656976444e-06, + "loss": 0.4204, + "step": 1045 + }, + { + "epoch": 1.4603580562659846, + "grad_norm": 1.9981309556081335, + "learning_rate": 6.100151461203359e-06, + "loss": 0.4727, + "step": 1046 + }, + { + "epoch": 1.4617530806789119, + "grad_norm": 1.9543783730619007, + "learning_rate": 6.0922229507571716e-06, + "loss": 0.4923, + "step": 1047 + }, + { + "epoch": 1.4631481050918391, + "grad_norm": 2.0229372400285293, + "learning_rate": 6.084291555301537e-06, + "loss": 0.5037, + "step": 1048 + }, + { + "epoch": 1.4645431295047664, + "grad_norm": 2.142636652811151, + "learning_rate": 6.076357295786526e-06, + "loss": 0.4535, + "step": 1049 + }, + { + "epoch": 1.4659381539176937, + "grad_norm": 2.003318668809125, + "learning_rate": 6.068420193169779e-06, + "loss": 0.5209, + "step": 1050 + }, + { + "epoch": 1.4673331783306207, + "grad_norm": 2.1145852835254373, + "learning_rate": 6.0604802684164436e-06, + "loss": 0.5086, + "step": 1051 + }, + { + "epoch": 1.468728202743548, + "grad_norm": 2.114398404283612, + "learning_rate": 6.052537542499122e-06, + "loss": 0.455, + "step": 1052 + }, + { + "epoch": 1.4701232271564753, + "grad_norm": 2.005080980781578, + "learning_rate": 6.044592036397816e-06, + "loss": 0.4439, + "step": 1053 + }, + { + "epoch": 1.4715182515694025, + "grad_norm": 1.9455243320200077, + "learning_rate": 6.0366437710998715e-06, + "loss": 0.4855, + "step": 1054 + }, + { + "epoch": 1.4729132759823296, + "grad_norm": 2.252933665609378, + "learning_rate": 6.0286927675999205e-06, + "loss": 0.4724, + "step": 1055 + }, + { + "epoch": 1.4743083003952568, + "grad_norm": 1.919732919513372, + "learning_rate": 6.02073904689983e-06, + "loss": 0.5262, + "step": 1056 + }, + { + "epoch": 1.4757033248081841, + "grad_norm": 2.3597721797516726, + "learning_rate": 6.012782630008646e-06, + "loss": 0.5051, + "step": 1057 + }, + { + "epoch": 1.4770983492211114, + "grad_norm": 2.214881592520836, + "learning_rate": 6.004823537942528e-06, + "loss": 0.4667, + "step": 1058 + }, + { + "epoch": 1.4784933736340387, + "grad_norm": 1.99206372251337, + "learning_rate": 5.996861791724713e-06, + "loss": 0.503, + "step": 1059 + }, + { + "epoch": 1.479888398046966, + "grad_norm": 2.102693460428217, + "learning_rate": 5.98889741238544e-06, + "loss": 0.4772, + "step": 1060 + }, + { + "epoch": 1.481283422459893, + "grad_norm": 2.1595527269970267, + "learning_rate": 5.9809304209619054e-06, + "loss": 0.4647, + "step": 1061 + }, + { + "epoch": 1.4826784468728202, + "grad_norm": 1.8101345913472888, + "learning_rate": 5.9729608384982085e-06, + "loss": 0.4393, + "step": 1062 + }, + { + "epoch": 1.4840734712857475, + "grad_norm": 2.0572441260481837, + "learning_rate": 5.964988686045289e-06, + "loss": 0.507, + "step": 1063 + }, + { + "epoch": 1.4854684956986748, + "grad_norm": 2.1342940374448958, + "learning_rate": 5.957013984660875e-06, + "loss": 0.4949, + "step": 1064 + }, + { + "epoch": 1.4868635201116018, + "grad_norm": 2.141413426865797, + "learning_rate": 5.949036755409432e-06, + "loss": 0.4561, + "step": 1065 + }, + { + "epoch": 1.488258544524529, + "grad_norm": 2.102762243321115, + "learning_rate": 5.941057019362095e-06, + "loss": 0.4462, + "step": 1066 + }, + { + "epoch": 1.4896535689374564, + "grad_norm": 1.96172978194436, + "learning_rate": 5.933074797596627e-06, + "loss": 0.4478, + "step": 1067 + }, + { + "epoch": 1.4910485933503836, + "grad_norm": 2.1140432852002427, + "learning_rate": 5.925090111197355e-06, + "loss": 0.4675, + "step": 1068 + }, + { + "epoch": 1.492443617763311, + "grad_norm": 2.0851675116564743, + "learning_rate": 5.917102981255114e-06, + "loss": 0.4895, + "step": 1069 + }, + { + "epoch": 1.4938386421762382, + "grad_norm": 2.1064824290137913, + "learning_rate": 5.909113428867195e-06, + "loss": 0.5739, + "step": 1070 + }, + { + "epoch": 1.4952336665891652, + "grad_norm": 2.363919632384231, + "learning_rate": 5.901121475137287e-06, + "loss": 0.4841, + "step": 1071 + }, + { + "epoch": 1.4966286910020925, + "grad_norm": 2.1554543460639515, + "learning_rate": 5.893127141175425e-06, + "loss": 0.5, + "step": 1072 + }, + { + "epoch": 1.4980237154150198, + "grad_norm": 2.100025898464971, + "learning_rate": 5.885130448097926e-06, + "loss": 0.5064, + "step": 1073 + }, + { + "epoch": 1.499418739827947, + "grad_norm": 2.1902729566688226, + "learning_rate": 5.877131417027343e-06, + "loss": 0.5269, + "step": 1074 + }, + { + "epoch": 1.500813764240874, + "grad_norm": 2.1036882743228, + "learning_rate": 5.869130069092401e-06, + "loss": 0.5054, + "step": 1075 + }, + { + "epoch": 1.5022087886538014, + "grad_norm": 2.0250157904579584, + "learning_rate": 5.861126425427949e-06, + "loss": 0.4583, + "step": 1076 + }, + { + "epoch": 1.5036038130667286, + "grad_norm": 2.1122885769868294, + "learning_rate": 5.853120507174894e-06, + "loss": 0.518, + "step": 1077 + }, + { + "epoch": 1.504998837479656, + "grad_norm": 2.0922750769271636, + "learning_rate": 5.845112335480159e-06, + "loss": 0.4901, + "step": 1078 + }, + { + "epoch": 1.5063938618925832, + "grad_norm": 2.0533938325641663, + "learning_rate": 5.83710193149661e-06, + "loss": 0.4318, + "step": 1079 + }, + { + "epoch": 1.5077888863055104, + "grad_norm": 2.038905596805902, + "learning_rate": 5.829089316383018e-06, + "loss": 0.4922, + "step": 1080 + }, + { + "epoch": 1.5091839107184377, + "grad_norm": 2.151329977005894, + "learning_rate": 5.821074511303988e-06, + "loss": 0.4657, + "step": 1081 + }, + { + "epoch": 1.5105789351313648, + "grad_norm": 1.92148106773121, + "learning_rate": 5.813057537429915e-06, + "loss": 0.4731, + "step": 1082 + }, + { + "epoch": 1.511973959544292, + "grad_norm": 2.091908495352274, + "learning_rate": 5.805038415936919e-06, + "loss": 0.4274, + "step": 1083 + }, + { + "epoch": 1.5133689839572193, + "grad_norm": 1.965352130042937, + "learning_rate": 5.797017168006791e-06, + "loss": 0.4769, + "step": 1084 + }, + { + "epoch": 1.5147640083701464, + "grad_norm": 2.1649638197222343, + "learning_rate": 5.7889938148269445e-06, + "loss": 0.5076, + "step": 1085 + }, + { + "epoch": 1.5161590327830736, + "grad_norm": 2.014733451344566, + "learning_rate": 5.7809683775903525e-06, + "loss": 0.4598, + "step": 1086 + }, + { + "epoch": 1.517554057196001, + "grad_norm": 2.2758162297619844, + "learning_rate": 5.7729408774954865e-06, + "loss": 0.4766, + "step": 1087 + }, + { + "epoch": 1.5189490816089282, + "grad_norm": 2.1485122128659815, + "learning_rate": 5.764911335746275e-06, + "loss": 0.4787, + "step": 1088 + }, + { + "epoch": 1.5203441060218554, + "grad_norm": 2.1157777785516236, + "learning_rate": 5.756879773552037e-06, + "loss": 0.4864, + "step": 1089 + }, + { + "epoch": 1.5217391304347827, + "grad_norm": 2.049165688638103, + "learning_rate": 5.748846212127421e-06, + "loss": 0.5437, + "step": 1090 + }, + { + "epoch": 1.52313415484771, + "grad_norm": 2.1872948943308494, + "learning_rate": 5.74081067269237e-06, + "loss": 0.4513, + "step": 1091 + }, + { + "epoch": 1.524529179260637, + "grad_norm": 1.980706388022509, + "learning_rate": 5.732773176472042e-06, + "loss": 0.4243, + "step": 1092 + }, + { + "epoch": 1.5259242036735643, + "grad_norm": 1.8594637870370903, + "learning_rate": 5.7247337446967625e-06, + "loss": 0.4787, + "step": 1093 + }, + { + "epoch": 1.5273192280864916, + "grad_norm": 2.004778006837601, + "learning_rate": 5.716692398601975e-06, + "loss": 0.4724, + "step": 1094 + }, + { + "epoch": 1.5287142524994186, + "grad_norm": 1.9496047062065673, + "learning_rate": 5.708649159428181e-06, + "loss": 0.5118, + "step": 1095 + }, + { + "epoch": 1.5301092769123459, + "grad_norm": 2.323924983303737, + "learning_rate": 5.700604048420875e-06, + "loss": 0.5089, + "step": 1096 + }, + { + "epoch": 1.5315043013252732, + "grad_norm": 1.8596638029475623, + "learning_rate": 5.692557086830501e-06, + "loss": 0.4974, + "step": 1097 + }, + { + "epoch": 1.5328993257382004, + "grad_norm": 2.1299683325884056, + "learning_rate": 5.68450829591239e-06, + "loss": 0.4651, + "step": 1098 + }, + { + "epoch": 1.5342943501511277, + "grad_norm": 2.076100505238362, + "learning_rate": 5.676457696926703e-06, + "loss": 0.4883, + "step": 1099 + }, + { + "epoch": 1.535689374564055, + "grad_norm": 2.146952548510558, + "learning_rate": 5.668405311138382e-06, + "loss": 0.4918, + "step": 1100 + }, + { + "epoch": 1.5370843989769822, + "grad_norm": 2.0439577341140915, + "learning_rate": 5.660351159817083e-06, + "loss": 0.4709, + "step": 1101 + }, + { + "epoch": 1.5384794233899093, + "grad_norm": 2.188976123724856, + "learning_rate": 5.652295264237128e-06, + "loss": 0.4984, + "step": 1102 + }, + { + "epoch": 1.5398744478028366, + "grad_norm": 2.0659662235602942, + "learning_rate": 5.6442376456774495e-06, + "loss": 0.4316, + "step": 1103 + }, + { + "epoch": 1.5412694722157638, + "grad_norm": 1.9009883802453258, + "learning_rate": 5.636178325421524e-06, + "loss": 0.4689, + "step": 1104 + }, + { + "epoch": 1.5426644966286909, + "grad_norm": 2.072884089336916, + "learning_rate": 5.628117324757326e-06, + "loss": 0.4881, + "step": 1105 + }, + { + "epoch": 1.5440595210416181, + "grad_norm": 2.0809785200505466, + "learning_rate": 5.620054664977275e-06, + "loss": 0.3991, + "step": 1106 + }, + { + "epoch": 1.5454545454545454, + "grad_norm": 1.9217918849239009, + "learning_rate": 5.61199036737816e-06, + "loss": 0.4564, + "step": 1107 + }, + { + "epoch": 1.5468495698674727, + "grad_norm": 1.8909553621918078, + "learning_rate": 5.603924453261109e-06, + "loss": 0.4339, + "step": 1108 + }, + { + "epoch": 1.5482445942804, + "grad_norm": 2.128862572565254, + "learning_rate": 5.595856943931512e-06, + "loss": 0.4987, + "step": 1109 + }, + { + "epoch": 1.5496396186933272, + "grad_norm": 2.1719486428343457, + "learning_rate": 5.587787860698975e-06, + "loss": 0.4635, + "step": 1110 + }, + { + "epoch": 1.5510346431062545, + "grad_norm": 1.9555844376182605, + "learning_rate": 5.579717224877261e-06, + "loss": 0.529, + "step": 1111 + }, + { + "epoch": 1.5524296675191815, + "grad_norm": 2.2688788762864927, + "learning_rate": 5.571645057784236e-06, + "loss": 0.5013, + "step": 1112 + }, + { + "epoch": 1.5538246919321088, + "grad_norm": 2.228130536027715, + "learning_rate": 5.5635713807418055e-06, + "loss": 0.4697, + "step": 1113 + }, + { + "epoch": 1.555219716345036, + "grad_norm": 2.0348706803599135, + "learning_rate": 5.55549621507587e-06, + "loss": 0.4896, + "step": 1114 + }, + { + "epoch": 1.5566147407579631, + "grad_norm": 1.985770851471805, + "learning_rate": 5.547419582116259e-06, + "loss": 0.5184, + "step": 1115 + }, + { + "epoch": 1.5580097651708904, + "grad_norm": 2.2260993059821073, + "learning_rate": 5.539341503196674e-06, + "loss": 0.4412, + "step": 1116 + }, + { + "epoch": 1.5594047895838177, + "grad_norm": 2.1526251050811336, + "learning_rate": 5.531261999654646e-06, + "loss": 0.4407, + "step": 1117 + }, + { + "epoch": 1.560799813996745, + "grad_norm": 2.021163422905953, + "learning_rate": 5.5231810928314555e-06, + "loss": 0.4614, + "step": 1118 + }, + { + "epoch": 1.5621948384096722, + "grad_norm": 1.974269573884685, + "learning_rate": 5.5150988040721e-06, + "loss": 0.4581, + "step": 1119 + }, + { + "epoch": 1.5635898628225995, + "grad_norm": 2.075129892774578, + "learning_rate": 5.507015154725226e-06, + "loss": 0.4614, + "step": 1120 + }, + { + "epoch": 1.5649848872355268, + "grad_norm": 1.8323848221545598, + "learning_rate": 5.4989301661430685e-06, + "loss": 0.451, + "step": 1121 + }, + { + "epoch": 1.5663799116484538, + "grad_norm": 2.0259660795350083, + "learning_rate": 5.490843859681404e-06, + "loss": 0.4601, + "step": 1122 + }, + { + "epoch": 1.567774936061381, + "grad_norm": 2.1270056899092076, + "learning_rate": 5.48275625669949e-06, + "loss": 0.458, + "step": 1123 + }, + { + "epoch": 1.5691699604743083, + "grad_norm": 1.9780355578127613, + "learning_rate": 5.474667378560007e-06, + "loss": 0.4608, + "step": 1124 + }, + { + "epoch": 1.5705649848872354, + "grad_norm": 1.9831337590058289, + "learning_rate": 5.466577246629006e-06, + "loss": 0.4808, + "step": 1125 + }, + { + "epoch": 1.5719600093001627, + "grad_norm": 1.964070734016315, + "learning_rate": 5.458485882275848e-06, + "loss": 0.4335, + "step": 1126 + }, + { + "epoch": 1.57335503371309, + "grad_norm": 1.8520479300798307, + "learning_rate": 5.45039330687315e-06, + "loss": 0.4896, + "step": 1127 + }, + { + "epoch": 1.5747500581260172, + "grad_norm": 1.987414988454785, + "learning_rate": 5.442299541796727e-06, + "loss": 0.4483, + "step": 1128 + }, + { + "epoch": 1.5761450825389445, + "grad_norm": 2.022178299234183, + "learning_rate": 5.4342046084255385e-06, + "loss": 0.4316, + "step": 1129 + }, + { + "epoch": 1.5775401069518717, + "grad_norm": 1.9494172415463071, + "learning_rate": 5.426108528141627e-06, + "loss": 0.4712, + "step": 1130 + }, + { + "epoch": 1.578935131364799, + "grad_norm": 1.9126927477747944, + "learning_rate": 5.4180113223300665e-06, + "loss": 0.443, + "step": 1131 + }, + { + "epoch": 1.580330155777726, + "grad_norm": 2.0412089862860614, + "learning_rate": 5.409913012378903e-06, + "loss": 0.4237, + "step": 1132 + }, + { + "epoch": 1.5817251801906533, + "grad_norm": 1.947246022834085, + "learning_rate": 5.401813619679102e-06, + "loss": 0.4951, + "step": 1133 + }, + { + "epoch": 1.5831202046035806, + "grad_norm": 2.0984893727921294, + "learning_rate": 5.3937131656244834e-06, + "loss": 0.4401, + "step": 1134 + }, + { + "epoch": 1.5845152290165077, + "grad_norm": 2.1947448476976437, + "learning_rate": 5.385611671611676e-06, + "loss": 0.4799, + "step": 1135 + }, + { + "epoch": 1.585910253429435, + "grad_norm": 2.026074719354588, + "learning_rate": 5.377509159040051e-06, + "loss": 0.4918, + "step": 1136 + }, + { + "epoch": 1.5873052778423622, + "grad_norm": 2.1061592247651486, + "learning_rate": 5.3694056493116745e-06, + "loss": 0.4472, + "step": 1137 + }, + { + "epoch": 1.5887003022552895, + "grad_norm": 2.119635832241502, + "learning_rate": 5.361301163831242e-06, + "loss": 0.4912, + "step": 1138 + }, + { + "epoch": 1.5900953266682167, + "grad_norm": 2.0514457851634504, + "learning_rate": 5.353195724006031e-06, + "loss": 0.4347, + "step": 1139 + }, + { + "epoch": 1.591490351081144, + "grad_norm": 1.9739425266282826, + "learning_rate": 5.345089351245834e-06, + "loss": 0.489, + "step": 1140 + }, + { + "epoch": 1.5928853754940713, + "grad_norm": 2.097334194705438, + "learning_rate": 5.336982066962915e-06, + "loss": 0.4706, + "step": 1141 + }, + { + "epoch": 1.5942803999069983, + "grad_norm": 2.0889563513652187, + "learning_rate": 5.328873892571941e-06, + "loss": 0.4324, + "step": 1142 + }, + { + "epoch": 1.5956754243199256, + "grad_norm": 2.161914385832255, + "learning_rate": 5.320764849489929e-06, + "loss": 0.4957, + "step": 1143 + }, + { + "epoch": 1.5970704487328529, + "grad_norm": 2.127711288940709, + "learning_rate": 5.312654959136194e-06, + "loss": 0.4044, + "step": 1144 + }, + { + "epoch": 1.59846547314578, + "grad_norm": 1.9955461760081252, + "learning_rate": 5.304544242932288e-06, + "loss": 0.5024, + "step": 1145 + }, + { + "epoch": 1.5998604975587072, + "grad_norm": 2.146359711210145, + "learning_rate": 5.296432722301944e-06, + "loss": 0.5134, + "step": 1146 + }, + { + "epoch": 1.6012555219716345, + "grad_norm": 2.056122492718699, + "learning_rate": 5.288320418671018e-06, + "loss": 0.4519, + "step": 1147 + }, + { + "epoch": 1.6026505463845617, + "grad_norm": 2.26187453297196, + "learning_rate": 5.280207353467438e-06, + "loss": 0.4829, + "step": 1148 + }, + { + "epoch": 1.604045570797489, + "grad_norm": 2.0985463489065426, + "learning_rate": 5.272093548121141e-06, + "loss": 0.4886, + "step": 1149 + }, + { + "epoch": 1.6054405952104163, + "grad_norm": 2.0830677262546002, + "learning_rate": 5.26397902406402e-06, + "loss": 0.4909, + "step": 1150 + }, + { + "epoch": 1.6068356196233435, + "grad_norm": 2.2329507839509923, + "learning_rate": 5.255863802729866e-06, + "loss": 0.4694, + "step": 1151 + }, + { + "epoch": 1.6082306440362706, + "grad_norm": 2.1821069364143146, + "learning_rate": 5.247747905554311e-06, + "loss": 0.4388, + "step": 1152 + }, + { + "epoch": 1.6096256684491979, + "grad_norm": 1.9642708773504776, + "learning_rate": 5.239631353974774e-06, + "loss": 0.5221, + "step": 1153 + }, + { + "epoch": 1.6110206928621251, + "grad_norm": 2.15241370034379, + "learning_rate": 5.231514169430403e-06, + "loss": 0.4742, + "step": 1154 + }, + { + "epoch": 1.6124157172750522, + "grad_norm": 2.1018420489764735, + "learning_rate": 5.223396373362013e-06, + "loss": 0.4391, + "step": 1155 + }, + { + "epoch": 1.6138107416879794, + "grad_norm": 1.8679671585704423, + "learning_rate": 5.215277987212041e-06, + "loss": 0.5281, + "step": 1156 + }, + { + "epoch": 1.6152057661009067, + "grad_norm": 2.1271535673490036, + "learning_rate": 5.207159032424478e-06, + "loss": 0.4999, + "step": 1157 + }, + { + "epoch": 1.616600790513834, + "grad_norm": 2.0550576490650565, + "learning_rate": 5.199039530444819e-06, + "loss": 0.452, + "step": 1158 + }, + { + "epoch": 1.6179958149267613, + "grad_norm": 2.086211359150292, + "learning_rate": 5.1909195027200055e-06, + "loss": 0.4656, + "step": 1159 + }, + { + "epoch": 1.6193908393396885, + "grad_norm": 1.9393017851532006, + "learning_rate": 5.182798970698361e-06, + "loss": 0.4661, + "step": 1160 + }, + { + "epoch": 1.6207858637526158, + "grad_norm": 2.11447885984754, + "learning_rate": 5.174677955829551e-06, + "loss": 0.4784, + "step": 1161 + }, + { + "epoch": 1.6221808881655428, + "grad_norm": 2.021527606784538, + "learning_rate": 5.166556479564511e-06, + "loss": 0.5031, + "step": 1162 + }, + { + "epoch": 1.6235759125784701, + "grad_norm": 1.989052449176325, + "learning_rate": 5.158434563355392e-06, + "loss": 0.5083, + "step": 1163 + }, + { + "epoch": 1.6249709369913974, + "grad_norm": 2.2313083148153625, + "learning_rate": 5.150312228655515e-06, + "loss": 0.4175, + "step": 1164 + }, + { + "epoch": 1.6263659614043244, + "grad_norm": 1.9280232439374705, + "learning_rate": 5.142189496919302e-06, + "loss": 0.4636, + "step": 1165 + }, + { + "epoch": 1.6277609858172517, + "grad_norm": 1.9246863328355193, + "learning_rate": 5.1340663896022206e-06, + "loss": 0.4467, + "step": 1166 + }, + { + "epoch": 1.629156010230179, + "grad_norm": 2.001675021541609, + "learning_rate": 5.125942928160736e-06, + "loss": 0.4713, + "step": 1167 + }, + { + "epoch": 1.6305510346431062, + "grad_norm": 2.1481197676528745, + "learning_rate": 5.117819134052246e-06, + "loss": 0.4849, + "step": 1168 + }, + { + "epoch": 1.6319460590560335, + "grad_norm": 2.022955901936903, + "learning_rate": 5.10969502873503e-06, + "loss": 0.4654, + "step": 1169 + }, + { + "epoch": 1.6333410834689608, + "grad_norm": 2.186736394556812, + "learning_rate": 5.101570633668185e-06, + "loss": 0.4674, + "step": 1170 + }, + { + "epoch": 1.634736107881888, + "grad_norm": 2.19635300793701, + "learning_rate": 5.093445970311576e-06, + "loss": 0.471, + "step": 1171 + }, + { + "epoch": 1.636131132294815, + "grad_norm": 2.0592350084511106, + "learning_rate": 5.085321060125775e-06, + "loss": 0.4522, + "step": 1172 + }, + { + "epoch": 1.6375261567077424, + "grad_norm": 2.0924620802741467, + "learning_rate": 5.07719592457201e-06, + "loss": 0.4646, + "step": 1173 + }, + { + "epoch": 1.6389211811206696, + "grad_norm": 2.1993464412256167, + "learning_rate": 5.069070585112097e-06, + "loss": 0.5289, + "step": 1174 + }, + { + "epoch": 1.6403162055335967, + "grad_norm": 1.9527466803709046, + "learning_rate": 5.060945063208399e-06, + "loss": 0.4692, + "step": 1175 + }, + { + "epoch": 1.641711229946524, + "grad_norm": 2.1525612142674833, + "learning_rate": 5.052819380323757e-06, + "loss": 0.4761, + "step": 1176 + }, + { + "epoch": 1.6431062543594512, + "grad_norm": 2.0655110259638207, + "learning_rate": 5.044693557921434e-06, + "loss": 0.4696, + "step": 1177 + }, + { + "epoch": 1.6445012787723785, + "grad_norm": 2.095404927614161, + "learning_rate": 5.036567617465067e-06, + "loss": 0.478, + "step": 1178 + }, + { + "epoch": 1.6458963031853058, + "grad_norm": 2.2636358073037557, + "learning_rate": 5.0284415804186025e-06, + "loss": 0.4676, + "step": 1179 + }, + { + "epoch": 1.647291327598233, + "grad_norm": 2.016150491128099, + "learning_rate": 5.02031546824624e-06, + "loss": 0.4707, + "step": 1180 + }, + { + "epoch": 1.6486863520111603, + "grad_norm": 2.1200548779498085, + "learning_rate": 5.012189302412383e-06, + "loss": 0.4903, + "step": 1181 + }, + { + "epoch": 1.6500813764240876, + "grad_norm": 2.154591495150212, + "learning_rate": 5.0040631043815715e-06, + "loss": 0.5041, + "step": 1182 + }, + { + "epoch": 1.6514764008370146, + "grad_norm": 2.072675257333494, + "learning_rate": 4.99593689561843e-06, + "loss": 0.4662, + "step": 1183 + }, + { + "epoch": 1.652871425249942, + "grad_norm": 1.9847287863038208, + "learning_rate": 4.987810697587618e-06, + "loss": 0.4988, + "step": 1184 + }, + { + "epoch": 1.654266449662869, + "grad_norm": 2.1882981619139477, + "learning_rate": 4.979684531753761e-06, + "loss": 0.4554, + "step": 1185 + }, + { + "epoch": 1.6556614740757962, + "grad_norm": 2.155215055693557, + "learning_rate": 4.971558419581398e-06, + "loss": 0.4495, + "step": 1186 + }, + { + "epoch": 1.6570564984887235, + "grad_norm": 1.9462286169261411, + "learning_rate": 4.963432382534933e-06, + "loss": 0.5166, + "step": 1187 + }, + { + "epoch": 1.6584515229016508, + "grad_norm": 2.319365094329712, + "learning_rate": 4.955306442078568e-06, + "loss": 0.5077, + "step": 1188 + }, + { + "epoch": 1.659846547314578, + "grad_norm": 2.013268429787615, + "learning_rate": 4.947180619676244e-06, + "loss": 0.4225, + "step": 1189 + }, + { + "epoch": 1.6612415717275053, + "grad_norm": 1.9030332809738684, + "learning_rate": 4.9390549367916004e-06, + "loss": 0.4474, + "step": 1190 + }, + { + "epoch": 1.6626365961404326, + "grad_norm": 1.9091704807756302, + "learning_rate": 4.930929414887904e-06, + "loss": 0.4471, + "step": 1191 + }, + { + "epoch": 1.6640316205533598, + "grad_norm": 2.131054390165848, + "learning_rate": 4.9228040754279915e-06, + "loss": 0.4812, + "step": 1192 + }, + { + "epoch": 1.665426644966287, + "grad_norm": 2.0380709327701862, + "learning_rate": 4.914678939874225e-06, + "loss": 0.5226, + "step": 1193 + }, + { + "epoch": 1.6668216693792142, + "grad_norm": 2.2253463416293786, + "learning_rate": 4.906554029688427e-06, + "loss": 0.4383, + "step": 1194 + }, + { + "epoch": 1.6682166937921412, + "grad_norm": 1.9151466529202, + "learning_rate": 4.898429366331815e-06, + "loss": 0.4818, + "step": 1195 + }, + { + "epoch": 1.6696117182050685, + "grad_norm": 2.103018852231921, + "learning_rate": 4.8903049712649705e-06, + "loss": 0.51, + "step": 1196 + }, + { + "epoch": 1.6710067426179958, + "grad_norm": 2.1336210474522033, + "learning_rate": 4.8821808659477544e-06, + "loss": 0.4482, + "step": 1197 + }, + { + "epoch": 1.672401767030923, + "grad_norm": 2.037967905605442, + "learning_rate": 4.874057071839265e-06, + "loss": 0.4624, + "step": 1198 + }, + { + "epoch": 1.6737967914438503, + "grad_norm": 1.9885465427196427, + "learning_rate": 4.86593361039778e-06, + "loss": 0.4824, + "step": 1199 + }, + { + "epoch": 1.6751918158567776, + "grad_norm": 2.1480503191792093, + "learning_rate": 4.857810503080701e-06, + "loss": 0.4892, + "step": 1200 + }, + { + "epoch": 1.6765868402697048, + "grad_norm": 2.1285265643155182, + "learning_rate": 4.849687771344487e-06, + "loss": 0.518, + "step": 1201 + }, + { + "epoch": 1.677981864682632, + "grad_norm": 2.3375001661766217, + "learning_rate": 4.841565436644609e-06, + "loss": 0.514, + "step": 1202 + }, + { + "epoch": 1.6793768890955592, + "grad_norm": 2.1836516379544113, + "learning_rate": 4.8334435204354915e-06, + "loss": 0.4623, + "step": 1203 + }, + { + "epoch": 1.6807719135084864, + "grad_norm": 2.104463723618796, + "learning_rate": 4.825322044170451e-06, + "loss": 0.5108, + "step": 1204 + }, + { + "epoch": 1.6821669379214135, + "grad_norm": 1.9744889241875467, + "learning_rate": 4.81720102930164e-06, + "loss": 0.4725, + "step": 1205 + }, + { + "epoch": 1.6835619623343407, + "grad_norm": 2.1029141184734357, + "learning_rate": 4.809080497279998e-06, + "loss": 0.4917, + "step": 1206 + }, + { + "epoch": 1.684956986747268, + "grad_norm": 2.1916454273646315, + "learning_rate": 4.800960469555183e-06, + "loss": 0.4912, + "step": 1207 + }, + { + "epoch": 1.6863520111601953, + "grad_norm": 1.955939718054068, + "learning_rate": 4.792840967575523e-06, + "loss": 0.487, + "step": 1208 + }, + { + "epoch": 1.6877470355731226, + "grad_norm": 2.059844165344383, + "learning_rate": 4.784722012787961e-06, + "loss": 0.495, + "step": 1209 + }, + { + "epoch": 1.6891420599860498, + "grad_norm": 2.2676353256111823, + "learning_rate": 4.776603626637988e-06, + "loss": 0.4841, + "step": 1210 + }, + { + "epoch": 1.690537084398977, + "grad_norm": 2.024198676505603, + "learning_rate": 4.768485830569598e-06, + "loss": 0.4484, + "step": 1211 + }, + { + "epoch": 1.6919321088119044, + "grad_norm": 2.0004153672009513, + "learning_rate": 4.7603686460252265e-06, + "loss": 0.4911, + "step": 1212 + }, + { + "epoch": 1.6933271332248314, + "grad_norm": 2.1709204019397563, + "learning_rate": 4.75225209444569e-06, + "loss": 0.4627, + "step": 1213 + }, + { + "epoch": 1.6947221576377587, + "grad_norm": 2.1160225414488654, + "learning_rate": 4.744136197270135e-06, + "loss": 0.5304, + "step": 1214 + }, + { + "epoch": 1.6961171820506857, + "grad_norm": 2.2698535491172525, + "learning_rate": 4.736020975935981e-06, + "loss": 0.4793, + "step": 1215 + }, + { + "epoch": 1.697512206463613, + "grad_norm": 2.0436804954046, + "learning_rate": 4.72790645187886e-06, + "loss": 0.4407, + "step": 1216 + }, + { + "epoch": 1.6989072308765403, + "grad_norm": 1.9979146113097979, + "learning_rate": 4.7197926465325626e-06, + "loss": 0.4564, + "step": 1217 + }, + { + "epoch": 1.7003022552894675, + "grad_norm": 2.0826310908062604, + "learning_rate": 4.711679581328983e-06, + "loss": 0.4837, + "step": 1218 + }, + { + "epoch": 1.7016972797023948, + "grad_norm": 2.221127633913511, + "learning_rate": 4.703567277698058e-06, + "loss": 0.5505, + "step": 1219 + }, + { + "epoch": 1.703092304115322, + "grad_norm": 2.3042281576812664, + "learning_rate": 4.695455757067712e-06, + "loss": 0.4513, + "step": 1220 + }, + { + "epoch": 1.7044873285282494, + "grad_norm": 1.9139161970984668, + "learning_rate": 4.687345040863808e-06, + "loss": 0.4778, + "step": 1221 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 2.1441515179909656, + "learning_rate": 4.679235150510072e-06, + "loss": 0.4342, + "step": 1222 + }, + { + "epoch": 1.7072773773541037, + "grad_norm": 1.9784971632191246, + "learning_rate": 4.671126107428061e-06, + "loss": 0.4749, + "step": 1223 + }, + { + "epoch": 1.708672401767031, + "grad_norm": 1.9425240844023457, + "learning_rate": 4.663017933037087e-06, + "loss": 0.4606, + "step": 1224 + }, + { + "epoch": 1.710067426179958, + "grad_norm": 2.06859895894967, + "learning_rate": 4.6549106487541666e-06, + "loss": 0.491, + "step": 1225 + }, + { + "epoch": 1.7114624505928853, + "grad_norm": 2.1250812807049155, + "learning_rate": 4.646804275993971e-06, + "loss": 0.46, + "step": 1226 + }, + { + "epoch": 1.7128574750058125, + "grad_norm": 2.1870237837592357, + "learning_rate": 4.63869883616876e-06, + "loss": 0.4526, + "step": 1227 + }, + { + "epoch": 1.7142524994187398, + "grad_norm": 1.933173246666357, + "learning_rate": 4.630594350688327e-06, + "loss": 0.4965, + "step": 1228 + }, + { + "epoch": 1.715647523831667, + "grad_norm": 2.1720589382321935, + "learning_rate": 4.62249084095995e-06, + "loss": 0.4633, + "step": 1229 + }, + { + "epoch": 1.7170425482445943, + "grad_norm": 2.03257424247359, + "learning_rate": 4.614388328388327e-06, + "loss": 0.5121, + "step": 1230 + }, + { + "epoch": 1.7184375726575216, + "grad_norm": 2.1561072697752395, + "learning_rate": 4.606286834375517e-06, + "loss": 0.4976, + "step": 1231 + }, + { + "epoch": 1.7198325970704489, + "grad_norm": 2.053680058584616, + "learning_rate": 4.598186380320899e-06, + "loss": 0.433, + "step": 1232 + }, + { + "epoch": 1.721227621483376, + "grad_norm": 2.181642713396842, + "learning_rate": 4.5900869876210986e-06, + "loss": 0.4768, + "step": 1233 + }, + { + "epoch": 1.7226226458963032, + "grad_norm": 2.0522746791202318, + "learning_rate": 4.581988677669935e-06, + "loss": 0.5157, + "step": 1234 + }, + { + "epoch": 1.7240176703092303, + "grad_norm": 2.11368549262154, + "learning_rate": 4.573891471858375e-06, + "loss": 0.4832, + "step": 1235 + }, + { + "epoch": 1.7254126947221575, + "grad_norm": 2.0913027904457127, + "learning_rate": 4.565795391574465e-06, + "loss": 0.4737, + "step": 1236 + }, + { + "epoch": 1.7268077191350848, + "grad_norm": 2.0912014065923636, + "learning_rate": 4.5577004582032745e-06, + "loss": 0.4488, + "step": 1237 + }, + { + "epoch": 1.728202743548012, + "grad_norm": 2.050372387152731, + "learning_rate": 4.549606693126851e-06, + "loss": 0.4636, + "step": 1238 + }, + { + "epoch": 1.7295977679609393, + "grad_norm": 2.1154084542696587, + "learning_rate": 4.541514117724155e-06, + "loss": 0.5255, + "step": 1239 + }, + { + "epoch": 1.7309927923738666, + "grad_norm": 2.0748003125338546, + "learning_rate": 4.533422753370995e-06, + "loss": 0.4257, + "step": 1240 + }, + { + "epoch": 1.7323878167867939, + "grad_norm": 1.9621472789788037, + "learning_rate": 4.525332621439995e-06, + "loss": 0.4846, + "step": 1241 + }, + { + "epoch": 1.7337828411997211, + "grad_norm": 2.103237491557985, + "learning_rate": 4.517243743300513e-06, + "loss": 0.4228, + "step": 1242 + }, + { + "epoch": 1.7351778656126482, + "grad_norm": 2.177553621256369, + "learning_rate": 4.5091561403185976e-06, + "loss": 0.5123, + "step": 1243 + }, + { + "epoch": 1.7365728900255755, + "grad_norm": 2.091109158658429, + "learning_rate": 4.501069833856934e-06, + "loss": 0.4629, + "step": 1244 + }, + { + "epoch": 1.7379679144385025, + "grad_norm": 2.1467197858254408, + "learning_rate": 4.492984845274774e-06, + "loss": 0.4354, + "step": 1245 + }, + { + "epoch": 1.7393629388514298, + "grad_norm": 2.100552590196265, + "learning_rate": 4.484901195927901e-06, + "loss": 0.4618, + "step": 1246 + }, + { + "epoch": 1.740757963264357, + "grad_norm": 1.9713180215115427, + "learning_rate": 4.476818907168545e-06, + "loss": 0.4186, + "step": 1247 + }, + { + "epoch": 1.7421529876772843, + "grad_norm": 2.03751695096277, + "learning_rate": 4.4687380003453555e-06, + "loss": 0.4412, + "step": 1248 + }, + { + "epoch": 1.7435480120902116, + "grad_norm": 2.101186407375219, + "learning_rate": 4.460658496803327e-06, + "loss": 0.4753, + "step": 1249 + }, + { + "epoch": 1.7449430365031389, + "grad_norm": 2.1552915811427247, + "learning_rate": 4.4525804178837425e-06, + "loss": 0.4574, + "step": 1250 + }, + { + "epoch": 1.7463380609160661, + "grad_norm": 2.270007645721411, + "learning_rate": 4.4445037849241305e-06, + "loss": 0.5034, + "step": 1251 + }, + { + "epoch": 1.7477330853289934, + "grad_norm": 2.128789717967715, + "learning_rate": 4.436428619258196e-06, + "loss": 0.4722, + "step": 1252 + }, + { + "epoch": 1.7491281097419205, + "grad_norm": 2.0638528861792365, + "learning_rate": 4.428354942215766e-06, + "loss": 0.4406, + "step": 1253 + }, + { + "epoch": 1.7505231341548477, + "grad_norm": 2.1111439690461493, + "learning_rate": 4.42028277512274e-06, + "loss": 0.448, + "step": 1254 + }, + { + "epoch": 1.7519181585677748, + "grad_norm": 1.9221909816221243, + "learning_rate": 4.412212139301027e-06, + "loss": 0.4915, + "step": 1255 + }, + { + "epoch": 1.753313182980702, + "grad_norm": 2.116085778290425, + "learning_rate": 4.404143056068489e-06, + "loss": 0.4063, + "step": 1256 + }, + { + "epoch": 1.7547082073936293, + "grad_norm": 1.910803308672269, + "learning_rate": 4.3960755467388916e-06, + "loss": 0.4783, + "step": 1257 + }, + { + "epoch": 1.7561032318065566, + "grad_norm": 2.088441580996199, + "learning_rate": 4.388009632621841e-06, + "loss": 0.5135, + "step": 1258 + }, + { + "epoch": 1.7574982562194839, + "grad_norm": 2.0746611561798503, + "learning_rate": 4.379945335022727e-06, + "loss": 0.4721, + "step": 1259 + }, + { + "epoch": 1.7588932806324111, + "grad_norm": 2.2426717188517618, + "learning_rate": 4.371882675242674e-06, + "loss": 0.4971, + "step": 1260 + }, + { + "epoch": 1.7602883050453384, + "grad_norm": 2.2204709536429843, + "learning_rate": 4.363821674578479e-06, + "loss": 0.5132, + "step": 1261 + }, + { + "epoch": 1.7616833294582657, + "grad_norm": 2.0706691763772183, + "learning_rate": 4.355762354322552e-06, + "loss": 0.4508, + "step": 1262 + }, + { + "epoch": 1.7630783538711927, + "grad_norm": 2.0641312165343533, + "learning_rate": 4.347704735762872e-06, + "loss": 0.495, + "step": 1263 + }, + { + "epoch": 1.76447337828412, + "grad_norm": 2.0817958512499604, + "learning_rate": 4.339648840182919e-06, + "loss": 0.4815, + "step": 1264 + }, + { + "epoch": 1.765868402697047, + "grad_norm": 2.082454137764786, + "learning_rate": 4.331594688861619e-06, + "loss": 0.4674, + "step": 1265 + }, + { + "epoch": 1.7672634271099743, + "grad_norm": 2.1167567040967175, + "learning_rate": 4.323542303073297e-06, + "loss": 0.4455, + "step": 1266 + }, + { + "epoch": 1.7686584515229016, + "grad_norm": 2.1312357560431567, + "learning_rate": 4.315491704087613e-06, + "loss": 0.456, + "step": 1267 + }, + { + "epoch": 1.7700534759358288, + "grad_norm": 2.007748405961002, + "learning_rate": 4.3074429131695e-06, + "loss": 0.465, + "step": 1268 + }, + { + "epoch": 1.7714485003487561, + "grad_norm": 2.1649734551805366, + "learning_rate": 4.299395951579126e-06, + "loss": 0.4658, + "step": 1269 + }, + { + "epoch": 1.7728435247616834, + "grad_norm": 2.013706806375797, + "learning_rate": 4.291350840571821e-06, + "loss": 0.4307, + "step": 1270 + }, + { + "epoch": 1.7742385491746107, + "grad_norm": 2.081976239927056, + "learning_rate": 4.283307601398026e-06, + "loss": 0.4848, + "step": 1271 + }, + { + "epoch": 1.775633573587538, + "grad_norm": 2.2533206281427525, + "learning_rate": 4.275266255303238e-06, + "loss": 0.4909, + "step": 1272 + }, + { + "epoch": 1.777028598000465, + "grad_norm": 2.247778078118919, + "learning_rate": 4.2672268235279616e-06, + "loss": 0.4611, + "step": 1273 + }, + { + "epoch": 1.7784236224133922, + "grad_norm": 1.9320941214004883, + "learning_rate": 4.259189327307632e-06, + "loss": 0.4782, + "step": 1274 + }, + { + "epoch": 1.7798186468263193, + "grad_norm": 2.201372285618325, + "learning_rate": 4.251153787872579e-06, + "loss": 0.4793, + "step": 1275 + }, + { + "epoch": 1.7812136712392466, + "grad_norm": 1.952612762150762, + "learning_rate": 4.2431202264479665e-06, + "loss": 0.4583, + "step": 1276 + }, + { + "epoch": 1.7826086956521738, + "grad_norm": 2.1107717354597244, + "learning_rate": 4.235088664253726e-06, + "loss": 0.5093, + "step": 1277 + }, + { + "epoch": 1.784003720065101, + "grad_norm": 2.2135031912589853, + "learning_rate": 4.227059122504514e-06, + "loss": 0.485, + "step": 1278 + }, + { + "epoch": 1.7853987444780284, + "grad_norm": 1.8768740585602934, + "learning_rate": 4.21903162240965e-06, + "loss": 0.4444, + "step": 1279 + }, + { + "epoch": 1.7867937688909556, + "grad_norm": 2.083417129855745, + "learning_rate": 4.211006185173056e-06, + "loss": 0.5084, + "step": 1280 + }, + { + "epoch": 1.788188793303883, + "grad_norm": 2.0108554229556574, + "learning_rate": 4.20298283199321e-06, + "loss": 0.4198, + "step": 1281 + }, + { + "epoch": 1.7895838177168102, + "grad_norm": 2.2544475269254916, + "learning_rate": 4.1949615840630845e-06, + "loss": 0.4629, + "step": 1282 + }, + { + "epoch": 1.7909788421297372, + "grad_norm": 2.1051705170049235, + "learning_rate": 4.186942462570087e-06, + "loss": 0.5034, + "step": 1283 + }, + { + "epoch": 1.7923738665426645, + "grad_norm": 2.160525988076939, + "learning_rate": 4.178925488696012e-06, + "loss": 0.4845, + "step": 1284 + }, + { + "epoch": 1.7937688909555918, + "grad_norm": 2.095025653388082, + "learning_rate": 4.170910683616985e-06, + "loss": 0.4656, + "step": 1285 + }, + { + "epoch": 1.7951639153685188, + "grad_norm": 2.277468807868547, + "learning_rate": 4.1628980685033914e-06, + "loss": 0.5152, + "step": 1286 + }, + { + "epoch": 1.796558939781446, + "grad_norm": 2.056705613970272, + "learning_rate": 4.154887664519842e-06, + "loss": 0.4893, + "step": 1287 + }, + { + "epoch": 1.7979539641943734, + "grad_norm": 2.1349968125546437, + "learning_rate": 4.1468794928251064e-06, + "loss": 0.4686, + "step": 1288 + }, + { + "epoch": 1.7993489886073006, + "grad_norm": 2.1952902085733914, + "learning_rate": 4.138873574572053e-06, + "loss": 0.4743, + "step": 1289 + }, + { + "epoch": 1.800744013020228, + "grad_norm": 2.0284310881562826, + "learning_rate": 4.130869930907599e-06, + "loss": 0.4169, + "step": 1290 + }, + { + "epoch": 1.8021390374331552, + "grad_norm": 1.9048197342872124, + "learning_rate": 4.122868582972659e-06, + "loss": 0.4679, + "step": 1291 + }, + { + "epoch": 1.8035340618460824, + "grad_norm": 2.0323882452817896, + "learning_rate": 4.114869551902075e-06, + "loss": 0.4279, + "step": 1292 + }, + { + "epoch": 1.8049290862590095, + "grad_norm": 2.0139222221685618, + "learning_rate": 4.106872858824576e-06, + "loss": 0.4539, + "step": 1293 + }, + { + "epoch": 1.8063241106719368, + "grad_norm": 2.0047655257894874, + "learning_rate": 4.098878524862715e-06, + "loss": 0.4979, + "step": 1294 + }, + { + "epoch": 1.807719135084864, + "grad_norm": 2.2420685216305314, + "learning_rate": 4.090886571132807e-06, + "loss": 0.566, + "step": 1295 + }, + { + "epoch": 1.809114159497791, + "grad_norm": 2.219670568258558, + "learning_rate": 4.082897018744887e-06, + "loss": 0.4951, + "step": 1296 + }, + { + "epoch": 1.8105091839107184, + "grad_norm": 2.339784196893198, + "learning_rate": 4.074909888802648e-06, + "loss": 0.505, + "step": 1297 + }, + { + "epoch": 1.8119042083236456, + "grad_norm": 1.8525635944473475, + "learning_rate": 4.066925202403374e-06, + "loss": 0.418, + "step": 1298 + }, + { + "epoch": 1.813299232736573, + "grad_norm": 2.097308855709139, + "learning_rate": 4.058942980637906e-06, + "loss": 0.4854, + "step": 1299 + }, + { + "epoch": 1.8146942571495002, + "grad_norm": 1.9765348057867853, + "learning_rate": 4.050963244590571e-06, + "loss": 0.4356, + "step": 1300 + }, + { + "epoch": 1.8160892815624274, + "grad_norm": 1.9813369058762067, + "learning_rate": 4.042986015339126e-06, + "loss": 0.4718, + "step": 1301 + }, + { + "epoch": 1.8174843059753547, + "grad_norm": 1.9999920964885178, + "learning_rate": 4.035011313954713e-06, + "loss": 0.4839, + "step": 1302 + }, + { + "epoch": 1.8188793303882818, + "grad_norm": 2.1800679736320103, + "learning_rate": 4.027039161501795e-06, + "loss": 0.4071, + "step": 1303 + }, + { + "epoch": 1.820274354801209, + "grad_norm": 1.9433494165364085, + "learning_rate": 4.019069579038096e-06, + "loss": 0.4594, + "step": 1304 + }, + { + "epoch": 1.8216693792141363, + "grad_norm": 1.9112976678661902, + "learning_rate": 4.011102587614563e-06, + "loss": 0.4272, + "step": 1305 + }, + { + "epoch": 1.8230644036270633, + "grad_norm": 2.0895025081439487, + "learning_rate": 4.00313820827529e-06, + "loss": 0.4914, + "step": 1306 + }, + { + "epoch": 1.8244594280399906, + "grad_norm": 2.2075749414041135, + "learning_rate": 3.995176462057473e-06, + "loss": 0.5239, + "step": 1307 + }, + { + "epoch": 1.8258544524529179, + "grad_norm": 2.2550877735540555, + "learning_rate": 3.987217369991357e-06, + "loss": 0.468, + "step": 1308 + }, + { + "epoch": 1.8272494768658452, + "grad_norm": 2.050163159425238, + "learning_rate": 3.979260953100169e-06, + "loss": 0.4714, + "step": 1309 + }, + { + "epoch": 1.8286445012787724, + "grad_norm": 2.0819553331959315, + "learning_rate": 3.97130723240008e-06, + "loss": 0.4548, + "step": 1310 + }, + { + "epoch": 1.8300395256916997, + "grad_norm": 2.414283571515306, + "learning_rate": 3.96335622890013e-06, + "loss": 0.4951, + "step": 1311 + }, + { + "epoch": 1.831434550104627, + "grad_norm": 1.6184625533605201, + "learning_rate": 3.955407963602184e-06, + "loss": 0.4758, + "step": 1312 + }, + { + "epoch": 1.832829574517554, + "grad_norm": 2.0008120841892967, + "learning_rate": 3.94746245750088e-06, + "loss": 0.4571, + "step": 1313 + }, + { + "epoch": 1.8342245989304813, + "grad_norm": 2.1349958964625815, + "learning_rate": 3.939519731583557e-06, + "loss": 0.4837, + "step": 1314 + }, + { + "epoch": 1.8356196233434086, + "grad_norm": 1.837343945362903, + "learning_rate": 3.9315798068302214e-06, + "loss": 0.438, + "step": 1315 + }, + { + "epoch": 1.8370146477563356, + "grad_norm": 1.8766035582552343, + "learning_rate": 3.923642704213475e-06, + "loss": 0.4936, + "step": 1316 + }, + { + "epoch": 1.8384096721692629, + "grad_norm": 2.0632480061696117, + "learning_rate": 3.915708444698465e-06, + "loss": 0.4904, + "step": 1317 + }, + { + "epoch": 1.8398046965821901, + "grad_norm": 2.0966458414991576, + "learning_rate": 3.907777049242828e-06, + "loss": 0.4587, + "step": 1318 + }, + { + "epoch": 1.8411997209951174, + "grad_norm": 2.0322570194266056, + "learning_rate": 3.899848538796643e-06, + "loss": 0.4788, + "step": 1319 + }, + { + "epoch": 1.8425947454080447, + "grad_norm": 2.000161074661116, + "learning_rate": 3.891922934302356e-06, + "loss": 0.4924, + "step": 1320 + }, + { + "epoch": 1.843989769820972, + "grad_norm": 2.1998938474565213, + "learning_rate": 3.884000256694749e-06, + "loss": 0.4836, + "step": 1321 + }, + { + "epoch": 1.8453847942338992, + "grad_norm": 2.1050792021780027, + "learning_rate": 3.876080526900867e-06, + "loss": 0.5069, + "step": 1322 + }, + { + "epoch": 1.8467798186468263, + "grad_norm": 2.1401256807615727, + "learning_rate": 3.868163765839966e-06, + "loss": 0.4742, + "step": 1323 + }, + { + "epoch": 1.8481748430597535, + "grad_norm": 1.9999712616159633, + "learning_rate": 3.860249994423467e-06, + "loss": 0.4724, + "step": 1324 + }, + { + "epoch": 1.8495698674726808, + "grad_norm": 2.212420210483622, + "learning_rate": 3.852339233554891e-06, + "loss": 0.4594, + "step": 1325 + }, + { + "epoch": 1.8509648918856079, + "grad_norm": 2.1246908494803236, + "learning_rate": 3.844431504129804e-06, + "loss": 0.5114, + "step": 1326 + }, + { + "epoch": 1.8523599162985351, + "grad_norm": 2.092568223347476, + "learning_rate": 3.8365268270357715e-06, + "loss": 0.4775, + "step": 1327 + }, + { + "epoch": 1.8537549407114624, + "grad_norm": 1.9869337488454075, + "learning_rate": 3.828625223152291e-06, + "loss": 0.4637, + "step": 1328 + }, + { + "epoch": 1.8551499651243897, + "grad_norm": 2.055635422327693, + "learning_rate": 3.820726713350742e-06, + "loss": 0.4614, + "step": 1329 + }, + { + "epoch": 1.856544989537317, + "grad_norm": 2.1873731783351396, + "learning_rate": 3.812831318494335e-06, + "loss": 0.4888, + "step": 1330 + }, + { + "epoch": 1.8579400139502442, + "grad_norm": 2.0003013010135255, + "learning_rate": 3.804939059438052e-06, + "loss": 0.4689, + "step": 1331 + }, + { + "epoch": 1.8593350383631715, + "grad_norm": 2.071605299747796, + "learning_rate": 3.797049957028588e-06, + "loss": 0.476, + "step": 1332 + }, + { + "epoch": 1.8607300627760985, + "grad_norm": 2.0816926086751155, + "learning_rate": 3.7891640321043054e-06, + "loss": 0.4216, + "step": 1333 + }, + { + "epoch": 1.8621250871890258, + "grad_norm": 2.1529782419951857, + "learning_rate": 3.781281305495171e-06, + "loss": 0.421, + "step": 1334 + }, + { + "epoch": 1.863520111601953, + "grad_norm": 1.9862903369732532, + "learning_rate": 3.773401798022701e-06, + "loss": 0.4328, + "step": 1335 + }, + { + "epoch": 1.8649151360148801, + "grad_norm": 2.1563537251967455, + "learning_rate": 3.765525530499915e-06, + "loss": 0.4637, + "step": 1336 + }, + { + "epoch": 1.8663101604278074, + "grad_norm": 2.091076414635607, + "learning_rate": 3.757652523731269e-06, + "loss": 0.4169, + "step": 1337 + }, + { + "epoch": 1.8677051848407347, + "grad_norm": 1.9448368258697788, + "learning_rate": 3.7497827985126054e-06, + "loss": 0.4475, + "step": 1338 + }, + { + "epoch": 1.869100209253662, + "grad_norm": 2.024573917452205, + "learning_rate": 3.741916375631105e-06, + "loss": 0.4323, + "step": 1339 + }, + { + "epoch": 1.8704952336665892, + "grad_norm": 2.314125789311409, + "learning_rate": 3.7340532758652217e-06, + "loss": 0.5298, + "step": 1340 + }, + { + "epoch": 1.8718902580795165, + "grad_norm": 2.1177367118528614, + "learning_rate": 3.7261935199846266e-06, + "loss": 0.4449, + "step": 1341 + }, + { + "epoch": 1.8732852824924437, + "grad_norm": 2.262864442565798, + "learning_rate": 3.7183371287501684e-06, + "loss": 0.4593, + "step": 1342 + }, + { + "epoch": 1.8746803069053708, + "grad_norm": 1.971116485194465, + "learning_rate": 3.7104841229138034e-06, + "loss": 0.4833, + "step": 1343 + }, + { + "epoch": 1.876075331318298, + "grad_norm": 2.10454369475845, + "learning_rate": 3.7026345232185416e-06, + "loss": 0.438, + "step": 1344 + }, + { + "epoch": 1.8774703557312253, + "grad_norm": 1.9595998318710819, + "learning_rate": 3.6947883503984037e-06, + "loss": 0.4438, + "step": 1345 + }, + { + "epoch": 1.8788653801441524, + "grad_norm": 1.9520125987169128, + "learning_rate": 3.686945625178356e-06, + "loss": 0.4469, + "step": 1346 + }, + { + "epoch": 1.8802604045570797, + "grad_norm": 2.026901639450632, + "learning_rate": 3.6791063682742535e-06, + "loss": 0.4895, + "step": 1347 + }, + { + "epoch": 1.881655428970007, + "grad_norm": 2.048549645209667, + "learning_rate": 3.6712706003927937e-06, + "loss": 0.4631, + "step": 1348 + }, + { + "epoch": 1.8830504533829342, + "grad_norm": 2.0279995779597932, + "learning_rate": 3.6634383422314622e-06, + "loss": 0.442, + "step": 1349 + }, + { + "epoch": 1.8844454777958615, + "grad_norm": 2.1434290748836, + "learning_rate": 3.655609614478467e-06, + "loss": 0.5072, + "step": 1350 + }, + { + "epoch": 1.8858405022087887, + "grad_norm": 2.222887686762255, + "learning_rate": 3.647784437812693e-06, + "loss": 0.49, + "step": 1351 + }, + { + "epoch": 1.887235526621716, + "grad_norm": 2.2009445876073253, + "learning_rate": 3.6399628329036496e-06, + "loss": 0.4928, + "step": 1352 + }, + { + "epoch": 1.888630551034643, + "grad_norm": 2.174116284837636, + "learning_rate": 3.632144820411405e-06, + "loss": 0.3996, + "step": 1353 + }, + { + "epoch": 1.8900255754475703, + "grad_norm": 1.8585093779164292, + "learning_rate": 3.624330420986541e-06, + "loss": 0.4241, + "step": 1354 + }, + { + "epoch": 1.8914205998604976, + "grad_norm": 1.9457784843844574, + "learning_rate": 3.6165196552701e-06, + "loss": 0.4694, + "step": 1355 + }, + { + "epoch": 1.8928156242734246, + "grad_norm": 2.00270102931314, + "learning_rate": 3.6087125438935187e-06, + "loss": 0.4603, + "step": 1356 + }, + { + "epoch": 1.894210648686352, + "grad_norm": 2.0737253073991853, + "learning_rate": 3.6009091074785853e-06, + "loss": 0.461, + "step": 1357 + }, + { + "epoch": 1.8956056730992792, + "grad_norm": 2.064571146406589, + "learning_rate": 3.5931093666373845e-06, + "loss": 0.5147, + "step": 1358 + }, + { + "epoch": 1.8970006975122065, + "grad_norm": 2.1201036087247283, + "learning_rate": 3.585313341972232e-06, + "loss": 0.4543, + "step": 1359 + }, + { + "epoch": 1.8983957219251337, + "grad_norm": 2.080966829655726, + "learning_rate": 3.577521054075631e-06, + "loss": 0.4277, + "step": 1360 + }, + { + "epoch": 1.899790746338061, + "grad_norm": 2.0981462870140097, + "learning_rate": 3.5697325235302183e-06, + "loss": 0.4193, + "step": 1361 + }, + { + "epoch": 1.9011857707509883, + "grad_norm": 2.0868066804557257, + "learning_rate": 3.5619477709086982e-06, + "loss": 0.4302, + "step": 1362 + }, + { + "epoch": 1.9025807951639153, + "grad_norm": 2.1039057194297492, + "learning_rate": 3.5541668167738003e-06, + "loss": 0.5433, + "step": 1363 + }, + { + "epoch": 1.9039758195768426, + "grad_norm": 2.4297797676999, + "learning_rate": 3.546389681678224e-06, + "loss": 0.4555, + "step": 1364 + }, + { + "epoch": 1.9053708439897699, + "grad_norm": 1.9533632396939948, + "learning_rate": 3.538616386164575e-06, + "loss": 0.4645, + "step": 1365 + }, + { + "epoch": 1.906765868402697, + "grad_norm": 2.050877909307051, + "learning_rate": 3.530846950765318e-06, + "loss": 0.4768, + "step": 1366 + }, + { + "epoch": 1.9081608928156242, + "grad_norm": 2.1111581143101943, + "learning_rate": 3.5230813960027275e-06, + "loss": 0.4791, + "step": 1367 + }, + { + "epoch": 1.9095559172285514, + "grad_norm": 2.266748524269278, + "learning_rate": 3.5153197423888206e-06, + "loss": 0.4986, + "step": 1368 + }, + { + "epoch": 1.9109509416414787, + "grad_norm": 1.950535419944206, + "learning_rate": 3.5075620104253123e-06, + "loss": 0.4389, + "step": 1369 + }, + { + "epoch": 1.912345966054406, + "grad_norm": 2.0576812430591804, + "learning_rate": 3.4998082206035606e-06, + "loss": 0.4473, + "step": 1370 + }, + { + "epoch": 1.9137409904673333, + "grad_norm": 1.825998507257751, + "learning_rate": 3.492058393404509e-06, + "loss": 0.51, + "step": 1371 + }, + { + "epoch": 1.9151360148802605, + "grad_norm": 2.085578235101986, + "learning_rate": 3.4843125492986345e-06, + "loss": 0.4212, + "step": 1372 + }, + { + "epoch": 1.9165310392931876, + "grad_norm": 1.9189758160127117, + "learning_rate": 3.4765707087458912e-06, + "loss": 0.4944, + "step": 1373 + }, + { + "epoch": 1.9179260637061148, + "grad_norm": 2.113280326915886, + "learning_rate": 3.468832892195664e-06, + "loss": 0.4991, + "step": 1374 + }, + { + "epoch": 1.9193210881190421, + "grad_norm": 2.2202722381166025, + "learning_rate": 3.4610991200867006e-06, + "loss": 0.4734, + "step": 1375 + }, + { + "epoch": 1.9207161125319692, + "grad_norm": 1.9939976073920789, + "learning_rate": 3.453369412847071e-06, + "loss": 0.4053, + "step": 1376 + }, + { + "epoch": 1.9221111369448964, + "grad_norm": 2.0479827879447057, + "learning_rate": 3.445643790894109e-06, + "loss": 0.4436, + "step": 1377 + }, + { + "epoch": 1.9235061613578237, + "grad_norm": 2.1023765087199338, + "learning_rate": 3.4379222746343534e-06, + "loss": 0.4684, + "step": 1378 + }, + { + "epoch": 1.924901185770751, + "grad_norm": 2.0722694658703236, + "learning_rate": 3.4302048844634995e-06, + "loss": 0.4759, + "step": 1379 + }, + { + "epoch": 1.9262962101836782, + "grad_norm": 2.0161034662991693, + "learning_rate": 3.4224916407663484e-06, + "loss": 0.4283, + "step": 1380 + }, + { + "epoch": 1.9276912345966055, + "grad_norm": 2.114718900925972, + "learning_rate": 3.414782563916742e-06, + "loss": 0.5011, + "step": 1381 + }, + { + "epoch": 1.9290862590095328, + "grad_norm": 2.1968385042774283, + "learning_rate": 3.407077674277518e-06, + "loss": 0.4709, + "step": 1382 + }, + { + "epoch": 1.93048128342246, + "grad_norm": 2.2136573594525535, + "learning_rate": 3.3993769922004584e-06, + "loss": 0.5332, + "step": 1383 + }, + { + "epoch": 1.931876307835387, + "grad_norm": 2.143714300983868, + "learning_rate": 3.391680538026224e-06, + "loss": 0.4267, + "step": 1384 + }, + { + "epoch": 1.9332713322483144, + "grad_norm": 2.069489588319107, + "learning_rate": 3.3839883320843125e-06, + "loss": 0.4491, + "step": 1385 + }, + { + "epoch": 1.9346663566612414, + "grad_norm": 1.9452039632794036, + "learning_rate": 3.3763003946930023e-06, + "loss": 0.4664, + "step": 1386 + }, + { + "epoch": 1.9360613810741687, + "grad_norm": 2.189405225206946, + "learning_rate": 3.36861674615929e-06, + "loss": 0.4569, + "step": 1387 + }, + { + "epoch": 1.937456405487096, + "grad_norm": 2.220840979892466, + "learning_rate": 3.360937406778849e-06, + "loss": 0.5108, + "step": 1388 + }, + { + "epoch": 1.9388514299000232, + "grad_norm": 2.1295737957858423, + "learning_rate": 3.35326239683597e-06, + "loss": 0.4236, + "step": 1389 + }, + { + "epoch": 1.9402464543129505, + "grad_norm": 2.048196278694426, + "learning_rate": 3.3455917366035058e-06, + "loss": 0.4695, + "step": 1390 + }, + { + "epoch": 1.9416414787258778, + "grad_norm": 2.082360128476303, + "learning_rate": 3.337925446342819e-06, + "loss": 0.4389, + "step": 1391 + }, + { + "epoch": 1.943036503138805, + "grad_norm": 1.8747651704705666, + "learning_rate": 3.3302635463037352e-06, + "loss": 0.3972, + "step": 1392 + }, + { + "epoch": 1.9444315275517323, + "grad_norm": 2.048555484081868, + "learning_rate": 3.3226060567244767e-06, + "loss": 0.4803, + "step": 1393 + }, + { + "epoch": 1.9458265519646594, + "grad_norm": 2.136883881979085, + "learning_rate": 3.314952997831618e-06, + "loss": 0.4223, + "step": 1394 + }, + { + "epoch": 1.9472215763775866, + "grad_norm": 1.7874295552797317, + "learning_rate": 3.307304389840036e-06, + "loss": 0.4322, + "step": 1395 + }, + { + "epoch": 1.9486166007905137, + "grad_norm": 2.0438982119199465, + "learning_rate": 3.29966025295284e-06, + "loss": 0.4468, + "step": 1396 + }, + { + "epoch": 1.950011625203441, + "grad_norm": 2.1099373836087105, + "learning_rate": 3.292020607361337e-06, + "loss": 0.5667, + "step": 1397 + }, + { + "epoch": 1.9514066496163682, + "grad_norm": 2.1250544635117516, + "learning_rate": 3.284385473244974e-06, + "loss": 0.4201, + "step": 1398 + }, + { + "epoch": 1.9528016740292955, + "grad_norm": 2.0986308513899163, + "learning_rate": 3.2767548707712693e-06, + "loss": 0.4825, + "step": 1399 + }, + { + "epoch": 1.9541966984422228, + "grad_norm": 2.1463740035501977, + "learning_rate": 3.2691288200957826e-06, + "loss": 0.4821, + "step": 1400 + }, + { + "epoch": 1.95559172285515, + "grad_norm": 2.208153989617607, + "learning_rate": 3.2615073413620467e-06, + "loss": 0.5005, + "step": 1401 + }, + { + "epoch": 1.9569867472680773, + "grad_norm": 2.0747031690202524, + "learning_rate": 3.2538904547015137e-06, + "loss": 0.4718, + "step": 1402 + }, + { + "epoch": 1.9583817716810046, + "grad_norm": 2.058110546374091, + "learning_rate": 3.2462781802335124e-06, + "loss": 0.4699, + "step": 1403 + }, + { + "epoch": 1.9597767960939316, + "grad_norm": 2.187911351474957, + "learning_rate": 3.2386705380651877e-06, + "loss": 0.4686, + "step": 1404 + }, + { + "epoch": 1.961171820506859, + "grad_norm": 1.8656819381405738, + "learning_rate": 3.2310675482914444e-06, + "loss": 0.4688, + "step": 1405 + }, + { + "epoch": 1.962566844919786, + "grad_norm": 2.0876329165242047, + "learning_rate": 3.2234692309949034e-06, + "loss": 0.4658, + "step": 1406 + }, + { + "epoch": 1.9639618693327132, + "grad_norm": 2.0606274699365943, + "learning_rate": 3.2158756062458422e-06, + "loss": 0.4721, + "step": 1407 + }, + { + "epoch": 1.9653568937456405, + "grad_norm": 2.030107454192887, + "learning_rate": 3.208286694102141e-06, + "loss": 0.502, + "step": 1408 + }, + { + "epoch": 1.9667519181585678, + "grad_norm": 2.16972444150237, + "learning_rate": 3.2007025146092345e-06, + "loss": 0.4383, + "step": 1409 + }, + { + "epoch": 1.968146942571495, + "grad_norm": 1.8504090641131754, + "learning_rate": 3.1931230878000586e-06, + "loss": 0.4498, + "step": 1410 + }, + { + "epoch": 1.9695419669844223, + "grad_norm": 2.0621064144305143, + "learning_rate": 3.1855484336949876e-06, + "loss": 0.4547, + "step": 1411 + }, + { + "epoch": 1.9709369913973496, + "grad_norm": 1.9346127805447864, + "learning_rate": 3.1779785723017988e-06, + "loss": 0.5044, + "step": 1412 + }, + { + "epoch": 1.9723320158102768, + "grad_norm": 2.093246509824468, + "learning_rate": 3.170413523615605e-06, + "loss": 0.4427, + "step": 1413 + }, + { + "epoch": 1.9737270402232039, + "grad_norm": 2.1928383212100453, + "learning_rate": 3.162853307618805e-06, + "loss": 0.3999, + "step": 1414 + }, + { + "epoch": 1.9751220646361312, + "grad_norm": 1.8107419503015638, + "learning_rate": 3.155297944281036e-06, + "loss": 0.4503, + "step": 1415 + }, + { + "epoch": 1.9765170890490582, + "grad_norm": 2.08654083770065, + "learning_rate": 3.1477474535591167e-06, + "loss": 0.4615, + "step": 1416 + }, + { + "epoch": 1.9779121134619855, + "grad_norm": 2.0894026454200834, + "learning_rate": 3.1402018553969917e-06, + "loss": 0.5014, + "step": 1417 + }, + { + "epoch": 1.9793071378749127, + "grad_norm": 2.0296124204690487, + "learning_rate": 3.132661169725688e-06, + "loss": 0.4736, + "step": 1418 + }, + { + "epoch": 1.98070216228784, + "grad_norm": 1.9519281154170998, + "learning_rate": 3.125125416463254e-06, + "loss": 0.4329, + "step": 1419 + }, + { + "epoch": 1.9820971867007673, + "grad_norm": 2.1011005867616337, + "learning_rate": 3.1175946155147064e-06, + "loss": 0.4609, + "step": 1420 + }, + { + "epoch": 1.9834922111136946, + "grad_norm": 2.3096209237137524, + "learning_rate": 3.110068786771987e-06, + "loss": 0.4718, + "step": 1421 + }, + { + "epoch": 1.9848872355266218, + "grad_norm": 1.7175842413795206, + "learning_rate": 3.1025479501139e-06, + "loss": 0.4926, + "step": 1422 + }, + { + "epoch": 1.986282259939549, + "grad_norm": 2.206831702580605, + "learning_rate": 3.095032125406062e-06, + "loss": 0.4488, + "step": 1423 + }, + { + "epoch": 1.9876772843524761, + "grad_norm": 2.0869072404586597, + "learning_rate": 3.0875213325008548e-06, + "loss": 0.5062, + "step": 1424 + }, + { + "epoch": 1.9890723087654034, + "grad_norm": 2.1927222257464085, + "learning_rate": 3.0800155912373696e-06, + "loss": 0.4717, + "step": 1425 + }, + { + "epoch": 1.9904673331783305, + "grad_norm": 2.175734397835776, + "learning_rate": 3.0725149214413487e-06, + "loss": 0.4203, + "step": 1426 + }, + { + "epoch": 1.9918623575912577, + "grad_norm": 1.9894712393139962, + "learning_rate": 3.065019342925143e-06, + "loss": 0.4895, + "step": 1427 + }, + { + "epoch": 1.993257382004185, + "grad_norm": 2.2695006617555653, + "learning_rate": 3.0575288754876565e-06, + "loss": 0.478, + "step": 1428 + }, + { + "epoch": 1.9946524064171123, + "grad_norm": 1.8870689048595921, + "learning_rate": 3.0500435389142867e-06, + "loss": 0.4129, + "step": 1429 + }, + { + "epoch": 1.9960474308300395, + "grad_norm": 2.1967720177098946, + "learning_rate": 3.042563352976884e-06, + "loss": 0.456, + "step": 1430 + }, + { + "epoch": 1.9974424552429668, + "grad_norm": 2.0420373038544866, + "learning_rate": 3.035088337433694e-06, + "loss": 0.4766, + "step": 1431 + }, + { + "epoch": 1.998837479655894, + "grad_norm": 2.112347205511411, + "learning_rate": 3.0276185120292996e-06, + "loss": 0.4888, + "step": 1432 + }, + { + "epoch": 2.0013950244129273, + "grad_norm": 2.1052085745081897, + "learning_rate": 3.0201538964945787e-06, + "loss": 0.7158, + "step": 1433 + }, + { + "epoch": 2.0027900488258545, + "grad_norm": 1.9402636424171937, + "learning_rate": 3.0126945105466486e-06, + "loss": 0.2674, + "step": 1434 + }, + { + "epoch": 2.004185073238782, + "grad_norm": 1.8085458724472494, + "learning_rate": 3.005240373888812e-06, + "loss": 0.2916, + "step": 1435 + }, + { + "epoch": 2.005580097651709, + "grad_norm": 1.9029597036365433, + "learning_rate": 2.9977915062105023e-06, + "loss": 0.2669, + "step": 1436 + }, + { + "epoch": 2.0069751220646364, + "grad_norm": 1.7044125918783548, + "learning_rate": 2.9903479271872416e-06, + "loss": 0.2359, + "step": 1437 + }, + { + "epoch": 2.008370146477563, + "grad_norm": 1.9190122425840221, + "learning_rate": 2.9829096564805804e-06, + "loss": 0.2301, + "step": 1438 + }, + { + "epoch": 2.0097651708904904, + "grad_norm": 1.7303744404988668, + "learning_rate": 2.975476713738043e-06, + "loss": 0.2489, + "step": 1439 + }, + { + "epoch": 2.0111601953034177, + "grad_norm": 1.721857219696397, + "learning_rate": 2.9680491185930877e-06, + "loss": 0.2564, + "step": 1440 + }, + { + "epoch": 2.012555219716345, + "grad_norm": 1.894684087795828, + "learning_rate": 2.960626890665044e-06, + "loss": 0.2444, + "step": 1441 + }, + { + "epoch": 2.0139502441292723, + "grad_norm": 1.839486201760737, + "learning_rate": 2.953210049559062e-06, + "loss": 0.2157, + "step": 1442 + }, + { + "epoch": 2.0153452685421995, + "grad_norm": 2.4897922879317425, + "learning_rate": 2.945798614866068e-06, + "loss": 0.2716, + "step": 1443 + }, + { + "epoch": 2.016740292955127, + "grad_norm": 2.344531912366505, + "learning_rate": 2.9383926061627055e-06, + "loss": 0.239, + "step": 1444 + }, + { + "epoch": 2.018135317368054, + "grad_norm": 2.914955661673377, + "learning_rate": 2.9309920430112825e-06, + "loss": 0.2717, + "step": 1445 + }, + { + "epoch": 2.0195303417809813, + "grad_norm": 2.498308889226278, + "learning_rate": 2.92359694495973e-06, + "loss": 0.2606, + "step": 1446 + }, + { + "epoch": 2.0209253661939086, + "grad_norm": 2.1050902414844015, + "learning_rate": 2.9162073315415384e-06, + "loss": 0.2375, + "step": 1447 + }, + { + "epoch": 2.0223203906068354, + "grad_norm": 2.456539411273798, + "learning_rate": 2.9088232222757085e-06, + "loss": 0.2524, + "step": 1448 + }, + { + "epoch": 2.0237154150197627, + "grad_norm": 2.1469646592078653, + "learning_rate": 2.9014446366667115e-06, + "loss": 0.2684, + "step": 1449 + }, + { + "epoch": 2.02511043943269, + "grad_norm": 2.0918990363899552, + "learning_rate": 2.8940715942044204e-06, + "loss": 0.2578, + "step": 1450 + }, + { + "epoch": 2.0265054638456172, + "grad_norm": 1.9929495750968211, + "learning_rate": 2.8867041143640663e-06, + "loss": 0.2332, + "step": 1451 + }, + { + "epoch": 2.0279004882585445, + "grad_norm": 2.139820882014351, + "learning_rate": 2.8793422166061918e-06, + "loss": 0.2801, + "step": 1452 + }, + { + "epoch": 2.029295512671472, + "grad_norm": 1.9416468277806427, + "learning_rate": 2.8719859203765955e-06, + "loss": 0.227, + "step": 1453 + }, + { + "epoch": 2.030690537084399, + "grad_norm": 1.9592248880362728, + "learning_rate": 2.864635245106272e-06, + "loss": 0.2398, + "step": 1454 + }, + { + "epoch": 2.0320855614973263, + "grad_norm": 1.942022515983665, + "learning_rate": 2.8572902102113788e-06, + "loss": 0.2338, + "step": 1455 + }, + { + "epoch": 2.0334805859102536, + "grad_norm": 2.0314568308212535, + "learning_rate": 2.849950835093168e-06, + "loss": 0.2413, + "step": 1456 + }, + { + "epoch": 2.034875610323181, + "grad_norm": 1.8345231386774408, + "learning_rate": 2.8426171391379433e-06, + "loss": 0.2498, + "step": 1457 + }, + { + "epoch": 2.0362706347361077, + "grad_norm": 1.73045801685932, + "learning_rate": 2.835289141717008e-06, + "loss": 0.2173, + "step": 1458 + }, + { + "epoch": 2.037665659149035, + "grad_norm": 1.9676717354052715, + "learning_rate": 2.827966862186616e-06, + "loss": 0.2358, + "step": 1459 + }, + { + "epoch": 2.0390606835619622, + "grad_norm": 1.9796197987643156, + "learning_rate": 2.820650319887911e-06, + "loss": 0.2414, + "step": 1460 + }, + { + "epoch": 2.0404557079748895, + "grad_norm": 2.0928848682799384, + "learning_rate": 2.8133395341468915e-06, + "loss": 0.2527, + "step": 1461 + }, + { + "epoch": 2.041850732387817, + "grad_norm": 2.044176118101689, + "learning_rate": 2.8060345242743427e-06, + "loss": 0.2319, + "step": 1462 + }, + { + "epoch": 2.043245756800744, + "grad_norm": 1.8085526333410318, + "learning_rate": 2.7987353095657944e-06, + "loss": 0.2663, + "step": 1463 + }, + { + "epoch": 2.0446407812136713, + "grad_norm": 2.1890698281590772, + "learning_rate": 2.7914419093014734e-06, + "loss": 0.2424, + "step": 1464 + }, + { + "epoch": 2.0460358056265986, + "grad_norm": 1.8839951262325991, + "learning_rate": 2.784154342746246e-06, + "loss": 0.2453, + "step": 1465 + }, + { + "epoch": 2.047430830039526, + "grad_norm": 2.1446428182352024, + "learning_rate": 2.7768726291495667e-06, + "loss": 0.2616, + "step": 1466 + }, + { + "epoch": 2.048825854452453, + "grad_norm": 2.12053100151656, + "learning_rate": 2.7695967877454356e-06, + "loss": 0.2628, + "step": 1467 + }, + { + "epoch": 2.05022087886538, + "grad_norm": 2.010891601213425, + "learning_rate": 2.7623268377523356e-06, + "loss": 0.2103, + "step": 1468 + }, + { + "epoch": 2.0516159032783072, + "grad_norm": 1.9005154237356725, + "learning_rate": 2.755062798373189e-06, + "loss": 0.2456, + "step": 1469 + }, + { + "epoch": 2.0530109276912345, + "grad_norm": 2.119569304257289, + "learning_rate": 2.747804688795311e-06, + "loss": 0.2576, + "step": 1470 + }, + { + "epoch": 2.0544059521041618, + "grad_norm": 1.9846330923110018, + "learning_rate": 2.7405525281903506e-06, + "loss": 0.2108, + "step": 1471 + }, + { + "epoch": 2.055800976517089, + "grad_norm": 2.0242598137098358, + "learning_rate": 2.7333063357142414e-06, + "loss": 0.2376, + "step": 1472 + }, + { + "epoch": 2.0571960009300163, + "grad_norm": 2.065760746797269, + "learning_rate": 2.7260661305071523e-06, + "loss": 0.2202, + "step": 1473 + }, + { + "epoch": 2.0585910253429436, + "grad_norm": 1.9897327351088137, + "learning_rate": 2.718831931693443e-06, + "loss": 0.2508, + "step": 1474 + }, + { + "epoch": 2.059986049755871, + "grad_norm": 1.9011348442504956, + "learning_rate": 2.7116037583816e-06, + "loss": 0.2504, + "step": 1475 + }, + { + "epoch": 2.061381074168798, + "grad_norm": 2.054795671413816, + "learning_rate": 2.7043816296642005e-06, + "loss": 0.2314, + "step": 1476 + }, + { + "epoch": 2.0627760985817254, + "grad_norm": 2.1098702197687156, + "learning_rate": 2.6971655646178544e-06, + "loss": 0.2603, + "step": 1477 + }, + { + "epoch": 2.064171122994652, + "grad_norm": 1.88931900817361, + "learning_rate": 2.689955582303152e-06, + "loss": 0.2164, + "step": 1478 + }, + { + "epoch": 2.0655661474075795, + "grad_norm": 1.8274028969126306, + "learning_rate": 2.6827517017646154e-06, + "loss": 0.2246, + "step": 1479 + }, + { + "epoch": 2.0669611718205068, + "grad_norm": 2.097344575244166, + "learning_rate": 2.6755539420306565e-06, + "loss": 0.2263, + "step": 1480 + }, + { + "epoch": 2.068356196233434, + "grad_norm": 1.6940391063154205, + "learning_rate": 2.668362322113512e-06, + "loss": 0.2429, + "step": 1481 + }, + { + "epoch": 2.0697512206463613, + "grad_norm": 1.8672184430662262, + "learning_rate": 2.661176861009205e-06, + "loss": 0.2409, + "step": 1482 + }, + { + "epoch": 2.0711462450592886, + "grad_norm": 2.0268800259192523, + "learning_rate": 2.6539975776974926e-06, + "loss": 0.2332, + "step": 1483 + }, + { + "epoch": 2.072541269472216, + "grad_norm": 1.9571704604901023, + "learning_rate": 2.646824491141807e-06, + "loss": 0.2227, + "step": 1484 + }, + { + "epoch": 2.073936293885143, + "grad_norm": 1.9668513577356352, + "learning_rate": 2.6396576202892176e-06, + "loss": 0.2221, + "step": 1485 + }, + { + "epoch": 2.0753313182980704, + "grad_norm": 1.8267652821156959, + "learning_rate": 2.632496984070375e-06, + "loss": 0.2181, + "step": 1486 + }, + { + "epoch": 2.0767263427109977, + "grad_norm": 2.0560708153439533, + "learning_rate": 2.6253426013994586e-06, + "loss": 0.2557, + "step": 1487 + }, + { + "epoch": 2.0781213671239245, + "grad_norm": 2.17511504619303, + "learning_rate": 2.6181944911741333e-06, + "loss": 0.2532, + "step": 1488 + }, + { + "epoch": 2.0795163915368517, + "grad_norm": 2.094401377084581, + "learning_rate": 2.6110526722754955e-06, + "loss": 0.2683, + "step": 1489 + }, + { + "epoch": 2.080911415949779, + "grad_norm": 2.175178881238892, + "learning_rate": 2.603917163568021e-06, + "loss": 0.2466, + "step": 1490 + }, + { + "epoch": 2.0823064403627063, + "grad_norm": 1.882430762960756, + "learning_rate": 2.5967879838995176e-06, + "loss": 0.2261, + "step": 1491 + }, + { + "epoch": 2.0837014647756336, + "grad_norm": 2.138876118948161, + "learning_rate": 2.589665152101081e-06, + "loss": 0.2428, + "step": 1492 + }, + { + "epoch": 2.085096489188561, + "grad_norm": 1.9993127285312577, + "learning_rate": 2.582548686987031e-06, + "loss": 0.2349, + "step": 1493 + }, + { + "epoch": 2.086491513601488, + "grad_norm": 1.9705015030533575, + "learning_rate": 2.5754386073548775e-06, + "loss": 0.2198, + "step": 1494 + }, + { + "epoch": 2.0878865380144154, + "grad_norm": 2.032583776330532, + "learning_rate": 2.5683349319852647e-06, + "loss": 0.2423, + "step": 1495 + }, + { + "epoch": 2.0892815624273426, + "grad_norm": 1.9542879134817772, + "learning_rate": 2.5612376796419126e-06, + "loss": 0.242, + "step": 1496 + }, + { + "epoch": 2.09067658684027, + "grad_norm": 2.098429761287188, + "learning_rate": 2.5541468690715797e-06, + "loss": 0.2491, + "step": 1497 + }, + { + "epoch": 2.0920716112531967, + "grad_norm": 2.0326652718730287, + "learning_rate": 2.5470625190040105e-06, + "loss": 0.2499, + "step": 1498 + }, + { + "epoch": 2.093466635666124, + "grad_norm": 2.105130932551734, + "learning_rate": 2.5399846481518857e-06, + "loss": 0.2501, + "step": 1499 + }, + { + "epoch": 2.0948616600790513, + "grad_norm": 1.9278156061938734, + "learning_rate": 2.5329132752107675e-06, + "loss": 0.2606, + "step": 1500 + }, + { + "epoch": 2.0962566844919786, + "grad_norm": 2.0731891083929472, + "learning_rate": 2.525848418859055e-06, + "loss": 0.2497, + "step": 1501 + }, + { + "epoch": 2.097651708904906, + "grad_norm": 1.7753623857882452, + "learning_rate": 2.518790097757938e-06, + "loss": 0.2183, + "step": 1502 + }, + { + "epoch": 2.099046733317833, + "grad_norm": 1.8816805999317845, + "learning_rate": 2.51173833055134e-06, + "loss": 0.223, + "step": 1503 + }, + { + "epoch": 2.1004417577307604, + "grad_norm": 2.050127188663919, + "learning_rate": 2.504693135865875e-06, + "loss": 0.2604, + "step": 1504 + }, + { + "epoch": 2.1018367821436876, + "grad_norm": 2.068754384900015, + "learning_rate": 2.497654532310799e-06, + "loss": 0.2444, + "step": 1505 + }, + { + "epoch": 2.103231806556615, + "grad_norm": 1.8837169365636035, + "learning_rate": 2.490622538477952e-06, + "loss": 0.2445, + "step": 1506 + }, + { + "epoch": 2.104626830969542, + "grad_norm": 2.1180481981544097, + "learning_rate": 2.483597172941718e-06, + "loss": 0.2346, + "step": 1507 + }, + { + "epoch": 2.106021855382469, + "grad_norm": 2.0748715550357946, + "learning_rate": 2.4765784542589754e-06, + "loss": 0.2497, + "step": 1508 + }, + { + "epoch": 2.1074168797953963, + "grad_norm": 2.0079446467193995, + "learning_rate": 2.46956640096904e-06, + "loss": 0.2032, + "step": 1509 + }, + { + "epoch": 2.1088119042083235, + "grad_norm": 1.8790214788210842, + "learning_rate": 2.4625610315936267e-06, + "loss": 0.245, + "step": 1510 + }, + { + "epoch": 2.110206928621251, + "grad_norm": 2.1128877451193864, + "learning_rate": 2.4555623646367952e-06, + "loss": 0.2778, + "step": 1511 + }, + { + "epoch": 2.111601953034178, + "grad_norm": 2.1529575803923353, + "learning_rate": 2.448570418584898e-06, + "loss": 0.2477, + "step": 1512 + }, + { + "epoch": 2.1129969774471054, + "grad_norm": 1.9471977071121356, + "learning_rate": 2.4415852119065343e-06, + "loss": 0.2454, + "step": 1513 + }, + { + "epoch": 2.1143920018600326, + "grad_norm": 1.9889979101530386, + "learning_rate": 2.4346067630525084e-06, + "loss": 0.2442, + "step": 1514 + }, + { + "epoch": 2.11578702627296, + "grad_norm": 2.139859675456495, + "learning_rate": 2.427635090455766e-06, + "loss": 0.2289, + "step": 1515 + }, + { + "epoch": 2.117182050685887, + "grad_norm": 2.013707423934541, + "learning_rate": 2.42067021253136e-06, + "loss": 0.232, + "step": 1516 + }, + { + "epoch": 2.1185770750988144, + "grad_norm": 1.89760209157116, + "learning_rate": 2.4137121476763965e-06, + "loss": 0.2077, + "step": 1517 + }, + { + "epoch": 2.1199720995117413, + "grad_norm": 2.0816857703833143, + "learning_rate": 2.4067609142699798e-06, + "loss": 0.2193, + "step": 1518 + }, + { + "epoch": 2.1213671239246685, + "grad_norm": 2.1617449428356705, + "learning_rate": 2.3998165306731713e-06, + "loss": 0.2411, + "step": 1519 + }, + { + "epoch": 2.122762148337596, + "grad_norm": 1.985051182731335, + "learning_rate": 2.3928790152289443e-06, + "loss": 0.2066, + "step": 1520 + }, + { + "epoch": 2.124157172750523, + "grad_norm": 2.0332885897822917, + "learning_rate": 2.385948386262123e-06, + "loss": 0.2589, + "step": 1521 + }, + { + "epoch": 2.1255521971634503, + "grad_norm": 2.1609643142255477, + "learning_rate": 2.3790246620793466e-06, + "loss": 0.2453, + "step": 1522 + }, + { + "epoch": 2.1269472215763776, + "grad_norm": 2.1104388284929074, + "learning_rate": 2.372107860969019e-06, + "loss": 0.2327, + "step": 1523 + }, + { + "epoch": 2.128342245989305, + "grad_norm": 2.0050701731009664, + "learning_rate": 2.3651980012012454e-06, + "loss": 0.2183, + "step": 1524 + }, + { + "epoch": 2.129737270402232, + "grad_norm": 1.8374702594979928, + "learning_rate": 2.358295101027807e-06, + "loss": 0.2271, + "step": 1525 + }, + { + "epoch": 2.1311322948151594, + "grad_norm": 1.7744485016961058, + "learning_rate": 2.351399178682101e-06, + "loss": 0.2318, + "step": 1526 + }, + { + "epoch": 2.1325273192280867, + "grad_norm": 2.1707657393472557, + "learning_rate": 2.3445102523790876e-06, + "loss": 0.2192, + "step": 1527 + }, + { + "epoch": 2.1339223436410135, + "grad_norm": 1.9212023430718843, + "learning_rate": 2.3376283403152527e-06, + "loss": 0.2274, + "step": 1528 + }, + { + "epoch": 2.135317368053941, + "grad_norm": 1.9431330763254702, + "learning_rate": 2.330753460668553e-06, + "loss": 0.2363, + "step": 1529 + }, + { + "epoch": 2.136712392466868, + "grad_norm": 1.850992735000709, + "learning_rate": 2.323885631598366e-06, + "loss": 0.2398, + "step": 1530 + }, + { + "epoch": 2.1381074168797953, + "grad_norm": 1.9617578235530817, + "learning_rate": 2.3170248712454525e-06, + "loss": 0.247, + "step": 1531 + }, + { + "epoch": 2.1395024412927226, + "grad_norm": 2.0582404649051385, + "learning_rate": 2.3101711977318995e-06, + "loss": 0.2654, + "step": 1532 + }, + { + "epoch": 2.14089746570565, + "grad_norm": 2.0530723360930927, + "learning_rate": 2.3033246291610717e-06, + "loss": 0.2297, + "step": 1533 + }, + { + "epoch": 2.142292490118577, + "grad_norm": 2.0428682175339183, + "learning_rate": 2.2964851836175705e-06, + "loss": 0.2479, + "step": 1534 + }, + { + "epoch": 2.1436875145315044, + "grad_norm": 2.033140247859454, + "learning_rate": 2.2896528791671807e-06, + "loss": 0.2287, + "step": 1535 + }, + { + "epoch": 2.1450825389444317, + "grad_norm": 1.9706953955373776, + "learning_rate": 2.2828277338568226e-06, + "loss": 0.2309, + "step": 1536 + }, + { + "epoch": 2.146477563357359, + "grad_norm": 2.0657013312891763, + "learning_rate": 2.2760097657145096e-06, + "loss": 0.2376, + "step": 1537 + }, + { + "epoch": 2.147872587770286, + "grad_norm": 2.010341964625042, + "learning_rate": 2.2691989927492984e-06, + "loss": 0.2577, + "step": 1538 + }, + { + "epoch": 2.149267612183213, + "grad_norm": 2.2001973373784334, + "learning_rate": 2.262395432951235e-06, + "loss": 0.2596, + "step": 1539 + }, + { + "epoch": 2.1506626365961403, + "grad_norm": 1.9469948847454945, + "learning_rate": 2.2555991042913177e-06, + "loss": 0.2278, + "step": 1540 + }, + { + "epoch": 2.1520576610090676, + "grad_norm": 2.108148021681362, + "learning_rate": 2.248810024721441e-06, + "loss": 0.2251, + "step": 1541 + }, + { + "epoch": 2.153452685421995, + "grad_norm": 1.9836437529062159, + "learning_rate": 2.2420282121743513e-06, + "loss": 0.2367, + "step": 1542 + }, + { + "epoch": 2.154847709834922, + "grad_norm": 2.024544254750702, + "learning_rate": 2.235253684563602e-06, + "loss": 0.2259, + "step": 1543 + }, + { + "epoch": 2.1562427342478494, + "grad_norm": 2.016705862662843, + "learning_rate": 2.228486459783506e-06, + "loss": 0.2176, + "step": 1544 + }, + { + "epoch": 2.1576377586607767, + "grad_norm": 1.8277595229039698, + "learning_rate": 2.221726555709079e-06, + "loss": 0.2328, + "step": 1545 + }, + { + "epoch": 2.159032783073704, + "grad_norm": 2.0853945150495137, + "learning_rate": 2.2149739901960088e-06, + "loss": 0.2496, + "step": 1546 + }, + { + "epoch": 2.160427807486631, + "grad_norm": 1.933919574208661, + "learning_rate": 2.208228781080592e-06, + "loss": 0.2276, + "step": 1547 + }, + { + "epoch": 2.161822831899558, + "grad_norm": 1.9957596951759287, + "learning_rate": 2.201490946179696e-06, + "loss": 0.2202, + "step": 1548 + }, + { + "epoch": 2.1632178563124853, + "grad_norm": 1.8822509043654059, + "learning_rate": 2.19476050329071e-06, + "loss": 0.2122, + "step": 1549 + }, + { + "epoch": 2.1646128807254126, + "grad_norm": 1.984553964575256, + "learning_rate": 2.188037470191502e-06, + "loss": 0.2258, + "step": 1550 + }, + { + "epoch": 2.16600790513834, + "grad_norm": 1.7587934213104526, + "learning_rate": 2.181321864640362e-06, + "loss": 0.2286, + "step": 1551 + }, + { + "epoch": 2.167402929551267, + "grad_norm": 2.0834079951630495, + "learning_rate": 2.1746137043759594e-06, + "loss": 0.2099, + "step": 1552 + }, + { + "epoch": 2.1687979539641944, + "grad_norm": 1.7944200796734386, + "learning_rate": 2.167913007117306e-06, + "loss": 0.2262, + "step": 1553 + }, + { + "epoch": 2.1701929783771217, + "grad_norm": 2.0717320889653177, + "learning_rate": 2.1612197905636913e-06, + "loss": 0.2554, + "step": 1554 + }, + { + "epoch": 2.171588002790049, + "grad_norm": 1.9794269773744197, + "learning_rate": 2.154534072394651e-06, + "loss": 0.2332, + "step": 1555 + }, + { + "epoch": 2.172983027202976, + "grad_norm": 2.0786474656189524, + "learning_rate": 2.147855870269916e-06, + "loss": 0.2512, + "step": 1556 + }, + { + "epoch": 2.1743780516159035, + "grad_norm": 1.905587487947106, + "learning_rate": 2.1411852018293583e-06, + "loss": 0.2725, + "step": 1557 + }, + { + "epoch": 2.1757730760288303, + "grad_norm": 1.9686301802786672, + "learning_rate": 2.1345220846929514e-06, + "loss": 0.2291, + "step": 1558 + }, + { + "epoch": 2.1771681004417576, + "grad_norm": 2.0061029510599266, + "learning_rate": 2.127866536460727e-06, + "loss": 0.2323, + "step": 1559 + }, + { + "epoch": 2.178563124854685, + "grad_norm": 1.8392842837936532, + "learning_rate": 2.1212185747127235e-06, + "loss": 0.2392, + "step": 1560 + }, + { + "epoch": 2.179958149267612, + "grad_norm": 1.8963609241375565, + "learning_rate": 2.1145782170089346e-06, + "loss": 0.2241, + "step": 1561 + }, + { + "epoch": 2.1813531736805394, + "grad_norm": 2.020691216176965, + "learning_rate": 2.107945480889276e-06, + "loss": 0.2202, + "step": 1562 + }, + { + "epoch": 2.1827481980934667, + "grad_norm": 2.1222330186555034, + "learning_rate": 2.1013203838735273e-06, + "loss": 0.2338, + "step": 1563 + }, + { + "epoch": 2.184143222506394, + "grad_norm": 1.8451126874497825, + "learning_rate": 2.094702943461289e-06, + "loss": 0.2271, + "step": 1564 + }, + { + "epoch": 2.185538246919321, + "grad_norm": 2.0925263169112243, + "learning_rate": 2.0880931771319395e-06, + "loss": 0.261, + "step": 1565 + }, + { + "epoch": 2.1869332713322485, + "grad_norm": 2.0486486674355566, + "learning_rate": 2.0814911023445904e-06, + "loss": 0.2056, + "step": 1566 + }, + { + "epoch": 2.1883282957451757, + "grad_norm": 1.8973623339063204, + "learning_rate": 2.0748967365380292e-06, + "loss": 0.2288, + "step": 1567 + }, + { + "epoch": 2.1897233201581026, + "grad_norm": 1.9264105513759306, + "learning_rate": 2.0683100971306873e-06, + "loss": 0.2458, + "step": 1568 + }, + { + "epoch": 2.19111834457103, + "grad_norm": 2.1351016674119396, + "learning_rate": 2.0617312015205844e-06, + "loss": 0.2618, + "step": 1569 + }, + { + "epoch": 2.192513368983957, + "grad_norm": 2.0301845360513444, + "learning_rate": 2.055160067085283e-06, + "loss": 0.222, + "step": 1570 + }, + { + "epoch": 2.1939083933968844, + "grad_norm": 1.9366582361580502, + "learning_rate": 2.0485967111818506e-06, + "loss": 0.2464, + "step": 1571 + }, + { + "epoch": 2.1953034178098116, + "grad_norm": 1.8379955335152875, + "learning_rate": 2.0420411511468086e-06, + "loss": 0.2218, + "step": 1572 + }, + { + "epoch": 2.196698442222739, + "grad_norm": 1.924135371652443, + "learning_rate": 2.0354934042960804e-06, + "loss": 0.2684, + "step": 1573 + }, + { + "epoch": 2.198093466635666, + "grad_norm": 1.9427660648400258, + "learning_rate": 2.0289534879249544e-06, + "loss": 0.2195, + "step": 1574 + }, + { + "epoch": 2.1994884910485935, + "grad_norm": 2.117053729594471, + "learning_rate": 2.0224214193080394e-06, + "loss": 0.2363, + "step": 1575 + }, + { + "epoch": 2.2008835154615207, + "grad_norm": 2.0212314391425323, + "learning_rate": 2.015897215699208e-06, + "loss": 0.2407, + "step": 1576 + }, + { + "epoch": 2.202278539874448, + "grad_norm": 1.9744514677082492, + "learning_rate": 2.0093808943315636e-06, + "loss": 0.213, + "step": 1577 + }, + { + "epoch": 2.2036735642873753, + "grad_norm": 2.0360684189888976, + "learning_rate": 2.0028724724173886e-06, + "loss": 0.218, + "step": 1578 + }, + { + "epoch": 2.205068588700302, + "grad_norm": 1.990797315085629, + "learning_rate": 1.996371967148098e-06, + "loss": 0.2252, + "step": 1579 + }, + { + "epoch": 2.2064636131132294, + "grad_norm": 2.1560737572043225, + "learning_rate": 1.989879395694194e-06, + "loss": 0.2704, + "step": 1580 + }, + { + "epoch": 2.2078586375261566, + "grad_norm": 2.017853645994684, + "learning_rate": 1.9833947752052286e-06, + "loss": 0.2195, + "step": 1581 + }, + { + "epoch": 2.209253661939084, + "grad_norm": 2.041977727763059, + "learning_rate": 1.976918122809744e-06, + "loss": 0.2405, + "step": 1582 + }, + { + "epoch": 2.210648686352011, + "grad_norm": 1.9608446629611556, + "learning_rate": 1.9704494556152413e-06, + "loss": 0.2365, + "step": 1583 + }, + { + "epoch": 2.2120437107649384, + "grad_norm": 1.8806591549300393, + "learning_rate": 1.9639887907081297e-06, + "loss": 0.2084, + "step": 1584 + }, + { + "epoch": 2.2134387351778657, + "grad_norm": 2.1068982589953307, + "learning_rate": 1.9575361451536772e-06, + "loss": 0.2541, + "step": 1585 + }, + { + "epoch": 2.214833759590793, + "grad_norm": 1.9806962496339104, + "learning_rate": 1.9510915359959694e-06, + "loss": 0.2053, + "step": 1586 + }, + { + "epoch": 2.2162287840037203, + "grad_norm": 1.7041895265128661, + "learning_rate": 1.944654980257869e-06, + "loss": 0.2245, + "step": 1587 + }, + { + "epoch": 2.217623808416647, + "grad_norm": 1.9984605463110598, + "learning_rate": 1.9382264949409614e-06, + "loss": 0.2348, + "step": 1588 + }, + { + "epoch": 2.2190188328295744, + "grad_norm": 2.0330867230795286, + "learning_rate": 1.931806097025517e-06, + "loss": 0.2491, + "step": 1589 + }, + { + "epoch": 2.2204138572425016, + "grad_norm": 2.0364218516052133, + "learning_rate": 1.925393803470447e-06, + "loss": 0.2109, + "step": 1590 + }, + { + "epoch": 2.221808881655429, + "grad_norm": 2.452548285422924, + "learning_rate": 1.9189896312132506e-06, + "loss": 0.2388, + "step": 1591 + }, + { + "epoch": 2.223203906068356, + "grad_norm": 2.028391891671147, + "learning_rate": 1.912593597169975e-06, + "loss": 0.2078, + "step": 1592 + }, + { + "epoch": 2.2245989304812834, + "grad_norm": 2.0694649414957476, + "learning_rate": 1.9062057182351768e-06, + "loss": 0.2378, + "step": 1593 + }, + { + "epoch": 2.2259939548942107, + "grad_norm": 1.8977475386258975, + "learning_rate": 1.899826011281865e-06, + "loss": 0.2551, + "step": 1594 + }, + { + "epoch": 2.227388979307138, + "grad_norm": 1.8794144304149887, + "learning_rate": 1.893454493161468e-06, + "loss": 0.222, + "step": 1595 + }, + { + "epoch": 2.2287840037200652, + "grad_norm": 1.8421851383635428, + "learning_rate": 1.8870911807037856e-06, + "loss": 0.2292, + "step": 1596 + }, + { + "epoch": 2.2301790281329925, + "grad_norm": 2.1294035926487216, + "learning_rate": 1.8807360907169326e-06, + "loss": 0.2573, + "step": 1597 + }, + { + "epoch": 2.23157405254592, + "grad_norm": 1.9710536419619946, + "learning_rate": 1.8743892399873154e-06, + "loss": 0.2262, + "step": 1598 + }, + { + "epoch": 2.2329690769588466, + "grad_norm": 2.054320626994414, + "learning_rate": 1.868050645279576e-06, + "loss": 0.2363, + "step": 1599 + }, + { + "epoch": 2.234364101371774, + "grad_norm": 1.9376735935313658, + "learning_rate": 1.8617203233365427e-06, + "loss": 0.2277, + "step": 1600 + }, + { + "epoch": 2.235759125784701, + "grad_norm": 1.96875678147811, + "learning_rate": 1.8553982908792e-06, + "loss": 0.2387, + "step": 1601 + }, + { + "epoch": 2.2371541501976284, + "grad_norm": 2.183861599239592, + "learning_rate": 1.8490845646066303e-06, + "loss": 0.2772, + "step": 1602 + }, + { + "epoch": 2.2385491746105557, + "grad_norm": 2.3569650294971227, + "learning_rate": 1.8427791611959762e-06, + "loss": 0.2558, + "step": 1603 + }, + { + "epoch": 2.239944199023483, + "grad_norm": 1.907449594297111, + "learning_rate": 1.8364820973024e-06, + "loss": 0.2141, + "step": 1604 + }, + { + "epoch": 2.2413392234364102, + "grad_norm": 2.1933972049660477, + "learning_rate": 1.8301933895590362e-06, + "loss": 0.253, + "step": 1605 + }, + { + "epoch": 2.2427342478493375, + "grad_norm": 1.9857324925925934, + "learning_rate": 1.8239130545769408e-06, + "loss": 0.2216, + "step": 1606 + }, + { + "epoch": 2.2441292722622648, + "grad_norm": 2.199122492257547, + "learning_rate": 1.8176411089450618e-06, + "loss": 0.2547, + "step": 1607 + }, + { + "epoch": 2.2455242966751916, + "grad_norm": 2.007748115189075, + "learning_rate": 1.8113775692301822e-06, + "loss": 0.2354, + "step": 1608 + }, + { + "epoch": 2.246919321088119, + "grad_norm": 2.144681332284037, + "learning_rate": 1.8051224519768817e-06, + "loss": 0.2368, + "step": 1609 + }, + { + "epoch": 2.248314345501046, + "grad_norm": 1.9606369218220936, + "learning_rate": 1.7988757737074959e-06, + "loss": 0.216, + "step": 1610 + }, + { + "epoch": 2.2497093699139734, + "grad_norm": 2.0077910044386553, + "learning_rate": 1.7926375509220695e-06, + "loss": 0.2246, + "step": 1611 + }, + { + "epoch": 2.2511043943269007, + "grad_norm": 2.0150546849238355, + "learning_rate": 1.7864078000983076e-06, + "loss": 0.2102, + "step": 1612 + }, + { + "epoch": 2.252499418739828, + "grad_norm": 2.048720247263012, + "learning_rate": 1.7801865376915451e-06, + "loss": 0.2207, + "step": 1613 + }, + { + "epoch": 2.2538944431527552, + "grad_norm": 2.051461913853357, + "learning_rate": 1.7739737801346895e-06, + "loss": 0.2479, + "step": 1614 + }, + { + "epoch": 2.2552894675656825, + "grad_norm": 1.8388149137964327, + "learning_rate": 1.7677695438381831e-06, + "loss": 0.2531, + "step": 1615 + }, + { + "epoch": 2.2566844919786098, + "grad_norm": 2.0495512505404823, + "learning_rate": 1.761573845189965e-06, + "loss": 0.2349, + "step": 1616 + }, + { + "epoch": 2.258079516391537, + "grad_norm": 1.8415416726079457, + "learning_rate": 1.7553867005554215e-06, + "loss": 0.2042, + "step": 1617 + }, + { + "epoch": 2.2594745408044643, + "grad_norm": 1.9768703900158413, + "learning_rate": 1.7492081262773397e-06, + "loss": 0.2323, + "step": 1618 + }, + { + "epoch": 2.260869565217391, + "grad_norm": 2.1606595752623923, + "learning_rate": 1.7430381386758748e-06, + "loss": 0.2465, + "step": 1619 + }, + { + "epoch": 2.2622645896303184, + "grad_norm": 1.9661673368498434, + "learning_rate": 1.7368767540484965e-06, + "loss": 0.2219, + "step": 1620 + }, + { + "epoch": 2.2636596140432457, + "grad_norm": 1.8878002265817848, + "learning_rate": 1.7307239886699546e-06, + "loss": 0.2537, + "step": 1621 + }, + { + "epoch": 2.265054638456173, + "grad_norm": 2.1599816560012632, + "learning_rate": 1.7245798587922263e-06, + "loss": 0.2409, + "step": 1622 + }, + { + "epoch": 2.2664496628691, + "grad_norm": 1.877718357363733, + "learning_rate": 1.7184443806444851e-06, + "loss": 0.2273, + "step": 1623 + }, + { + "epoch": 2.2678446872820275, + "grad_norm": 1.7823955384474417, + "learning_rate": 1.7123175704330514e-06, + "loss": 0.2189, + "step": 1624 + }, + { + "epoch": 2.2692397116949548, + "grad_norm": 1.9192171774637246, + "learning_rate": 1.706199444341341e-06, + "loss": 0.2403, + "step": 1625 + }, + { + "epoch": 2.270634736107882, + "grad_norm": 2.0764997514013617, + "learning_rate": 1.7000900185298418e-06, + "loss": 0.273, + "step": 1626 + }, + { + "epoch": 2.2720297605208093, + "grad_norm": 2.124933803249738, + "learning_rate": 1.6939893091360577e-06, + "loss": 0.2631, + "step": 1627 + }, + { + "epoch": 2.273424784933736, + "grad_norm": 2.0537090747806257, + "learning_rate": 1.6878973322744658e-06, + "loss": 0.2259, + "step": 1628 + }, + { + "epoch": 2.2748198093466634, + "grad_norm": 1.9496310576864384, + "learning_rate": 1.6818141040364816e-06, + "loss": 0.2286, + "step": 1629 + }, + { + "epoch": 2.2762148337595907, + "grad_norm": 1.9684283583462694, + "learning_rate": 1.6757396404904087e-06, + "loss": 0.2257, + "step": 1630 + }, + { + "epoch": 2.277609858172518, + "grad_norm": 1.9211403090843973, + "learning_rate": 1.6696739576813981e-06, + "loss": 0.24, + "step": 1631 + }, + { + "epoch": 2.279004882585445, + "grad_norm": 1.9601889627957176, + "learning_rate": 1.6636170716314114e-06, + "loss": 0.2641, + "step": 1632 + }, + { + "epoch": 2.2803999069983725, + "grad_norm": 2.2106724133570697, + "learning_rate": 1.657568998339175e-06, + "loss": 0.2624, + "step": 1633 + }, + { + "epoch": 2.2817949314112997, + "grad_norm": 2.3843544770866556, + "learning_rate": 1.6515297537801305e-06, + "loss": 0.2485, + "step": 1634 + }, + { + "epoch": 2.283189955824227, + "grad_norm": 1.9719774663966787, + "learning_rate": 1.6454993539064075e-06, + "loss": 0.2178, + "step": 1635 + }, + { + "epoch": 2.2845849802371543, + "grad_norm": 1.9373964616111852, + "learning_rate": 1.6394778146467672e-06, + "loss": 0.2687, + "step": 1636 + }, + { + "epoch": 2.2859800046500816, + "grad_norm": 2.184356785161183, + "learning_rate": 1.6334651519065658e-06, + "loss": 0.2558, + "step": 1637 + }, + { + "epoch": 2.287375029063009, + "grad_norm": 2.09734743364332, + "learning_rate": 1.6274613815677176e-06, + "loss": 0.2441, + "step": 1638 + }, + { + "epoch": 2.2887700534759357, + "grad_norm": 2.1459547086066872, + "learning_rate": 1.6214665194886474e-06, + "loss": 0.1963, + "step": 1639 + }, + { + "epoch": 2.290165077888863, + "grad_norm": 1.8154559511599928, + "learning_rate": 1.6154805815042457e-06, + "loss": 0.2092, + "step": 1640 + }, + { + "epoch": 2.29156010230179, + "grad_norm": 2.1323386307866543, + "learning_rate": 1.6095035834258365e-06, + "loss": 0.2485, + "step": 1641 + }, + { + "epoch": 2.2929551267147175, + "grad_norm": 1.8872892475373095, + "learning_rate": 1.6035355410411252e-06, + "loss": 0.1996, + "step": 1642 + }, + { + "epoch": 2.2943501511276447, + "grad_norm": 2.0382534164232684, + "learning_rate": 1.5975764701141611e-06, + "loss": 0.2359, + "step": 1643 + }, + { + "epoch": 2.295745175540572, + "grad_norm": 1.9572911333723713, + "learning_rate": 1.5916263863853e-06, + "loss": 0.2416, + "step": 1644 + }, + { + "epoch": 2.2971401999534993, + "grad_norm": 2.150341017189422, + "learning_rate": 1.585685305571159e-06, + "loss": 0.2625, + "step": 1645 + }, + { + "epoch": 2.2985352243664265, + "grad_norm": 2.1453971015429016, + "learning_rate": 1.5797532433645696e-06, + "loss": 0.2271, + "step": 1646 + }, + { + "epoch": 2.299930248779354, + "grad_norm": 2.096925731594485, + "learning_rate": 1.5738302154345475e-06, + "loss": 0.2363, + "step": 1647 + }, + { + "epoch": 2.3013252731922806, + "grad_norm": 2.0826325764632365, + "learning_rate": 1.5679162374262414e-06, + "loss": 0.2319, + "step": 1648 + }, + { + "epoch": 2.302720297605208, + "grad_norm": 1.9053150555031908, + "learning_rate": 1.5620113249608943e-06, + "loss": 0.2536, + "step": 1649 + }, + { + "epoch": 2.304115322018135, + "grad_norm": 2.013788051403245, + "learning_rate": 1.5561154936358069e-06, + "loss": 0.2379, + "step": 1650 + }, + { + "epoch": 2.3055103464310625, + "grad_norm": 2.0929701003948313, + "learning_rate": 1.5502287590242942e-06, + "loss": 0.2285, + "step": 1651 + }, + { + "epoch": 2.3069053708439897, + "grad_norm": 1.8618570907267726, + "learning_rate": 1.5443511366756375e-06, + "loss": 0.2168, + "step": 1652 + }, + { + "epoch": 2.308300395256917, + "grad_norm": 1.9443015801429164, + "learning_rate": 1.53848264211505e-06, + "loss": 0.2428, + "step": 1653 + }, + { + "epoch": 2.3096954196698443, + "grad_norm": 1.730889234878071, + "learning_rate": 1.5326232908436405e-06, + "loss": 0.2132, + "step": 1654 + }, + { + "epoch": 2.3110904440827715, + "grad_norm": 2.033663509452582, + "learning_rate": 1.526773098338359e-06, + "loss": 0.2326, + "step": 1655 + }, + { + "epoch": 2.312485468495699, + "grad_norm": 2.0086739819616684, + "learning_rate": 1.5209320800519683e-06, + "loss": 0.2299, + "step": 1656 + }, + { + "epoch": 2.313880492908626, + "grad_norm": 2.1150179443276693, + "learning_rate": 1.515100251412998e-06, + "loss": 0.2514, + "step": 1657 + }, + { + "epoch": 2.3152755173215533, + "grad_norm": 2.0580293221490744, + "learning_rate": 1.5092776278257027e-06, + "loss": 0.2269, + "step": 1658 + }, + { + "epoch": 2.31667054173448, + "grad_norm": 1.9664146763176193, + "learning_rate": 1.5034642246700203e-06, + "loss": 0.239, + "step": 1659 + }, + { + "epoch": 2.3180655661474074, + "grad_norm": 2.127726449836482, + "learning_rate": 1.4976600573015398e-06, + "loss": 0.2503, + "step": 1660 + }, + { + "epoch": 2.3194605905603347, + "grad_norm": 2.1049092968450216, + "learning_rate": 1.4918651410514479e-06, + "loss": 0.2497, + "step": 1661 + }, + { + "epoch": 2.320855614973262, + "grad_norm": 2.066851659126085, + "learning_rate": 1.486079491226501e-06, + "loss": 0.2209, + "step": 1662 + }, + { + "epoch": 2.3222506393861893, + "grad_norm": 2.0102872039801536, + "learning_rate": 1.4803031231089782e-06, + "loss": 0.232, + "step": 1663 + }, + { + "epoch": 2.3236456637991165, + "grad_norm": 1.8240849797174543, + "learning_rate": 1.4745360519566382e-06, + "loss": 0.2228, + "step": 1664 + }, + { + "epoch": 2.325040688212044, + "grad_norm": 2.0260207328677646, + "learning_rate": 1.4687782930026833e-06, + "loss": 0.2503, + "step": 1665 + }, + { + "epoch": 2.326435712624971, + "grad_norm": 1.9370440208183426, + "learning_rate": 1.4630298614557236e-06, + "loss": 0.2293, + "step": 1666 + }, + { + "epoch": 2.3278307370378983, + "grad_norm": 2.0408731656057126, + "learning_rate": 1.4572907724997249e-06, + "loss": 0.2745, + "step": 1667 + }, + { + "epoch": 2.329225761450825, + "grad_norm": 1.9501445042320373, + "learning_rate": 1.4515610412939791e-06, + "loss": 0.22, + "step": 1668 + }, + { + "epoch": 2.3306207858637524, + "grad_norm": 1.8239379510751645, + "learning_rate": 1.445840682973062e-06, + "loss": 0.2416, + "step": 1669 + }, + { + "epoch": 2.3320158102766797, + "grad_norm": 2.008062176566863, + "learning_rate": 1.4401297126467884e-06, + "loss": 0.2087, + "step": 1670 + }, + { + "epoch": 2.333410834689607, + "grad_norm": 1.9572423516680113, + "learning_rate": 1.4344281454001751e-06, + "loss": 0.2045, + "step": 1671 + }, + { + "epoch": 2.3348058591025342, + "grad_norm": 2.1525787216094128, + "learning_rate": 1.4287359962934055e-06, + "loss": 0.2466, + "step": 1672 + }, + { + "epoch": 2.3362008835154615, + "grad_norm": 2.1085528054352722, + "learning_rate": 1.4230532803617814e-06, + "loss": 0.249, + "step": 1673 + }, + { + "epoch": 2.337595907928389, + "grad_norm": 1.900064207744204, + "learning_rate": 1.4173800126156916e-06, + "loss": 0.2427, + "step": 1674 + }, + { + "epoch": 2.338990932341316, + "grad_norm": 1.8330769763213726, + "learning_rate": 1.411716208040566e-06, + "loss": 0.224, + "step": 1675 + }, + { + "epoch": 2.3403859567542433, + "grad_norm": 2.1005430164406387, + "learning_rate": 1.4060618815968375e-06, + "loss": 0.2487, + "step": 1676 + }, + { + "epoch": 2.3417809811671706, + "grad_norm": 2.0233119703276197, + "learning_rate": 1.4004170482199054e-06, + "loss": 0.2616, + "step": 1677 + }, + { + "epoch": 2.343176005580098, + "grad_norm": 1.9503887562468631, + "learning_rate": 1.3947817228200956e-06, + "loss": 0.2101, + "step": 1678 + }, + { + "epoch": 2.3445710299930247, + "grad_norm": 2.0266811706036796, + "learning_rate": 1.3891559202826133e-06, + "loss": 0.2634, + "step": 1679 + }, + { + "epoch": 2.345966054405952, + "grad_norm": 2.313818281195364, + "learning_rate": 1.3835396554675179e-06, + "loss": 0.2358, + "step": 1680 + }, + { + "epoch": 2.3473610788188792, + "grad_norm": 2.0086635288310677, + "learning_rate": 1.37793294320967e-06, + "loss": 0.261, + "step": 1681 + }, + { + "epoch": 2.3487561032318065, + "grad_norm": 1.9456694874174518, + "learning_rate": 1.3723357983186974e-06, + "loss": 0.2166, + "step": 1682 + }, + { + "epoch": 2.3501511276447338, + "grad_norm": 2.0159574470737236, + "learning_rate": 1.3667482355789607e-06, + "loss": 0.2259, + "step": 1683 + }, + { + "epoch": 2.351546152057661, + "grad_norm": 2.0882553918627504, + "learning_rate": 1.3611702697495088e-06, + "loss": 0.2127, + "step": 1684 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 1.9536375676927595, + "learning_rate": 1.3556019155640416e-06, + "loss": 0.2327, + "step": 1685 + }, + { + "epoch": 2.3543362008835156, + "grad_norm": 1.8321246562049889, + "learning_rate": 1.350043187730868e-06, + "loss": 0.2382, + "step": 1686 + }, + { + "epoch": 2.355731225296443, + "grad_norm": 2.4564159839079625, + "learning_rate": 1.34449410093287e-06, + "loss": 0.234, + "step": 1687 + }, + { + "epoch": 2.3571262497093697, + "grad_norm": 1.8735083700021802, + "learning_rate": 1.3389546698274686e-06, + "loss": 0.2153, + "step": 1688 + }, + { + "epoch": 2.358521274122297, + "grad_norm": 2.0889076665454063, + "learning_rate": 1.333424909046574e-06, + "loss": 0.2486, + "step": 1689 + }, + { + "epoch": 2.359916298535224, + "grad_norm": 2.152373586743707, + "learning_rate": 1.327904833196556e-06, + "loss": 0.2581, + "step": 1690 + }, + { + "epoch": 2.3613113229481515, + "grad_norm": 1.9192239036423904, + "learning_rate": 1.3223944568582047e-06, + "loss": 0.2151, + "step": 1691 + }, + { + "epoch": 2.3627063473610788, + "grad_norm": 1.9426489688983704, + "learning_rate": 1.3168937945866861e-06, + "loss": 0.2289, + "step": 1692 + }, + { + "epoch": 2.364101371774006, + "grad_norm": 2.0708618477896703, + "learning_rate": 1.311402860911507e-06, + "loss": 0.2416, + "step": 1693 + }, + { + "epoch": 2.3654963961869333, + "grad_norm": 1.9852614398809703, + "learning_rate": 1.3059216703364814e-06, + "loss": 0.2189, + "step": 1694 + }, + { + "epoch": 2.3668914205998606, + "grad_norm": 3.1422525655244855, + "learning_rate": 1.3004502373396821e-06, + "loss": 0.2364, + "step": 1695 + }, + { + "epoch": 2.368286445012788, + "grad_norm": 2.002483705304144, + "learning_rate": 1.2949885763734127e-06, + "loss": 0.2428, + "step": 1696 + }, + { + "epoch": 2.369681469425715, + "grad_norm": 1.9611827941041333, + "learning_rate": 1.2895367018641658e-06, + "loss": 0.2219, + "step": 1697 + }, + { + "epoch": 2.3710764938386424, + "grad_norm": 1.9929935498243903, + "learning_rate": 1.284094628212576e-06, + "loss": 0.2582, + "step": 1698 + }, + { + "epoch": 2.372471518251569, + "grad_norm": 1.9163017033696872, + "learning_rate": 1.278662369793398e-06, + "loss": 0.2196, + "step": 1699 + }, + { + "epoch": 2.3738665426644965, + "grad_norm": 2.1264033667590874, + "learning_rate": 1.273239940955459e-06, + "loss": 0.2552, + "step": 1700 + }, + { + "epoch": 2.3752615670774238, + "grad_norm": 2.029148427130633, + "learning_rate": 1.267827356021618e-06, + "loss": 0.2621, + "step": 1701 + }, + { + "epoch": 2.376656591490351, + "grad_norm": 2.141536132697765, + "learning_rate": 1.2624246292887377e-06, + "loss": 0.2347, + "step": 1702 + }, + { + "epoch": 2.3780516159032783, + "grad_norm": 2.0105615317777, + "learning_rate": 1.2570317750276374e-06, + "loss": 0.2582, + "step": 1703 + }, + { + "epoch": 2.3794466403162056, + "grad_norm": 2.062581019953467, + "learning_rate": 1.2516488074830586e-06, + "loss": 0.2473, + "step": 1704 + }, + { + "epoch": 2.380841664729133, + "grad_norm": 1.80393769257362, + "learning_rate": 1.246275740873631e-06, + "loss": 0.2301, + "step": 1705 + }, + { + "epoch": 2.38223668914206, + "grad_norm": 2.231699482955148, + "learning_rate": 1.2409125893918329e-06, + "loss": 0.2533, + "step": 1706 + }, + { + "epoch": 2.3836317135549874, + "grad_norm": 2.028453535194021, + "learning_rate": 1.2355593672039462e-06, + "loss": 0.1988, + "step": 1707 + }, + { + "epoch": 2.385026737967914, + "grad_norm": 1.917076340098864, + "learning_rate": 1.2302160884500337e-06, + "loss": 0.2623, + "step": 1708 + }, + { + "epoch": 2.3864217623808415, + "grad_norm": 2.0507149118696932, + "learning_rate": 1.2248827672438868e-06, + "loss": 0.2035, + "step": 1709 + }, + { + "epoch": 2.3878167867937687, + "grad_norm": 2.010253697172704, + "learning_rate": 1.2195594176729963e-06, + "loss": 0.236, + "step": 1710 + }, + { + "epoch": 2.389211811206696, + "grad_norm": 2.11324341731085, + "learning_rate": 1.2142460537985168e-06, + "loss": 0.2325, + "step": 1711 + }, + { + "epoch": 2.3906068356196233, + "grad_norm": 1.9697450740429523, + "learning_rate": 1.2089426896552265e-06, + "loss": 0.2245, + "step": 1712 + }, + { + "epoch": 2.3920018600325506, + "grad_norm": 2.0186158624900177, + "learning_rate": 1.2036493392514847e-06, + "loss": 0.2654, + "step": 1713 + }, + { + "epoch": 2.393396884445478, + "grad_norm": 2.086842619934469, + "learning_rate": 1.1983660165692078e-06, + "loss": 0.2071, + "step": 1714 + }, + { + "epoch": 2.394791908858405, + "grad_norm": 2.280522318427615, + "learning_rate": 1.1930927355638189e-06, + "loss": 0.2438, + "step": 1715 + }, + { + "epoch": 2.3961869332713324, + "grad_norm": 1.850134137584549, + "learning_rate": 1.1878295101642185e-06, + "loss": 0.2314, + "step": 1716 + }, + { + "epoch": 2.3975819576842596, + "grad_norm": 2.0987488388488744, + "learning_rate": 1.182576354272748e-06, + "loss": 0.2304, + "step": 1717 + }, + { + "epoch": 2.398976982097187, + "grad_norm": 1.9418839835561703, + "learning_rate": 1.1773332817651512e-06, + "loss": 0.226, + "step": 1718 + }, + { + "epoch": 2.400372006510114, + "grad_norm": 1.9467470286378656, + "learning_rate": 1.1721003064905329e-06, + "loss": 0.2132, + "step": 1719 + }, + { + "epoch": 2.401767030923041, + "grad_norm": 1.9135163938253588, + "learning_rate": 1.1668774422713336e-06, + "loss": 0.2419, + "step": 1720 + }, + { + "epoch": 2.4031620553359683, + "grad_norm": 1.9477074001689785, + "learning_rate": 1.1616647029032818e-06, + "loss": 0.24, + "step": 1721 + }, + { + "epoch": 2.4045570797488955, + "grad_norm": 2.0907606430864023, + "learning_rate": 1.1564621021553617e-06, + "loss": 0.2309, + "step": 1722 + }, + { + "epoch": 2.405952104161823, + "grad_norm": 2.0151485407481213, + "learning_rate": 1.1512696537697804e-06, + "loss": 0.2641, + "step": 1723 + }, + { + "epoch": 2.40734712857475, + "grad_norm": 2.3088467147809233, + "learning_rate": 1.1460873714619275e-06, + "loss": 0.2649, + "step": 1724 + }, + { + "epoch": 2.4087421529876774, + "grad_norm": 2.072519389501527, + "learning_rate": 1.140915268920339e-06, + "loss": 0.2409, + "step": 1725 + }, + { + "epoch": 2.4101371774006046, + "grad_norm": 1.9307910533793868, + "learning_rate": 1.13575335980666e-06, + "loss": 0.2245, + "step": 1726 + }, + { + "epoch": 2.411532201813532, + "grad_norm": 2.1765573168660355, + "learning_rate": 1.130601657755616e-06, + "loss": 0.2241, + "step": 1727 + }, + { + "epoch": 2.4129272262264587, + "grad_norm": 1.994483581997574, + "learning_rate": 1.125460176374965e-06, + "loss": 0.2287, + "step": 1728 + }, + { + "epoch": 2.414322250639386, + "grad_norm": 1.9387870571187331, + "learning_rate": 1.1203289292454728e-06, + "loss": 0.2544, + "step": 1729 + }, + { + "epoch": 2.4157172750523133, + "grad_norm": 2.1825510130358188, + "learning_rate": 1.1152079299208724e-06, + "loss": 0.2478, + "step": 1730 + }, + { + "epoch": 2.4171122994652405, + "grad_norm": 2.119670431535453, + "learning_rate": 1.1100971919278247e-06, + "loss": 0.2302, + "step": 1731 + }, + { + "epoch": 2.418507323878168, + "grad_norm": 1.9383266869156575, + "learning_rate": 1.104996728765887e-06, + "loss": 0.2428, + "step": 1732 + }, + { + "epoch": 2.419902348291095, + "grad_norm": 2.110592186403715, + "learning_rate": 1.0999065539074793e-06, + "loss": 0.2413, + "step": 1733 + }, + { + "epoch": 2.4212973727040223, + "grad_norm": 2.067613222890333, + "learning_rate": 1.094826680797843e-06, + "loss": 0.231, + "step": 1734 + }, + { + "epoch": 2.4226923971169496, + "grad_norm": 1.984326692206471, + "learning_rate": 1.0897571228550097e-06, + "loss": 0.2467, + "step": 1735 + }, + { + "epoch": 2.424087421529877, + "grad_norm": 2.0167088904753783, + "learning_rate": 1.0846978934697666e-06, + "loss": 0.2423, + "step": 1736 + }, + { + "epoch": 2.425482445942804, + "grad_norm": 1.8768002950793947, + "learning_rate": 1.0796490060056142e-06, + "loss": 0.2341, + "step": 1737 + }, + { + "epoch": 2.4268774703557314, + "grad_norm": 2.00793087438797, + "learning_rate": 1.074610473798738e-06, + "loss": 0.2293, + "step": 1738 + }, + { + "epoch": 2.4282724947686587, + "grad_norm": 2.2281107440159906, + "learning_rate": 1.0695823101579728e-06, + "loss": 0.2223, + "step": 1739 + }, + { + "epoch": 2.4296675191815855, + "grad_norm": 1.971731384171419, + "learning_rate": 1.0645645283647616e-06, + "loss": 0.2377, + "step": 1740 + }, + { + "epoch": 2.431062543594513, + "grad_norm": 2.0683911759403846, + "learning_rate": 1.0595571416731293e-06, + "loss": 0.2311, + "step": 1741 + }, + { + "epoch": 2.43245756800744, + "grad_norm": 1.9440966369219763, + "learning_rate": 1.0545601633096414e-06, + "loss": 0.2199, + "step": 1742 + }, + { + "epoch": 2.4338525924203673, + "grad_norm": 2.0991571579373662, + "learning_rate": 1.049573606473369e-06, + "loss": 0.2502, + "step": 1743 + }, + { + "epoch": 2.4352476168332946, + "grad_norm": 1.9757218408408208, + "learning_rate": 1.0445974843358563e-06, + "loss": 0.2072, + "step": 1744 + }, + { + "epoch": 2.436642641246222, + "grad_norm": 1.9940009794376974, + "learning_rate": 1.0396318100410868e-06, + "loss": 0.2224, + "step": 1745 + }, + { + "epoch": 2.438037665659149, + "grad_norm": 2.0684497758647638, + "learning_rate": 1.0346765967054472e-06, + "loss": 0.2581, + "step": 1746 + }, + { + "epoch": 2.4394326900720764, + "grad_norm": 2.277181118110738, + "learning_rate": 1.029731857417689e-06, + "loss": 0.2598, + "step": 1747 + }, + { + "epoch": 2.4408277144850032, + "grad_norm": 1.9937563449803872, + "learning_rate": 1.0247976052389018e-06, + "loss": 0.2369, + "step": 1748 + }, + { + "epoch": 2.4422227388979305, + "grad_norm": 1.9892653023975233, + "learning_rate": 1.0198738532024715e-06, + "loss": 0.2181, + "step": 1749 + }, + { + "epoch": 2.443617763310858, + "grad_norm": 1.9123071022144358, + "learning_rate": 1.0149606143140484e-06, + "loss": 0.2466, + "step": 1750 + }, + { + "epoch": 2.445012787723785, + "grad_norm": 1.9682942803900485, + "learning_rate": 1.0100579015515156e-06, + "loss": 0.2392, + "step": 1751 + }, + { + "epoch": 2.4464078121367123, + "grad_norm": 2.0981882014591786, + "learning_rate": 1.005165727864953e-06, + "loss": 0.2393, + "step": 1752 + }, + { + "epoch": 2.4478028365496396, + "grad_norm": 2.0362345779504105, + "learning_rate": 1.0002841061765989e-06, + "loss": 0.2249, + "step": 1753 + }, + { + "epoch": 2.449197860962567, + "grad_norm": 1.8793934193224626, + "learning_rate": 9.954130493808201e-07, + "loss": 0.2337, + "step": 1754 + }, + { + "epoch": 2.450592885375494, + "grad_norm": 2.165450490603712, + "learning_rate": 9.905525703440815e-07, + "loss": 0.2291, + "step": 1755 + }, + { + "epoch": 2.4519879097884214, + "grad_norm": 2.142970644431852, + "learning_rate": 9.85702681904902e-07, + "loss": 0.2133, + "step": 1756 + }, + { + "epoch": 2.4533829342013487, + "grad_norm": 2.0095221047598595, + "learning_rate": 9.808633968738297e-07, + "loss": 0.2371, + "step": 1757 + }, + { + "epoch": 2.454777958614276, + "grad_norm": 2.026353074856489, + "learning_rate": 9.760347280334064e-07, + "loss": 0.2537, + "step": 1758 + }, + { + "epoch": 2.456172983027203, + "grad_norm": 1.9325330175950954, + "learning_rate": 9.712166881381279e-07, + "loss": 0.2427, + "step": 1759 + }, + { + "epoch": 2.45756800744013, + "grad_norm": 2.0937852825424224, + "learning_rate": 9.664092899144156e-07, + "loss": 0.2497, + "step": 1760 + }, + { + "epoch": 2.4589630318530573, + "grad_norm": 1.8882699979737334, + "learning_rate": 9.616125460605857e-07, + "loss": 0.2189, + "step": 1761 + }, + { + "epoch": 2.4603580562659846, + "grad_norm": 1.9641098748743273, + "learning_rate": 9.56826469246806e-07, + "loss": 0.2163, + "step": 1762 + }, + { + "epoch": 2.461753080678912, + "grad_norm": 1.9865195042240242, + "learning_rate": 9.520510721150722e-07, + "loss": 0.2188, + "step": 1763 + }, + { + "epoch": 2.463148105091839, + "grad_norm": 2.1443424582013026, + "learning_rate": 9.472863672791721e-07, + "loss": 0.2784, + "step": 1764 + }, + { + "epoch": 2.4645431295047664, + "grad_norm": 2.1137717827169915, + "learning_rate": 9.425323673246461e-07, + "loss": 0.2322, + "step": 1765 + }, + { + "epoch": 2.4659381539176937, + "grad_norm": 2.131351078053459, + "learning_rate": 9.377890848087595e-07, + "loss": 0.2427, + "step": 1766 + }, + { + "epoch": 2.467333178330621, + "grad_norm": 2.140790053123637, + "learning_rate": 9.330565322604729e-07, + "loss": 0.2544, + "step": 1767 + }, + { + "epoch": 2.468728202743548, + "grad_norm": 2.152964895533497, + "learning_rate": 9.283347221803985e-07, + "loss": 0.2343, + "step": 1768 + }, + { + "epoch": 2.470123227156475, + "grad_norm": 2.0861415574061484, + "learning_rate": 9.236236670407772e-07, + "loss": 0.261, + "step": 1769 + }, + { + "epoch": 2.4715182515694023, + "grad_norm": 2.364724254514781, + "learning_rate": 9.189233792854424e-07, + "loss": 0.2511, + "step": 1770 + }, + { + "epoch": 2.4729132759823296, + "grad_norm": 2.0717700877473035, + "learning_rate": 9.142338713297838e-07, + "loss": 0.2462, + "step": 1771 + }, + { + "epoch": 2.474308300395257, + "grad_norm": 1.9399398996755268, + "learning_rate": 9.095551555607169e-07, + "loss": 0.2422, + "step": 1772 + }, + { + "epoch": 2.475703324808184, + "grad_norm": 2.097444199435732, + "learning_rate": 9.048872443366529e-07, + "loss": 0.2473, + "step": 1773 + }, + { + "epoch": 2.4770983492211114, + "grad_norm": 1.9711078649095015, + "learning_rate": 9.002301499874622e-07, + "loss": 0.2346, + "step": 1774 + }, + { + "epoch": 2.4784933736340387, + "grad_norm": 1.992416087567518, + "learning_rate": 8.955838848144449e-07, + "loss": 0.2225, + "step": 1775 + }, + { + "epoch": 2.479888398046966, + "grad_norm": 2.2371561835066065, + "learning_rate": 8.909484610902958e-07, + "loss": 0.2554, + "step": 1776 + }, + { + "epoch": 2.481283422459893, + "grad_norm": 1.9975667952863843, + "learning_rate": 8.863238910590704e-07, + "loss": 0.2093, + "step": 1777 + }, + { + "epoch": 2.4826784468728205, + "grad_norm": 1.986118930839998, + "learning_rate": 8.817101869361599e-07, + "loss": 0.2481, + "step": 1778 + }, + { + "epoch": 2.4840734712857477, + "grad_norm": 2.142304590119852, + "learning_rate": 8.77107360908253e-07, + "loss": 0.264, + "step": 1779 + }, + { + "epoch": 2.4854684956986746, + "grad_norm": 2.1182415329232014, + "learning_rate": 8.725154251333012e-07, + "loss": 0.2328, + "step": 1780 + }, + { + "epoch": 2.486863520111602, + "grad_norm": 2.2640669840530507, + "learning_rate": 8.679343917404959e-07, + "loss": 0.2952, + "step": 1781 + }, + { + "epoch": 2.488258544524529, + "grad_norm": 1.913135749354688, + "learning_rate": 8.633642728302266e-07, + "loss": 0.239, + "step": 1782 + }, + { + "epoch": 2.4896535689374564, + "grad_norm": 2.1042684303606634, + "learning_rate": 8.588050804740527e-07, + "loss": 0.2282, + "step": 1783 + }, + { + "epoch": 2.4910485933503836, + "grad_norm": 2.0527372099519385, + "learning_rate": 8.542568267146761e-07, + "loss": 0.2184, + "step": 1784 + }, + { + "epoch": 2.492443617763311, + "grad_norm": 1.8290280740715956, + "learning_rate": 8.49719523565904e-07, + "loss": 0.2139, + "step": 1785 + }, + { + "epoch": 2.493838642176238, + "grad_norm": 2.142700588699554, + "learning_rate": 8.451931830126148e-07, + "loss": 0.2395, + "step": 1786 + }, + { + "epoch": 2.4952336665891655, + "grad_norm": 1.9887460560020362, + "learning_rate": 8.40677817010736e-07, + "loss": 0.2276, + "step": 1787 + }, + { + "epoch": 2.4966286910020927, + "grad_norm": 2.149857725209497, + "learning_rate": 8.361734374872032e-07, + "loss": 0.2396, + "step": 1788 + }, + { + "epoch": 2.4980237154150196, + "grad_norm": 1.8153829921701001, + "learning_rate": 8.316800563399307e-07, + "loss": 0.2243, + "step": 1789 + }, + { + "epoch": 2.499418739827947, + "grad_norm": 2.0873477016865003, + "learning_rate": 8.271976854377861e-07, + "loss": 0.2184, + "step": 1790 + }, + { + "epoch": 2.500813764240874, + "grad_norm": 1.9058032255841029, + "learning_rate": 8.227263366205523e-07, + "loss": 0.2617, + "step": 1791 + }, + { + "epoch": 2.5022087886538014, + "grad_norm": 2.03985002067712, + "learning_rate": 8.182660216988964e-07, + "loss": 0.3254, + "step": 1792 + }, + { + "epoch": 2.5036038130667286, + "grad_norm": 2.4462443382032104, + "learning_rate": 8.138167524543445e-07, + "loss": 0.2494, + "step": 1793 + }, + { + "epoch": 2.504998837479656, + "grad_norm": 2.1759864601852534, + "learning_rate": 8.09378540639243e-07, + "loss": 0.2518, + "step": 1794 + }, + { + "epoch": 2.506393861892583, + "grad_norm": 2.0809827238097336, + "learning_rate": 8.049513979767304e-07, + "loss": 0.2461, + "step": 1795 + }, + { + "epoch": 2.5077888863055104, + "grad_norm": 1.9241245111870922, + "learning_rate": 8.00535336160711e-07, + "loss": 0.1999, + "step": 1796 + }, + { + "epoch": 2.5091839107184377, + "grad_norm": 2.1413620625370475, + "learning_rate": 7.96130366855819e-07, + "loss": 0.2423, + "step": 1797 + }, + { + "epoch": 2.510578935131365, + "grad_norm": 2.1005332347594354, + "learning_rate": 7.917365016973866e-07, + "loss": 0.2203, + "step": 1798 + }, + { + "epoch": 2.5119739595442923, + "grad_norm": 2.146855579080123, + "learning_rate": 7.873537522914155e-07, + "loss": 0.2518, + "step": 1799 + }, + { + "epoch": 2.5133689839572195, + "grad_norm": 2.084590198938967, + "learning_rate": 7.829821302145485e-07, + "loss": 0.2359, + "step": 1800 + }, + { + "epoch": 2.5147640083701464, + "grad_norm": 1.9631135365144416, + "learning_rate": 7.786216470140334e-07, + "loss": 0.2333, + "step": 1801 + }, + { + "epoch": 2.5161590327830736, + "grad_norm": 2.0533634112072474, + "learning_rate": 7.742723142076991e-07, + "loss": 0.2298, + "step": 1802 + }, + { + "epoch": 2.517554057196001, + "grad_norm": 1.9899219985356666, + "learning_rate": 7.699341432839203e-07, + "loss": 0.245, + "step": 1803 + }, + { + "epoch": 2.518949081608928, + "grad_norm": 1.9831540992143863, + "learning_rate": 7.656071457015879e-07, + "loss": 0.2082, + "step": 1804 + }, + { + "epoch": 2.5203441060218554, + "grad_norm": 2.0248350609157786, + "learning_rate": 7.612913328900784e-07, + "loss": 0.2297, + "step": 1805 + }, + { + "epoch": 2.5217391304347827, + "grad_norm": 2.0684064839827028, + "learning_rate": 7.569867162492283e-07, + "loss": 0.2006, + "step": 1806 + }, + { + "epoch": 2.52313415484771, + "grad_norm": 1.9978028339758431, + "learning_rate": 7.526933071492959e-07, + "loss": 0.2649, + "step": 1807 + }, + { + "epoch": 2.524529179260637, + "grad_norm": 2.2154732847751744, + "learning_rate": 7.484111169309399e-07, + "loss": 0.2274, + "step": 1808 + }, + { + "epoch": 2.525924203673564, + "grad_norm": 2.035660314013249, + "learning_rate": 7.441401569051848e-07, + "loss": 0.2304, + "step": 1809 + }, + { + "epoch": 2.5273192280864913, + "grad_norm": 2.1081064276269754, + "learning_rate": 7.398804383533886e-07, + "loss": 0.2153, + "step": 1810 + }, + { + "epoch": 2.5287142524994186, + "grad_norm": 2.103371622818518, + "learning_rate": 7.356319725272165e-07, + "loss": 0.2349, + "step": 1811 + }, + { + "epoch": 2.530109276912346, + "grad_norm": 1.9604309959647959, + "learning_rate": 7.313947706486136e-07, + "loss": 0.2438, + "step": 1812 + }, + { + "epoch": 2.531504301325273, + "grad_norm": 2.178669534413003, + "learning_rate": 7.271688439097713e-07, + "loss": 0.2594, + "step": 1813 + }, + { + "epoch": 2.5328993257382004, + "grad_norm": 1.9811957969660663, + "learning_rate": 7.229542034730952e-07, + "loss": 0.1927, + "step": 1814 + }, + { + "epoch": 2.5342943501511277, + "grad_norm": 2.202042500358503, + "learning_rate": 7.187508604711851e-07, + "loss": 0.2412, + "step": 1815 + }, + { + "epoch": 2.535689374564055, + "grad_norm": 2.027289212836733, + "learning_rate": 7.145588260067943e-07, + "loss": 0.2534, + "step": 1816 + }, + { + "epoch": 2.5370843989769822, + "grad_norm": 2.000708288343631, + "learning_rate": 7.103781111528074e-07, + "loss": 0.239, + "step": 1817 + }, + { + "epoch": 2.5384794233899095, + "grad_norm": 2.1008206829818907, + "learning_rate": 7.062087269522105e-07, + "loss": 0.208, + "step": 1818 + }, + { + "epoch": 2.5398744478028368, + "grad_norm": 1.7956015898367612, + "learning_rate": 7.020506844180608e-07, + "loss": 0.2397, + "step": 1819 + }, + { + "epoch": 2.541269472215764, + "grad_norm": 2.3670874992202298, + "learning_rate": 6.979039945334543e-07, + "loss": 0.245, + "step": 1820 + }, + { + "epoch": 2.542664496628691, + "grad_norm": 2.0068429751335835, + "learning_rate": 6.937686682515044e-07, + "loss": 0.2358, + "step": 1821 + }, + { + "epoch": 2.544059521041618, + "grad_norm": 2.1746011102665554, + "learning_rate": 6.896447164953057e-07, + "loss": 0.2563, + "step": 1822 + }, + { + "epoch": 2.5454545454545454, + "grad_norm": 2.0644119660815936, + "learning_rate": 6.855321501579077e-07, + "loss": 0.2371, + "step": 1823 + }, + { + "epoch": 2.5468495698674727, + "grad_norm": 2.116923220111308, + "learning_rate": 6.814309801022873e-07, + "loss": 0.2388, + "step": 1824 + }, + { + "epoch": 2.5482445942804, + "grad_norm": 1.983451872015146, + "learning_rate": 6.77341217161322e-07, + "loss": 0.2625, + "step": 1825 + }, + { + "epoch": 2.5496396186933272, + "grad_norm": 2.0390077022283326, + "learning_rate": 6.732628721377533e-07, + "loss": 0.2172, + "step": 1826 + }, + { + "epoch": 2.5510346431062545, + "grad_norm": 2.0429464708845404, + "learning_rate": 6.69195955804165e-07, + "loss": 0.289, + "step": 1827 + }, + { + "epoch": 2.5524296675191813, + "grad_norm": 2.1879865095437, + "learning_rate": 6.651404789029553e-07, + "loss": 0.2345, + "step": 1828 + }, + { + "epoch": 2.5538246919321086, + "grad_norm": 1.772535273422064, + "learning_rate": 6.610964521463032e-07, + "loss": 0.2353, + "step": 1829 + }, + { + "epoch": 2.555219716345036, + "grad_norm": 2.041075828824093, + "learning_rate": 6.570638862161449e-07, + "loss": 0.2114, + "step": 1830 + }, + { + "epoch": 2.556614740757963, + "grad_norm": 1.945830106436581, + "learning_rate": 6.530427917641447e-07, + "loss": 0.2164, + "step": 1831 + }, + { + "epoch": 2.5580097651708904, + "grad_norm": 2.0753411827360915, + "learning_rate": 6.490331794116633e-07, + "loss": 0.2533, + "step": 1832 + }, + { + "epoch": 2.5594047895838177, + "grad_norm": 1.7446405117787227, + "learning_rate": 6.450350597497335e-07, + "loss": 0.2094, + "step": 1833 + }, + { + "epoch": 2.560799813996745, + "grad_norm": 2.0417418372007456, + "learning_rate": 6.410484433390335e-07, + "loss": 0.2552, + "step": 1834 + }, + { + "epoch": 2.562194838409672, + "grad_norm": 1.9407852162679993, + "learning_rate": 6.370733407098517e-07, + "loss": 0.2034, + "step": 1835 + }, + { + "epoch": 2.5635898628225995, + "grad_norm": 2.0514788706278275, + "learning_rate": 6.331097623620697e-07, + "loss": 0.2379, + "step": 1836 + }, + { + "epoch": 2.5649848872355268, + "grad_norm": 1.8667937445523195, + "learning_rate": 6.291577187651255e-07, + "loss": 0.223, + "step": 1837 + }, + { + "epoch": 2.566379911648454, + "grad_norm": 2.0783967507217445, + "learning_rate": 6.252172203579892e-07, + "loss": 0.2411, + "step": 1838 + }, + { + "epoch": 2.5677749360613813, + "grad_norm": 2.0226917795020602, + "learning_rate": 6.212882775491352e-07, + "loss": 0.269, + "step": 1839 + }, + { + "epoch": 2.5691699604743086, + "grad_norm": 1.977576382502622, + "learning_rate": 6.173709007165158e-07, + "loss": 0.2432, + "step": 1840 + }, + { + "epoch": 2.5705649848872354, + "grad_norm": 2.113865208840359, + "learning_rate": 6.134651002075315e-07, + "loss": 0.2335, + "step": 1841 + }, + { + "epoch": 2.5719600093001627, + "grad_norm": 2.111195344110925, + "learning_rate": 6.095708863390065e-07, + "loss": 0.2428, + "step": 1842 + }, + { + "epoch": 2.57335503371309, + "grad_norm": 1.8766206380320936, + "learning_rate": 6.056882693971605e-07, + "loss": 0.2201, + "step": 1843 + }, + { + "epoch": 2.574750058126017, + "grad_norm": 2.09593489201861, + "learning_rate": 6.018172596375776e-07, + "loss": 0.2652, + "step": 1844 + }, + { + "epoch": 2.5761450825389445, + "grad_norm": 1.971523019642963, + "learning_rate": 5.979578672851843e-07, + "loss": 0.2481, + "step": 1845 + }, + { + "epoch": 2.5775401069518717, + "grad_norm": 2.0922975506755646, + "learning_rate": 5.941101025342239e-07, + "loss": 0.2221, + "step": 1846 + }, + { + "epoch": 2.578935131364799, + "grad_norm": 2.2444677637837653, + "learning_rate": 5.902739755482201e-07, + "loss": 0.29, + "step": 1847 + }, + { + "epoch": 2.580330155777726, + "grad_norm": 1.963684451073659, + "learning_rate": 5.864494964599615e-07, + "loss": 0.2135, + "step": 1848 + }, + { + "epoch": 2.581725180190653, + "grad_norm": 2.008525153651005, + "learning_rate": 5.826366753714707e-07, + "loss": 0.2359, + "step": 1849 + }, + { + "epoch": 2.5831202046035804, + "grad_norm": 1.8327259871668735, + "learning_rate": 5.788355223539698e-07, + "loss": 0.2244, + "step": 1850 + }, + { + "epoch": 2.5845152290165077, + "grad_norm": 2.1182315396159876, + "learning_rate": 5.750460474478675e-07, + "loss": 0.2228, + "step": 1851 + }, + { + "epoch": 2.585910253429435, + "grad_norm": 1.8049971008688241, + "learning_rate": 5.712682606627251e-07, + "loss": 0.2294, + "step": 1852 + }, + { + "epoch": 2.587305277842362, + "grad_norm": 2.23181809180191, + "learning_rate": 5.675021719772262e-07, + "loss": 0.2463, + "step": 1853 + }, + { + "epoch": 2.5887003022552895, + "grad_norm": 1.9925138661937265, + "learning_rate": 5.637477913391604e-07, + "loss": 0.2242, + "step": 1854 + }, + { + "epoch": 2.5900953266682167, + "grad_norm": 2.010379835853784, + "learning_rate": 5.600051286653884e-07, + "loss": 0.2568, + "step": 1855 + }, + { + "epoch": 2.591490351081144, + "grad_norm": 2.1668861502665884, + "learning_rate": 5.562741938418187e-07, + "loss": 0.2281, + "step": 1856 + }, + { + "epoch": 2.5928853754940713, + "grad_norm": 2.0561471815296843, + "learning_rate": 5.525549967233829e-07, + "loss": 0.2368, + "step": 1857 + }, + { + "epoch": 2.5942803999069985, + "grad_norm": 2.0978158108861567, + "learning_rate": 5.488475471340099e-07, + "loss": 0.2341, + "step": 1858 + }, + { + "epoch": 2.595675424319926, + "grad_norm": 2.085462389890947, + "learning_rate": 5.451518548665946e-07, + "loss": 0.2429, + "step": 1859 + }, + { + "epoch": 2.597070448732853, + "grad_norm": 1.806559468372376, + "learning_rate": 5.414679296829806e-07, + "loss": 0.2287, + "step": 1860 + }, + { + "epoch": 2.59846547314578, + "grad_norm": 1.7938452572055765, + "learning_rate": 5.377957813139262e-07, + "loss": 0.1991, + "step": 1861 + }, + { + "epoch": 2.599860497558707, + "grad_norm": 2.1163670152268446, + "learning_rate": 5.341354194590831e-07, + "loss": 0.3004, + "step": 1862 + }, + { + "epoch": 2.6012555219716345, + "grad_norm": 2.15912048187847, + "learning_rate": 5.304868537869706e-07, + "loss": 0.2219, + "step": 1863 + }, + { + "epoch": 2.6026505463845617, + "grad_norm": 1.843409231643337, + "learning_rate": 5.268500939349514e-07, + "loss": 0.2283, + "step": 1864 + }, + { + "epoch": 2.604045570797489, + "grad_norm": 1.9210220245156822, + "learning_rate": 5.232251495091989e-07, + "loss": 0.2133, + "step": 1865 + }, + { + "epoch": 2.6054405952104163, + "grad_norm": 1.8978056842019908, + "learning_rate": 5.196120300846835e-07, + "loss": 0.221, + "step": 1866 + }, + { + "epoch": 2.6068356196233435, + "grad_norm": 1.9173007014525827, + "learning_rate": 5.160107452051361e-07, + "loss": 0.2478, + "step": 1867 + }, + { + "epoch": 2.6082306440362704, + "grad_norm": 2.063458389194393, + "learning_rate": 5.124213043830278e-07, + "loss": 0.2033, + "step": 1868 + }, + { + "epoch": 2.6096256684491976, + "grad_norm": 1.964758365774162, + "learning_rate": 5.088437170995481e-07, + "loss": 0.2132, + "step": 1869 + }, + { + "epoch": 2.611020692862125, + "grad_norm": 1.8757381329933205, + "learning_rate": 5.052779928045737e-07, + "loss": 0.2257, + "step": 1870 + }, + { + "epoch": 2.612415717275052, + "grad_norm": 1.9938237138566777, + "learning_rate": 5.01724140916649e-07, + "loss": 0.2365, + "step": 1871 + }, + { + "epoch": 2.6138107416879794, + "grad_norm": 2.057328367380359, + "learning_rate": 4.981821708229545e-07, + "loss": 0.2523, + "step": 1872 + }, + { + "epoch": 2.6152057661009067, + "grad_norm": 2.093011615403081, + "learning_rate": 4.946520918792886e-07, + "loss": 0.22, + "step": 1873 + }, + { + "epoch": 2.616600790513834, + "grad_norm": 2.0856658826067878, + "learning_rate": 4.911339134100401e-07, + "loss": 0.2504, + "step": 1874 + }, + { + "epoch": 2.6179958149267613, + "grad_norm": 2.2370140372218805, + "learning_rate": 4.87627644708163e-07, + "loss": 0.2391, + "step": 1875 + }, + { + "epoch": 2.6193908393396885, + "grad_norm": 1.992843597428801, + "learning_rate": 4.841332950351535e-07, + "loss": 0.2452, + "step": 1876 + }, + { + "epoch": 2.620785863752616, + "grad_norm": 2.036241454180426, + "learning_rate": 4.806508736210253e-07, + "loss": 0.2378, + "step": 1877 + }, + { + "epoch": 2.622180888165543, + "grad_norm": 1.8430704939397233, + "learning_rate": 4.771803896642812e-07, + "loss": 0.218, + "step": 1878 + }, + { + "epoch": 2.6235759125784703, + "grad_norm": 1.9168731440964641, + "learning_rate": 4.737218523318965e-07, + "loss": 0.2575, + "step": 1879 + }, + { + "epoch": 2.6249709369913976, + "grad_norm": 1.9668663396982646, + "learning_rate": 4.7027527075929e-07, + "loss": 0.2195, + "step": 1880 + }, + { + "epoch": 2.6263659614043244, + "grad_norm": 2.1482072780264247, + "learning_rate": 4.6684065405029677e-07, + "loss": 0.2384, + "step": 1881 + }, + { + "epoch": 2.6277609858172517, + "grad_norm": 1.936941683232777, + "learning_rate": 4.6341801127715303e-07, + "loss": 0.2247, + "step": 1882 + }, + { + "epoch": 2.629156010230179, + "grad_norm": 2.010168782321282, + "learning_rate": 4.6000735148046316e-07, + "loss": 0.2536, + "step": 1883 + }, + { + "epoch": 2.6305510346431062, + "grad_norm": 2.0449854159937937, + "learning_rate": 4.566086836691791e-07, + "loss": 0.2277, + "step": 1884 + }, + { + "epoch": 2.6319460590560335, + "grad_norm": 1.9233425119697793, + "learning_rate": 4.532220168205798e-07, + "loss": 0.2421, + "step": 1885 + }, + { + "epoch": 2.633341083468961, + "grad_norm": 2.0816496032711003, + "learning_rate": 4.498473598802444e-07, + "loss": 0.2053, + "step": 1886 + }, + { + "epoch": 2.634736107881888, + "grad_norm": 2.013648412197882, + "learning_rate": 4.464847217620266e-07, + "loss": 0.2315, + "step": 1887 + }, + { + "epoch": 2.636131132294815, + "grad_norm": 1.9879522597479942, + "learning_rate": 4.4313411134803584e-07, + "loss": 0.2463, + "step": 1888 + }, + { + "epoch": 2.637526156707742, + "grad_norm": 2.0669142465872024, + "learning_rate": 4.397955374886104e-07, + "loss": 0.2174, + "step": 1889 + }, + { + "epoch": 2.6389211811206694, + "grad_norm": 2.048055292937162, + "learning_rate": 4.364690090022938e-07, + "loss": 0.2393, + "step": 1890 + }, + { + "epoch": 2.6403162055335967, + "grad_norm": 1.9212520880953143, + "learning_rate": 4.331545346758159e-07, + "loss": 0.2322, + "step": 1891 + }, + { + "epoch": 2.641711229946524, + "grad_norm": 1.9897143304092708, + "learning_rate": 4.2985212326406456e-07, + "loss": 0.2112, + "step": 1892 + }, + { + "epoch": 2.6431062543594512, + "grad_norm": 1.994917321745064, + "learning_rate": 4.265617834900637e-07, + "loss": 0.2139, + "step": 1893 + }, + { + "epoch": 2.6445012787723785, + "grad_norm": 1.9275707655087257, + "learning_rate": 4.2328352404495346e-07, + "loss": 0.2031, + "step": 1894 + }, + { + "epoch": 2.6458963031853058, + "grad_norm": 2.114603415128938, + "learning_rate": 4.2001735358796316e-07, + "loss": 0.2053, + "step": 1895 + }, + { + "epoch": 2.647291327598233, + "grad_norm": 1.7540496628185678, + "learning_rate": 4.167632807463895e-07, + "loss": 0.2058, + "step": 1896 + }, + { + "epoch": 2.6486863520111603, + "grad_norm": 1.9772733263814442, + "learning_rate": 4.135213141155769e-07, + "loss": 0.227, + "step": 1897 + }, + { + "epoch": 2.6500813764240876, + "grad_norm": 2.122744145357744, + "learning_rate": 4.1029146225889103e-07, + "loss": 0.2367, + "step": 1898 + }, + { + "epoch": 2.651476400837015, + "grad_norm": 2.0606193799135797, + "learning_rate": 4.0707373370769634e-07, + "loss": 0.2345, + "step": 1899 + }, + { + "epoch": 2.652871425249942, + "grad_norm": 2.037156489409722, + "learning_rate": 4.0386813696133564e-07, + "loss": 0.235, + "step": 1900 + }, + { + "epoch": 2.654266449662869, + "grad_norm": 2.025359743039822, + "learning_rate": 4.0067468048710756e-07, + "loss": 0.2194, + "step": 1901 + }, + { + "epoch": 2.6556614740757962, + "grad_norm": 1.9308904352486622, + "learning_rate": 3.974933727202412e-07, + "loss": 0.228, + "step": 1902 + }, + { + "epoch": 2.6570564984887235, + "grad_norm": 1.8104587056449903, + "learning_rate": 3.943242220638777e-07, + "loss": 0.2135, + "step": 1903 + }, + { + "epoch": 2.6584515229016508, + "grad_norm": 1.991585390255207, + "learning_rate": 3.911672368890462e-07, + "loss": 0.2403, + "step": 1904 + }, + { + "epoch": 2.659846547314578, + "grad_norm": 2.1170487598745242, + "learning_rate": 3.8802242553464096e-07, + "loss": 0.2307, + "step": 1905 + }, + { + "epoch": 2.6612415717275053, + "grad_norm": 2.0282853922616075, + "learning_rate": 3.8488979630739996e-07, + "loss": 0.2272, + "step": 1906 + }, + { + "epoch": 2.6626365961404326, + "grad_norm": 2.0265308519331855, + "learning_rate": 3.8176935748188425e-07, + "loss": 0.1872, + "step": 1907 + }, + { + "epoch": 2.66403162055336, + "grad_norm": 1.7643254590081145, + "learning_rate": 3.78661117300454e-07, + "loss": 0.2069, + "step": 1908 + }, + { + "epoch": 2.6654266449662867, + "grad_norm": 1.986084570077434, + "learning_rate": 3.755650839732489e-07, + "loss": 0.2139, + "step": 1909 + }, + { + "epoch": 2.666821669379214, + "grad_norm": 1.8960562363541258, + "learning_rate": 3.7248126567816454e-07, + "loss": 0.2557, + "step": 1910 + }, + { + "epoch": 2.668216693792141, + "grad_norm": 2.0653357761405995, + "learning_rate": 3.694096705608319e-07, + "loss": 0.2375, + "step": 1911 + }, + { + "epoch": 2.6696117182050685, + "grad_norm": 2.1077724754663407, + "learning_rate": 3.6635030673459413e-07, + "loss": 0.2215, + "step": 1912 + }, + { + "epoch": 2.6710067426179958, + "grad_norm": 2.0513233906590247, + "learning_rate": 3.6330318228049e-07, + "loss": 0.248, + "step": 1913 + }, + { + "epoch": 2.672401767030923, + "grad_norm": 1.9265363937989877, + "learning_rate": 3.6026830524722443e-07, + "loss": 0.2407, + "step": 1914 + }, + { + "epoch": 2.6737967914438503, + "grad_norm": 1.9526513264278522, + "learning_rate": 3.572456836511551e-07, + "loss": 0.2443, + "step": 1915 + }, + { + "epoch": 2.6751918158567776, + "grad_norm": 1.9303284847693432, + "learning_rate": 3.5423532547626816e-07, + "loss": 0.2205, + "step": 1916 + }, + { + "epoch": 2.676586840269705, + "grad_norm": 1.8811607733432787, + "learning_rate": 3.5123723867415527e-07, + "loss": 0.1948, + "step": 1917 + }, + { + "epoch": 2.677981864682632, + "grad_norm": 2.03022918053246, + "learning_rate": 3.4825143116399454e-07, + "loss": 0.2197, + "step": 1918 + }, + { + "epoch": 2.6793768890955594, + "grad_norm": 2.2929035930250397, + "learning_rate": 3.452779108325316e-07, + "loss": 0.2173, + "step": 1919 + }, + { + "epoch": 2.6807719135084866, + "grad_norm": 2.001488086541109, + "learning_rate": 3.4231668553405316e-07, + "loss": 0.2216, + "step": 1920 + }, + { + "epoch": 2.6821669379214135, + "grad_norm": 2.0989052899619187, + "learning_rate": 3.39367763090373e-07, + "loss": 0.2404, + "step": 1921 + }, + { + "epoch": 2.6835619623343407, + "grad_norm": 2.053071087875118, + "learning_rate": 3.3643115129080695e-07, + "loss": 0.2371, + "step": 1922 + }, + { + "epoch": 2.684956986747268, + "grad_norm": 2.003778312547755, + "learning_rate": 3.3350685789215133e-07, + "loss": 0.2249, + "step": 1923 + }, + { + "epoch": 2.6863520111601953, + "grad_norm": 2.091480355902784, + "learning_rate": 3.3059489061866625e-07, + "loss": 0.2453, + "step": 1924 + }, + { + "epoch": 2.6877470355731226, + "grad_norm": 2.1456977629987772, + "learning_rate": 3.276952571620556e-07, + "loss": 0.2521, + "step": 1925 + }, + { + "epoch": 2.68914205998605, + "grad_norm": 2.075417810454784, + "learning_rate": 3.248079651814395e-07, + "loss": 0.231, + "step": 1926 + }, + { + "epoch": 2.690537084398977, + "grad_norm": 1.8386109201307006, + "learning_rate": 3.2193302230334455e-07, + "loss": 0.196, + "step": 1927 + }, + { + "epoch": 2.6919321088119044, + "grad_norm": 1.8455305977705043, + "learning_rate": 3.190704361216751e-07, + "loss": 0.2119, + "step": 1928 + }, + { + "epoch": 2.693327133224831, + "grad_norm": 2.139309615920741, + "learning_rate": 3.162202141976956e-07, + "loss": 0.2647, + "step": 1929 + }, + { + "epoch": 2.6947221576377585, + "grad_norm": 2.186763765790992, + "learning_rate": 3.133823640600137e-07, + "loss": 0.2589, + "step": 1930 + }, + { + "epoch": 2.6961171820506857, + "grad_norm": 1.9436673585389133, + "learning_rate": 3.105568932045577e-07, + "loss": 0.2499, + "step": 1931 + }, + { + "epoch": 2.697512206463613, + "grad_norm": 2.048763078121567, + "learning_rate": 3.077438090945573e-07, + "loss": 0.2188, + "step": 1932 + }, + { + "epoch": 2.6989072308765403, + "grad_norm": 2.1053994973981487, + "learning_rate": 3.0494311916052234e-07, + "loss": 0.2464, + "step": 1933 + }, + { + "epoch": 2.7003022552894675, + "grad_norm": 2.0192397420316506, + "learning_rate": 3.021548308002248e-07, + "loss": 0.2486, + "step": 1934 + }, + { + "epoch": 2.701697279702395, + "grad_norm": 1.9945367846333557, + "learning_rate": 2.9937895137868046e-07, + "loss": 0.2329, + "step": 1935 + }, + { + "epoch": 2.703092304115322, + "grad_norm": 2.155115974239163, + "learning_rate": 2.9661548822812636e-07, + "loss": 0.2162, + "step": 1936 + }, + { + "epoch": 2.7044873285282494, + "grad_norm": 1.844769199810483, + "learning_rate": 2.9386444864800355e-07, + "loss": 0.2554, + "step": 1937 + }, + { + "epoch": 2.7058823529411766, + "grad_norm": 1.9952354992173393, + "learning_rate": 2.911258399049394e-07, + "loss": 0.2677, + "step": 1938 + }, + { + "epoch": 2.707277377354104, + "grad_norm": 2.136313468913573, + "learning_rate": 2.8839966923272286e-07, + "loss": 0.2281, + "step": 1939 + }, + { + "epoch": 2.708672401767031, + "grad_norm": 2.037309497070798, + "learning_rate": 2.8568594383229067e-07, + "loss": 0.2527, + "step": 1940 + }, + { + "epoch": 2.710067426179958, + "grad_norm": 2.177748542840591, + "learning_rate": 2.8298467087170655e-07, + "loss": 0.2567, + "step": 1941 + }, + { + "epoch": 2.7114624505928853, + "grad_norm": 2.0205312029912563, + "learning_rate": 2.8029585748614196e-07, + "loss": 0.2419, + "step": 1942 + }, + { + "epoch": 2.7128574750058125, + "grad_norm": 2.0495681104433046, + "learning_rate": 2.7761951077785676e-07, + "loss": 0.2418, + "step": 1943 + }, + { + "epoch": 2.71425249941874, + "grad_norm": 2.0238763267167275, + "learning_rate": 2.749556378161833e-07, + "loss": 0.2247, + "step": 1944 + }, + { + "epoch": 2.715647523831667, + "grad_norm": 2.0799212973766887, + "learning_rate": 2.723042456375036e-07, + "loss": 0.2214, + "step": 1945 + }, + { + "epoch": 2.7170425482445943, + "grad_norm": 1.9828707876406386, + "learning_rate": 2.696653412452327e-07, + "loss": 0.2254, + "step": 1946 + }, + { + "epoch": 2.7184375726575216, + "grad_norm": 1.9574822824704652, + "learning_rate": 2.6703893160980266e-07, + "loss": 0.2315, + "step": 1947 + }, + { + "epoch": 2.719832597070449, + "grad_norm": 2.00140036288395, + "learning_rate": 2.6442502366863854e-07, + "loss": 0.2433, + "step": 1948 + }, + { + "epoch": 2.7212276214833757, + "grad_norm": 2.0476945336094827, + "learning_rate": 2.618236243261452e-07, + "loss": 0.2334, + "step": 1949 + }, + { + "epoch": 2.722622645896303, + "grad_norm": 2.1346972742135892, + "learning_rate": 2.592347404536888e-07, + "loss": 0.2349, + "step": 1950 + }, + { + "epoch": 2.7240176703092303, + "grad_norm": 2.1642854854702773, + "learning_rate": 2.566583788895721e-07, + "loss": 0.2348, + "step": 1951 + }, + { + "epoch": 2.7254126947221575, + "grad_norm": 1.9708668257935886, + "learning_rate": 2.5409454643902543e-07, + "loss": 0.2381, + "step": 1952 + }, + { + "epoch": 2.726807719135085, + "grad_norm": 2.059735784249374, + "learning_rate": 2.5154324987418434e-07, + "loss": 0.2414, + "step": 1953 + }, + { + "epoch": 2.728202743548012, + "grad_norm": 2.004488455511273, + "learning_rate": 2.4900449593406984e-07, + "loss": 0.2245, + "step": 1954 + }, + { + "epoch": 2.7295977679609393, + "grad_norm": 1.9994468174505557, + "learning_rate": 2.4647829132457446e-07, + "loss": 0.2259, + "step": 1955 + }, + { + "epoch": 2.7309927923738666, + "grad_norm": 1.843524154811651, + "learning_rate": 2.439646427184428e-07, + "loss": 0.2207, + "step": 1956 + }, + { + "epoch": 2.732387816786794, + "grad_norm": 2.1830527721245594, + "learning_rate": 2.4146355675525145e-07, + "loss": 0.2292, + "step": 1957 + }, + { + "epoch": 2.733782841199721, + "grad_norm": 1.9550636161481592, + "learning_rate": 2.389750400413965e-07, + "loss": 0.2481, + "step": 1958 + }, + { + "epoch": 2.7351778656126484, + "grad_norm": 2.009516811101446, + "learning_rate": 2.364990991500743e-07, + "loss": 0.2264, + "step": 1959 + }, + { + "epoch": 2.7365728900255757, + "grad_norm": 2.185037327848579, + "learning_rate": 2.340357406212601e-07, + "loss": 0.2055, + "step": 1960 + }, + { + "epoch": 2.7379679144385025, + "grad_norm": 1.8805293460013788, + "learning_rate": 2.315849709616963e-07, + "loss": 0.1983, + "step": 1961 + }, + { + "epoch": 2.73936293885143, + "grad_norm": 1.8960709072653619, + "learning_rate": 2.2914679664487237e-07, + "loss": 0.1951, + "step": 1962 + }, + { + "epoch": 2.740757963264357, + "grad_norm": 2.0602378849111247, + "learning_rate": 2.2672122411100727e-07, + "loss": 0.2368, + "step": 1963 + }, + { + "epoch": 2.7421529876772843, + "grad_norm": 1.960524247241766, + "learning_rate": 2.2430825976703485e-07, + "loss": 0.221, + "step": 1964 + }, + { + "epoch": 2.7435480120902116, + "grad_norm": 2.1133291760383375, + "learning_rate": 2.2190790998658561e-07, + "loss": 0.2667, + "step": 1965 + }, + { + "epoch": 2.744943036503139, + "grad_norm": 2.0557807010861917, + "learning_rate": 2.1952018110996843e-07, + "loss": 0.2194, + "step": 1966 + }, + { + "epoch": 2.746338060916066, + "grad_norm": 2.10649236905764, + "learning_rate": 2.1714507944415708e-07, + "loss": 0.2541, + "step": 1967 + }, + { + "epoch": 2.7477330853289934, + "grad_norm": 1.9442564367344377, + "learning_rate": 2.1478261126276989e-07, + "loss": 0.2181, + "step": 1968 + }, + { + "epoch": 2.7491281097419202, + "grad_norm": 2.0582235498172055, + "learning_rate": 2.1243278280605517e-07, + "loss": 0.2414, + "step": 1969 + }, + { + "epoch": 2.7505231341548475, + "grad_norm": 2.0367916482967803, + "learning_rate": 2.1009560028087627e-07, + "loss": 0.2198, + "step": 1970 + }, + { + "epoch": 2.7519181585677748, + "grad_norm": 2.0136363532537978, + "learning_rate": 2.0777106986069162e-07, + "loss": 0.2335, + "step": 1971 + }, + { + "epoch": 2.753313182980702, + "grad_norm": 1.9593075020307442, + "learning_rate": 2.0545919768554078e-07, + "loss": 0.2151, + "step": 1972 + }, + { + "epoch": 2.7547082073936293, + "grad_norm": 2.068783498395829, + "learning_rate": 2.0315998986202902e-07, + "loss": 0.2484, + "step": 1973 + }, + { + "epoch": 2.7561032318065566, + "grad_norm": 2.0417988017428095, + "learning_rate": 2.0087345246330714e-07, + "loss": 0.229, + "step": 1974 + }, + { + "epoch": 2.757498256219484, + "grad_norm": 2.107556273533938, + "learning_rate": 1.985995915290595e-07, + "loss": 0.2346, + "step": 1975 + }, + { + "epoch": 2.758893280632411, + "grad_norm": 2.0145221028032694, + "learning_rate": 1.9633841306548717e-07, + "loss": 0.213, + "step": 1976 + }, + { + "epoch": 2.7602883050453384, + "grad_norm": 1.995502113277562, + "learning_rate": 1.9408992304529252e-07, + "loss": 0.2505, + "step": 1977 + }, + { + "epoch": 2.7616833294582657, + "grad_norm": 1.974801471687715, + "learning_rate": 1.9185412740765962e-07, + "loss": 0.1984, + "step": 1978 + }, + { + "epoch": 2.763078353871193, + "grad_norm": 1.9678999879577588, + "learning_rate": 1.8963103205824397e-07, + "loss": 0.2222, + "step": 1979 + }, + { + "epoch": 2.76447337828412, + "grad_norm": 2.044892899381473, + "learning_rate": 1.8742064286915329e-07, + "loss": 0.233, + "step": 1980 + }, + { + "epoch": 2.765868402697047, + "grad_norm": 1.930009643778441, + "learning_rate": 1.8522296567893282e-07, + "loss": 0.25, + "step": 1981 + }, + { + "epoch": 2.7672634271099743, + "grad_norm": 2.189950482821562, + "learning_rate": 1.830380062925513e-07, + "loss": 0.223, + "step": 1982 + }, + { + "epoch": 2.7686584515229016, + "grad_norm": 2.1618985560864883, + "learning_rate": 1.8086577048138432e-07, + "loss": 0.2372, + "step": 1983 + }, + { + "epoch": 2.770053475935829, + "grad_norm": 1.8935735279008907, + "learning_rate": 1.787062639831988e-07, + "loss": 0.2201, + "step": 1984 + }, + { + "epoch": 2.771448500348756, + "grad_norm": 2.1258001318663116, + "learning_rate": 1.7655949250213743e-07, + "loss": 0.2152, + "step": 1985 + }, + { + "epoch": 2.7728435247616834, + "grad_norm": 1.9586861209264248, + "learning_rate": 1.7442546170870654e-07, + "loss": 0.2247, + "step": 1986 + }, + { + "epoch": 2.7742385491746107, + "grad_norm": 2.042832703188186, + "learning_rate": 1.7230417723975766e-07, + "loss": 0.2408, + "step": 1987 + }, + { + "epoch": 2.775633573587538, + "grad_norm": 1.9728731477852715, + "learning_rate": 1.7019564469847372e-07, + "loss": 0.2128, + "step": 1988 + }, + { + "epoch": 2.7770285980004648, + "grad_norm": 1.915287465847493, + "learning_rate": 1.6809986965435675e-07, + "loss": 0.205, + "step": 1989 + }, + { + "epoch": 2.778423622413392, + "grad_norm": 2.0763097266218726, + "learning_rate": 1.660168576432092e-07, + "loss": 0.2432, + "step": 1990 + }, + { + "epoch": 2.7798186468263193, + "grad_norm": 2.0818890704572643, + "learning_rate": 1.6394661416711977e-07, + "loss": 0.2215, + "step": 1991 + }, + { + "epoch": 2.7812136712392466, + "grad_norm": 1.9996995437677372, + "learning_rate": 1.6188914469445372e-07, + "loss": 0.2231, + "step": 1992 + }, + { + "epoch": 2.782608695652174, + "grad_norm": 2.0338022002237546, + "learning_rate": 1.5984445465983156e-07, + "loss": 0.2332, + "step": 1993 + }, + { + "epoch": 2.784003720065101, + "grad_norm": 2.1000818079003896, + "learning_rate": 1.5781254946412029e-07, + "loss": 0.2169, + "step": 1994 + }, + { + "epoch": 2.7853987444780284, + "grad_norm": 1.9706118972006013, + "learning_rate": 1.5579343447441663e-07, + "loss": 0.2156, + "step": 1995 + }, + { + "epoch": 2.7867937688909556, + "grad_norm": 1.825179731518908, + "learning_rate": 1.5378711502403164e-07, + "loss": 0.1899, + "step": 1996 + }, + { + "epoch": 2.788188793303883, + "grad_norm": 1.9856171305408068, + "learning_rate": 1.5179359641247948e-07, + "loss": 0.2378, + "step": 1997 + }, + { + "epoch": 2.78958381771681, + "grad_norm": 2.0723262091623758, + "learning_rate": 1.4981288390546188e-07, + "loss": 0.2406, + "step": 1998 + }, + { + "epoch": 2.7909788421297375, + "grad_norm": 2.0926030580473984, + "learning_rate": 1.4784498273485436e-07, + "loss": 0.2271, + "step": 1999 + }, + { + "epoch": 2.7923738665426647, + "grad_norm": 2.0940586610006826, + "learning_rate": 1.458898980986917e-07, + "loss": 0.2356, + "step": 2000 + }, + { + "epoch": 2.793768890955592, + "grad_norm": 2.136395647396599, + "learning_rate": 1.4394763516115573e-07, + "loss": 0.2287, + "step": 2001 + }, + { + "epoch": 2.795163915368519, + "grad_norm": 2.0999404737694722, + "learning_rate": 1.4201819905256043e-07, + "loss": 0.2258, + "step": 2002 + }, + { + "epoch": 2.796558939781446, + "grad_norm": 2.0712201828778847, + "learning_rate": 1.4010159486933906e-07, + "loss": 0.2514, + "step": 2003 + }, + { + "epoch": 2.7979539641943734, + "grad_norm": 2.053549791837711, + "learning_rate": 1.3819782767403034e-07, + "loss": 0.2193, + "step": 2004 + }, + { + "epoch": 2.7993489886073006, + "grad_norm": 1.8855245314887703, + "learning_rate": 1.363069024952668e-07, + "loss": 0.2356, + "step": 2005 + }, + { + "epoch": 2.800744013020228, + "grad_norm": 1.923053478031501, + "learning_rate": 1.344288243277575e-07, + "loss": 0.245, + "step": 2006 + }, + { + "epoch": 2.802139037433155, + "grad_norm": 2.013116779776721, + "learning_rate": 1.3256359813227758e-07, + "loss": 0.2242, + "step": 2007 + }, + { + "epoch": 2.8035340618460824, + "grad_norm": 1.8577386165706877, + "learning_rate": 1.3071122883565657e-07, + "loss": 0.246, + "step": 2008 + }, + { + "epoch": 2.8049290862590093, + "grad_norm": 2.073816182656257, + "learning_rate": 1.288717213307622e-07, + "loss": 0.2448, + "step": 2009 + }, + { + "epoch": 2.8063241106719365, + "grad_norm": 2.1025633652970894, + "learning_rate": 1.2704508047649e-07, + "loss": 0.2524, + "step": 2010 + }, + { + "epoch": 2.807719135084864, + "grad_norm": 1.9930940045825634, + "learning_rate": 1.2523131109774822e-07, + "loss": 0.2007, + "step": 2011 + }, + { + "epoch": 2.809114159497791, + "grad_norm": 1.7898062762710325, + "learning_rate": 1.234304179854473e-07, + "loss": 0.2162, + "step": 2012 + }, + { + "epoch": 2.8105091839107184, + "grad_norm": 2.0747699633734653, + "learning_rate": 1.2164240589648436e-07, + "loss": 0.2266, + "step": 2013 + }, + { + "epoch": 2.8119042083236456, + "grad_norm": 1.9999733625821126, + "learning_rate": 1.1986727955373588e-07, + "loss": 0.2281, + "step": 2014 + }, + { + "epoch": 2.813299232736573, + "grad_norm": 2.0375106273671855, + "learning_rate": 1.1810504364603737e-07, + "loss": 0.24, + "step": 2015 + }, + { + "epoch": 2.8146942571495, + "grad_norm": 2.0844670518449693, + "learning_rate": 1.163557028281792e-07, + "loss": 0.2058, + "step": 2016 + }, + { + "epoch": 2.8160892815624274, + "grad_norm": 1.9962373585540047, + "learning_rate": 1.146192617208891e-07, + "loss": 0.1974, + "step": 2017 + }, + { + "epoch": 2.8174843059753547, + "grad_norm": 1.8386401423765901, + "learning_rate": 1.128957249108209e-07, + "loss": 0.2076, + "step": 2018 + }, + { + "epoch": 2.818879330388282, + "grad_norm": 2.0468415910931874, + "learning_rate": 1.1118509695054236e-07, + "loss": 0.2085, + "step": 2019 + }, + { + "epoch": 2.8202743548012092, + "grad_norm": 1.9826632032740978, + "learning_rate": 1.094873823585263e-07, + "loss": 0.2078, + "step": 2020 + }, + { + "epoch": 2.8216693792141365, + "grad_norm": 1.8783716162476043, + "learning_rate": 1.0780258561913281e-07, + "loss": 0.2367, + "step": 2021 + }, + { + "epoch": 2.8230644036270633, + "grad_norm": 2.217419255609164, + "learning_rate": 1.0613071118260321e-07, + "loss": 0.2332, + "step": 2022 + }, + { + "epoch": 2.8244594280399906, + "grad_norm": 2.200154723898179, + "learning_rate": 1.0447176346504439e-07, + "loss": 0.2497, + "step": 2023 + }, + { + "epoch": 2.825854452452918, + "grad_norm": 2.0172069553573193, + "learning_rate": 1.0282574684841784e-07, + "loss": 0.2507, + "step": 2024 + }, + { + "epoch": 2.827249476865845, + "grad_norm": 2.0893341853420506, + "learning_rate": 1.011926656805301e-07, + "loss": 0.213, + "step": 2025 + }, + { + "epoch": 2.8286445012787724, + "grad_norm": 1.814949208944079, + "learning_rate": 9.957252427501951e-08, + "loss": 0.2115, + "step": 2026 + }, + { + "epoch": 2.8300395256916997, + "grad_norm": 1.9078100366735615, + "learning_rate": 9.796532691134453e-08, + "loss": 0.2388, + "step": 2027 + }, + { + "epoch": 2.831434550104627, + "grad_norm": 1.9158681074801742, + "learning_rate": 9.637107783477484e-08, + "loss": 0.2171, + "step": 2028 + }, + { + "epoch": 2.832829574517554, + "grad_norm": 2.034413782195638, + "learning_rate": 9.478978125637583e-08, + "loss": 0.2061, + "step": 2029 + }, + { + "epoch": 2.834224598930481, + "grad_norm": 2.0640999913344595, + "learning_rate": 9.322144135300137e-08, + "loss": 0.2329, + "step": 2030 + }, + { + "epoch": 2.8356196233434083, + "grad_norm": 2.0272822842665774, + "learning_rate": 9.166606226728103e-08, + "loss": 0.2074, + "step": 2031 + }, + { + "epoch": 2.8370146477563356, + "grad_norm": 1.7729057129292922, + "learning_rate": 9.012364810761121e-08, + "loss": 0.2384, + "step": 2032 + }, + { + "epoch": 2.838409672169263, + "grad_norm": 2.336513429393473, + "learning_rate": 8.859420294814014e-08, + "loss": 0.2352, + "step": 2033 + }, + { + "epoch": 2.83980469658219, + "grad_norm": 1.9596699673808875, + "learning_rate": 8.70777308287618e-08, + "loss": 0.253, + "step": 2034 + }, + { + "epoch": 2.8411997209951174, + "grad_norm": 2.216580635688953, + "learning_rate": 8.557423575510037e-08, + "loss": 0.2535, + "step": 2035 + }, + { + "epoch": 2.8425947454080447, + "grad_norm": 2.169592549290016, + "learning_rate": 8.408372169850521e-08, + "loss": 0.2498, + "step": 2036 + }, + { + "epoch": 2.843989769820972, + "grad_norm": 2.1695073881811306, + "learning_rate": 8.26061925960353e-08, + "loss": 0.2648, + "step": 2037 + }, + { + "epoch": 2.8453847942338992, + "grad_norm": 2.197398628568355, + "learning_rate": 8.114165235045268e-08, + "loss": 0.2425, + "step": 2038 + }, + { + "epoch": 2.8467798186468265, + "grad_norm": 2.0971862433821427, + "learning_rate": 7.969010483020845e-08, + "loss": 0.2298, + "step": 2039 + }, + { + "epoch": 2.8481748430597538, + "grad_norm": 1.8349596523090903, + "learning_rate": 7.825155386943784e-08, + "loss": 0.2355, + "step": 2040 + }, + { + "epoch": 2.849569867472681, + "grad_norm": 2.0915909568177797, + "learning_rate": 7.682600326794353e-08, + "loss": 0.2419, + "step": 2041 + }, + { + "epoch": 2.850964891885608, + "grad_norm": 1.9667085866119112, + "learning_rate": 7.541345679118961e-08, + "loss": 0.2325, + "step": 2042 + }, + { + "epoch": 2.852359916298535, + "grad_norm": 2.144176674881882, + "learning_rate": 7.401391817029257e-08, + "loss": 0.2259, + "step": 2043 + }, + { + "epoch": 2.8537549407114624, + "grad_norm": 2.0336094691866, + "learning_rate": 7.262739110200923e-08, + "loss": 0.2272, + "step": 2044 + }, + { + "epoch": 2.8551499651243897, + "grad_norm": 2.132962040695552, + "learning_rate": 7.125387924872552e-08, + "loss": 0.2544, + "step": 2045 + }, + { + "epoch": 2.856544989537317, + "grad_norm": 1.955162224372117, + "learning_rate": 6.98933862384521e-08, + "loss": 0.2132, + "step": 2046 + }, + { + "epoch": 2.857940013950244, + "grad_norm": 2.095095213879681, + "learning_rate": 6.854591566480884e-08, + "loss": 0.1864, + "step": 2047 + }, + { + "epoch": 2.8593350383631715, + "grad_norm": 1.9264801577988864, + "learning_rate": 6.721147108701864e-08, + "loss": 0.2338, + "step": 2048 + }, + { + "epoch": 2.8607300627760983, + "grad_norm": 2.030106096946145, + "learning_rate": 6.589005602989862e-08, + "loss": 0.2299, + "step": 2049 + }, + { + "epoch": 2.8621250871890256, + "grad_norm": 2.312046798351557, + "learning_rate": 6.458167398384896e-08, + "loss": 0.2375, + "step": 2050 + }, + { + "epoch": 2.863520111601953, + "grad_norm": 2.0184696688819206, + "learning_rate": 6.328632840484294e-08, + "loss": 0.1987, + "step": 2051 + }, + { + "epoch": 2.86491513601488, + "grad_norm": 1.8251619065621412, + "learning_rate": 6.200402271442085e-08, + "loss": 0.2252, + "step": 2052 + }, + { + "epoch": 2.8663101604278074, + "grad_norm": 2.0497067870912575, + "learning_rate": 6.073476029967884e-08, + "loss": 0.2308, + "step": 2053 + }, + { + "epoch": 2.8677051848407347, + "grad_norm": 2.0486210575713097, + "learning_rate": 5.947854451326007e-08, + "loss": 0.2437, + "step": 2054 + }, + { + "epoch": 2.869100209253662, + "grad_norm": 1.9864897653983566, + "learning_rate": 5.823537867334694e-08, + "loss": 0.2112, + "step": 2055 + }, + { + "epoch": 2.870495233666589, + "grad_norm": 1.9230197033304526, + "learning_rate": 5.7005266063650534e-08, + "loss": 0.2251, + "step": 2056 + }, + { + "epoch": 2.8718902580795165, + "grad_norm": 2.151992679895416, + "learning_rate": 5.5788209933403944e-08, + "loss": 0.2792, + "step": 2057 + }, + { + "epoch": 2.8732852824924437, + "grad_norm": 1.920997012465794, + "learning_rate": 5.4584213497351766e-08, + "loss": 0.2251, + "step": 2058 + }, + { + "epoch": 2.874680306905371, + "grad_norm": 1.9992012597394688, + "learning_rate": 5.339327993574339e-08, + "loss": 0.2103, + "step": 2059 + }, + { + "epoch": 2.8760753313182983, + "grad_norm": 1.9393989081162242, + "learning_rate": 5.221541239432415e-08, + "loss": 0.1968, + "step": 2060 + }, + { + "epoch": 2.8774703557312256, + "grad_norm": 2.063840366940395, + "learning_rate": 5.1050613984324756e-08, + "loss": 0.2034, + "step": 2061 + }, + { + "epoch": 2.8788653801441524, + "grad_norm": 2.2590391916977346, + "learning_rate": 4.989888778245744e-08, + "loss": 0.2559, + "step": 2062 + }, + { + "epoch": 2.8802604045570797, + "grad_norm": 1.9797057677091427, + "learning_rate": 4.8760236830903697e-08, + "loss": 0.2391, + "step": 2063 + }, + { + "epoch": 2.881655428970007, + "grad_norm": 2.0780206968147117, + "learning_rate": 4.763466413730822e-08, + "loss": 0.2426, + "step": 2064 + }, + { + "epoch": 2.883050453382934, + "grad_norm": 2.036645937963387, + "learning_rate": 4.65221726747711e-08, + "loss": 0.237, + "step": 2065 + }, + { + "epoch": 2.8844454777958615, + "grad_norm": 2.1721874584487995, + "learning_rate": 4.542276538183954e-08, + "loss": 0.2231, + "step": 2066 + }, + { + "epoch": 2.8858405022087887, + "grad_norm": 2.042288111806101, + "learning_rate": 4.433644516249891e-08, + "loss": 0.2465, + "step": 2067 + }, + { + "epoch": 2.887235526621716, + "grad_norm": 2.142452821109536, + "learning_rate": 4.326321488616836e-08, + "loss": 0.251, + "step": 2068 + }, + { + "epoch": 2.888630551034643, + "grad_norm": 2.0700122700221915, + "learning_rate": 4.220307738768859e-08, + "loss": 0.2239, + "step": 2069 + }, + { + "epoch": 2.89002557544757, + "grad_norm": 1.9911765497561993, + "learning_rate": 4.11560354673185e-08, + "loss": 0.2451, + "step": 2070 + }, + { + "epoch": 2.8914205998604974, + "grad_norm": 1.8703876336869296, + "learning_rate": 4.0122091890726354e-08, + "loss": 0.2296, + "step": 2071 + }, + { + "epoch": 2.8928156242734246, + "grad_norm": 1.9321327299338205, + "learning_rate": 3.9101249388981965e-08, + "loss": 0.2019, + "step": 2072 + }, + { + "epoch": 2.894210648686352, + "grad_norm": 2.012664815357327, + "learning_rate": 3.809351065854894e-08, + "loss": 0.2576, + "step": 2073 + }, + { + "epoch": 2.895605673099279, + "grad_norm": 2.087285982253065, + "learning_rate": 3.709887836128023e-08, + "loss": 0.2611, + "step": 2074 + }, + { + "epoch": 2.8970006975122065, + "grad_norm": 2.0487586659165093, + "learning_rate": 3.611735512440706e-08, + "loss": 0.251, + "step": 2075 + }, + { + "epoch": 2.8983957219251337, + "grad_norm": 2.114982811705642, + "learning_rate": 3.5148943540536105e-08, + "loss": 0.2257, + "step": 2076 + }, + { + "epoch": 2.899790746338061, + "grad_norm": 1.9590167118799493, + "learning_rate": 3.4193646167640646e-08, + "loss": 0.253, + "step": 2077 + }, + { + "epoch": 2.9011857707509883, + "grad_norm": 2.1822450790273953, + "learning_rate": 3.325146552905223e-08, + "loss": 0.2262, + "step": 2078 + }, + { + "epoch": 2.9025807951639155, + "grad_norm": 1.9201480800951032, + "learning_rate": 3.2322404113457886e-08, + "loss": 0.2547, + "step": 2079 + }, + { + "epoch": 2.903975819576843, + "grad_norm": 2.0972666441542107, + "learning_rate": 3.1406464374890144e-08, + "loss": 0.2294, + "step": 2080 + }, + { + "epoch": 2.90537084398977, + "grad_norm": 1.991224686802119, + "learning_rate": 3.0503648732722046e-08, + "loss": 0.25, + "step": 2081 + }, + { + "epoch": 2.906765868402697, + "grad_norm": 2.0853791989901547, + "learning_rate": 2.9613959571660468e-08, + "loss": 0.2189, + "step": 2082 + }, + { + "epoch": 2.908160892815624, + "grad_norm": 2.0370360728725916, + "learning_rate": 2.8737399241740016e-08, + "loss": 0.2187, + "step": 2083 + }, + { + "epoch": 2.9095559172285514, + "grad_norm": 2.014096101500848, + "learning_rate": 2.7873970058316934e-08, + "loss": 0.224, + "step": 2084 + }, + { + "epoch": 2.9109509416414787, + "grad_norm": 1.986275091522161, + "learning_rate": 2.7023674302061875e-08, + "loss": 0.225, + "step": 2085 + }, + { + "epoch": 2.912345966054406, + "grad_norm": 1.9278901568730784, + "learning_rate": 2.6186514218954905e-08, + "loss": 0.2088, + "step": 2086 + }, + { + "epoch": 2.9137409904673333, + "grad_norm": 2.0213138891500924, + "learning_rate": 2.5362492020280517e-08, + "loss": 0.216, + "step": 2087 + }, + { + "epoch": 2.9151360148802605, + "grad_norm": 1.9502452930485683, + "learning_rate": 2.4551609882619288e-08, + "loss": 0.2513, + "step": 2088 + }, + { + "epoch": 2.9165310392931874, + "grad_norm": 2.2370632409827684, + "learning_rate": 2.3753869947843457e-08, + "loss": 0.2172, + "step": 2089 + }, + { + "epoch": 2.9179260637061146, + "grad_norm": 2.015550383922713, + "learning_rate": 2.296927432311358e-08, + "loss": 0.2448, + "step": 2090 + }, + { + "epoch": 2.919321088119042, + "grad_norm": 2.03055889711366, + "learning_rate": 2.2197825080867432e-08, + "loss": 0.2196, + "step": 2091 + }, + { + "epoch": 2.920716112531969, + "grad_norm": 2.1961513763780176, + "learning_rate": 2.1439524258819456e-08, + "loss": 0.2498, + "step": 2092 + }, + { + "epoch": 2.9221111369448964, + "grad_norm": 1.9256101469230513, + "learning_rate": 2.0694373859954653e-08, + "loss": 0.22, + "step": 2093 + }, + { + "epoch": 2.9235061613578237, + "grad_norm": 1.9815134324395363, + "learning_rate": 1.99623758525197e-08, + "loss": 0.237, + "step": 2094 + }, + { + "epoch": 2.924901185770751, + "grad_norm": 1.907106053723731, + "learning_rate": 1.9243532170023504e-08, + "loss": 0.2211, + "step": 2095 + }, + { + "epoch": 2.9262962101836782, + "grad_norm": 1.9455456724863225, + "learning_rate": 1.8537844711227215e-08, + "loss": 0.2026, + "step": 2096 + }, + { + "epoch": 2.9276912345966055, + "grad_norm": 1.9285325866093366, + "learning_rate": 1.7845315340140334e-08, + "loss": 0.2107, + "step": 2097 + }, + { + "epoch": 2.929086259009533, + "grad_norm": 1.7732551833753323, + "learning_rate": 1.7165945886018498e-08, + "loss": 0.1909, + "step": 2098 + }, + { + "epoch": 2.93048128342246, + "grad_norm": 2.0531915086165236, + "learning_rate": 1.6499738143354594e-08, + "loss": 0.2141, + "step": 2099 + }, + { + "epoch": 2.9318763078353873, + "grad_norm": 2.1691009679826094, + "learning_rate": 1.584669387187765e-08, + "loss": 0.2288, + "step": 2100 + }, + { + "epoch": 2.9332713322483146, + "grad_norm": 1.879402148888189, + "learning_rate": 1.520681479654562e-08, + "loss": 0.2178, + "step": 2101 + }, + { + "epoch": 2.9346663566612414, + "grad_norm": 1.953785706612728, + "learning_rate": 1.4580102607541502e-08, + "loss": 0.2275, + "step": 2102 + }, + { + "epoch": 2.9360613810741687, + "grad_norm": 1.9696383581667096, + "learning_rate": 1.3966558960269994e-08, + "loss": 0.2084, + "step": 2103 + }, + { + "epoch": 2.937456405487096, + "grad_norm": 1.6864007194487332, + "learning_rate": 1.3366185475351957e-08, + "loss": 0.2093, + "step": 2104 + }, + { + "epoch": 2.9388514299000232, + "grad_norm": 1.7996177568725729, + "learning_rate": 1.2778983738620521e-08, + "loss": 0.2305, + "step": 2105 + }, + { + "epoch": 2.9402464543129505, + "grad_norm": 2.088014390966735, + "learning_rate": 1.2204955301116095e-08, + "loss": 0.2105, + "step": 2106 + }, + { + "epoch": 2.941641478725878, + "grad_norm": 1.884717455818504, + "learning_rate": 1.164410167908414e-08, + "loss": 0.2199, + "step": 2107 + }, + { + "epoch": 2.943036503138805, + "grad_norm": 2.105817440939439, + "learning_rate": 1.109642435396907e-08, + "loss": 0.2152, + "step": 2108 + }, + { + "epoch": 2.9444315275517323, + "grad_norm": 1.9437942158112311, + "learning_rate": 1.0561924772412024e-08, + "loss": 0.2228, + "step": 2109 + }, + { + "epoch": 2.945826551964659, + "grad_norm": 2.376506125566622, + "learning_rate": 1.0040604346245319e-08, + "loss": 0.27, + "step": 2110 + }, + { + "epoch": 2.9472215763775864, + "grad_norm": 1.7465087784159485, + "learning_rate": 9.532464452491341e-09, + "loss": 0.2221, + "step": 2111 + }, + { + "epoch": 2.9486166007905137, + "grad_norm": 2.105100907970145, + "learning_rate": 9.037506433355325e-09, + "loss": 0.228, + "step": 2112 + }, + { + "epoch": 2.950011625203441, + "grad_norm": 2.057081162802249, + "learning_rate": 8.555731596224803e-09, + "loss": 0.2096, + "step": 2113 + }, + { + "epoch": 2.9514066496163682, + "grad_norm": 1.9893991376538172, + "learning_rate": 8.087141213665717e-09, + "loss": 0.2394, + "step": 2114 + }, + { + "epoch": 2.9528016740292955, + "grad_norm": 1.953608082819364, + "learning_rate": 7.631736523416867e-09, + "loss": 0.2302, + "step": 2115 + }, + { + "epoch": 2.9541966984422228, + "grad_norm": 1.9960133893896974, + "learning_rate": 7.1895187283899104e-09, + "loss": 0.1825, + "step": 2116 + }, + { + "epoch": 2.95559172285515, + "grad_norm": 2.0829277277844622, + "learning_rate": 6.760488996662706e-09, + "loss": 0.2352, + "step": 2117 + }, + { + "epoch": 2.9569867472680773, + "grad_norm": 1.9399119855696576, + "learning_rate": 6.3446484614798635e-09, + "loss": 0.1935, + "step": 2118 + }, + { + "epoch": 2.9583817716810046, + "grad_norm": 1.9161326288025806, + "learning_rate": 5.941998221247192e-09, + "loss": 0.236, + "step": 2119 + }, + { + "epoch": 2.959776796093932, + "grad_norm": 2.089159927313854, + "learning_rate": 5.552539339528373e-09, + "loss": 0.2755, + "step": 2120 + }, + { + "epoch": 2.961171820506859, + "grad_norm": 1.8493546378825017, + "learning_rate": 5.176272845045516e-09, + "loss": 0.2264, + "step": 2121 + }, + { + "epoch": 2.962566844919786, + "grad_norm": 2.079176415927749, + "learning_rate": 4.813199731671381e-09, + "loss": 0.2042, + "step": 2122 + }, + { + "epoch": 2.963961869332713, + "grad_norm": 1.885503099661438, + "learning_rate": 4.463320958432716e-09, + "loss": 0.1929, + "step": 2123 + }, + { + "epoch": 2.9653568937456405, + "grad_norm": 1.8483044364688408, + "learning_rate": 4.1266374495024795e-09, + "loss": 0.2185, + "step": 2124 + }, + { + "epoch": 2.9667519181585678, + "grad_norm": 1.9885490703185613, + "learning_rate": 3.803150094200403e-09, + "loss": 0.2181, + "step": 2125 + }, + { + "epoch": 2.968146942571495, + "grad_norm": 1.9818060311208294, + "learning_rate": 3.4928597469885416e-09, + "loss": 0.2124, + "step": 2126 + }, + { + "epoch": 2.9695419669844223, + "grad_norm": 1.970519772764202, + "learning_rate": 3.1957672274723907e-09, + "loss": 0.2311, + "step": 2127 + }, + { + "epoch": 2.9709369913973496, + "grad_norm": 2.3609131575161273, + "learning_rate": 2.9118733203942207e-09, + "loss": 0.2369, + "step": 2128 + }, + { + "epoch": 2.972332015810277, + "grad_norm": 2.091411714396494, + "learning_rate": 2.6411787756353e-09, + "loss": 0.237, + "step": 2129 + }, + { + "epoch": 2.9737270402232037, + "grad_norm": 1.9145913000006876, + "learning_rate": 2.3836843082108987e-09, + "loss": 0.2468, + "step": 2130 + }, + { + "epoch": 2.975122064636131, + "grad_norm": 1.9045923373896427, + "learning_rate": 2.1393905982691752e-09, + "loss": 0.2164, + "step": 2131 + }, + { + "epoch": 2.976517089049058, + "grad_norm": 2.0044107224748284, + "learning_rate": 1.9082982910911817e-09, + "loss": 0.2264, + "step": 2132 + }, + { + "epoch": 2.9779121134619855, + "grad_norm": 1.961477665662354, + "learning_rate": 1.6904079970853083e-09, + "loss": 0.2187, + "step": 2133 + }, + { + "epoch": 2.9793071378749127, + "grad_norm": 1.8835567782670801, + "learning_rate": 1.4857202917900604e-09, + "loss": 0.2224, + "step": 2134 + }, + { + "epoch": 2.98070216228784, + "grad_norm": 2.1519901375116066, + "learning_rate": 1.2942357158701734e-09, + "loss": 0.2136, + "step": 2135 + }, + { + "epoch": 2.9820971867007673, + "grad_norm": 2.0606473023148135, + "learning_rate": 1.1159547751143918e-09, + "loss": 0.2397, + "step": 2136 + }, + { + "epoch": 2.9834922111136946, + "grad_norm": 1.9019884303178762, + "learning_rate": 9.508779404360235e-10, + "loss": 0.2472, + "step": 2137 + }, + { + "epoch": 2.984887235526622, + "grad_norm": 2.083765199600487, + "learning_rate": 7.990056478707209e-10, + "loss": 0.222, + "step": 2138 + }, + { + "epoch": 2.986282259939549, + "grad_norm": 1.9020968587306906, + "learning_rate": 6.603382985759244e-10, + "loss": 0.225, + "step": 2139 + }, + { + "epoch": 2.9876772843524764, + "grad_norm": 2.2043377624661127, + "learning_rate": 5.348762588286427e-10, + "loss": 0.2621, + "step": 2140 + }, + { + "epoch": 2.9890723087654036, + "grad_norm": 2.1382878635243983, + "learning_rate": 4.2261986002600783e-10, + "loss": 0.251, + "step": 2141 + }, + { + "epoch": 2.9904673331783305, + "grad_norm": 2.144632398744335, + "learning_rate": 3.235693986830546e-10, + "loss": 0.2618, + "step": 2142 + }, + { + "epoch": 2.9918623575912577, + "grad_norm": 2.136193399423466, + "learning_rate": 2.3772513643327555e-10, + "loss": 0.2406, + "step": 2143 + }, + { + "epoch": 2.993257382004185, + "grad_norm": 1.9404996403940398, + "learning_rate": 1.650873000258457e-10, + "loss": 0.2249, + "step": 2144 + }, + { + "epoch": 2.9946524064171123, + "grad_norm": 2.0262928151798407, + "learning_rate": 1.0565608132728778e-10, + "loss": 0.2308, + "step": 2145 + }, + { + "epoch": 2.9960474308300395, + "grad_norm": 1.9808655214657436, + "learning_rate": 5.943163732036183e-11, + "loss": 0.2503, + "step": 2146 + }, + { + "epoch": 2.997442455242967, + "grad_norm": 2.1179871403188932, + "learning_rate": 2.6414090102400147e-11, + "loss": 0.2435, + "step": 2147 + }, + { + "epoch": 2.998837479655894, + "grad_norm": 2.124758681637207, + "learning_rate": 6.6035268864173e-12, + "loss": 0.2391, + "step": 2148 + }, + { + "epoch": 2.998837479655894, + "step": 2148, + "total_flos": 3.265949006466908e+17, + "train_loss": 0.48496823534508215, + "train_runtime": 18125.988, + "train_samples_per_second": 2.847, + "train_steps_per_second": 0.119 + } + ], + "logging_steps": 1, + "max_steps": 2148, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 400, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.265949006466908e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}