diff --git "a/checkpoint-12328/trainer_state.json" "b/checkpoint-12328/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-12328/trainer_state.json" @@ -0,0 +1,86329 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 12328, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00016223231667748214, + "grad_norm": 3.5011032612251896, + "learning_rate": 5.0000000000000004e-08, + "loss": 0.879, + "step": 1 + }, + { + "epoch": 0.0003244646333549643, + "grad_norm": 3.6882450473006885, + "learning_rate": 1.0000000000000001e-07, + "loss": 0.9205, + "step": 2 + }, + { + "epoch": 0.0004866969500324465, + "grad_norm": 3.738800422337598, + "learning_rate": 1.5000000000000002e-07, + "loss": 0.9059, + "step": 3 + }, + { + "epoch": 0.0006489292667099286, + "grad_norm": 3.4348526081358552, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.844, + "step": 4 + }, + { + "epoch": 0.0008111615833874108, + "grad_norm": 3.5656195214360453, + "learning_rate": 2.5000000000000004e-07, + "loss": 0.8983, + "step": 5 + }, + { + "epoch": 0.000973393900064893, + "grad_norm": 3.6436670716459276, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.9177, + "step": 6 + }, + { + "epoch": 0.0011356262167423752, + "grad_norm": 3.5270102591719357, + "learning_rate": 3.5000000000000004e-07, + "loss": 0.9155, + "step": 7 + }, + { + "epoch": 0.0012978585334198572, + "grad_norm": 3.128258841418744, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.8991, + "step": 8 + }, + { + "epoch": 0.0014600908500973394, + "grad_norm": 3.1184728074687267, + "learning_rate": 4.5000000000000003e-07, + "loss": 0.916, + "step": 9 + }, + { + "epoch": 0.0016223231667748216, + "grad_norm": 3.2714520411786565, + "learning_rate": 5.000000000000001e-07, + "loss": 0.8371, + "step": 10 + }, + { + "epoch": 0.0017845554834523037, + "grad_norm": 3.1603622485942404, + "learning_rate": 5.5e-07, + "loss": 0.9177, + "step": 11 + }, + { + "epoch": 0.001946787800129786, + "grad_norm": 2.8404536530214672, + "learning_rate": 6.000000000000001e-07, + "loss": 0.8697, + "step": 12 + }, + { + "epoch": 0.002109020116807268, + "grad_norm": 2.6947561681348287, + "learning_rate": 6.5e-07, + "loss": 0.9065, + "step": 13 + }, + { + "epoch": 0.0022712524334847503, + "grad_norm": 2.405162319647659, + "learning_rate": 7.000000000000001e-07, + "loss": 0.9028, + "step": 14 + }, + { + "epoch": 0.0024334847501622323, + "grad_norm": 2.3187493782035706, + "learning_rate": 7.5e-07, + "loss": 0.8805, + "step": 15 + }, + { + "epoch": 0.0025957170668397143, + "grad_norm": 1.9971224888043058, + "learning_rate": 8.000000000000001e-07, + "loss": 0.8562, + "step": 16 + }, + { + "epoch": 0.0027579493835171967, + "grad_norm": 1.8218262078898388, + "learning_rate": 8.500000000000001e-07, + "loss": 0.8808, + "step": 17 + }, + { + "epoch": 0.0029201817001946787, + "grad_norm": 1.6783142597492973, + "learning_rate": 9.000000000000001e-07, + "loss": 0.8896, + "step": 18 + }, + { + "epoch": 0.003082414016872161, + "grad_norm": 1.574213816484913, + "learning_rate": 9.500000000000001e-07, + "loss": 0.8849, + "step": 19 + }, + { + "epoch": 0.003244646333549643, + "grad_norm": 1.358848517199633, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.8805, + "step": 20 + }, + { + "epoch": 0.003406878650227125, + "grad_norm": 1.3488069538699246, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.8963, + "step": 21 + }, + { + "epoch": 0.0035691109669046075, + "grad_norm": 1.238178771518931, + "learning_rate": 1.1e-06, + "loss": 0.8895, + "step": 22 + }, + { + "epoch": 0.0037313432835820895, + "grad_norm": 1.1872183406811823, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.8264, + "step": 23 + }, + { + "epoch": 0.003893575600259572, + "grad_norm": 1.1151010867088509, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.8257, + "step": 24 + }, + { + "epoch": 0.004055807916937054, + "grad_norm": 1.1250173976076996, + "learning_rate": 1.25e-06, + "loss": 0.8601, + "step": 25 + }, + { + "epoch": 0.004218040233614536, + "grad_norm": 1.0790090469750315, + "learning_rate": 1.3e-06, + "loss": 0.8334, + "step": 26 + }, + { + "epoch": 0.004380272550292018, + "grad_norm": 1.0869154277507271, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.8796, + "step": 27 + }, + { + "epoch": 0.004542504866969501, + "grad_norm": 0.9751455723032995, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.8373, + "step": 28 + }, + { + "epoch": 0.004704737183646983, + "grad_norm": 0.9388878326990321, + "learning_rate": 1.45e-06, + "loss": 0.761, + "step": 29 + }, + { + "epoch": 0.004866969500324465, + "grad_norm": 0.9729709202742933, + "learning_rate": 1.5e-06, + "loss": 0.8637, + "step": 30 + }, + { + "epoch": 0.005029201817001947, + "grad_norm": 0.8746220176927548, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.8075, + "step": 31 + }, + { + "epoch": 0.005191434133679429, + "grad_norm": 0.8852670818394883, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.8194, + "step": 32 + }, + { + "epoch": 0.0053536664503569115, + "grad_norm": 0.8877615528202797, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.8067, + "step": 33 + }, + { + "epoch": 0.0055158987670343934, + "grad_norm": 0.8675537652854793, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.7826, + "step": 34 + }, + { + "epoch": 0.005678131083711875, + "grad_norm": 0.8131856478572171, + "learning_rate": 1.75e-06, + "loss": 0.7958, + "step": 35 + }, + { + "epoch": 0.005840363400389357, + "grad_norm": 0.7764800954919194, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.7655, + "step": 36 + }, + { + "epoch": 0.006002595717066839, + "grad_norm": 0.8339862981055256, + "learning_rate": 1.85e-06, + "loss": 0.8037, + "step": 37 + }, + { + "epoch": 0.006164828033744322, + "grad_norm": 0.7767007026828351, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.7807, + "step": 38 + }, + { + "epoch": 0.006327060350421804, + "grad_norm": 0.7799835078506225, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.7375, + "step": 39 + }, + { + "epoch": 0.006489292667099286, + "grad_norm": 0.8183997927172201, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7876, + "step": 40 + }, + { + "epoch": 0.006651524983776768, + "grad_norm": 0.7477378048982426, + "learning_rate": 2.05e-06, + "loss": 0.7582, + "step": 41 + }, + { + "epoch": 0.00681375730045425, + "grad_norm": 0.6976430091245774, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.7521, + "step": 42 + }, + { + "epoch": 0.006975989617131733, + "grad_norm": 0.6933127057916861, + "learning_rate": 2.15e-06, + "loss": 0.7579, + "step": 43 + }, + { + "epoch": 0.007138221933809215, + "grad_norm": 0.722132968762779, + "learning_rate": 2.2e-06, + "loss": 0.7656, + "step": 44 + }, + { + "epoch": 0.007300454250486697, + "grad_norm": 0.7362871329888476, + "learning_rate": 2.25e-06, + "loss": 0.8483, + "step": 45 + }, + { + "epoch": 0.007462686567164179, + "grad_norm": 0.7176750890167213, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.7798, + "step": 46 + }, + { + "epoch": 0.007624918883841661, + "grad_norm": 0.7143823607675764, + "learning_rate": 2.35e-06, + "loss": 0.7744, + "step": 47 + }, + { + "epoch": 0.007787151200519144, + "grad_norm": 0.6807544838023724, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.7501, + "step": 48 + }, + { + "epoch": 0.007949383517196625, + "grad_norm": 0.7286538602859893, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.8185, + "step": 49 + }, + { + "epoch": 0.008111615833874108, + "grad_norm": 0.6959526365840605, + "learning_rate": 2.5e-06, + "loss": 0.8136, + "step": 50 + }, + { + "epoch": 0.00827384815055159, + "grad_norm": 0.6406257632890853, + "learning_rate": 2.55e-06, + "loss": 0.7179, + "step": 51 + }, + { + "epoch": 0.008436080467229072, + "grad_norm": 0.6905274554182129, + "learning_rate": 2.6e-06, + "loss": 0.7391, + "step": 52 + }, + { + "epoch": 0.008598312783906555, + "grad_norm": 0.6370329079766539, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.766, + "step": 53 + }, + { + "epoch": 0.008760545100584036, + "grad_norm": 0.6740857388871624, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.7523, + "step": 54 + }, + { + "epoch": 0.008922777417261519, + "grad_norm": 0.6195464673105623, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.732, + "step": 55 + }, + { + "epoch": 0.009085009733939001, + "grad_norm": 0.6445293584055014, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.7261, + "step": 56 + }, + { + "epoch": 0.009247242050616482, + "grad_norm": 0.5853988900366073, + "learning_rate": 2.85e-06, + "loss": 0.7209, + "step": 57 + }, + { + "epoch": 0.009409474367293965, + "grad_norm": 0.5851202109648997, + "learning_rate": 2.9e-06, + "loss": 0.7024, + "step": 58 + }, + { + "epoch": 0.009571706683971446, + "grad_norm": 0.6227471248377136, + "learning_rate": 2.95e-06, + "loss": 0.7344, + "step": 59 + }, + { + "epoch": 0.00973393900064893, + "grad_norm": 0.6154164945335879, + "learning_rate": 3e-06, + "loss": 0.7203, + "step": 60 + }, + { + "epoch": 0.009896171317326412, + "grad_norm": 0.6212765674602251, + "learning_rate": 3.05e-06, + "loss": 0.7349, + "step": 61 + }, + { + "epoch": 0.010058403634003893, + "grad_norm": 0.6335774967487576, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.6951, + "step": 62 + }, + { + "epoch": 0.010220635950681376, + "grad_norm": 0.5965950018280336, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.7597, + "step": 63 + }, + { + "epoch": 0.010382868267358857, + "grad_norm": 0.5998041818896832, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.7292, + "step": 64 + }, + { + "epoch": 0.01054510058403634, + "grad_norm": 0.565874071514447, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.7132, + "step": 65 + }, + { + "epoch": 0.010707332900713823, + "grad_norm": 0.6209725916740495, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.7246, + "step": 66 + }, + { + "epoch": 0.010869565217391304, + "grad_norm": 0.568426830012614, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.7275, + "step": 67 + }, + { + "epoch": 0.011031797534068787, + "grad_norm": 0.5718811580230821, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.6916, + "step": 68 + }, + { + "epoch": 0.011194029850746268, + "grad_norm": 0.5964717868492152, + "learning_rate": 3.45e-06, + "loss": 0.7212, + "step": 69 + }, + { + "epoch": 0.01135626216742375, + "grad_norm": 0.5834103617998909, + "learning_rate": 3.5e-06, + "loss": 0.7445, + "step": 70 + }, + { + "epoch": 0.011518494484101234, + "grad_norm": 0.5513609112227259, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.7078, + "step": 71 + }, + { + "epoch": 0.011680726800778715, + "grad_norm": 0.6048731221654371, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.7374, + "step": 72 + }, + { + "epoch": 0.011842959117456198, + "grad_norm": 0.5705807741635119, + "learning_rate": 3.65e-06, + "loss": 0.7194, + "step": 73 + }, + { + "epoch": 0.012005191434133679, + "grad_norm": 0.5901345114452039, + "learning_rate": 3.7e-06, + "loss": 0.7238, + "step": 74 + }, + { + "epoch": 0.012167423750811162, + "grad_norm": 0.5563411035578699, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.709, + "step": 75 + }, + { + "epoch": 0.012329656067488644, + "grad_norm": 0.5928369363107909, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.7089, + "step": 76 + }, + { + "epoch": 0.012491888384166126, + "grad_norm": 0.5884687897338163, + "learning_rate": 3.85e-06, + "loss": 0.6935, + "step": 77 + }, + { + "epoch": 0.012654120700843608, + "grad_norm": 0.5718495838281857, + "learning_rate": 3.900000000000001e-06, + "loss": 0.6918, + "step": 78 + }, + { + "epoch": 0.01281635301752109, + "grad_norm": 0.5724122676372986, + "learning_rate": 3.95e-06, + "loss": 0.7195, + "step": 79 + }, + { + "epoch": 0.012978585334198572, + "grad_norm": 0.590269200393233, + "learning_rate": 4.000000000000001e-06, + "loss": 0.7565, + "step": 80 + }, + { + "epoch": 0.013140817650876055, + "grad_norm": 0.5624859017350167, + "learning_rate": 4.05e-06, + "loss": 0.7052, + "step": 81 + }, + { + "epoch": 0.013303049967553536, + "grad_norm": 0.5577834650777775, + "learning_rate": 4.1e-06, + "loss": 0.7231, + "step": 82 + }, + { + "epoch": 0.01346528228423102, + "grad_norm": 0.605120741061099, + "learning_rate": 4.15e-06, + "loss": 0.7134, + "step": 83 + }, + { + "epoch": 0.0136275146009085, + "grad_norm": 0.6271846183987227, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.7516, + "step": 84 + }, + { + "epoch": 0.013789746917585983, + "grad_norm": 0.5581467313154133, + "learning_rate": 4.25e-06, + "loss": 0.7232, + "step": 85 + }, + { + "epoch": 0.013951979234263466, + "grad_norm": 0.5755051807550504, + "learning_rate": 4.3e-06, + "loss": 0.7302, + "step": 86 + }, + { + "epoch": 0.014114211550940947, + "grad_norm": 0.5970418360975179, + "learning_rate": 4.350000000000001e-06, + "loss": 0.6343, + "step": 87 + }, + { + "epoch": 0.01427644386761843, + "grad_norm": 0.5801543860573787, + "learning_rate": 4.4e-06, + "loss": 0.7327, + "step": 88 + }, + { + "epoch": 0.014438676184295911, + "grad_norm": 0.5870638632095596, + "learning_rate": 4.450000000000001e-06, + "loss": 0.7038, + "step": 89 + }, + { + "epoch": 0.014600908500973394, + "grad_norm": 0.5420265101592575, + "learning_rate": 4.5e-06, + "loss": 0.6946, + "step": 90 + }, + { + "epoch": 0.014763140817650877, + "grad_norm": 0.5788811399901236, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.7272, + "step": 91 + }, + { + "epoch": 0.014925373134328358, + "grad_norm": 0.5808645166797506, + "learning_rate": 4.600000000000001e-06, + "loss": 0.706, + "step": 92 + }, + { + "epoch": 0.01508760545100584, + "grad_norm": 0.596178682453679, + "learning_rate": 4.65e-06, + "loss": 0.7244, + "step": 93 + }, + { + "epoch": 0.015249837767683322, + "grad_norm": 0.5590414713413869, + "learning_rate": 4.7e-06, + "loss": 0.6944, + "step": 94 + }, + { + "epoch": 0.015412070084360805, + "grad_norm": 0.5834743764084958, + "learning_rate": 4.75e-06, + "loss": 0.7236, + "step": 95 + }, + { + "epoch": 0.015574302401038288, + "grad_norm": 0.5855717960346297, + "learning_rate": 4.800000000000001e-06, + "loss": 0.7231, + "step": 96 + }, + { + "epoch": 0.01573653471771577, + "grad_norm": 0.5320053857878654, + "learning_rate": 4.85e-06, + "loss": 0.6889, + "step": 97 + }, + { + "epoch": 0.01589876703439325, + "grad_norm": 0.6659337704630155, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.729, + "step": 98 + }, + { + "epoch": 0.016060999351070734, + "grad_norm": 0.5758855882841524, + "learning_rate": 4.95e-06, + "loss": 0.7459, + "step": 99 + }, + { + "epoch": 0.016223231667748216, + "grad_norm": 0.5630886711228157, + "learning_rate": 5e-06, + "loss": 0.6918, + "step": 100 + }, + { + "epoch": 0.016385463984425697, + "grad_norm": 0.5505339009165565, + "learning_rate": 4.99999996352865e-06, + "loss": 0.7046, + "step": 101 + }, + { + "epoch": 0.01654769630110318, + "grad_norm": 0.5625755852148216, + "learning_rate": 4.999999854114601e-06, + "loss": 0.7215, + "step": 102 + }, + { + "epoch": 0.016709928617780662, + "grad_norm": 0.5561035078798409, + "learning_rate": 4.999999671757856e-06, + "loss": 0.6933, + "step": 103 + }, + { + "epoch": 0.016872160934458143, + "grad_norm": 0.5787304936112931, + "learning_rate": 4.99999941645842e-06, + "loss": 0.6864, + "step": 104 + }, + { + "epoch": 0.017034393251135625, + "grad_norm": 0.5674698925226783, + "learning_rate": 4.9999990882163005e-06, + "loss": 0.6699, + "step": 105 + }, + { + "epoch": 0.01719662556781311, + "grad_norm": 0.6123565380332773, + "learning_rate": 4.9999986870315075e-06, + "loss": 0.7008, + "step": 106 + }, + { + "epoch": 0.01735885788449059, + "grad_norm": 0.5878907303222654, + "learning_rate": 4.999998212904053e-06, + "loss": 0.6933, + "step": 107 + }, + { + "epoch": 0.01752109020116807, + "grad_norm": 0.5811085327379852, + "learning_rate": 4.999997665833949e-06, + "loss": 0.6824, + "step": 108 + }, + { + "epoch": 0.017683322517845556, + "grad_norm": 0.5846386638894923, + "learning_rate": 4.9999970458212134e-06, + "loss": 0.718, + "step": 109 + }, + { + "epoch": 0.017845554834523037, + "grad_norm": 0.567542379323991, + "learning_rate": 4.999996352865865e-06, + "loss": 0.7068, + "step": 110 + }, + { + "epoch": 0.018007787151200518, + "grad_norm": 0.6034869851708228, + "learning_rate": 4.999995586967921e-06, + "loss": 0.6947, + "step": 111 + }, + { + "epoch": 0.018170019467878003, + "grad_norm": 0.5568899449044614, + "learning_rate": 4.999994748127407e-06, + "loss": 0.6863, + "step": 112 + }, + { + "epoch": 0.018332251784555484, + "grad_norm": 0.5559063736433502, + "learning_rate": 4.999993836344345e-06, + "loss": 0.6917, + "step": 113 + }, + { + "epoch": 0.018494484101232965, + "grad_norm": 0.5977851573029138, + "learning_rate": 4.999992851618763e-06, + "loss": 0.696, + "step": 114 + }, + { + "epoch": 0.018656716417910446, + "grad_norm": 0.5860583438575667, + "learning_rate": 4.999991793950688e-06, + "loss": 0.6888, + "step": 115 + }, + { + "epoch": 0.01881894873458793, + "grad_norm": 0.5529316475159397, + "learning_rate": 4.999990663340154e-06, + "loss": 0.7238, + "step": 116 + }, + { + "epoch": 0.018981181051265412, + "grad_norm": 0.5712819813141639, + "learning_rate": 4.999989459787191e-06, + "loss": 0.672, + "step": 117 + }, + { + "epoch": 0.019143413367942893, + "grad_norm": 0.5839299467221445, + "learning_rate": 4.999988183291836e-06, + "loss": 0.6553, + "step": 118 + }, + { + "epoch": 0.019305645684620378, + "grad_norm": 0.5477594445009771, + "learning_rate": 4.9999868338541256e-06, + "loss": 0.731, + "step": 119 + }, + { + "epoch": 0.01946787800129786, + "grad_norm": 0.6117839239179852, + "learning_rate": 4.999985411474098e-06, + "loss": 0.7165, + "step": 120 + }, + { + "epoch": 0.01963011031797534, + "grad_norm": 0.5518083540922223, + "learning_rate": 4.999983916151797e-06, + "loss": 0.6737, + "step": 121 + }, + { + "epoch": 0.019792342634652824, + "grad_norm": 0.591572875303357, + "learning_rate": 4.999982347887264e-06, + "loss": 0.6748, + "step": 122 + }, + { + "epoch": 0.019954574951330305, + "grad_norm": 0.5912717384711073, + "learning_rate": 4.999980706680546e-06, + "loss": 0.7197, + "step": 123 + }, + { + "epoch": 0.020116807268007787, + "grad_norm": 0.548323267154995, + "learning_rate": 4.999978992531691e-06, + "loss": 0.676, + "step": 124 + }, + { + "epoch": 0.020279039584685268, + "grad_norm": 0.5378004819082913, + "learning_rate": 4.999977205440748e-06, + "loss": 0.709, + "step": 125 + }, + { + "epoch": 0.020441271901362752, + "grad_norm": 0.5735907217642654, + "learning_rate": 4.99997534540777e-06, + "loss": 0.7033, + "step": 126 + }, + { + "epoch": 0.020603504218040233, + "grad_norm": 0.611432103416515, + "learning_rate": 4.999973412432812e-06, + "loss": 0.7331, + "step": 127 + }, + { + "epoch": 0.020765736534717714, + "grad_norm": 0.5801846673685215, + "learning_rate": 4.999971406515929e-06, + "loss": 0.6836, + "step": 128 + }, + { + "epoch": 0.0209279688513952, + "grad_norm": 0.5395790140564014, + "learning_rate": 4.99996932765718e-06, + "loss": 0.679, + "step": 129 + }, + { + "epoch": 0.02109020116807268, + "grad_norm": 0.5748635824280245, + "learning_rate": 4.999967175856625e-06, + "loss": 0.6998, + "step": 130 + }, + { + "epoch": 0.02125243348475016, + "grad_norm": 0.6241210928745985, + "learning_rate": 4.9999649511143274e-06, + "loss": 0.6983, + "step": 131 + }, + { + "epoch": 0.021414665801427646, + "grad_norm": 0.5921169345900037, + "learning_rate": 4.999962653430353e-06, + "loss": 0.699, + "step": 132 + }, + { + "epoch": 0.021576898118105127, + "grad_norm": 0.5785822691395817, + "learning_rate": 4.999960282804768e-06, + "loss": 0.7217, + "step": 133 + }, + { + "epoch": 0.021739130434782608, + "grad_norm": 0.5918014584277733, + "learning_rate": 4.9999578392376415e-06, + "loss": 0.6925, + "step": 134 + }, + { + "epoch": 0.02190136275146009, + "grad_norm": 0.5472479998171474, + "learning_rate": 4.9999553227290446e-06, + "loss": 0.696, + "step": 135 + }, + { + "epoch": 0.022063595068137574, + "grad_norm": 0.5561108718538762, + "learning_rate": 4.999952733279051e-06, + "loss": 0.7, + "step": 136 + }, + { + "epoch": 0.022225827384815055, + "grad_norm": 0.5581600486157452, + "learning_rate": 4.999950070887735e-06, + "loss": 0.6404, + "step": 137 + }, + { + "epoch": 0.022388059701492536, + "grad_norm": 0.5661396959109326, + "learning_rate": 4.999947335555177e-06, + "loss": 0.7101, + "step": 138 + }, + { + "epoch": 0.02255029201817002, + "grad_norm": 0.5592933636229189, + "learning_rate": 4.999944527281455e-06, + "loss": 0.6834, + "step": 139 + }, + { + "epoch": 0.0227125243348475, + "grad_norm": 0.5861422208831669, + "learning_rate": 4.999941646066652e-06, + "loss": 0.6774, + "step": 140 + }, + { + "epoch": 0.022874756651524983, + "grad_norm": 0.5971214029307094, + "learning_rate": 4.99993869191085e-06, + "loss": 0.7163, + "step": 141 + }, + { + "epoch": 0.023036988968202467, + "grad_norm": 0.5902683673342579, + "learning_rate": 4.9999356648141375e-06, + "loss": 0.7157, + "step": 142 + }, + { + "epoch": 0.02319922128487995, + "grad_norm": 0.5890067654662353, + "learning_rate": 4.999932564776601e-06, + "loss": 0.7624, + "step": 143 + }, + { + "epoch": 0.02336145360155743, + "grad_norm": 0.6047626304350823, + "learning_rate": 4.9999293917983325e-06, + "loss": 0.6745, + "step": 144 + }, + { + "epoch": 0.02352368591823491, + "grad_norm": 0.5635730412694969, + "learning_rate": 4.999926145879423e-06, + "loss": 0.6999, + "step": 145 + }, + { + "epoch": 0.023685918234912395, + "grad_norm": 0.5406918743846494, + "learning_rate": 4.999922827019969e-06, + "loss": 0.7153, + "step": 146 + }, + { + "epoch": 0.023848150551589876, + "grad_norm": 0.5938292166191902, + "learning_rate": 4.999919435220066e-06, + "loss": 0.7191, + "step": 147 + }, + { + "epoch": 0.024010382868267358, + "grad_norm": 0.5594016727722229, + "learning_rate": 4.999915970479813e-06, + "loss": 0.6863, + "step": 148 + }, + { + "epoch": 0.024172615184944842, + "grad_norm": 0.5773371756766511, + "learning_rate": 4.999912432799312e-06, + "loss": 0.6425, + "step": 149 + }, + { + "epoch": 0.024334847501622323, + "grad_norm": 0.5794372901024266, + "learning_rate": 4.999908822178666e-06, + "loss": 0.6866, + "step": 150 + }, + { + "epoch": 0.024497079818299804, + "grad_norm": 0.6349264045296137, + "learning_rate": 4.999905138617979e-06, + "loss": 0.703, + "step": 151 + }, + { + "epoch": 0.02465931213497729, + "grad_norm": 0.6136090830455821, + "learning_rate": 4.99990138211736e-06, + "loss": 0.6866, + "step": 152 + }, + { + "epoch": 0.02482154445165477, + "grad_norm": 0.5492433268197677, + "learning_rate": 4.999897552676919e-06, + "loss": 0.6703, + "step": 153 + }, + { + "epoch": 0.02498377676833225, + "grad_norm": 0.5877988692448967, + "learning_rate": 4.9998936502967654e-06, + "loss": 0.6956, + "step": 154 + }, + { + "epoch": 0.025146009085009732, + "grad_norm": 0.5779945501850478, + "learning_rate": 4.999889674977015e-06, + "loss": 0.7087, + "step": 155 + }, + { + "epoch": 0.025308241401687217, + "grad_norm": 0.5901163795336117, + "learning_rate": 4.999885626717784e-06, + "loss": 0.6594, + "step": 156 + }, + { + "epoch": 0.025470473718364698, + "grad_norm": 0.7027833233132346, + "learning_rate": 4.999881505519189e-06, + "loss": 0.67, + "step": 157 + }, + { + "epoch": 0.02563270603504218, + "grad_norm": 0.5801571854294091, + "learning_rate": 4.999877311381352e-06, + "loss": 0.6718, + "step": 158 + }, + { + "epoch": 0.025794938351719664, + "grad_norm": 0.5796311295916904, + "learning_rate": 4.999873044304393e-06, + "loss": 0.6781, + "step": 159 + }, + { + "epoch": 0.025957170668397145, + "grad_norm": 0.5818621213946525, + "learning_rate": 4.999868704288439e-06, + "loss": 0.676, + "step": 160 + }, + { + "epoch": 0.026119402985074626, + "grad_norm": 0.5636269657081767, + "learning_rate": 4.9998642913336144e-06, + "loss": 0.6636, + "step": 161 + }, + { + "epoch": 0.02628163530175211, + "grad_norm": 0.5802813278334794, + "learning_rate": 4.9998598054400505e-06, + "loss": 0.6874, + "step": 162 + }, + { + "epoch": 0.02644386761842959, + "grad_norm": 0.6189621341530747, + "learning_rate": 4.9998552466078756e-06, + "loss": 0.6772, + "step": 163 + }, + { + "epoch": 0.026606099935107073, + "grad_norm": 0.5928156749550382, + "learning_rate": 4.999850614837225e-06, + "loss": 0.686, + "step": 164 + }, + { + "epoch": 0.026768332251784554, + "grad_norm": 0.612918308721807, + "learning_rate": 4.999845910128231e-06, + "loss": 0.6862, + "step": 165 + }, + { + "epoch": 0.02693056456846204, + "grad_norm": 0.558288714173246, + "learning_rate": 4.999841132481035e-06, + "loss": 0.6552, + "step": 166 + }, + { + "epoch": 0.02709279688513952, + "grad_norm": 0.5933782671603824, + "learning_rate": 4.999836281895773e-06, + "loss": 0.6722, + "step": 167 + }, + { + "epoch": 0.027255029201817, + "grad_norm": 0.5734378603749863, + "learning_rate": 4.9998313583725865e-06, + "loss": 0.7019, + "step": 168 + }, + { + "epoch": 0.027417261518494485, + "grad_norm": 0.612594653162177, + "learning_rate": 4.999826361911622e-06, + "loss": 0.692, + "step": 169 + }, + { + "epoch": 0.027579493835171966, + "grad_norm": 0.5286746752103332, + "learning_rate": 4.999821292513022e-06, + "loss": 0.6405, + "step": 170 + }, + { + "epoch": 0.027741726151849447, + "grad_norm": 0.5569891569102852, + "learning_rate": 4.999816150176936e-06, + "loss": 0.7048, + "step": 171 + }, + { + "epoch": 0.027903958468526932, + "grad_norm": 0.5298759950818331, + "learning_rate": 4.999810934903515e-06, + "loss": 0.6982, + "step": 172 + }, + { + "epoch": 0.028066190785204413, + "grad_norm": 0.5414589770452752, + "learning_rate": 4.999805646692909e-06, + "loss": 0.684, + "step": 173 + }, + { + "epoch": 0.028228423101881894, + "grad_norm": 0.5995711540594314, + "learning_rate": 4.999800285545274e-06, + "loss": 0.6956, + "step": 174 + }, + { + "epoch": 0.028390655418559375, + "grad_norm": 0.5835199106493375, + "learning_rate": 4.999794851460766e-06, + "loss": 0.6909, + "step": 175 + }, + { + "epoch": 0.02855288773523686, + "grad_norm": 0.5809155459681484, + "learning_rate": 4.999789344439543e-06, + "loss": 0.6693, + "step": 176 + }, + { + "epoch": 0.02871512005191434, + "grad_norm": 0.595748485931571, + "learning_rate": 4.999783764481766e-06, + "loss": 0.6751, + "step": 177 + }, + { + "epoch": 0.028877352368591822, + "grad_norm": 0.5704216078964018, + "learning_rate": 4.999778111587598e-06, + "loss": 0.7057, + "step": 178 + }, + { + "epoch": 0.029039584685269307, + "grad_norm": 0.5590631795628255, + "learning_rate": 4.9997723857572035e-06, + "loss": 0.7188, + "step": 179 + }, + { + "epoch": 0.029201817001946788, + "grad_norm": 0.621168260036352, + "learning_rate": 4.9997665869907495e-06, + "loss": 0.7328, + "step": 180 + }, + { + "epoch": 0.02936404931862427, + "grad_norm": 0.618087294142451, + "learning_rate": 4.999760715288406e-06, + "loss": 0.6584, + "step": 181 + }, + { + "epoch": 0.029526281635301754, + "grad_norm": 0.5652873499717097, + "learning_rate": 4.999754770650344e-06, + "loss": 0.6719, + "step": 182 + }, + { + "epoch": 0.029688513951979235, + "grad_norm": 0.5726203252789515, + "learning_rate": 4.999748753076737e-06, + "loss": 0.6776, + "step": 183 + }, + { + "epoch": 0.029850746268656716, + "grad_norm": 0.6187838287866723, + "learning_rate": 4.99974266256776e-06, + "loss": 0.6478, + "step": 184 + }, + { + "epoch": 0.030012978585334197, + "grad_norm": 0.5791025899707688, + "learning_rate": 4.999736499123591e-06, + "loss": 0.6525, + "step": 185 + }, + { + "epoch": 0.03017521090201168, + "grad_norm": 0.5911132718120969, + "learning_rate": 4.99973026274441e-06, + "loss": 0.6886, + "step": 186 + }, + { + "epoch": 0.030337443218689163, + "grad_norm": 0.5600270771332668, + "learning_rate": 4.999723953430398e-06, + "loss": 0.6584, + "step": 187 + }, + { + "epoch": 0.030499675535366644, + "grad_norm": 0.5392528179396543, + "learning_rate": 4.999717571181742e-06, + "loss": 0.6484, + "step": 188 + }, + { + "epoch": 0.03066190785204413, + "grad_norm": 0.5550973040074854, + "learning_rate": 4.999711115998625e-06, + "loss": 0.667, + "step": 189 + }, + { + "epoch": 0.03082414016872161, + "grad_norm": 0.5590037545175758, + "learning_rate": 4.9997045878812365e-06, + "loss": 0.6578, + "step": 190 + }, + { + "epoch": 0.03098637248539909, + "grad_norm": 0.5851697324656132, + "learning_rate": 4.999697986829767e-06, + "loss": 0.6771, + "step": 191 + }, + { + "epoch": 0.031148604802076575, + "grad_norm": 0.5609807745756724, + "learning_rate": 4.999691312844409e-06, + "loss": 0.645, + "step": 192 + }, + { + "epoch": 0.031310837118754056, + "grad_norm": 0.5970370537707993, + "learning_rate": 4.999684565925358e-06, + "loss": 0.6538, + "step": 193 + }, + { + "epoch": 0.03147306943543154, + "grad_norm": 0.5682851462006359, + "learning_rate": 4.999677746072809e-06, + "loss": 0.6583, + "step": 194 + }, + { + "epoch": 0.03163530175210902, + "grad_norm": 0.5847817907368655, + "learning_rate": 4.999670853286963e-06, + "loss": 0.706, + "step": 195 + }, + { + "epoch": 0.0317975340687865, + "grad_norm": 0.6080760641349195, + "learning_rate": 4.99966388756802e-06, + "loss": 0.6949, + "step": 196 + }, + { + "epoch": 0.03195976638546399, + "grad_norm": 0.5951097751504074, + "learning_rate": 4.999656848916184e-06, + "loss": 0.6648, + "step": 197 + }, + { + "epoch": 0.03212199870214147, + "grad_norm": 0.5666413686198986, + "learning_rate": 4.999649737331659e-06, + "loss": 0.6723, + "step": 198 + }, + { + "epoch": 0.03228423101881895, + "grad_norm": 0.5967419864582026, + "learning_rate": 4.999642552814654e-06, + "loss": 0.6662, + "step": 199 + }, + { + "epoch": 0.03244646333549643, + "grad_norm": 0.5652919761753173, + "learning_rate": 4.999635295365378e-06, + "loss": 0.6358, + "step": 200 + }, + { + "epoch": 0.03260869565217391, + "grad_norm": 0.5888649614605591, + "learning_rate": 4.999627964984042e-06, + "loss": 0.6552, + "step": 201 + }, + { + "epoch": 0.03277092796885139, + "grad_norm": 0.5846289377996231, + "learning_rate": 4.999620561670862e-06, + "loss": 0.6465, + "step": 202 + }, + { + "epoch": 0.032933160285528874, + "grad_norm": 0.6163196687299176, + "learning_rate": 4.999613085426052e-06, + "loss": 0.6823, + "step": 203 + }, + { + "epoch": 0.03309539260220636, + "grad_norm": 0.5713494879718587, + "learning_rate": 4.99960553624983e-06, + "loss": 0.6912, + "step": 204 + }, + { + "epoch": 0.033257624918883844, + "grad_norm": 0.5996811795213359, + "learning_rate": 4.999597914142418e-06, + "loss": 0.6756, + "step": 205 + }, + { + "epoch": 0.033419857235561325, + "grad_norm": 0.5942308152126138, + "learning_rate": 4.9995902191040366e-06, + "loss": 0.6779, + "step": 206 + }, + { + "epoch": 0.033582089552238806, + "grad_norm": 0.6200015694748698, + "learning_rate": 4.9995824511349115e-06, + "loss": 0.6513, + "step": 207 + }, + { + "epoch": 0.03374432186891629, + "grad_norm": 0.5658539162000348, + "learning_rate": 4.999574610235269e-06, + "loss": 0.6556, + "step": 208 + }, + { + "epoch": 0.03390655418559377, + "grad_norm": 0.5868853947690869, + "learning_rate": 4.999566696405338e-06, + "loss": 0.6826, + "step": 209 + }, + { + "epoch": 0.03406878650227125, + "grad_norm": 0.55601049843555, + "learning_rate": 4.999558709645349e-06, + "loss": 0.6708, + "step": 210 + }, + { + "epoch": 0.03423101881894874, + "grad_norm": 0.6467528831060202, + "learning_rate": 4.999550649955535e-06, + "loss": 0.6578, + "step": 211 + }, + { + "epoch": 0.03439325113562622, + "grad_norm": 0.6195389743066652, + "learning_rate": 4.9995425173361324e-06, + "loss": 0.645, + "step": 212 + }, + { + "epoch": 0.0345554834523037, + "grad_norm": 0.5994170985316613, + "learning_rate": 4.999534311787376e-06, + "loss": 0.6742, + "step": 213 + }, + { + "epoch": 0.03471771576898118, + "grad_norm": 0.5507958306808273, + "learning_rate": 4.9995260333095084e-06, + "loss": 0.6435, + "step": 214 + }, + { + "epoch": 0.03487994808565866, + "grad_norm": 0.5936225756454899, + "learning_rate": 4.999517681902769e-06, + "loss": 0.6392, + "step": 215 + }, + { + "epoch": 0.03504218040233614, + "grad_norm": 0.5849856367196633, + "learning_rate": 4.999509257567401e-06, + "loss": 0.6651, + "step": 216 + }, + { + "epoch": 0.03520441271901363, + "grad_norm": 0.5610739061625474, + "learning_rate": 4.999500760303652e-06, + "loss": 0.6489, + "step": 217 + }, + { + "epoch": 0.03536664503569111, + "grad_norm": 0.5959550278451634, + "learning_rate": 4.999492190111769e-06, + "loss": 0.6809, + "step": 218 + }, + { + "epoch": 0.03552887735236859, + "grad_norm": 0.5728881464257662, + "learning_rate": 4.999483546992002e-06, + "loss": 0.6601, + "step": 219 + }, + { + "epoch": 0.035691109669046074, + "grad_norm": 0.6280667379506931, + "learning_rate": 4.999474830944604e-06, + "loss": 0.7114, + "step": 220 + }, + { + "epoch": 0.035853341985723555, + "grad_norm": 0.6303972746119869, + "learning_rate": 4.999466041969828e-06, + "loss": 0.6659, + "step": 221 + }, + { + "epoch": 0.036015574302401036, + "grad_norm": 0.5984027035089835, + "learning_rate": 4.9994571800679315e-06, + "loss": 0.6656, + "step": 222 + }, + { + "epoch": 0.03617780661907852, + "grad_norm": 0.5903061656436334, + "learning_rate": 4.999448245239172e-06, + "loss": 0.7018, + "step": 223 + }, + { + "epoch": 0.036340038935756006, + "grad_norm": 0.5707126530721176, + "learning_rate": 4.999439237483811e-06, + "loss": 0.6987, + "step": 224 + }, + { + "epoch": 0.03650227125243349, + "grad_norm": 0.5866170121951949, + "learning_rate": 4.999430156802111e-06, + "loss": 0.6678, + "step": 225 + }, + { + "epoch": 0.03666450356911097, + "grad_norm": 0.5888281952599297, + "learning_rate": 4.999421003194337e-06, + "loss": 0.683, + "step": 226 + }, + { + "epoch": 0.03682673588578845, + "grad_norm": 0.5754943971290996, + "learning_rate": 4.999411776660757e-06, + "loss": 0.6983, + "step": 227 + }, + { + "epoch": 0.03698896820246593, + "grad_norm": 0.5917848702881242, + "learning_rate": 4.999402477201638e-06, + "loss": 0.6795, + "step": 228 + }, + { + "epoch": 0.03715120051914341, + "grad_norm": 0.5541860093239458, + "learning_rate": 4.999393104817254e-06, + "loss": 0.6763, + "step": 229 + }, + { + "epoch": 0.03731343283582089, + "grad_norm": 0.5645621744383095, + "learning_rate": 4.999383659507877e-06, + "loss": 0.6791, + "step": 230 + }, + { + "epoch": 0.03747566515249838, + "grad_norm": 0.6207140342610485, + "learning_rate": 4.999374141273782e-06, + "loss": 0.6797, + "step": 231 + }, + { + "epoch": 0.03763789746917586, + "grad_norm": 0.5763761855959595, + "learning_rate": 4.9993645501152485e-06, + "loss": 0.6638, + "step": 232 + }, + { + "epoch": 0.03780012978585334, + "grad_norm": 0.6151301544283374, + "learning_rate": 4.999354886032555e-06, + "loss": 0.6477, + "step": 233 + }, + { + "epoch": 0.037962362102530824, + "grad_norm": 0.696692760605654, + "learning_rate": 4.999345149025983e-06, + "loss": 0.6623, + "step": 234 + }, + { + "epoch": 0.038124594419208305, + "grad_norm": 0.6090762853488806, + "learning_rate": 4.999335339095817e-06, + "loss": 0.6736, + "step": 235 + }, + { + "epoch": 0.038286826735885786, + "grad_norm": 0.5606275396636897, + "learning_rate": 4.999325456242345e-06, + "loss": 0.6963, + "step": 236 + }, + { + "epoch": 0.038449059052563274, + "grad_norm": 0.5949342791896147, + "learning_rate": 4.999315500465853e-06, + "loss": 0.6487, + "step": 237 + }, + { + "epoch": 0.038611291369240755, + "grad_norm": 0.5702342695653565, + "learning_rate": 4.999305471766633e-06, + "loss": 0.6694, + "step": 238 + }, + { + "epoch": 0.038773523685918236, + "grad_norm": 0.5925346376669332, + "learning_rate": 4.999295370144976e-06, + "loss": 0.6818, + "step": 239 + }, + { + "epoch": 0.03893575600259572, + "grad_norm": 0.5989533995015026, + "learning_rate": 4.999285195601179e-06, + "loss": 0.6909, + "step": 240 + }, + { + "epoch": 0.0390979883192732, + "grad_norm": 0.606666831253811, + "learning_rate": 4.999274948135537e-06, + "loss": 0.6392, + "step": 241 + }, + { + "epoch": 0.03926022063595068, + "grad_norm": 0.5861074076409163, + "learning_rate": 4.9992646277483505e-06, + "loss": 0.6622, + "step": 242 + }, + { + "epoch": 0.03942245295262816, + "grad_norm": 0.5714981772404752, + "learning_rate": 4.999254234439919e-06, + "loss": 0.6776, + "step": 243 + }, + { + "epoch": 0.03958468526930565, + "grad_norm": 0.5620263027791118, + "learning_rate": 4.999243768210547e-06, + "loss": 0.6603, + "step": 244 + }, + { + "epoch": 0.03974691758598313, + "grad_norm": 0.5927370863453431, + "learning_rate": 4.999233229060538e-06, + "loss": 0.653, + "step": 245 + }, + { + "epoch": 0.03990914990266061, + "grad_norm": 0.5756938845924514, + "learning_rate": 4.999222616990202e-06, + "loss": 0.6847, + "step": 246 + }, + { + "epoch": 0.04007138221933809, + "grad_norm": 0.5565494302668496, + "learning_rate": 4.9992119319998475e-06, + "loss": 0.6489, + "step": 247 + }, + { + "epoch": 0.04023361453601557, + "grad_norm": 0.6011007907012187, + "learning_rate": 4.999201174089785e-06, + "loss": 0.6431, + "step": 248 + }, + { + "epoch": 0.040395846852693054, + "grad_norm": 0.5924405097204769, + "learning_rate": 4.999190343260331e-06, + "loss": 0.6848, + "step": 249 + }, + { + "epoch": 0.040558079169370535, + "grad_norm": 0.5660200560990039, + "learning_rate": 4.9991794395118e-06, + "loss": 0.6755, + "step": 250 + }, + { + "epoch": 0.04072031148604802, + "grad_norm": 0.5587948439530892, + "learning_rate": 4.99916846284451e-06, + "loss": 0.6611, + "step": 251 + }, + { + "epoch": 0.040882543802725504, + "grad_norm": 0.6000314240479667, + "learning_rate": 4.999157413258782e-06, + "loss": 0.6546, + "step": 252 + }, + { + "epoch": 0.041044776119402986, + "grad_norm": 0.5727312776041602, + "learning_rate": 4.999146290754937e-06, + "loss": 0.6475, + "step": 253 + }, + { + "epoch": 0.04120700843608047, + "grad_norm": 0.6233362667588161, + "learning_rate": 4.999135095333301e-06, + "loss": 0.6569, + "step": 254 + }, + { + "epoch": 0.04136924075275795, + "grad_norm": 0.6475267310934011, + "learning_rate": 4.9991238269942e-06, + "loss": 0.6789, + "step": 255 + }, + { + "epoch": 0.04153147306943543, + "grad_norm": 0.6038355630903628, + "learning_rate": 4.999112485737963e-06, + "loss": 0.6719, + "step": 256 + }, + { + "epoch": 0.04169370538611292, + "grad_norm": 0.6195437238399718, + "learning_rate": 4.999101071564921e-06, + "loss": 0.6611, + "step": 257 + }, + { + "epoch": 0.0418559377027904, + "grad_norm": 0.5897937659943908, + "learning_rate": 4.999089584475407e-06, + "loss": 0.6634, + "step": 258 + }, + { + "epoch": 0.04201817001946788, + "grad_norm": 0.5820353598444059, + "learning_rate": 4.999078024469755e-06, + "loss": 0.6795, + "step": 259 + }, + { + "epoch": 0.04218040233614536, + "grad_norm": 0.571945660195972, + "learning_rate": 4.999066391548304e-06, + "loss": 0.6751, + "step": 260 + }, + { + "epoch": 0.04234263465282284, + "grad_norm": 0.5741094038760809, + "learning_rate": 4.999054685711393e-06, + "loss": 0.6542, + "step": 261 + }, + { + "epoch": 0.04250486696950032, + "grad_norm": 0.5828587506302216, + "learning_rate": 4.9990429069593626e-06, + "loss": 0.6259, + "step": 262 + }, + { + "epoch": 0.042667099286177804, + "grad_norm": 0.6113251985544236, + "learning_rate": 4.999031055292557e-06, + "loss": 0.663, + "step": 263 + }, + { + "epoch": 0.04282933160285529, + "grad_norm": 0.6219279361086933, + "learning_rate": 4.999019130711323e-06, + "loss": 0.6673, + "step": 264 + }, + { + "epoch": 0.04299156391953277, + "grad_norm": 0.5976643710652798, + "learning_rate": 4.999007133216006e-06, + "loss": 0.663, + "step": 265 + }, + { + "epoch": 0.043153796236210254, + "grad_norm": 0.5620767347849874, + "learning_rate": 4.99899506280696e-06, + "loss": 0.6953, + "step": 266 + }, + { + "epoch": 0.043316028552887735, + "grad_norm": 0.5614718413852148, + "learning_rate": 4.998982919484533e-06, + "loss": 0.6561, + "step": 267 + }, + { + "epoch": 0.043478260869565216, + "grad_norm": 0.6028647966687669, + "learning_rate": 4.998970703249082e-06, + "loss": 0.6574, + "step": 268 + }, + { + "epoch": 0.0436404931862427, + "grad_norm": 0.6286758297841375, + "learning_rate": 4.998958414100962e-06, + "loss": 0.6349, + "step": 269 + }, + { + "epoch": 0.04380272550292018, + "grad_norm": 0.6501295141808092, + "learning_rate": 4.998946052040533e-06, + "loss": 0.6735, + "step": 270 + }, + { + "epoch": 0.043964957819597666, + "grad_norm": 0.5715094141597182, + "learning_rate": 4.998933617068154e-06, + "loss": 0.7012, + "step": 271 + }, + { + "epoch": 0.04412719013627515, + "grad_norm": 0.5804245644132112, + "learning_rate": 4.99892110918419e-06, + "loss": 0.6833, + "step": 272 + }, + { + "epoch": 0.04428942245295263, + "grad_norm": 0.5746736291864678, + "learning_rate": 4.9989085283890026e-06, + "loss": 0.6549, + "step": 273 + }, + { + "epoch": 0.04445165476963011, + "grad_norm": 0.6002281665600498, + "learning_rate": 4.998895874682962e-06, + "loss": 0.6323, + "step": 274 + }, + { + "epoch": 0.04461388708630759, + "grad_norm": 0.5805687217561825, + "learning_rate": 4.998883148066436e-06, + "loss": 0.6595, + "step": 275 + }, + { + "epoch": 0.04477611940298507, + "grad_norm": 0.6016313258011189, + "learning_rate": 4.998870348539797e-06, + "loss": 0.6684, + "step": 276 + }, + { + "epoch": 0.04493835171966256, + "grad_norm": 0.5895799676125637, + "learning_rate": 4.998857476103417e-06, + "loss": 0.6701, + "step": 277 + }, + { + "epoch": 0.04510058403634004, + "grad_norm": 0.561719673552275, + "learning_rate": 4.9988445307576725e-06, + "loss": 0.689, + "step": 278 + }, + { + "epoch": 0.04526281635301752, + "grad_norm": 0.58759796144376, + "learning_rate": 4.9988315125029405e-06, + "loss": 0.6583, + "step": 279 + }, + { + "epoch": 0.045425048669695, + "grad_norm": 0.6001122570595592, + "learning_rate": 4.998818421339601e-06, + "loss": 0.6341, + "step": 280 + }, + { + "epoch": 0.045587280986372485, + "grad_norm": 0.6042268411424719, + "learning_rate": 4.998805257268037e-06, + "loss": 0.6718, + "step": 281 + }, + { + "epoch": 0.045749513303049966, + "grad_norm": 0.5867540672148255, + "learning_rate": 4.998792020288632e-06, + "loss": 0.6769, + "step": 282 + }, + { + "epoch": 0.04591174561972745, + "grad_norm": 0.5872867324817627, + "learning_rate": 4.998778710401772e-06, + "loss": 0.675, + "step": 283 + }, + { + "epoch": 0.046073977936404935, + "grad_norm": 0.5949142076845801, + "learning_rate": 4.998765327607844e-06, + "loss": 0.6305, + "step": 284 + }, + { + "epoch": 0.046236210253082416, + "grad_norm": 0.5740593101792001, + "learning_rate": 4.998751871907242e-06, + "loss": 0.6495, + "step": 285 + }, + { + "epoch": 0.0463984425697599, + "grad_norm": 0.6177305334619504, + "learning_rate": 4.9987383433003545e-06, + "loss": 0.6825, + "step": 286 + }, + { + "epoch": 0.04656067488643738, + "grad_norm": 0.5724295265057461, + "learning_rate": 4.998724741787579e-06, + "loss": 0.6286, + "step": 287 + }, + { + "epoch": 0.04672290720311486, + "grad_norm": 0.5890270580302933, + "learning_rate": 4.998711067369312e-06, + "loss": 0.6326, + "step": 288 + }, + { + "epoch": 0.04688513951979234, + "grad_norm": 0.5840005409173303, + "learning_rate": 4.998697320045951e-06, + "loss": 0.6983, + "step": 289 + }, + { + "epoch": 0.04704737183646982, + "grad_norm": 0.5822392154475063, + "learning_rate": 4.998683499817899e-06, + "loss": 0.6392, + "step": 290 + }, + { + "epoch": 0.04720960415314731, + "grad_norm": 0.5939263366169416, + "learning_rate": 4.998669606685558e-06, + "loss": 0.6685, + "step": 291 + }, + { + "epoch": 0.04737183646982479, + "grad_norm": 0.6052380819940486, + "learning_rate": 4.998655640649334e-06, + "loss": 0.6794, + "step": 292 + }, + { + "epoch": 0.04753406878650227, + "grad_norm": 0.6164588261294419, + "learning_rate": 4.9986416017096335e-06, + "loss": 0.6468, + "step": 293 + }, + { + "epoch": 0.04769630110317975, + "grad_norm": 0.5858028256635247, + "learning_rate": 4.998627489866867e-06, + "loss": 0.6341, + "step": 294 + }, + { + "epoch": 0.047858533419857234, + "grad_norm": 0.627969950828264, + "learning_rate": 4.998613305121447e-06, + "loss": 0.6223, + "step": 295 + }, + { + "epoch": 0.048020765736534715, + "grad_norm": 0.5989954654252588, + "learning_rate": 4.998599047473786e-06, + "loss": 0.6345, + "step": 296 + }, + { + "epoch": 0.0481829980532122, + "grad_norm": 0.5781459508715842, + "learning_rate": 4.998584716924299e-06, + "loss": 0.6633, + "step": 297 + }, + { + "epoch": 0.048345230369889684, + "grad_norm": 0.6183537929424545, + "learning_rate": 4.998570313473408e-06, + "loss": 0.6134, + "step": 298 + }, + { + "epoch": 0.048507462686567165, + "grad_norm": 0.5991583748399698, + "learning_rate": 4.998555837121529e-06, + "loss": 0.6345, + "step": 299 + }, + { + "epoch": 0.04866969500324465, + "grad_norm": 0.6018706909305259, + "learning_rate": 4.998541287869086e-06, + "loss": 0.6863, + "step": 300 + }, + { + "epoch": 0.04883192731992213, + "grad_norm": 0.553460714093285, + "learning_rate": 4.998526665716505e-06, + "loss": 0.6061, + "step": 301 + }, + { + "epoch": 0.04899415963659961, + "grad_norm": 0.5917194568155869, + "learning_rate": 4.998511970664211e-06, + "loss": 0.6391, + "step": 302 + }, + { + "epoch": 0.04915639195327709, + "grad_norm": 0.5890562142447306, + "learning_rate": 4.998497202712633e-06, + "loss": 0.6536, + "step": 303 + }, + { + "epoch": 0.04931862426995458, + "grad_norm": 0.6276265410557944, + "learning_rate": 4.998482361862201e-06, + "loss": 0.6453, + "step": 304 + }, + { + "epoch": 0.04948085658663206, + "grad_norm": 0.5756406234538802, + "learning_rate": 4.99846744811335e-06, + "loss": 0.6548, + "step": 305 + }, + { + "epoch": 0.04964308890330954, + "grad_norm": 0.5991002803881119, + "learning_rate": 4.998452461466514e-06, + "loss": 0.6405, + "step": 306 + }, + { + "epoch": 0.04980532121998702, + "grad_norm": 0.5886567929071607, + "learning_rate": 4.998437401922131e-06, + "loss": 0.6296, + "step": 307 + }, + { + "epoch": 0.0499675535366645, + "grad_norm": 0.6136633999928645, + "learning_rate": 4.998422269480638e-06, + "loss": 0.6703, + "step": 308 + }, + { + "epoch": 0.050129785853341984, + "grad_norm": 0.5995158631370342, + "learning_rate": 4.99840706414248e-06, + "loss": 0.6781, + "step": 309 + }, + { + "epoch": 0.050292018170019465, + "grad_norm": 0.5839845500808024, + "learning_rate": 4.998391785908098e-06, + "loss": 0.6886, + "step": 310 + }, + { + "epoch": 0.05045425048669695, + "grad_norm": 0.640215048600306, + "learning_rate": 4.998376434777939e-06, + "loss": 0.6379, + "step": 311 + }, + { + "epoch": 0.050616482803374434, + "grad_norm": 0.5887999283634753, + "learning_rate": 4.998361010752451e-06, + "loss": 0.6931, + "step": 312 + }, + { + "epoch": 0.050778715120051915, + "grad_norm": 0.5614909940611267, + "learning_rate": 4.998345513832082e-06, + "loss": 0.695, + "step": 313 + }, + { + "epoch": 0.050940947436729396, + "grad_norm": 0.6085605750166405, + "learning_rate": 4.998329944017287e-06, + "loss": 0.6519, + "step": 314 + }, + { + "epoch": 0.05110317975340688, + "grad_norm": 0.5755425121487607, + "learning_rate": 4.99831430130852e-06, + "loss": 0.639, + "step": 315 + }, + { + "epoch": 0.05126541207008436, + "grad_norm": 0.5893179464155225, + "learning_rate": 4.998298585706235e-06, + "loss": 0.6594, + "step": 316 + }, + { + "epoch": 0.051427644386761846, + "grad_norm": 0.6268439497846823, + "learning_rate": 4.998282797210893e-06, + "loss": 0.678, + "step": 317 + }, + { + "epoch": 0.05158987670343933, + "grad_norm": 0.5606369734733292, + "learning_rate": 4.998266935822953e-06, + "loss": 0.6591, + "step": 318 + }, + { + "epoch": 0.05175210902011681, + "grad_norm": 0.5790347530278213, + "learning_rate": 4.998251001542879e-06, + "loss": 0.6572, + "step": 319 + }, + { + "epoch": 0.05191434133679429, + "grad_norm": 0.5922923033219585, + "learning_rate": 4.998234994371135e-06, + "loss": 0.6837, + "step": 320 + }, + { + "epoch": 0.05207657365347177, + "grad_norm": 0.6188607696686074, + "learning_rate": 4.998218914308189e-06, + "loss": 0.6597, + "step": 321 + }, + { + "epoch": 0.05223880597014925, + "grad_norm": 0.6056594210349503, + "learning_rate": 4.9982027613545096e-06, + "loss": 0.6258, + "step": 322 + }, + { + "epoch": 0.05240103828682673, + "grad_norm": 0.5878629210304103, + "learning_rate": 4.9981865355105676e-06, + "loss": 0.6547, + "step": 323 + }, + { + "epoch": 0.05256327060350422, + "grad_norm": 0.5594156835005117, + "learning_rate": 4.998170236776837e-06, + "loss": 0.5977, + "step": 324 + }, + { + "epoch": 0.0527255029201817, + "grad_norm": 0.5995323398765728, + "learning_rate": 4.998153865153793e-06, + "loss": 0.651, + "step": 325 + }, + { + "epoch": 0.05288773523685918, + "grad_norm": 0.5939283386467773, + "learning_rate": 4.998137420641914e-06, + "loss": 0.6495, + "step": 326 + }, + { + "epoch": 0.053049967553536664, + "grad_norm": 0.5933800315475746, + "learning_rate": 4.998120903241679e-06, + "loss": 0.6258, + "step": 327 + }, + { + "epoch": 0.053212199870214145, + "grad_norm": 0.5686060589036954, + "learning_rate": 4.99810431295357e-06, + "loss": 0.629, + "step": 328 + }, + { + "epoch": 0.05337443218689163, + "grad_norm": 0.6109020758896844, + "learning_rate": 4.998087649778073e-06, + "loss": 0.6732, + "step": 329 + }, + { + "epoch": 0.05353666450356911, + "grad_norm": 0.5900711637835718, + "learning_rate": 4.998070913715671e-06, + "loss": 0.6493, + "step": 330 + }, + { + "epoch": 0.053698896820246596, + "grad_norm": 0.57898416585338, + "learning_rate": 4.998054104766854e-06, + "loss": 0.6164, + "step": 331 + }, + { + "epoch": 0.05386112913692408, + "grad_norm": 0.6296171514493286, + "learning_rate": 4.998037222932113e-06, + "loss": 0.6427, + "step": 332 + }, + { + "epoch": 0.05402336145360156, + "grad_norm": 0.5848764536643476, + "learning_rate": 4.998020268211939e-06, + "loss": 0.6894, + "step": 333 + }, + { + "epoch": 0.05418559377027904, + "grad_norm": 0.5804998916465657, + "learning_rate": 4.998003240606827e-06, + "loss": 0.6651, + "step": 334 + }, + { + "epoch": 0.05434782608695652, + "grad_norm": 0.5843078020035609, + "learning_rate": 4.9979861401172755e-06, + "loss": 0.6511, + "step": 335 + }, + { + "epoch": 0.054510058403634, + "grad_norm": 0.5540167017532299, + "learning_rate": 4.997968966743782e-06, + "loss": 0.6598, + "step": 336 + }, + { + "epoch": 0.05467229072031149, + "grad_norm": 0.5801808719504319, + "learning_rate": 4.997951720486848e-06, + "loss": 0.6196, + "step": 337 + }, + { + "epoch": 0.05483452303698897, + "grad_norm": 0.567209887292168, + "learning_rate": 4.997934401346976e-06, + "loss": 0.6485, + "step": 338 + }, + { + "epoch": 0.05499675535366645, + "grad_norm": 0.5810479276852726, + "learning_rate": 4.997917009324672e-06, + "loss": 0.646, + "step": 339 + }, + { + "epoch": 0.05515898767034393, + "grad_norm": 0.5760647681039582, + "learning_rate": 4.9978995444204435e-06, + "loss": 0.6515, + "step": 340 + }, + { + "epoch": 0.055321219987021414, + "grad_norm": 0.6121584664296077, + "learning_rate": 4.9978820066348e-06, + "loss": 0.6754, + "step": 341 + }, + { + "epoch": 0.055483452303698895, + "grad_norm": 0.5897581687944927, + "learning_rate": 4.997864395968252e-06, + "loss": 0.6531, + "step": 342 + }, + { + "epoch": 0.055645684620376376, + "grad_norm": 0.5844894521218872, + "learning_rate": 4.997846712421316e-06, + "loss": 0.6461, + "step": 343 + }, + { + "epoch": 0.055807916937053864, + "grad_norm": 0.6107171883329388, + "learning_rate": 4.9978289559945055e-06, + "loss": 0.6595, + "step": 344 + }, + { + "epoch": 0.055970149253731345, + "grad_norm": 0.631075239250163, + "learning_rate": 4.99781112668834e-06, + "loss": 0.6486, + "step": 345 + }, + { + "epoch": 0.056132381570408826, + "grad_norm": 0.6216775810686019, + "learning_rate": 4.997793224503339e-06, + "loss": 0.658, + "step": 346 + }, + { + "epoch": 0.05629461388708631, + "grad_norm": 0.5803065059287532, + "learning_rate": 4.9977752494400244e-06, + "loss": 0.6488, + "step": 347 + }, + { + "epoch": 0.05645684620376379, + "grad_norm": 0.6605433490154199, + "learning_rate": 4.997757201498922e-06, + "loss": 0.6668, + "step": 348 + }, + { + "epoch": 0.05661907852044127, + "grad_norm": 0.5757984088539525, + "learning_rate": 4.997739080680557e-06, + "loss": 0.6327, + "step": 349 + }, + { + "epoch": 0.05678131083711875, + "grad_norm": 0.6672447815064362, + "learning_rate": 4.9977208869854595e-06, + "loss": 0.6294, + "step": 350 + }, + { + "epoch": 0.05694354315379624, + "grad_norm": 0.5900087270141204, + "learning_rate": 4.997702620414159e-06, + "loss": 0.6414, + "step": 351 + }, + { + "epoch": 0.05710577547047372, + "grad_norm": 0.605960339167227, + "learning_rate": 4.997684280967189e-06, + "loss": 0.6216, + "step": 352 + }, + { + "epoch": 0.0572680077871512, + "grad_norm": 0.5893806246870459, + "learning_rate": 4.997665868645085e-06, + "loss": 0.6973, + "step": 353 + }, + { + "epoch": 0.05743024010382868, + "grad_norm": 0.5987093787320164, + "learning_rate": 4.997647383448384e-06, + "loss": 0.6633, + "step": 354 + }, + { + "epoch": 0.05759247242050616, + "grad_norm": 0.6071670066587801, + "learning_rate": 4.997628825377624e-06, + "loss": 0.6703, + "step": 355 + }, + { + "epoch": 0.057754704737183644, + "grad_norm": 0.5911984830109991, + "learning_rate": 4.9976101944333486e-06, + "loss": 0.6224, + "step": 356 + }, + { + "epoch": 0.05791693705386113, + "grad_norm": 0.5620803500893938, + "learning_rate": 4.997591490616101e-06, + "loss": 0.6037, + "step": 357 + }, + { + "epoch": 0.058079169370538614, + "grad_norm": 0.654598264454661, + "learning_rate": 4.997572713926426e-06, + "loss": 0.6689, + "step": 358 + }, + { + "epoch": 0.058241401687216095, + "grad_norm": 0.5906396916761918, + "learning_rate": 4.997553864364871e-06, + "loss": 0.6619, + "step": 359 + }, + { + "epoch": 0.058403634003893576, + "grad_norm": 0.5733218593139213, + "learning_rate": 4.997534941931988e-06, + "loss": 0.6481, + "step": 360 + }, + { + "epoch": 0.05856586632057106, + "grad_norm": 0.5706351481625366, + "learning_rate": 4.997515946628327e-06, + "loss": 0.6745, + "step": 361 + }, + { + "epoch": 0.05872809863724854, + "grad_norm": 0.5724634320655483, + "learning_rate": 4.997496878454444e-06, + "loss": 0.6411, + "step": 362 + }, + { + "epoch": 0.05889033095392602, + "grad_norm": 0.6001239079641917, + "learning_rate": 4.997477737410894e-06, + "loss": 0.6561, + "step": 363 + }, + { + "epoch": 0.05905256327060351, + "grad_norm": 0.6348334375380142, + "learning_rate": 4.997458523498236e-06, + "loss": 0.6489, + "step": 364 + }, + { + "epoch": 0.05921479558728099, + "grad_norm": 0.5632655160012718, + "learning_rate": 4.997439236717031e-06, + "loss": 0.6607, + "step": 365 + }, + { + "epoch": 0.05937702790395847, + "grad_norm": 0.5787839762226872, + "learning_rate": 4.99741987706784e-06, + "loss": 0.661, + "step": 366 + }, + { + "epoch": 0.05953926022063595, + "grad_norm": 0.5997057135046351, + "learning_rate": 4.9974004445512305e-06, + "loss": 0.6466, + "step": 367 + }, + { + "epoch": 0.05970149253731343, + "grad_norm": 0.6042408948456212, + "learning_rate": 4.997380939167769e-06, + "loss": 0.6261, + "step": 368 + }, + { + "epoch": 0.05986372485399091, + "grad_norm": 0.6320171125474818, + "learning_rate": 4.997361360918021e-06, + "loss": 0.6419, + "step": 369 + }, + { + "epoch": 0.060025957170668394, + "grad_norm": 0.5667969627398597, + "learning_rate": 4.997341709802563e-06, + "loss": 0.6129, + "step": 370 + }, + { + "epoch": 0.06018818948734588, + "grad_norm": 0.619169618059234, + "learning_rate": 4.997321985821964e-06, + "loss": 0.6418, + "step": 371 + }, + { + "epoch": 0.06035042180402336, + "grad_norm": 0.5960479630200557, + "learning_rate": 4.997302188976803e-06, + "loss": 0.6362, + "step": 372 + }, + { + "epoch": 0.060512654120700844, + "grad_norm": 0.6184973640965882, + "learning_rate": 4.997282319267656e-06, + "loss": 0.6655, + "step": 373 + }, + { + "epoch": 0.060674886437378325, + "grad_norm": 0.5721471056059229, + "learning_rate": 4.997262376695101e-06, + "loss": 0.6571, + "step": 374 + }, + { + "epoch": 0.060837118754055806, + "grad_norm": 0.6668803197468989, + "learning_rate": 4.997242361259723e-06, + "loss": 0.6551, + "step": 375 + }, + { + "epoch": 0.06099935107073329, + "grad_norm": 0.6139824497098213, + "learning_rate": 4.997222272962105e-06, + "loss": 0.6753, + "step": 376 + }, + { + "epoch": 0.061161583387410776, + "grad_norm": 0.5632227107604026, + "learning_rate": 4.997202111802831e-06, + "loss": 0.6223, + "step": 377 + }, + { + "epoch": 0.06132381570408826, + "grad_norm": 0.5853627195112485, + "learning_rate": 4.997181877782492e-06, + "loss": 0.6636, + "step": 378 + }, + { + "epoch": 0.06148604802076574, + "grad_norm": 0.5768533033788393, + "learning_rate": 4.997161570901677e-06, + "loss": 0.6615, + "step": 379 + }, + { + "epoch": 0.06164828033744322, + "grad_norm": 0.5752706883776355, + "learning_rate": 4.9971411911609795e-06, + "loss": 0.6188, + "step": 380 + }, + { + "epoch": 0.0618105126541207, + "grad_norm": 0.6254591782609953, + "learning_rate": 4.997120738560992e-06, + "loss": 0.614, + "step": 381 + }, + { + "epoch": 0.06197274497079818, + "grad_norm": 0.6085851903083848, + "learning_rate": 4.997100213102313e-06, + "loss": 0.6444, + "step": 382 + }, + { + "epoch": 0.06213497728747566, + "grad_norm": 0.5798341859517654, + "learning_rate": 4.997079614785542e-06, + "loss": 0.6343, + "step": 383 + }, + { + "epoch": 0.06229720960415315, + "grad_norm": 0.5963862512335352, + "learning_rate": 4.997058943611278e-06, + "loss": 0.6449, + "step": 384 + }, + { + "epoch": 0.06245944192083063, + "grad_norm": 0.5911378720208574, + "learning_rate": 4.997038199580127e-06, + "loss": 0.6603, + "step": 385 + }, + { + "epoch": 0.06262167423750811, + "grad_norm": 0.5865878211661638, + "learning_rate": 4.99701738269269e-06, + "loss": 0.6436, + "step": 386 + }, + { + "epoch": 0.0627839065541856, + "grad_norm": 0.6111936876848418, + "learning_rate": 4.996996492949578e-06, + "loss": 0.6635, + "step": 387 + }, + { + "epoch": 0.06294613887086307, + "grad_norm": 0.6133113030511214, + "learning_rate": 4.996975530351399e-06, + "loss": 0.6536, + "step": 388 + }, + { + "epoch": 0.06310837118754056, + "grad_norm": 0.6021850735868766, + "learning_rate": 4.9969544948987656e-06, + "loss": 0.6482, + "step": 389 + }, + { + "epoch": 0.06327060350421804, + "grad_norm": 0.5959099565427739, + "learning_rate": 4.99693338659229e-06, + "loss": 0.6383, + "step": 390 + }, + { + "epoch": 0.06343283582089553, + "grad_norm": 0.583039753453075, + "learning_rate": 4.9969122054325896e-06, + "loss": 0.6755, + "step": 391 + }, + { + "epoch": 0.063595068137573, + "grad_norm": 0.6000598859468498, + "learning_rate": 4.996890951420281e-06, + "loss": 0.6461, + "step": 392 + }, + { + "epoch": 0.06375730045425049, + "grad_norm": 0.5937729880962906, + "learning_rate": 4.996869624555986e-06, + "loss": 0.6819, + "step": 393 + }, + { + "epoch": 0.06391953277092798, + "grad_norm": 0.6122120409870828, + "learning_rate": 4.996848224840326e-06, + "loss": 0.6539, + "step": 394 + }, + { + "epoch": 0.06408176508760545, + "grad_norm": 0.5910626874326982, + "learning_rate": 4.996826752273924e-06, + "loss": 0.6488, + "step": 395 + }, + { + "epoch": 0.06424399740428294, + "grad_norm": 0.6223684426607474, + "learning_rate": 4.996805206857409e-06, + "loss": 0.6428, + "step": 396 + }, + { + "epoch": 0.06440622972096041, + "grad_norm": 0.6209154966872374, + "learning_rate": 4.996783588591408e-06, + "loss": 0.6595, + "step": 397 + }, + { + "epoch": 0.0645684620376379, + "grad_norm": 0.5751891254775585, + "learning_rate": 4.996761897476552e-06, + "loss": 0.6206, + "step": 398 + }, + { + "epoch": 0.06473069435431537, + "grad_norm": 0.6068944080359707, + "learning_rate": 4.9967401335134745e-06, + "loss": 0.6475, + "step": 399 + }, + { + "epoch": 0.06489292667099286, + "grad_norm": 0.583409581367954, + "learning_rate": 4.996718296702811e-06, + "loss": 0.6349, + "step": 400 + }, + { + "epoch": 0.06505515898767035, + "grad_norm": 0.5956838877048176, + "learning_rate": 4.996696387045196e-06, + "loss": 0.6076, + "step": 401 + }, + { + "epoch": 0.06521739130434782, + "grad_norm": 0.6005726752830017, + "learning_rate": 4.996674404541272e-06, + "loss": 0.6355, + "step": 402 + }, + { + "epoch": 0.06537962362102531, + "grad_norm": 0.5823843511075448, + "learning_rate": 4.996652349191677e-06, + "loss": 0.6092, + "step": 403 + }, + { + "epoch": 0.06554185593770279, + "grad_norm": 0.6278811539045925, + "learning_rate": 4.996630220997058e-06, + "loss": 0.6339, + "step": 404 + }, + { + "epoch": 0.06570408825438027, + "grad_norm": 0.6153168731160129, + "learning_rate": 4.996608019958059e-06, + "loss": 0.6401, + "step": 405 + }, + { + "epoch": 0.06586632057105775, + "grad_norm": 0.5911236574380695, + "learning_rate": 4.996585746075327e-06, + "loss": 0.649, + "step": 406 + }, + { + "epoch": 0.06602855288773524, + "grad_norm": 0.6354214477507844, + "learning_rate": 4.996563399349513e-06, + "loss": 0.6478, + "step": 407 + }, + { + "epoch": 0.06619078520441272, + "grad_norm": 0.5941042991763457, + "learning_rate": 4.996540979781269e-06, + "loss": 0.6299, + "step": 408 + }, + { + "epoch": 0.0663530175210902, + "grad_norm": 0.5783590702766122, + "learning_rate": 4.996518487371249e-06, + "loss": 0.6285, + "step": 409 + }, + { + "epoch": 0.06651524983776769, + "grad_norm": 0.5865745555603104, + "learning_rate": 4.996495922120108e-06, + "loss": 0.6453, + "step": 410 + }, + { + "epoch": 0.06667748215444516, + "grad_norm": 0.6453197856229403, + "learning_rate": 4.9964732840285065e-06, + "loss": 0.6286, + "step": 411 + }, + { + "epoch": 0.06683971447112265, + "grad_norm": 0.5644459608614979, + "learning_rate": 4.9964505730971035e-06, + "loss": 0.5992, + "step": 412 + }, + { + "epoch": 0.06700194678780012, + "grad_norm": 0.668509909319712, + "learning_rate": 4.996427789326563e-06, + "loss": 0.6467, + "step": 413 + }, + { + "epoch": 0.06716417910447761, + "grad_norm": 0.5699008478037986, + "learning_rate": 4.996404932717548e-06, + "loss": 0.6419, + "step": 414 + }, + { + "epoch": 0.0673264114211551, + "grad_norm": 0.5896569330581716, + "learning_rate": 4.996382003270726e-06, + "loss": 0.6155, + "step": 415 + }, + { + "epoch": 0.06748864373783257, + "grad_norm": 0.5803577767099269, + "learning_rate": 4.996359000986767e-06, + "loss": 0.6282, + "step": 416 + }, + { + "epoch": 0.06765087605451006, + "grad_norm": 0.6132679714469595, + "learning_rate": 4.996335925866342e-06, + "loss": 0.6227, + "step": 417 + }, + { + "epoch": 0.06781310837118754, + "grad_norm": 0.6231153584909727, + "learning_rate": 4.9963127779101225e-06, + "loss": 0.6535, + "step": 418 + }, + { + "epoch": 0.06797534068786502, + "grad_norm": 0.5840631927011721, + "learning_rate": 4.996289557118785e-06, + "loss": 0.6381, + "step": 419 + }, + { + "epoch": 0.0681375730045425, + "grad_norm": 0.6196677692051755, + "learning_rate": 4.996266263493008e-06, + "loss": 0.6199, + "step": 420 + }, + { + "epoch": 0.06829980532121999, + "grad_norm": 0.6036446434582748, + "learning_rate": 4.9962428970334694e-06, + "loss": 0.6066, + "step": 421 + }, + { + "epoch": 0.06846203763789747, + "grad_norm": 0.6440694674874666, + "learning_rate": 4.996219457740853e-06, + "loss": 0.6336, + "step": 422 + }, + { + "epoch": 0.06862426995457495, + "grad_norm": 0.590597717478532, + "learning_rate": 4.996195945615841e-06, + "loss": 0.6439, + "step": 423 + }, + { + "epoch": 0.06878650227125244, + "grad_norm": 0.6355741980593647, + "learning_rate": 4.99617236065912e-06, + "loss": 0.6567, + "step": 424 + }, + { + "epoch": 0.06894873458792991, + "grad_norm": 0.6374106782168613, + "learning_rate": 4.996148702871378e-06, + "loss": 0.6567, + "step": 425 + }, + { + "epoch": 0.0691109669046074, + "grad_norm": 0.6154107023872831, + "learning_rate": 4.996124972253304e-06, + "loss": 0.6319, + "step": 426 + }, + { + "epoch": 0.06927319922128489, + "grad_norm": 0.623590824314094, + "learning_rate": 4.996101168805594e-06, + "loss": 0.6617, + "step": 427 + }, + { + "epoch": 0.06943543153796236, + "grad_norm": 0.5638396598603861, + "learning_rate": 4.996077292528939e-06, + "loss": 0.6354, + "step": 428 + }, + { + "epoch": 0.06959766385463985, + "grad_norm": 0.5986133188587661, + "learning_rate": 4.996053343424037e-06, + "loss": 0.624, + "step": 429 + }, + { + "epoch": 0.06975989617131732, + "grad_norm": 0.6133588301416035, + "learning_rate": 4.996029321491587e-06, + "loss": 0.6349, + "step": 430 + }, + { + "epoch": 0.06992212848799481, + "grad_norm": 0.5885790520032105, + "learning_rate": 4.996005226732289e-06, + "loss": 0.5997, + "step": 431 + }, + { + "epoch": 0.07008436080467229, + "grad_norm": 0.6394781376936449, + "learning_rate": 4.995981059146846e-06, + "loss": 0.6316, + "step": 432 + }, + { + "epoch": 0.07024659312134977, + "grad_norm": 0.5793279186007222, + "learning_rate": 4.995956818735965e-06, + "loss": 0.6252, + "step": 433 + }, + { + "epoch": 0.07040882543802726, + "grad_norm": 0.5962791301460946, + "learning_rate": 4.995932505500351e-06, + "loss": 0.6406, + "step": 434 + }, + { + "epoch": 0.07057105775470474, + "grad_norm": 0.6174552436435143, + "learning_rate": 4.995908119440715e-06, + "loss": 0.6197, + "step": 435 + }, + { + "epoch": 0.07073329007138222, + "grad_norm": 0.622575486665724, + "learning_rate": 4.995883660557767e-06, + "loss": 0.6654, + "step": 436 + }, + { + "epoch": 0.0708955223880597, + "grad_norm": 0.5845685849062082, + "learning_rate": 4.9958591288522226e-06, + "loss": 0.6203, + "step": 437 + }, + { + "epoch": 0.07105775470473719, + "grad_norm": 0.5824581208184169, + "learning_rate": 4.995834524324796e-06, + "loss": 0.5821, + "step": 438 + }, + { + "epoch": 0.07121998702141466, + "grad_norm": 0.576380760107768, + "learning_rate": 4.9958098469762055e-06, + "loss": 0.6493, + "step": 439 + }, + { + "epoch": 0.07138221933809215, + "grad_norm": 0.5827312324095199, + "learning_rate": 4.9957850968071705e-06, + "loss": 0.6554, + "step": 440 + }, + { + "epoch": 0.07154445165476964, + "grad_norm": 0.631708447917682, + "learning_rate": 4.995760273818415e-06, + "loss": 0.6366, + "step": 441 + }, + { + "epoch": 0.07170668397144711, + "grad_norm": 0.5718551853712053, + "learning_rate": 4.995735378010661e-06, + "loss": 0.6877, + "step": 442 + }, + { + "epoch": 0.0718689162881246, + "grad_norm": 0.594932588303127, + "learning_rate": 4.995710409384636e-06, + "loss": 0.6162, + "step": 443 + }, + { + "epoch": 0.07203114860480207, + "grad_norm": 0.5835674454955797, + "learning_rate": 4.995685367941069e-06, + "loss": 0.6407, + "step": 444 + }, + { + "epoch": 0.07219338092147956, + "grad_norm": 0.6007764088630868, + "learning_rate": 4.99566025368069e-06, + "loss": 0.66, + "step": 445 + }, + { + "epoch": 0.07235561323815703, + "grad_norm": 0.5767240644115915, + "learning_rate": 4.995635066604232e-06, + "loss": 0.6342, + "step": 446 + }, + { + "epoch": 0.07251784555483452, + "grad_norm": 0.6279249944757507, + "learning_rate": 4.9956098067124295e-06, + "loss": 0.6135, + "step": 447 + }, + { + "epoch": 0.07268007787151201, + "grad_norm": 0.6334264160219664, + "learning_rate": 4.99558447400602e-06, + "loss": 0.6416, + "step": 448 + }, + { + "epoch": 0.07284231018818949, + "grad_norm": 0.6424315934308126, + "learning_rate": 4.995559068485742e-06, + "loss": 0.6842, + "step": 449 + }, + { + "epoch": 0.07300454250486697, + "grad_norm": 0.5979955927227775, + "learning_rate": 4.995533590152337e-06, + "loss": 0.6444, + "step": 450 + }, + { + "epoch": 0.07316677482154445, + "grad_norm": 0.6103893045217688, + "learning_rate": 4.995508039006549e-06, + "loss": 0.6308, + "step": 451 + }, + { + "epoch": 0.07332900713822194, + "grad_norm": 0.5812000344589178, + "learning_rate": 4.995482415049123e-06, + "loss": 0.5738, + "step": 452 + }, + { + "epoch": 0.07349123945489941, + "grad_norm": 0.5969599971139341, + "learning_rate": 4.9954567182808064e-06, + "loss": 0.6464, + "step": 453 + }, + { + "epoch": 0.0736534717715769, + "grad_norm": 0.6268008850268878, + "learning_rate": 4.995430948702349e-06, + "loss": 0.6772, + "step": 454 + }, + { + "epoch": 0.07381570408825439, + "grad_norm": 0.5871697124257135, + "learning_rate": 4.9954051063145035e-06, + "loss": 0.6198, + "step": 455 + }, + { + "epoch": 0.07397793640493186, + "grad_norm": 0.5821458449601368, + "learning_rate": 4.995379191118023e-06, + "loss": 0.6143, + "step": 456 + }, + { + "epoch": 0.07414016872160935, + "grad_norm": 0.6060390035882478, + "learning_rate": 4.9953532031136644e-06, + "loss": 0.6858, + "step": 457 + }, + { + "epoch": 0.07430240103828682, + "grad_norm": 0.5826379742475425, + "learning_rate": 4.995327142302184e-06, + "loss": 0.6436, + "step": 458 + }, + { + "epoch": 0.07446463335496431, + "grad_norm": 0.5526187051761822, + "learning_rate": 4.9953010086843456e-06, + "loss": 0.6344, + "step": 459 + }, + { + "epoch": 0.07462686567164178, + "grad_norm": 0.606412503475564, + "learning_rate": 4.995274802260909e-06, + "loss": 0.6385, + "step": 460 + }, + { + "epoch": 0.07478909798831927, + "grad_norm": 0.5838931403878603, + "learning_rate": 4.9952485230326396e-06, + "loss": 0.6209, + "step": 461 + }, + { + "epoch": 0.07495133030499676, + "grad_norm": 0.5755354397581428, + "learning_rate": 4.995222171000303e-06, + "loss": 0.6436, + "step": 462 + }, + { + "epoch": 0.07511356262167423, + "grad_norm": 0.642473502150312, + "learning_rate": 4.9951957461646705e-06, + "loss": 0.6478, + "step": 463 + }, + { + "epoch": 0.07527579493835172, + "grad_norm": 0.6098523957056526, + "learning_rate": 4.995169248526512e-06, + "loss": 0.6396, + "step": 464 + }, + { + "epoch": 0.0754380272550292, + "grad_norm": 0.5904669507810371, + "learning_rate": 4.9951426780866e-06, + "loss": 0.6239, + "step": 465 + }, + { + "epoch": 0.07560025957170668, + "grad_norm": 0.6086998143670996, + "learning_rate": 4.99511603484571e-06, + "loss": 0.6458, + "step": 466 + }, + { + "epoch": 0.07576249188838417, + "grad_norm": 0.5743974221346773, + "learning_rate": 4.99508931880462e-06, + "loss": 0.6391, + "step": 467 + }, + { + "epoch": 0.07592472420506165, + "grad_norm": 0.7202024051585072, + "learning_rate": 4.995062529964109e-06, + "loss": 0.6085, + "step": 468 + }, + { + "epoch": 0.07608695652173914, + "grad_norm": 0.5929938074363782, + "learning_rate": 4.995035668324958e-06, + "loss": 0.6297, + "step": 469 + }, + { + "epoch": 0.07624918883841661, + "grad_norm": 0.583238265067669, + "learning_rate": 4.995008733887952e-06, + "loss": 0.5993, + "step": 470 + }, + { + "epoch": 0.0764114211550941, + "grad_norm": 0.6070563371364781, + "learning_rate": 4.994981726653876e-06, + "loss": 0.604, + "step": 471 + }, + { + "epoch": 0.07657365347177157, + "grad_norm": 0.6150898305772197, + "learning_rate": 4.994954646623519e-06, + "loss": 0.6266, + "step": 472 + }, + { + "epoch": 0.07673588578844906, + "grad_norm": 0.606007691456278, + "learning_rate": 4.99492749379767e-06, + "loss": 0.6282, + "step": 473 + }, + { + "epoch": 0.07689811810512655, + "grad_norm": 0.6255974306761828, + "learning_rate": 4.994900268177121e-06, + "loss": 0.6898, + "step": 474 + }, + { + "epoch": 0.07706035042180402, + "grad_norm": 0.602569562342543, + "learning_rate": 4.994872969762668e-06, + "loss": 0.6346, + "step": 475 + }, + { + "epoch": 0.07722258273848151, + "grad_norm": 0.5880333121038817, + "learning_rate": 4.994845598555106e-06, + "loss": 0.6555, + "step": 476 + }, + { + "epoch": 0.07738481505515898, + "grad_norm": 0.5951985492294325, + "learning_rate": 4.994818154555234e-06, + "loss": 0.6096, + "step": 477 + }, + { + "epoch": 0.07754704737183647, + "grad_norm": 0.5853305340133965, + "learning_rate": 4.994790637763853e-06, + "loss": 0.6235, + "step": 478 + }, + { + "epoch": 0.07770927968851395, + "grad_norm": 0.6124342255740163, + "learning_rate": 4.994763048181766e-06, + "loss": 0.6053, + "step": 479 + }, + { + "epoch": 0.07787151200519143, + "grad_norm": 0.5666523373118533, + "learning_rate": 4.994735385809778e-06, + "loss": 0.6282, + "step": 480 + }, + { + "epoch": 0.07803374432186892, + "grad_norm": 0.6492967872484566, + "learning_rate": 4.9947076506486945e-06, + "loss": 0.6488, + "step": 481 + }, + { + "epoch": 0.0781959766385464, + "grad_norm": 0.5958335690727303, + "learning_rate": 4.994679842699327e-06, + "loss": 0.607, + "step": 482 + }, + { + "epoch": 0.07835820895522388, + "grad_norm": 0.6107006037735838, + "learning_rate": 4.9946519619624856e-06, + "loss": 0.625, + "step": 483 + }, + { + "epoch": 0.07852044127190136, + "grad_norm": 0.7336093296977924, + "learning_rate": 4.994624008438984e-06, + "loss": 0.6275, + "step": 484 + }, + { + "epoch": 0.07868267358857885, + "grad_norm": 0.6009803460968492, + "learning_rate": 4.994595982129638e-06, + "loss": 0.6484, + "step": 485 + }, + { + "epoch": 0.07884490590525632, + "grad_norm": 0.6110749824625288, + "learning_rate": 4.9945678830352654e-06, + "loss": 0.6289, + "step": 486 + }, + { + "epoch": 0.07900713822193381, + "grad_norm": 0.5967851418870954, + "learning_rate": 4.994539711156685e-06, + "loss": 0.6185, + "step": 487 + }, + { + "epoch": 0.0791693705386113, + "grad_norm": 0.5960454245757091, + "learning_rate": 4.994511466494721e-06, + "loss": 0.6143, + "step": 488 + }, + { + "epoch": 0.07933160285528877, + "grad_norm": 0.5876751742114098, + "learning_rate": 4.994483149050194e-06, + "loss": 0.6433, + "step": 489 + }, + { + "epoch": 0.07949383517196626, + "grad_norm": 0.6243126232024806, + "learning_rate": 4.994454758823934e-06, + "loss": 0.6329, + "step": 490 + }, + { + "epoch": 0.07965606748864373, + "grad_norm": 0.6299601397464878, + "learning_rate": 4.994426295816768e-06, + "loss": 0.6226, + "step": 491 + }, + { + "epoch": 0.07981829980532122, + "grad_norm": 0.631981328629059, + "learning_rate": 4.994397760029525e-06, + "loss": 0.6367, + "step": 492 + }, + { + "epoch": 0.0799805321219987, + "grad_norm": 0.5953806157085131, + "learning_rate": 4.9943691514630395e-06, + "loss": 0.6234, + "step": 493 + }, + { + "epoch": 0.08014276443867618, + "grad_norm": 0.6264792452543687, + "learning_rate": 4.994340470118144e-06, + "loss": 0.6295, + "step": 494 + }, + { + "epoch": 0.08030499675535367, + "grad_norm": 0.6549708852279147, + "learning_rate": 4.994311715995679e-06, + "loss": 0.648, + "step": 495 + }, + { + "epoch": 0.08046722907203115, + "grad_norm": 0.6403076713056697, + "learning_rate": 4.99428288909648e-06, + "loss": 0.6349, + "step": 496 + }, + { + "epoch": 0.08062946138870863, + "grad_norm": 0.5945174175280595, + "learning_rate": 4.994253989421389e-06, + "loss": 0.6548, + "step": 497 + }, + { + "epoch": 0.08079169370538611, + "grad_norm": 0.6492005914380137, + "learning_rate": 4.99422501697125e-06, + "loss": 0.6646, + "step": 498 + }, + { + "epoch": 0.0809539260220636, + "grad_norm": 0.6216062569011008, + "learning_rate": 4.994195971746908e-06, + "loss": 0.6576, + "step": 499 + }, + { + "epoch": 0.08111615833874107, + "grad_norm": 0.6089020224047016, + "learning_rate": 4.99416685374921e-06, + "loss": 0.6145, + "step": 500 + }, + { + "epoch": 0.08127839065541856, + "grad_norm": 0.6389199170534066, + "learning_rate": 4.994137662979006e-06, + "loss": 0.6368, + "step": 501 + }, + { + "epoch": 0.08144062297209605, + "grad_norm": 0.5879934899025114, + "learning_rate": 4.9941083994371485e-06, + "loss": 0.6312, + "step": 502 + }, + { + "epoch": 0.08160285528877352, + "grad_norm": 0.5803923000178862, + "learning_rate": 4.9940790631244886e-06, + "loss": 0.664, + "step": 503 + }, + { + "epoch": 0.08176508760545101, + "grad_norm": 0.6247630131855508, + "learning_rate": 4.994049654041886e-06, + "loss": 0.6231, + "step": 504 + }, + { + "epoch": 0.08192731992212848, + "grad_norm": 0.5978905720098615, + "learning_rate": 4.994020172190195e-06, + "loss": 0.6487, + "step": 505 + }, + { + "epoch": 0.08208955223880597, + "grad_norm": 0.589589172032832, + "learning_rate": 4.9939906175702795e-06, + "loss": 0.6086, + "step": 506 + }, + { + "epoch": 0.08225178455548346, + "grad_norm": 0.6164443264137444, + "learning_rate": 4.993960990182999e-06, + "loss": 0.6278, + "step": 507 + }, + { + "epoch": 0.08241401687216093, + "grad_norm": 0.5871035936016513, + "learning_rate": 4.993931290029218e-06, + "loss": 0.6116, + "step": 508 + }, + { + "epoch": 0.08257624918883842, + "grad_norm": 0.5815972109729038, + "learning_rate": 4.993901517109805e-06, + "loss": 0.6032, + "step": 509 + }, + { + "epoch": 0.0827384815055159, + "grad_norm": 0.6071696505669538, + "learning_rate": 4.993871671425627e-06, + "loss": 0.6423, + "step": 510 + }, + { + "epoch": 0.08290071382219338, + "grad_norm": 0.6085361117662732, + "learning_rate": 4.993841752977557e-06, + "loss": 0.6297, + "step": 511 + }, + { + "epoch": 0.08306294613887086, + "grad_norm": 0.5896186848381382, + "learning_rate": 4.993811761766465e-06, + "loss": 0.6339, + "step": 512 + }, + { + "epoch": 0.08322517845554835, + "grad_norm": 0.5933587607796016, + "learning_rate": 4.993781697793228e-06, + "loss": 0.6129, + "step": 513 + }, + { + "epoch": 0.08338741077222583, + "grad_norm": 0.5923375438339517, + "learning_rate": 4.9937515610587226e-06, + "loss": 0.6281, + "step": 514 + }, + { + "epoch": 0.08354964308890331, + "grad_norm": 0.5995483544947847, + "learning_rate": 4.993721351563828e-06, + "loss": 0.619, + "step": 515 + }, + { + "epoch": 0.0837118754055808, + "grad_norm": 0.6094111463248554, + "learning_rate": 4.993691069309425e-06, + "loss": 0.6341, + "step": 516 + }, + { + "epoch": 0.08387410772225827, + "grad_norm": 0.6205554601949723, + "learning_rate": 4.993660714296399e-06, + "loss": 0.6751, + "step": 517 + }, + { + "epoch": 0.08403634003893576, + "grad_norm": 0.6160888794502284, + "learning_rate": 4.993630286525634e-06, + "loss": 0.6526, + "step": 518 + }, + { + "epoch": 0.08419857235561323, + "grad_norm": 0.6122837274890087, + "learning_rate": 4.993599785998019e-06, + "loss": 0.628, + "step": 519 + }, + { + "epoch": 0.08436080467229072, + "grad_norm": 0.5909137319776687, + "learning_rate": 4.993569212714443e-06, + "loss": 0.6105, + "step": 520 + }, + { + "epoch": 0.08452303698896821, + "grad_norm": 0.606520098973889, + "learning_rate": 4.993538566675798e-06, + "loss": 0.6282, + "step": 521 + }, + { + "epoch": 0.08468526930564568, + "grad_norm": 0.6190197023514359, + "learning_rate": 4.993507847882978e-06, + "loss": 0.6153, + "step": 522 + }, + { + "epoch": 0.08484750162232317, + "grad_norm": 0.6108997788629494, + "learning_rate": 4.993477056336881e-06, + "loss": 0.6616, + "step": 523 + }, + { + "epoch": 0.08500973393900065, + "grad_norm": 0.577632752892976, + "learning_rate": 4.993446192038404e-06, + "loss": 0.6519, + "step": 524 + }, + { + "epoch": 0.08517196625567813, + "grad_norm": 0.5882677590983554, + "learning_rate": 4.993415254988447e-06, + "loss": 0.6279, + "step": 525 + }, + { + "epoch": 0.08533419857235561, + "grad_norm": 0.5898549464017769, + "learning_rate": 4.993384245187913e-06, + "loss": 0.6481, + "step": 526 + }, + { + "epoch": 0.0854964308890331, + "grad_norm": 0.6206331511013465, + "learning_rate": 4.993353162637708e-06, + "loss": 0.6034, + "step": 527 + }, + { + "epoch": 0.08565866320571058, + "grad_norm": 0.6152514399622503, + "learning_rate": 4.993322007338738e-06, + "loss": 0.6314, + "step": 528 + }, + { + "epoch": 0.08582089552238806, + "grad_norm": 0.6371307068557504, + "learning_rate": 4.993290779291912e-06, + "loss": 0.6441, + "step": 529 + }, + { + "epoch": 0.08598312783906555, + "grad_norm": 0.6147219318433024, + "learning_rate": 4.993259478498141e-06, + "loss": 0.6327, + "step": 530 + }, + { + "epoch": 0.08614536015574302, + "grad_norm": 0.6070370434768888, + "learning_rate": 4.993228104958338e-06, + "loss": 0.611, + "step": 531 + }, + { + "epoch": 0.08630759247242051, + "grad_norm": 0.5770169196495215, + "learning_rate": 4.993196658673419e-06, + "loss": 0.6321, + "step": 532 + }, + { + "epoch": 0.08646982478909798, + "grad_norm": 0.616486154671567, + "learning_rate": 4.993165139644303e-06, + "loss": 0.6112, + "step": 533 + }, + { + "epoch": 0.08663205710577547, + "grad_norm": 0.6369673436669653, + "learning_rate": 4.993133547871906e-06, + "loss": 0.6238, + "step": 534 + }, + { + "epoch": 0.08679428942245296, + "grad_norm": 0.668101933640755, + "learning_rate": 4.993101883357153e-06, + "loss": 0.65, + "step": 535 + }, + { + "epoch": 0.08695652173913043, + "grad_norm": 0.5764115817166745, + "learning_rate": 4.993070146100966e-06, + "loss": 0.6406, + "step": 536 + }, + { + "epoch": 0.08711875405580792, + "grad_norm": 0.6099061630064554, + "learning_rate": 4.993038336104271e-06, + "loss": 0.6378, + "step": 537 + }, + { + "epoch": 0.0872809863724854, + "grad_norm": 0.6161205642121776, + "learning_rate": 4.993006453367998e-06, + "loss": 0.6205, + "step": 538 + }, + { + "epoch": 0.08744321868916288, + "grad_norm": 0.6123879689999023, + "learning_rate": 4.992974497893075e-06, + "loss": 0.641, + "step": 539 + }, + { + "epoch": 0.08760545100584036, + "grad_norm": 0.5925523099580071, + "learning_rate": 4.992942469680437e-06, + "loss": 0.6192, + "step": 540 + }, + { + "epoch": 0.08776768332251784, + "grad_norm": 0.656977154200654, + "learning_rate": 4.9929103687310144e-06, + "loss": 0.6402, + "step": 541 + }, + { + "epoch": 0.08792991563919533, + "grad_norm": 0.6267069837897024, + "learning_rate": 4.992878195045748e-06, + "loss": 0.6138, + "step": 542 + }, + { + "epoch": 0.08809214795587281, + "grad_norm": 0.6159782700484219, + "learning_rate": 4.992845948625575e-06, + "loss": 0.6324, + "step": 543 + }, + { + "epoch": 0.0882543802725503, + "grad_norm": 0.5929146148476617, + "learning_rate": 4.992813629471435e-06, + "loss": 0.6313, + "step": 544 + }, + { + "epoch": 0.08841661258922777, + "grad_norm": 0.6226292056901949, + "learning_rate": 4.992781237584272e-06, + "loss": 0.6245, + "step": 545 + }, + { + "epoch": 0.08857884490590526, + "grad_norm": 0.6164691508789464, + "learning_rate": 4.992748772965031e-06, + "loss": 0.6132, + "step": 546 + }, + { + "epoch": 0.08874107722258273, + "grad_norm": 0.5876218845253977, + "learning_rate": 4.992716235614661e-06, + "loss": 0.6101, + "step": 547 + }, + { + "epoch": 0.08890330953926022, + "grad_norm": 0.5795623120976889, + "learning_rate": 4.992683625534108e-06, + "loss": 0.6212, + "step": 548 + }, + { + "epoch": 0.08906554185593771, + "grad_norm": 0.6130066894909375, + "learning_rate": 4.992650942724326e-06, + "loss": 0.5987, + "step": 549 + }, + { + "epoch": 0.08922777417261518, + "grad_norm": 0.6245170629093179, + "learning_rate": 4.992618187186267e-06, + "loss": 0.6283, + "step": 550 + }, + { + "epoch": 0.08939000648929267, + "grad_norm": 0.5796544184873896, + "learning_rate": 4.992585358920888e-06, + "loss": 0.6497, + "step": 551 + }, + { + "epoch": 0.08955223880597014, + "grad_norm": 0.6035676749991913, + "learning_rate": 4.992552457929146e-06, + "loss": 0.6574, + "step": 552 + }, + { + "epoch": 0.08971447112264763, + "grad_norm": 0.6257315188007382, + "learning_rate": 4.992519484212001e-06, + "loss": 0.6476, + "step": 553 + }, + { + "epoch": 0.08987670343932512, + "grad_norm": 0.6479212584333242, + "learning_rate": 4.992486437770417e-06, + "loss": 0.6431, + "step": 554 + }, + { + "epoch": 0.0900389357560026, + "grad_norm": 0.5871680039154932, + "learning_rate": 4.992453318605355e-06, + "loss": 0.6213, + "step": 555 + }, + { + "epoch": 0.09020116807268008, + "grad_norm": 0.6044058857784029, + "learning_rate": 4.992420126717784e-06, + "loss": 0.6478, + "step": 556 + }, + { + "epoch": 0.09036340038935756, + "grad_norm": 0.6046537546580749, + "learning_rate": 4.99238686210867e-06, + "loss": 0.6163, + "step": 557 + }, + { + "epoch": 0.09052563270603504, + "grad_norm": 0.6681779741408403, + "learning_rate": 4.992353524778986e-06, + "loss": 0.6346, + "step": 558 + }, + { + "epoch": 0.09068786502271252, + "grad_norm": 0.6308843501094485, + "learning_rate": 4.992320114729703e-06, + "loss": 0.6204, + "step": 559 + }, + { + "epoch": 0.09085009733939, + "grad_norm": 0.6094442496591895, + "learning_rate": 4.992286631961796e-06, + "loss": 0.6597, + "step": 560 + }, + { + "epoch": 0.0910123296560675, + "grad_norm": 0.5856086143741038, + "learning_rate": 4.992253076476243e-06, + "loss": 0.6285, + "step": 561 + }, + { + "epoch": 0.09117456197274497, + "grad_norm": 0.6055253541186265, + "learning_rate": 4.992219448274022e-06, + "loss": 0.6337, + "step": 562 + }, + { + "epoch": 0.09133679428942246, + "grad_norm": 0.5969487490492622, + "learning_rate": 4.992185747356114e-06, + "loss": 0.594, + "step": 563 + }, + { + "epoch": 0.09149902660609993, + "grad_norm": 0.5899118555850554, + "learning_rate": 4.992151973723504e-06, + "loss": 0.655, + "step": 564 + }, + { + "epoch": 0.09166125892277742, + "grad_norm": 0.6042924909800401, + "learning_rate": 4.992118127377175e-06, + "loss": 0.6085, + "step": 565 + }, + { + "epoch": 0.0918234912394549, + "grad_norm": 0.652045144782332, + "learning_rate": 4.992084208318116e-06, + "loss": 0.5607, + "step": 566 + }, + { + "epoch": 0.09198572355613238, + "grad_norm": 0.6299497971578631, + "learning_rate": 4.992050216547316e-06, + "loss": 0.6239, + "step": 567 + }, + { + "epoch": 0.09214795587280987, + "grad_norm": 0.5802772591919475, + "learning_rate": 4.992016152065769e-06, + "loss": 0.6575, + "step": 568 + }, + { + "epoch": 0.09231018818948734, + "grad_norm": 0.5963841089223342, + "learning_rate": 4.991982014874465e-06, + "loss": 0.6684, + "step": 569 + }, + { + "epoch": 0.09247242050616483, + "grad_norm": 0.5907031889185995, + "learning_rate": 4.991947804974403e-06, + "loss": 0.6496, + "step": 570 + }, + { + "epoch": 0.0926346528228423, + "grad_norm": 0.621948789234854, + "learning_rate": 4.991913522366581e-06, + "loss": 0.6372, + "step": 571 + }, + { + "epoch": 0.0927968851395198, + "grad_norm": 0.6017983955008491, + "learning_rate": 4.991879167051998e-06, + "loss": 0.6107, + "step": 572 + }, + { + "epoch": 0.09295911745619727, + "grad_norm": 0.5829690177602398, + "learning_rate": 4.991844739031656e-06, + "loss": 0.6306, + "step": 573 + }, + { + "epoch": 0.09312134977287476, + "grad_norm": 0.6005524466738742, + "learning_rate": 4.991810238306561e-06, + "loss": 0.5934, + "step": 574 + }, + { + "epoch": 0.09328358208955224, + "grad_norm": 0.5790883296804037, + "learning_rate": 4.991775664877719e-06, + "loss": 0.5856, + "step": 575 + }, + { + "epoch": 0.09344581440622972, + "grad_norm": 0.6449475561182155, + "learning_rate": 4.99174101874614e-06, + "loss": 0.625, + "step": 576 + }, + { + "epoch": 0.0936080467229072, + "grad_norm": 0.6478904803677074, + "learning_rate": 4.991706299912832e-06, + "loss": 0.6239, + "step": 577 + }, + { + "epoch": 0.09377027903958468, + "grad_norm": 0.6129203522361282, + "learning_rate": 4.9916715083788105e-06, + "loss": 0.6388, + "step": 578 + }, + { + "epoch": 0.09393251135626217, + "grad_norm": 0.6254578702823209, + "learning_rate": 4.99163664414509e-06, + "loss": 0.6382, + "step": 579 + }, + { + "epoch": 0.09409474367293964, + "grad_norm": 0.6277375114621876, + "learning_rate": 4.991601707212687e-06, + "loss": 0.6305, + "step": 580 + }, + { + "epoch": 0.09425697598961713, + "grad_norm": 0.5702993486428753, + "learning_rate": 4.991566697582622e-06, + "loss": 0.6142, + "step": 581 + }, + { + "epoch": 0.09441920830629462, + "grad_norm": 0.6022371992042378, + "learning_rate": 4.991531615255915e-06, + "loss": 0.6311, + "step": 582 + }, + { + "epoch": 0.0945814406229721, + "grad_norm": 0.6062306996348007, + "learning_rate": 4.991496460233591e-06, + "loss": 0.648, + "step": 583 + }, + { + "epoch": 0.09474367293964958, + "grad_norm": 0.6227046287054222, + "learning_rate": 4.991461232516675e-06, + "loss": 0.6475, + "step": 584 + }, + { + "epoch": 0.09490590525632706, + "grad_norm": 0.6140414143862716, + "learning_rate": 4.991425932106196e-06, + "loss": 0.6224, + "step": 585 + }, + { + "epoch": 0.09506813757300454, + "grad_norm": 0.599903021515881, + "learning_rate": 4.991390559003181e-06, + "loss": 0.5974, + "step": 586 + }, + { + "epoch": 0.09523036988968202, + "grad_norm": 0.5818708538448811, + "learning_rate": 4.991355113208666e-06, + "loss": 0.6117, + "step": 587 + }, + { + "epoch": 0.0953926022063595, + "grad_norm": 0.592851353145954, + "learning_rate": 4.991319594723682e-06, + "loss": 0.6341, + "step": 588 + }, + { + "epoch": 0.095554834523037, + "grad_norm": 0.6255261015029627, + "learning_rate": 4.9912840035492675e-06, + "loss": 0.6337, + "step": 589 + }, + { + "epoch": 0.09571706683971447, + "grad_norm": 0.5710621432150333, + "learning_rate": 4.991248339686459e-06, + "loss": 0.6245, + "step": 590 + }, + { + "epoch": 0.09587929915639196, + "grad_norm": 0.6058281590388059, + "learning_rate": 4.991212603136299e-06, + "loss": 0.6145, + "step": 591 + }, + { + "epoch": 0.09604153147306943, + "grad_norm": 0.5918425022642091, + "learning_rate": 4.991176793899829e-06, + "loss": 0.5992, + "step": 592 + }, + { + "epoch": 0.09620376378974692, + "grad_norm": 0.5850041548827732, + "learning_rate": 4.991140911978094e-06, + "loss": 0.6276, + "step": 593 + }, + { + "epoch": 0.0963659961064244, + "grad_norm": 0.5940853286925323, + "learning_rate": 4.991104957372142e-06, + "loss": 0.6043, + "step": 594 + }, + { + "epoch": 0.09652822842310188, + "grad_norm": 0.6103362060380703, + "learning_rate": 4.9910689300830195e-06, + "loss": 0.6487, + "step": 595 + }, + { + "epoch": 0.09669046073977937, + "grad_norm": 0.6189400903208687, + "learning_rate": 4.9910328301117804e-06, + "loss": 0.615, + "step": 596 + }, + { + "epoch": 0.09685269305645684, + "grad_norm": 0.627354141994798, + "learning_rate": 4.990996657459477e-06, + "loss": 0.641, + "step": 597 + }, + { + "epoch": 0.09701492537313433, + "grad_norm": 0.5927784116564634, + "learning_rate": 4.9909604121271635e-06, + "loss": 0.6486, + "step": 598 + }, + { + "epoch": 0.0971771576898118, + "grad_norm": 0.5787257912334588, + "learning_rate": 4.990924094115899e-06, + "loss": 0.5798, + "step": 599 + }, + { + "epoch": 0.0973393900064893, + "grad_norm": 0.6086356538070183, + "learning_rate": 4.990887703426743e-06, + "loss": 0.6402, + "step": 600 + }, + { + "epoch": 0.09750162232316678, + "grad_norm": 0.6062001367156161, + "learning_rate": 4.990851240060757e-06, + "loss": 0.6147, + "step": 601 + }, + { + "epoch": 0.09766385463984426, + "grad_norm": 0.5966320275599226, + "learning_rate": 4.990814704019004e-06, + "loss": 0.6201, + "step": 602 + }, + { + "epoch": 0.09782608695652174, + "grad_norm": 0.5752218511964444, + "learning_rate": 4.990778095302551e-06, + "loss": 0.6439, + "step": 603 + }, + { + "epoch": 0.09798831927319922, + "grad_norm": 0.6527478798159436, + "learning_rate": 4.990741413912467e-06, + "loss": 0.6045, + "step": 604 + }, + { + "epoch": 0.0981505515898767, + "grad_norm": 0.6202309164562855, + "learning_rate": 4.99070465984982e-06, + "loss": 0.6079, + "step": 605 + }, + { + "epoch": 0.09831278390655418, + "grad_norm": 0.6216456219347162, + "learning_rate": 4.990667833115684e-06, + "loss": 0.6513, + "step": 606 + }, + { + "epoch": 0.09847501622323167, + "grad_norm": 0.6045632952708466, + "learning_rate": 4.990630933711134e-06, + "loss": 0.6282, + "step": 607 + }, + { + "epoch": 0.09863724853990916, + "grad_norm": 0.5978090287164536, + "learning_rate": 4.990593961637245e-06, + "loss": 0.6087, + "step": 608 + }, + { + "epoch": 0.09879948085658663, + "grad_norm": 0.7290033503038483, + "learning_rate": 4.990556916895096e-06, + "loss": 0.6062, + "step": 609 + }, + { + "epoch": 0.09896171317326412, + "grad_norm": 0.5943205350862575, + "learning_rate": 4.990519799485769e-06, + "loss": 0.6143, + "step": 610 + }, + { + "epoch": 0.09912394548994159, + "grad_norm": 0.5973655841631732, + "learning_rate": 4.990482609410347e-06, + "loss": 0.6217, + "step": 611 + }, + { + "epoch": 0.09928617780661908, + "grad_norm": 0.6370591035556559, + "learning_rate": 4.990445346669913e-06, + "loss": 0.633, + "step": 612 + }, + { + "epoch": 0.09944841012329655, + "grad_norm": 0.5833252212606217, + "learning_rate": 4.990408011265556e-06, + "loss": 0.6368, + "step": 613 + }, + { + "epoch": 0.09961064243997404, + "grad_norm": 0.6412653776781858, + "learning_rate": 4.990370603198365e-06, + "loss": 0.6437, + "step": 614 + }, + { + "epoch": 0.09977287475665153, + "grad_norm": 0.6832007971797404, + "learning_rate": 4.9903331224694316e-06, + "loss": 0.5921, + "step": 615 + }, + { + "epoch": 0.099935107073329, + "grad_norm": 0.7035402710372498, + "learning_rate": 4.990295569079849e-06, + "loss": 0.6481, + "step": 616 + }, + { + "epoch": 0.10009733939000649, + "grad_norm": 0.613361722293131, + "learning_rate": 4.990257943030713e-06, + "loss": 0.6529, + "step": 617 + }, + { + "epoch": 0.10025957170668397, + "grad_norm": 0.6343901788388261, + "learning_rate": 4.990220244323122e-06, + "loss": 0.6378, + "step": 618 + }, + { + "epoch": 0.10042180402336146, + "grad_norm": 0.6180894118555761, + "learning_rate": 4.990182472958175e-06, + "loss": 0.607, + "step": 619 + }, + { + "epoch": 0.10058403634003893, + "grad_norm": 0.5812767762143146, + "learning_rate": 4.990144628936974e-06, + "loss": 0.6487, + "step": 620 + }, + { + "epoch": 0.10074626865671642, + "grad_norm": 0.6263767265382002, + "learning_rate": 4.990106712260624e-06, + "loss": 0.6327, + "step": 621 + }, + { + "epoch": 0.1009085009733939, + "grad_norm": 0.5815915653810975, + "learning_rate": 4.990068722930232e-06, + "loss": 0.6541, + "step": 622 + }, + { + "epoch": 0.10107073329007138, + "grad_norm": 0.5810968357850914, + "learning_rate": 4.990030660946904e-06, + "loss": 0.619, + "step": 623 + }, + { + "epoch": 0.10123296560674887, + "grad_norm": 0.5961509377480327, + "learning_rate": 4.989992526311752e-06, + "loss": 0.619, + "step": 624 + }, + { + "epoch": 0.10139519792342634, + "grad_norm": 0.586287316226167, + "learning_rate": 4.989954319025889e-06, + "loss": 0.6284, + "step": 625 + }, + { + "epoch": 0.10155743024010383, + "grad_norm": 0.5963054116793164, + "learning_rate": 4.98991603909043e-06, + "loss": 0.5956, + "step": 626 + }, + { + "epoch": 0.1017196625567813, + "grad_norm": 0.5818276066011274, + "learning_rate": 4.989877686506491e-06, + "loss": 0.6006, + "step": 627 + }, + { + "epoch": 0.10188189487345879, + "grad_norm": 0.5987902083446461, + "learning_rate": 4.989839261275191e-06, + "loss": 0.5934, + "step": 628 + }, + { + "epoch": 0.10204412719013628, + "grad_norm": 0.6212168541435729, + "learning_rate": 4.989800763397651e-06, + "loss": 0.5973, + "step": 629 + }, + { + "epoch": 0.10220635950681375, + "grad_norm": 0.6215950181107167, + "learning_rate": 4.989762192874995e-06, + "loss": 0.6172, + "step": 630 + }, + { + "epoch": 0.10236859182349124, + "grad_norm": 0.5866992877123337, + "learning_rate": 4.989723549708349e-06, + "loss": 0.5959, + "step": 631 + }, + { + "epoch": 0.10253082414016872, + "grad_norm": 0.6001687301567745, + "learning_rate": 4.989684833898838e-06, + "loss": 0.6247, + "step": 632 + }, + { + "epoch": 0.1026930564568462, + "grad_norm": 0.6044109990119116, + "learning_rate": 4.989646045447594e-06, + "loss": 0.6138, + "step": 633 + }, + { + "epoch": 0.10285528877352369, + "grad_norm": 0.596887711578008, + "learning_rate": 4.989607184355747e-06, + "loss": 0.6551, + "step": 634 + }, + { + "epoch": 0.10301752109020117, + "grad_norm": 0.6297726586587677, + "learning_rate": 4.989568250624433e-06, + "loss": 0.6276, + "step": 635 + }, + { + "epoch": 0.10317975340687865, + "grad_norm": 0.6053129914368286, + "learning_rate": 4.989529244254786e-06, + "loss": 0.6217, + "step": 636 + }, + { + "epoch": 0.10334198572355613, + "grad_norm": 0.5900366469721411, + "learning_rate": 4.989490165247945e-06, + "loss": 0.6029, + "step": 637 + }, + { + "epoch": 0.10350421804023362, + "grad_norm": 0.63878971588723, + "learning_rate": 4.98945101360505e-06, + "loss": 0.6157, + "step": 638 + }, + { + "epoch": 0.10366645035691109, + "grad_norm": 0.5869835285778888, + "learning_rate": 4.989411789327243e-06, + "loss": 0.6346, + "step": 639 + }, + { + "epoch": 0.10382868267358858, + "grad_norm": 0.5716786162760494, + "learning_rate": 4.989372492415669e-06, + "loss": 0.6319, + "step": 640 + }, + { + "epoch": 0.10399091499026607, + "grad_norm": 0.583600502181976, + "learning_rate": 4.989333122871474e-06, + "loss": 0.6393, + "step": 641 + }, + { + "epoch": 0.10415314730694354, + "grad_norm": 0.6109442012697807, + "learning_rate": 4.9892936806958085e-06, + "loss": 0.6785, + "step": 642 + }, + { + "epoch": 0.10431537962362103, + "grad_norm": 0.5917513750321668, + "learning_rate": 4.989254165889821e-06, + "loss": 0.5882, + "step": 643 + }, + { + "epoch": 0.1044776119402985, + "grad_norm": 0.560762905756711, + "learning_rate": 4.989214578454665e-06, + "loss": 0.5774, + "step": 644 + }, + { + "epoch": 0.10463984425697599, + "grad_norm": 0.6257476353748889, + "learning_rate": 4.989174918391496e-06, + "loss": 0.6263, + "step": 645 + }, + { + "epoch": 0.10480207657365347, + "grad_norm": 0.5863690563026587, + "learning_rate": 4.989135185701471e-06, + "loss": 0.6487, + "step": 646 + }, + { + "epoch": 0.10496430889033095, + "grad_norm": 0.5864952950486552, + "learning_rate": 4.989095380385749e-06, + "loss": 0.5941, + "step": 647 + }, + { + "epoch": 0.10512654120700844, + "grad_norm": 0.5714230370149399, + "learning_rate": 4.989055502445493e-06, + "loss": 0.6517, + "step": 648 + }, + { + "epoch": 0.10528877352368592, + "grad_norm": 0.5703289962691088, + "learning_rate": 4.989015551881863e-06, + "loss": 0.6161, + "step": 649 + }, + { + "epoch": 0.1054510058403634, + "grad_norm": 0.5941810849129162, + "learning_rate": 4.988975528696028e-06, + "loss": 0.645, + "step": 650 + }, + { + "epoch": 0.10561323815704088, + "grad_norm": 0.6711986155549085, + "learning_rate": 4.988935432889155e-06, + "loss": 0.6773, + "step": 651 + }, + { + "epoch": 0.10577547047371837, + "grad_norm": 0.6689082989999415, + "learning_rate": 4.9888952644624135e-06, + "loss": 0.6166, + "step": 652 + }, + { + "epoch": 0.10593770279039584, + "grad_norm": 0.6474851361388222, + "learning_rate": 4.988855023416975e-06, + "loss": 0.6638, + "step": 653 + }, + { + "epoch": 0.10609993510707333, + "grad_norm": 0.6333002169924219, + "learning_rate": 4.988814709754014e-06, + "loss": 0.6342, + "step": 654 + }, + { + "epoch": 0.10626216742375082, + "grad_norm": 0.6263736236147774, + "learning_rate": 4.988774323474707e-06, + "loss": 0.6497, + "step": 655 + }, + { + "epoch": 0.10642439974042829, + "grad_norm": 0.5987990780096545, + "learning_rate": 4.988733864580232e-06, + "loss": 0.6276, + "step": 656 + }, + { + "epoch": 0.10658663205710578, + "grad_norm": 0.5917095751050008, + "learning_rate": 4.98869333307177e-06, + "loss": 0.603, + "step": 657 + }, + { + "epoch": 0.10674886437378325, + "grad_norm": 0.608379325495087, + "learning_rate": 4.988652728950503e-06, + "loss": 0.5992, + "step": 658 + }, + { + "epoch": 0.10691109669046074, + "grad_norm": 0.5963072391901902, + "learning_rate": 4.988612052217616e-06, + "loss": 0.615, + "step": 659 + }, + { + "epoch": 0.10707332900713822, + "grad_norm": 0.6074239592814855, + "learning_rate": 4.988571302874296e-06, + "loss": 0.5972, + "step": 660 + }, + { + "epoch": 0.1072355613238157, + "grad_norm": 0.5972444991769021, + "learning_rate": 4.988530480921731e-06, + "loss": 0.5987, + "step": 661 + }, + { + "epoch": 0.10739779364049319, + "grad_norm": 0.6119533070669712, + "learning_rate": 4.988489586361114e-06, + "loss": 0.6466, + "step": 662 + }, + { + "epoch": 0.10756002595717067, + "grad_norm": 0.6080485558328418, + "learning_rate": 4.988448619193637e-06, + "loss": 0.6223, + "step": 663 + }, + { + "epoch": 0.10772225827384815, + "grad_norm": 0.6174517451812946, + "learning_rate": 4.988407579420494e-06, + "loss": 0.6484, + "step": 664 + }, + { + "epoch": 0.10788449059052563, + "grad_norm": 0.6159492124204641, + "learning_rate": 4.988366467042884e-06, + "loss": 0.6517, + "step": 665 + }, + { + "epoch": 0.10804672290720312, + "grad_norm": 0.6096084552111702, + "learning_rate": 4.988325282062007e-06, + "loss": 0.627, + "step": 666 + }, + { + "epoch": 0.10820895522388059, + "grad_norm": 0.5974600776840928, + "learning_rate": 4.988284024479064e-06, + "loss": 0.6334, + "step": 667 + }, + { + "epoch": 0.10837118754055808, + "grad_norm": 0.5980915835342598, + "learning_rate": 4.988242694295258e-06, + "loss": 0.6582, + "step": 668 + }, + { + "epoch": 0.10853341985723557, + "grad_norm": 0.6193384673967016, + "learning_rate": 4.988201291511796e-06, + "loss": 0.5907, + "step": 669 + }, + { + "epoch": 0.10869565217391304, + "grad_norm": 0.5965197968019856, + "learning_rate": 4.988159816129885e-06, + "loss": 0.6425, + "step": 670 + }, + { + "epoch": 0.10885788449059053, + "grad_norm": 0.5916995945589593, + "learning_rate": 4.988118268150736e-06, + "loss": 0.6471, + "step": 671 + }, + { + "epoch": 0.109020116807268, + "grad_norm": 0.5940553634308091, + "learning_rate": 4.988076647575562e-06, + "loss": 0.6135, + "step": 672 + }, + { + "epoch": 0.10918234912394549, + "grad_norm": 0.597063411587957, + "learning_rate": 4.9880349544055745e-06, + "loss": 0.6252, + "step": 673 + }, + { + "epoch": 0.10934458144062298, + "grad_norm": 0.5678302254994781, + "learning_rate": 4.987993188641993e-06, + "loss": 0.6448, + "step": 674 + }, + { + "epoch": 0.10950681375730045, + "grad_norm": 0.5958313318242434, + "learning_rate": 4.987951350286034e-06, + "loss": 0.6107, + "step": 675 + }, + { + "epoch": 0.10966904607397794, + "grad_norm": 0.6384227835843154, + "learning_rate": 4.98790943933892e-06, + "loss": 0.6024, + "step": 676 + }, + { + "epoch": 0.10983127839065542, + "grad_norm": 0.5873168109782115, + "learning_rate": 4.987867455801873e-06, + "loss": 0.613, + "step": 677 + }, + { + "epoch": 0.1099935107073329, + "grad_norm": 0.6320421024971175, + "learning_rate": 4.987825399676118e-06, + "loss": 0.5973, + "step": 678 + }, + { + "epoch": 0.11015574302401038, + "grad_norm": 0.6346485444303747, + "learning_rate": 4.987783270962881e-06, + "loss": 0.6037, + "step": 679 + }, + { + "epoch": 0.11031797534068787, + "grad_norm": 0.6049961850163522, + "learning_rate": 4.987741069663393e-06, + "loss": 0.6232, + "step": 680 + }, + { + "epoch": 0.11048020765736535, + "grad_norm": 0.6503167490737153, + "learning_rate": 4.987698795778885e-06, + "loss": 0.6282, + "step": 681 + }, + { + "epoch": 0.11064243997404283, + "grad_norm": 0.6455410312114456, + "learning_rate": 4.98765644931059e-06, + "loss": 0.6034, + "step": 682 + }, + { + "epoch": 0.11080467229072032, + "grad_norm": 0.5982952688924502, + "learning_rate": 4.987614030259742e-06, + "loss": 0.6319, + "step": 683 + }, + { + "epoch": 0.11096690460739779, + "grad_norm": 0.6140300520852249, + "learning_rate": 4.987571538627581e-06, + "loss": 0.612, + "step": 684 + }, + { + "epoch": 0.11112913692407528, + "grad_norm": 0.5882171060780191, + "learning_rate": 4.987528974415346e-06, + "loss": 0.616, + "step": 685 + }, + { + "epoch": 0.11129136924075275, + "grad_norm": 0.5980278377789953, + "learning_rate": 4.9874863376242785e-06, + "loss": 0.6385, + "step": 686 + }, + { + "epoch": 0.11145360155743024, + "grad_norm": 0.6052561242069202, + "learning_rate": 4.987443628255624e-06, + "loss": 0.6336, + "step": 687 + }, + { + "epoch": 0.11161583387410773, + "grad_norm": 0.613961448221876, + "learning_rate": 4.987400846310626e-06, + "loss": 0.6032, + "step": 688 + }, + { + "epoch": 0.1117780661907852, + "grad_norm": 0.586979705969461, + "learning_rate": 4.987357991790535e-06, + "loss": 0.6129, + "step": 689 + }, + { + "epoch": 0.11194029850746269, + "grad_norm": 0.6087225015449349, + "learning_rate": 4.987315064696601e-06, + "loss": 0.582, + "step": 690 + }, + { + "epoch": 0.11210253082414016, + "grad_norm": 0.6098937469549578, + "learning_rate": 4.987272065030075e-06, + "loss": 0.6235, + "step": 691 + }, + { + "epoch": 0.11226476314081765, + "grad_norm": 0.6145382840046216, + "learning_rate": 4.987228992792213e-06, + "loss": 0.6231, + "step": 692 + }, + { + "epoch": 0.11242699545749513, + "grad_norm": 0.614055250544281, + "learning_rate": 4.987185847984272e-06, + "loss": 0.6391, + "step": 693 + }, + { + "epoch": 0.11258922777417261, + "grad_norm": 0.615459514653446, + "learning_rate": 4.98714263060751e-06, + "loss": 0.6087, + "step": 694 + }, + { + "epoch": 0.1127514600908501, + "grad_norm": 0.5754601214724121, + "learning_rate": 4.9870993406631885e-06, + "loss": 0.6219, + "step": 695 + }, + { + "epoch": 0.11291369240752758, + "grad_norm": 0.5832216204939539, + "learning_rate": 4.9870559781525695e-06, + "loss": 0.622, + "step": 696 + }, + { + "epoch": 0.11307592472420507, + "grad_norm": 0.6354739419982733, + "learning_rate": 4.98701254307692e-06, + "loss": 0.6502, + "step": 697 + }, + { + "epoch": 0.11323815704088254, + "grad_norm": 0.6143606313497593, + "learning_rate": 4.9869690354375055e-06, + "loss": 0.6696, + "step": 698 + }, + { + "epoch": 0.11340038935756003, + "grad_norm": 0.613251112262801, + "learning_rate": 4.9869254552355965e-06, + "loss": 0.6232, + "step": 699 + }, + { + "epoch": 0.1135626216742375, + "grad_norm": 0.6145680470989177, + "learning_rate": 4.986881802472464e-06, + "loss": 0.6497, + "step": 700 + }, + { + "epoch": 0.11372485399091499, + "grad_norm": 0.5930457606227125, + "learning_rate": 4.986838077149382e-06, + "loss": 0.6584, + "step": 701 + }, + { + "epoch": 0.11388708630759248, + "grad_norm": 0.6459306910376664, + "learning_rate": 4.986794279267627e-06, + "loss": 0.6345, + "step": 702 + }, + { + "epoch": 0.11404931862426995, + "grad_norm": 0.6267337366958773, + "learning_rate": 4.986750408828476e-06, + "loss": 0.6198, + "step": 703 + }, + { + "epoch": 0.11421155094094744, + "grad_norm": 0.5847360680271608, + "learning_rate": 4.986706465833208e-06, + "loss": 0.6256, + "step": 704 + }, + { + "epoch": 0.11437378325762491, + "grad_norm": 0.5948317938617863, + "learning_rate": 4.986662450283107e-06, + "loss": 0.5889, + "step": 705 + }, + { + "epoch": 0.1145360155743024, + "grad_norm": 0.6067074525434181, + "learning_rate": 4.986618362179456e-06, + "loss": 0.6348, + "step": 706 + }, + { + "epoch": 0.11469824789097988, + "grad_norm": 0.578387126199125, + "learning_rate": 4.986574201523542e-06, + "loss": 0.6283, + "step": 707 + }, + { + "epoch": 0.11486048020765736, + "grad_norm": 0.591519204031583, + "learning_rate": 4.986529968316654e-06, + "loss": 0.6041, + "step": 708 + }, + { + "epoch": 0.11502271252433485, + "grad_norm": 0.6078318594384442, + "learning_rate": 4.986485662560081e-06, + "loss": 0.6404, + "step": 709 + }, + { + "epoch": 0.11518494484101233, + "grad_norm": 0.5999583814819922, + "learning_rate": 4.986441284255117e-06, + "loss": 0.61, + "step": 710 + }, + { + "epoch": 0.11534717715768981, + "grad_norm": 0.6163749896964185, + "learning_rate": 4.9863968334030565e-06, + "loss": 0.6172, + "step": 711 + }, + { + "epoch": 0.11550940947436729, + "grad_norm": 0.6119889628345391, + "learning_rate": 4.986352310005197e-06, + "loss": 0.593, + "step": 712 + }, + { + "epoch": 0.11567164179104478, + "grad_norm": 0.6012453653776673, + "learning_rate": 4.986307714062836e-06, + "loss": 0.5862, + "step": 713 + }, + { + "epoch": 0.11583387410772227, + "grad_norm": 0.5696564639467636, + "learning_rate": 4.9862630455772755e-06, + "loss": 0.5818, + "step": 714 + }, + { + "epoch": 0.11599610642439974, + "grad_norm": 0.6163208600786418, + "learning_rate": 4.98621830454982e-06, + "loss": 0.6009, + "step": 715 + }, + { + "epoch": 0.11615833874107723, + "grad_norm": 0.6103770533383867, + "learning_rate": 4.986173490981773e-06, + "loss": 0.6371, + "step": 716 + }, + { + "epoch": 0.1163205710577547, + "grad_norm": 0.6010843923032364, + "learning_rate": 4.986128604874443e-06, + "loss": 0.587, + "step": 717 + }, + { + "epoch": 0.11648280337443219, + "grad_norm": 0.5962460815604594, + "learning_rate": 4.98608364622914e-06, + "loss": 0.6371, + "step": 718 + }, + { + "epoch": 0.11664503569110966, + "grad_norm": 0.6268007352717224, + "learning_rate": 4.986038615047175e-06, + "loss": 0.6538, + "step": 719 + }, + { + "epoch": 0.11680726800778715, + "grad_norm": 0.6439900756157539, + "learning_rate": 4.985993511329863e-06, + "loss": 0.6427, + "step": 720 + }, + { + "epoch": 0.11696950032446464, + "grad_norm": 0.5914435344876913, + "learning_rate": 4.985948335078518e-06, + "loss": 0.6299, + "step": 721 + }, + { + "epoch": 0.11713173264114211, + "grad_norm": 0.6258701171641615, + "learning_rate": 4.985903086294459e-06, + "loss": 0.63, + "step": 722 + }, + { + "epoch": 0.1172939649578196, + "grad_norm": 0.6085762028643285, + "learning_rate": 4.985857764979007e-06, + "loss": 0.6158, + "step": 723 + }, + { + "epoch": 0.11745619727449708, + "grad_norm": 0.5638449086013215, + "learning_rate": 4.985812371133485e-06, + "loss": 0.588, + "step": 724 + }, + { + "epoch": 0.11761842959117456, + "grad_norm": 0.5784249923701319, + "learning_rate": 4.985766904759215e-06, + "loss": 0.6452, + "step": 725 + }, + { + "epoch": 0.11778066190785204, + "grad_norm": 0.5891382338061734, + "learning_rate": 4.985721365857525e-06, + "loss": 0.6146, + "step": 726 + }, + { + "epoch": 0.11794289422452953, + "grad_norm": 0.6217675521919537, + "learning_rate": 4.985675754429744e-06, + "loss": 0.5885, + "step": 727 + }, + { + "epoch": 0.11810512654120701, + "grad_norm": 0.6273730783673459, + "learning_rate": 4.9856300704772025e-06, + "loss": 0.5917, + "step": 728 + }, + { + "epoch": 0.11826735885788449, + "grad_norm": 0.6514451322236303, + "learning_rate": 4.985584314001232e-06, + "loss": 0.617, + "step": 729 + }, + { + "epoch": 0.11842959117456198, + "grad_norm": 0.6137319444317233, + "learning_rate": 4.98553848500317e-06, + "loss": 0.606, + "step": 730 + }, + { + "epoch": 0.11859182349123945, + "grad_norm": 0.6168596123835992, + "learning_rate": 4.985492583484353e-06, + "loss": 0.6498, + "step": 731 + }, + { + "epoch": 0.11875405580791694, + "grad_norm": 0.6365136878022281, + "learning_rate": 4.985446609446118e-06, + "loss": 0.6542, + "step": 732 + }, + { + "epoch": 0.11891628812459441, + "grad_norm": 0.610056312738841, + "learning_rate": 4.98540056288981e-06, + "loss": 0.6007, + "step": 733 + }, + { + "epoch": 0.1190785204412719, + "grad_norm": 0.6032843343633596, + "learning_rate": 4.98535444381677e-06, + "loss": 0.6203, + "step": 734 + }, + { + "epoch": 0.11924075275794939, + "grad_norm": 0.5790409168949657, + "learning_rate": 4.985308252228343e-06, + "loss": 0.6144, + "step": 735 + }, + { + "epoch": 0.11940298507462686, + "grad_norm": 0.5652301636840622, + "learning_rate": 4.98526198812588e-06, + "loss": 0.6068, + "step": 736 + }, + { + "epoch": 0.11956521739130435, + "grad_norm": 0.6209670891278528, + "learning_rate": 4.985215651510729e-06, + "loss": 0.6147, + "step": 737 + }, + { + "epoch": 0.11972744970798183, + "grad_norm": 0.6441169679150139, + "learning_rate": 4.9851692423842406e-06, + "loss": 0.6071, + "step": 738 + }, + { + "epoch": 0.11988968202465931, + "grad_norm": 0.6163031930534393, + "learning_rate": 4.98512276074777e-06, + "loss": 0.5817, + "step": 739 + }, + { + "epoch": 0.12005191434133679, + "grad_norm": 0.6259153039493465, + "learning_rate": 4.9850762066026735e-06, + "loss": 0.6052, + "step": 740 + }, + { + "epoch": 0.12021414665801428, + "grad_norm": 0.6352478089519104, + "learning_rate": 4.98502957995031e-06, + "loss": 0.5997, + "step": 741 + }, + { + "epoch": 0.12037637897469176, + "grad_norm": 0.6273728024338969, + "learning_rate": 4.98498288079204e-06, + "loss": 0.641, + "step": 742 + }, + { + "epoch": 0.12053861129136924, + "grad_norm": 0.5994137086076499, + "learning_rate": 4.984936109129225e-06, + "loss": 0.6303, + "step": 743 + }, + { + "epoch": 0.12070084360804673, + "grad_norm": 0.5790513934559129, + "learning_rate": 4.984889264963229e-06, + "loss": 0.5799, + "step": 744 + }, + { + "epoch": 0.1208630759247242, + "grad_norm": 0.5975566961744814, + "learning_rate": 4.984842348295421e-06, + "loss": 0.628, + "step": 745 + }, + { + "epoch": 0.12102530824140169, + "grad_norm": 0.5908973658766827, + "learning_rate": 4.984795359127168e-06, + "loss": 0.6748, + "step": 746 + }, + { + "epoch": 0.12118754055807916, + "grad_norm": 0.5817921948362346, + "learning_rate": 4.984748297459842e-06, + "loss": 0.6337, + "step": 747 + }, + { + "epoch": 0.12134977287475665, + "grad_norm": 0.5976124281847481, + "learning_rate": 4.984701163294817e-06, + "loss": 0.6117, + "step": 748 + }, + { + "epoch": 0.12151200519143414, + "grad_norm": 0.5906591123672558, + "learning_rate": 4.984653956633466e-06, + "loss": 0.6047, + "step": 749 + }, + { + "epoch": 0.12167423750811161, + "grad_norm": 0.5518481089026451, + "learning_rate": 4.984606677477168e-06, + "loss": 0.6099, + "step": 750 + }, + { + "epoch": 0.1218364698247891, + "grad_norm": 0.6617156473316859, + "learning_rate": 4.984559325827301e-06, + "loss": 0.6506, + "step": 751 + }, + { + "epoch": 0.12199870214146658, + "grad_norm": 0.6501905583255096, + "learning_rate": 4.984511901685248e-06, + "loss": 0.5971, + "step": 752 + }, + { + "epoch": 0.12216093445814406, + "grad_norm": 0.6489196010587776, + "learning_rate": 4.984464405052392e-06, + "loss": 0.6243, + "step": 753 + }, + { + "epoch": 0.12232316677482155, + "grad_norm": 0.5955784743723378, + "learning_rate": 4.9844168359301195e-06, + "loss": 0.5901, + "step": 754 + }, + { + "epoch": 0.12248539909149903, + "grad_norm": 0.6036043960870867, + "learning_rate": 4.984369194319818e-06, + "loss": 0.6303, + "step": 755 + }, + { + "epoch": 0.12264763140817651, + "grad_norm": 0.5950966843101957, + "learning_rate": 4.984321480222877e-06, + "loss": 0.5884, + "step": 756 + }, + { + "epoch": 0.12280986372485399, + "grad_norm": 0.5863509392229471, + "learning_rate": 4.984273693640689e-06, + "loss": 0.6393, + "step": 757 + }, + { + "epoch": 0.12297209604153148, + "grad_norm": 0.6184093600971461, + "learning_rate": 4.984225834574648e-06, + "loss": 0.622, + "step": 758 + }, + { + "epoch": 0.12313432835820895, + "grad_norm": 0.6111761085785742, + "learning_rate": 4.984177903026153e-06, + "loss": 0.6605, + "step": 759 + }, + { + "epoch": 0.12329656067488644, + "grad_norm": 0.5765192417122125, + "learning_rate": 4.984129898996599e-06, + "loss": 0.5856, + "step": 760 + }, + { + "epoch": 0.12345879299156393, + "grad_norm": 0.5992396098005033, + "learning_rate": 4.984081822487388e-06, + "loss": 0.5887, + "step": 761 + }, + { + "epoch": 0.1236210253082414, + "grad_norm": 0.6121589127425286, + "learning_rate": 4.984033673499922e-06, + "loss": 0.6048, + "step": 762 + }, + { + "epoch": 0.12378325762491889, + "grad_norm": 0.6201797754253519, + "learning_rate": 4.983985452035607e-06, + "loss": 0.5687, + "step": 763 + }, + { + "epoch": 0.12394548994159636, + "grad_norm": 0.6507647731139429, + "learning_rate": 4.9839371580958496e-06, + "loss": 0.6073, + "step": 764 + }, + { + "epoch": 0.12410772225827385, + "grad_norm": 0.6400634074400299, + "learning_rate": 4.983888791682058e-06, + "loss": 0.6266, + "step": 765 + }, + { + "epoch": 0.12426995457495132, + "grad_norm": 0.6281153259846051, + "learning_rate": 4.9838403527956455e-06, + "loss": 0.6098, + "step": 766 + }, + { + "epoch": 0.12443218689162881, + "grad_norm": 0.6156085947871965, + "learning_rate": 4.983791841438023e-06, + "loss": 0.5691, + "step": 767 + }, + { + "epoch": 0.1245944192083063, + "grad_norm": 0.6104524639336778, + "learning_rate": 4.983743257610607e-06, + "loss": 0.5688, + "step": 768 + }, + { + "epoch": 0.12475665152498377, + "grad_norm": 0.6168730399340607, + "learning_rate": 4.9836946013148155e-06, + "loss": 0.6032, + "step": 769 + }, + { + "epoch": 0.12491888384166126, + "grad_norm": 0.5984815206310637, + "learning_rate": 4.983645872552068e-06, + "loss": 0.6133, + "step": 770 + }, + { + "epoch": 0.12508111615833875, + "grad_norm": 0.6244794037249959, + "learning_rate": 4.983597071323784e-06, + "loss": 0.6344, + "step": 771 + }, + { + "epoch": 0.12524334847501623, + "grad_norm": 0.620207128842261, + "learning_rate": 4.983548197631391e-06, + "loss": 0.6254, + "step": 772 + }, + { + "epoch": 0.1254055807916937, + "grad_norm": 0.6348984806239886, + "learning_rate": 4.9834992514763135e-06, + "loss": 0.6025, + "step": 773 + }, + { + "epoch": 0.1255678131083712, + "grad_norm": 0.6233273577443574, + "learning_rate": 4.983450232859979e-06, + "loss": 0.5889, + "step": 774 + }, + { + "epoch": 0.12573004542504868, + "grad_norm": 0.5969880061726889, + "learning_rate": 4.983401141783818e-06, + "loss": 0.6084, + "step": 775 + }, + { + "epoch": 0.12589227774172615, + "grad_norm": 0.6441488478117032, + "learning_rate": 4.983351978249262e-06, + "loss": 0.6139, + "step": 776 + }, + { + "epoch": 0.12605451005840362, + "grad_norm": 0.5996621380626992, + "learning_rate": 4.983302742257748e-06, + "loss": 0.6093, + "step": 777 + }, + { + "epoch": 0.12621674237508113, + "grad_norm": 0.6609967967407968, + "learning_rate": 4.98325343381071e-06, + "loss": 0.6691, + "step": 778 + }, + { + "epoch": 0.1263789746917586, + "grad_norm": 0.5926947039705023, + "learning_rate": 4.983204052909588e-06, + "loss": 0.6222, + "step": 779 + }, + { + "epoch": 0.12654120700843607, + "grad_norm": 0.6072611741794861, + "learning_rate": 4.9831545995558215e-06, + "loss": 0.6437, + "step": 780 + }, + { + "epoch": 0.12670343932511358, + "grad_norm": 0.6003870058306839, + "learning_rate": 4.983105073750856e-06, + "loss": 0.6371, + "step": 781 + }, + { + "epoch": 0.12686567164179105, + "grad_norm": 0.5877096317911503, + "learning_rate": 4.983055475496134e-06, + "loss": 0.6077, + "step": 782 + }, + { + "epoch": 0.12702790395846852, + "grad_norm": 0.6247578621507172, + "learning_rate": 4.983005804793103e-06, + "loss": 0.6157, + "step": 783 + }, + { + "epoch": 0.127190136275146, + "grad_norm": 0.6441325824172004, + "learning_rate": 4.982956061643212e-06, + "loss": 0.6268, + "step": 784 + }, + { + "epoch": 0.1273523685918235, + "grad_norm": 0.6220193672591529, + "learning_rate": 4.982906246047915e-06, + "loss": 0.6235, + "step": 785 + }, + { + "epoch": 0.12751460090850097, + "grad_norm": 0.6304053260237308, + "learning_rate": 4.982856358008663e-06, + "loss": 0.6407, + "step": 786 + }, + { + "epoch": 0.12767683322517845, + "grad_norm": 0.5773967943086138, + "learning_rate": 4.9828063975269114e-06, + "loss": 0.6167, + "step": 787 + }, + { + "epoch": 0.12783906554185595, + "grad_norm": 0.5661567470190519, + "learning_rate": 4.98275636460412e-06, + "loss": 0.5533, + "step": 788 + }, + { + "epoch": 0.12800129785853342, + "grad_norm": 0.585407359295343, + "learning_rate": 4.9827062592417454e-06, + "loss": 0.6336, + "step": 789 + }, + { + "epoch": 0.1281635301752109, + "grad_norm": 0.6111969017976736, + "learning_rate": 4.9826560814412525e-06, + "loss": 0.6358, + "step": 790 + }, + { + "epoch": 0.12832576249188837, + "grad_norm": 0.603715492631786, + "learning_rate": 4.9826058312041045e-06, + "loss": 0.6387, + "step": 791 + }, + { + "epoch": 0.12848799480856588, + "grad_norm": 0.6043497191316571, + "learning_rate": 4.982555508531767e-06, + "loss": 0.6468, + "step": 792 + }, + { + "epoch": 0.12865022712524335, + "grad_norm": 0.6173567012484944, + "learning_rate": 4.982505113425708e-06, + "loss": 0.6463, + "step": 793 + }, + { + "epoch": 0.12881245944192082, + "grad_norm": 0.6258303861576413, + "learning_rate": 4.982454645887399e-06, + "loss": 0.6103, + "step": 794 + }, + { + "epoch": 0.12897469175859833, + "grad_norm": 0.6006117864855103, + "learning_rate": 4.9824041059183115e-06, + "loss": 0.6277, + "step": 795 + }, + { + "epoch": 0.1291369240752758, + "grad_norm": 0.584966882775351, + "learning_rate": 4.982353493519921e-06, + "loss": 0.5862, + "step": 796 + }, + { + "epoch": 0.12929915639195327, + "grad_norm": 0.5793953452454812, + "learning_rate": 4.982302808693704e-06, + "loss": 0.6186, + "step": 797 + }, + { + "epoch": 0.12946138870863075, + "grad_norm": 0.6308741178042999, + "learning_rate": 4.9822520514411385e-06, + "loss": 0.6436, + "step": 798 + }, + { + "epoch": 0.12962362102530825, + "grad_norm": 0.593891021961268, + "learning_rate": 4.9822012217637065e-06, + "loss": 0.6262, + "step": 799 + }, + { + "epoch": 0.12978585334198572, + "grad_norm": 0.5856448873752901, + "learning_rate": 4.98215031966289e-06, + "loss": 0.5854, + "step": 800 + }, + { + "epoch": 0.1299480856586632, + "grad_norm": 0.5962822471978348, + "learning_rate": 4.982099345140174e-06, + "loss": 0.632, + "step": 801 + }, + { + "epoch": 0.1301103179753407, + "grad_norm": 0.5862018503043995, + "learning_rate": 4.982048298197048e-06, + "loss": 0.6264, + "step": 802 + }, + { + "epoch": 0.13027255029201817, + "grad_norm": 0.6293114329682852, + "learning_rate": 4.981997178835e-06, + "loss": 0.6241, + "step": 803 + }, + { + "epoch": 0.13043478260869565, + "grad_norm": 0.6290010696323325, + "learning_rate": 4.981945987055521e-06, + "loss": 0.6136, + "step": 804 + }, + { + "epoch": 0.13059701492537312, + "grad_norm": 0.6145482170325048, + "learning_rate": 4.981894722860106e-06, + "loss": 0.643, + "step": 805 + }, + { + "epoch": 0.13075924724205062, + "grad_norm": 0.6382206072470977, + "learning_rate": 4.981843386250248e-06, + "loss": 0.6367, + "step": 806 + }, + { + "epoch": 0.1309214795587281, + "grad_norm": 0.6303823871577136, + "learning_rate": 4.981791977227448e-06, + "loss": 0.622, + "step": 807 + }, + { + "epoch": 0.13108371187540557, + "grad_norm": 0.5907645697991404, + "learning_rate": 4.981740495793205e-06, + "loss": 0.6197, + "step": 808 + }, + { + "epoch": 0.13124594419208307, + "grad_norm": 0.584673121427501, + "learning_rate": 4.98168894194902e-06, + "loss": 0.6216, + "step": 809 + }, + { + "epoch": 0.13140817650876055, + "grad_norm": 0.6392254597997278, + "learning_rate": 4.981637315696397e-06, + "loss": 0.6422, + "step": 810 + }, + { + "epoch": 0.13157040882543802, + "grad_norm": 0.641847279252877, + "learning_rate": 4.981585617036845e-06, + "loss": 0.6139, + "step": 811 + }, + { + "epoch": 0.1317326411421155, + "grad_norm": 0.5859333486019102, + "learning_rate": 4.98153384597187e-06, + "loss": 0.6221, + "step": 812 + }, + { + "epoch": 0.131894873458793, + "grad_norm": 0.6316468180325392, + "learning_rate": 4.981482002502983e-06, + "loss": 0.6217, + "step": 813 + }, + { + "epoch": 0.13205710577547047, + "grad_norm": 0.6324665110883183, + "learning_rate": 4.981430086631696e-06, + "loss": 0.6371, + "step": 814 + }, + { + "epoch": 0.13221933809214795, + "grad_norm": 0.6163991940927916, + "learning_rate": 4.981378098359526e-06, + "loss": 0.6333, + "step": 815 + }, + { + "epoch": 0.13238157040882545, + "grad_norm": 0.6001702166436016, + "learning_rate": 4.9813260376879885e-06, + "loss": 0.6507, + "step": 816 + }, + { + "epoch": 0.13254380272550292, + "grad_norm": 0.6160575746296141, + "learning_rate": 4.981273904618602e-06, + "loss": 0.6351, + "step": 817 + }, + { + "epoch": 0.1327060350421804, + "grad_norm": 0.6028646445865672, + "learning_rate": 4.981221699152888e-06, + "loss": 0.632, + "step": 818 + }, + { + "epoch": 0.13286826735885787, + "grad_norm": 0.6142540701445968, + "learning_rate": 4.981169421292369e-06, + "loss": 0.6164, + "step": 819 + }, + { + "epoch": 0.13303049967553537, + "grad_norm": 0.5917869792136329, + "learning_rate": 4.981117071038572e-06, + "loss": 0.621, + "step": 820 + }, + { + "epoch": 0.13319273199221285, + "grad_norm": 0.6045825662575596, + "learning_rate": 4.981064648393023e-06, + "loss": 0.6288, + "step": 821 + }, + { + "epoch": 0.13335496430889032, + "grad_norm": 0.6199107322636234, + "learning_rate": 4.981012153357252e-06, + "loss": 0.656, + "step": 822 + }, + { + "epoch": 0.13351719662556782, + "grad_norm": 0.5826693977080102, + "learning_rate": 4.9809595859327905e-06, + "loss": 0.6169, + "step": 823 + }, + { + "epoch": 0.1336794289422453, + "grad_norm": 0.6331381019185369, + "learning_rate": 4.980906946121173e-06, + "loss": 0.6194, + "step": 824 + }, + { + "epoch": 0.13384166125892277, + "grad_norm": 0.5990995299600678, + "learning_rate": 4.980854233923934e-06, + "loss": 0.6099, + "step": 825 + }, + { + "epoch": 0.13400389357560025, + "grad_norm": 0.5932286095338417, + "learning_rate": 4.9808014493426124e-06, + "loss": 0.6247, + "step": 826 + }, + { + "epoch": 0.13416612589227775, + "grad_norm": 0.5836143224549672, + "learning_rate": 4.980748592378749e-06, + "loss": 0.5897, + "step": 827 + }, + { + "epoch": 0.13432835820895522, + "grad_norm": 0.6924390230369861, + "learning_rate": 4.980695663033885e-06, + "loss": 0.5956, + "step": 828 + }, + { + "epoch": 0.1344905905256327, + "grad_norm": 0.6201441959179145, + "learning_rate": 4.980642661309564e-06, + "loss": 0.6304, + "step": 829 + }, + { + "epoch": 0.1346528228423102, + "grad_norm": 0.5879743010767488, + "learning_rate": 4.980589587207334e-06, + "loss": 0.603, + "step": 830 + }, + { + "epoch": 0.13481505515898767, + "grad_norm": 0.6333894547834047, + "learning_rate": 4.980536440728743e-06, + "loss": 0.6176, + "step": 831 + }, + { + "epoch": 0.13497728747566515, + "grad_norm": 0.6442082636619264, + "learning_rate": 4.980483221875342e-06, + "loss": 0.6324, + "step": 832 + }, + { + "epoch": 0.13513951979234262, + "grad_norm": 0.6049570516939456, + "learning_rate": 4.980429930648682e-06, + "loss": 0.6321, + "step": 833 + }, + { + "epoch": 0.13530175210902012, + "grad_norm": 0.6437990214818582, + "learning_rate": 4.9803765670503205e-06, + "loss": 0.596, + "step": 834 + }, + { + "epoch": 0.1354639844256976, + "grad_norm": 0.6341791990339494, + "learning_rate": 4.980323131081812e-06, + "loss": 0.5861, + "step": 835 + }, + { + "epoch": 0.13562621674237507, + "grad_norm": 0.5755205826934249, + "learning_rate": 4.980269622744718e-06, + "loss": 0.5724, + "step": 836 + }, + { + "epoch": 0.13578844905905257, + "grad_norm": 0.5879458172678952, + "learning_rate": 4.980216042040598e-06, + "loss": 0.5913, + "step": 837 + }, + { + "epoch": 0.13595068137573005, + "grad_norm": 0.6014482636290367, + "learning_rate": 4.9801623889710146e-06, + "loss": 0.6189, + "step": 838 + }, + { + "epoch": 0.13611291369240752, + "grad_norm": 0.5814630623988423, + "learning_rate": 4.980108663537536e-06, + "loss": 0.6329, + "step": 839 + }, + { + "epoch": 0.136275146009085, + "grad_norm": 0.6033391752756518, + "learning_rate": 4.980054865741728e-06, + "loss": 0.6376, + "step": 840 + }, + { + "epoch": 0.1364373783257625, + "grad_norm": 0.6206475702793178, + "learning_rate": 4.98000099558516e-06, + "loss": 0.6142, + "step": 841 + }, + { + "epoch": 0.13659961064243997, + "grad_norm": 0.6138760004031152, + "learning_rate": 4.979947053069405e-06, + "loss": 0.6429, + "step": 842 + }, + { + "epoch": 0.13676184295911745, + "grad_norm": 0.6055208728626289, + "learning_rate": 4.979893038196036e-06, + "loss": 0.6224, + "step": 843 + }, + { + "epoch": 0.13692407527579495, + "grad_norm": 0.5815046821082615, + "learning_rate": 4.979838950966629e-06, + "loss": 0.5946, + "step": 844 + }, + { + "epoch": 0.13708630759247242, + "grad_norm": 0.6301235185084472, + "learning_rate": 4.979784791382762e-06, + "loss": 0.6297, + "step": 845 + }, + { + "epoch": 0.1372485399091499, + "grad_norm": 0.6066101322480144, + "learning_rate": 4.9797305594460166e-06, + "loss": 0.5784, + "step": 846 + }, + { + "epoch": 0.13741077222582737, + "grad_norm": 0.6261944590062563, + "learning_rate": 4.979676255157972e-06, + "loss": 0.6079, + "step": 847 + }, + { + "epoch": 0.13757300454250487, + "grad_norm": 0.5919654780903161, + "learning_rate": 4.979621878520217e-06, + "loss": 0.6304, + "step": 848 + }, + { + "epoch": 0.13773523685918235, + "grad_norm": 0.5988060526052267, + "learning_rate": 4.979567429534335e-06, + "loss": 0.583, + "step": 849 + }, + { + "epoch": 0.13789746917585982, + "grad_norm": 0.619617375984754, + "learning_rate": 4.979512908201914e-06, + "loss": 0.6155, + "step": 850 + }, + { + "epoch": 0.13805970149253732, + "grad_norm": 0.6017448538538015, + "learning_rate": 4.979458314524548e-06, + "loss": 0.6021, + "step": 851 + }, + { + "epoch": 0.1382219338092148, + "grad_norm": 0.6254113250893821, + "learning_rate": 4.979403648503827e-06, + "loss": 0.6017, + "step": 852 + }, + { + "epoch": 0.13838416612589227, + "grad_norm": 0.627893331656738, + "learning_rate": 4.9793489101413474e-06, + "loss": 0.6434, + "step": 853 + }, + { + "epoch": 0.13854639844256977, + "grad_norm": 0.596259177147533, + "learning_rate": 4.979294099438706e-06, + "loss": 0.6286, + "step": 854 + }, + { + "epoch": 0.13870863075924725, + "grad_norm": 0.6239094421256801, + "learning_rate": 4.979239216397503e-06, + "loss": 0.6248, + "step": 855 + }, + { + "epoch": 0.13887086307592472, + "grad_norm": 0.6050393253568223, + "learning_rate": 4.979184261019338e-06, + "loss": 0.6307, + "step": 856 + }, + { + "epoch": 0.1390330953926022, + "grad_norm": 0.6367829406842651, + "learning_rate": 4.979129233305815e-06, + "loss": 0.614, + "step": 857 + }, + { + "epoch": 0.1391953277092797, + "grad_norm": 0.617635445152866, + "learning_rate": 4.97907413325854e-06, + "loss": 0.6192, + "step": 858 + }, + { + "epoch": 0.13935756002595717, + "grad_norm": 0.5959391457499478, + "learning_rate": 4.9790189608791205e-06, + "loss": 0.5937, + "step": 859 + }, + { + "epoch": 0.13951979234263465, + "grad_norm": 0.5931868725164852, + "learning_rate": 4.978963716169166e-06, + "loss": 0.6113, + "step": 860 + }, + { + "epoch": 0.13968202465931215, + "grad_norm": 0.6012387719965743, + "learning_rate": 4.978908399130289e-06, + "loss": 0.6003, + "step": 861 + }, + { + "epoch": 0.13984425697598962, + "grad_norm": 0.6054916207427669, + "learning_rate": 4.978853009764103e-06, + "loss": 0.6088, + "step": 862 + }, + { + "epoch": 0.1400064892926671, + "grad_norm": 0.6379191852332907, + "learning_rate": 4.978797548072224e-06, + "loss": 0.6426, + "step": 863 + }, + { + "epoch": 0.14016872160934457, + "grad_norm": 0.5666941249119275, + "learning_rate": 4.97874201405627e-06, + "loss": 0.6182, + "step": 864 + }, + { + "epoch": 0.14033095392602207, + "grad_norm": 0.5951418349522476, + "learning_rate": 4.978686407717862e-06, + "loss": 0.623, + "step": 865 + }, + { + "epoch": 0.14049318624269955, + "grad_norm": 0.6147139038166238, + "learning_rate": 4.9786307290586215e-06, + "loss": 0.6052, + "step": 866 + }, + { + "epoch": 0.14065541855937702, + "grad_norm": 0.609754831793361, + "learning_rate": 4.978574978080174e-06, + "loss": 0.6414, + "step": 867 + }, + { + "epoch": 0.14081765087605452, + "grad_norm": 0.6129820659504895, + "learning_rate": 4.978519154784146e-06, + "loss": 0.5974, + "step": 868 + }, + { + "epoch": 0.140979883192732, + "grad_norm": 0.649759018523284, + "learning_rate": 4.978463259172166e-06, + "loss": 0.6117, + "step": 869 + }, + { + "epoch": 0.14114211550940947, + "grad_norm": 0.6107681478668266, + "learning_rate": 4.978407291245866e-06, + "loss": 0.5965, + "step": 870 + }, + { + "epoch": 0.14130434782608695, + "grad_norm": 0.6068139822326235, + "learning_rate": 4.978351251006876e-06, + "loss": 0.6211, + "step": 871 + }, + { + "epoch": 0.14146658014276445, + "grad_norm": 0.6335920960505405, + "learning_rate": 4.978295138456835e-06, + "loss": 0.611, + "step": 872 + }, + { + "epoch": 0.14162881245944192, + "grad_norm": 0.6177484098607492, + "learning_rate": 4.978238953597376e-06, + "loss": 0.5968, + "step": 873 + }, + { + "epoch": 0.1417910447761194, + "grad_norm": 0.5987422795273865, + "learning_rate": 4.978182696430142e-06, + "loss": 0.5994, + "step": 874 + }, + { + "epoch": 0.1419532770927969, + "grad_norm": 0.6307945247603609, + "learning_rate": 4.978126366956773e-06, + "loss": 0.6292, + "step": 875 + }, + { + "epoch": 0.14211550940947437, + "grad_norm": 0.6372549946656031, + "learning_rate": 4.978069965178912e-06, + "loss": 0.6027, + "step": 876 + }, + { + "epoch": 0.14227774172615185, + "grad_norm": 0.588221870409263, + "learning_rate": 4.9780134910982045e-06, + "loss": 0.6048, + "step": 877 + }, + { + "epoch": 0.14243997404282932, + "grad_norm": 0.6385755474903104, + "learning_rate": 4.9779569447163e-06, + "loss": 0.6116, + "step": 878 + }, + { + "epoch": 0.14260220635950682, + "grad_norm": 0.6095957259292661, + "learning_rate": 4.977900326034847e-06, + "loss": 0.5963, + "step": 879 + }, + { + "epoch": 0.1427644386761843, + "grad_norm": 0.6201248559886144, + "learning_rate": 4.977843635055497e-06, + "loss": 0.6243, + "step": 880 + }, + { + "epoch": 0.14292667099286177, + "grad_norm": 0.6102720799354557, + "learning_rate": 4.977786871779905e-06, + "loss": 0.6454, + "step": 881 + }, + { + "epoch": 0.14308890330953927, + "grad_norm": 0.636392286211165, + "learning_rate": 4.977730036209727e-06, + "loss": 0.6268, + "step": 882 + }, + { + "epoch": 0.14325113562621675, + "grad_norm": 0.5675443558839023, + "learning_rate": 4.977673128346622e-06, + "loss": 0.6052, + "step": 883 + }, + { + "epoch": 0.14341336794289422, + "grad_norm": 0.6116446080641638, + "learning_rate": 4.977616148192249e-06, + "loss": 0.635, + "step": 884 + }, + { + "epoch": 0.1435756002595717, + "grad_norm": 0.620938258797265, + "learning_rate": 4.977559095748271e-06, + "loss": 0.5694, + "step": 885 + }, + { + "epoch": 0.1437378325762492, + "grad_norm": 0.6344261169269049, + "learning_rate": 4.977501971016353e-06, + "loss": 0.5799, + "step": 886 + }, + { + "epoch": 0.14390006489292667, + "grad_norm": 0.593988117149527, + "learning_rate": 4.977444773998161e-06, + "loss": 0.6308, + "step": 887 + }, + { + "epoch": 0.14406229720960415, + "grad_norm": 0.6150987539695634, + "learning_rate": 4.977387504695365e-06, + "loss": 0.6135, + "step": 888 + }, + { + "epoch": 0.14422452952628165, + "grad_norm": 0.5846657221538905, + "learning_rate": 4.977330163109635e-06, + "loss": 0.6139, + "step": 889 + }, + { + "epoch": 0.14438676184295912, + "grad_norm": 0.619617781961449, + "learning_rate": 4.977272749242645e-06, + "loss": 0.6021, + "step": 890 + }, + { + "epoch": 0.1445489941596366, + "grad_norm": 0.58086415902693, + "learning_rate": 4.977215263096069e-06, + "loss": 0.5975, + "step": 891 + }, + { + "epoch": 0.14471122647631407, + "grad_norm": 0.6128510972843537, + "learning_rate": 4.977157704671585e-06, + "loss": 0.5765, + "step": 892 + }, + { + "epoch": 0.14487345879299157, + "grad_norm": 0.6205609028819009, + "learning_rate": 4.977100073970872e-06, + "loss": 0.6562, + "step": 893 + }, + { + "epoch": 0.14503569110966905, + "grad_norm": 0.594167280165316, + "learning_rate": 4.977042370995612e-06, + "loss": 0.6263, + "step": 894 + }, + { + "epoch": 0.14519792342634652, + "grad_norm": 0.6000000728327816, + "learning_rate": 4.976984595747487e-06, + "loss": 0.5887, + "step": 895 + }, + { + "epoch": 0.14536015574302402, + "grad_norm": 0.6135275947066898, + "learning_rate": 4.976926748228185e-06, + "loss": 0.6192, + "step": 896 + }, + { + "epoch": 0.1455223880597015, + "grad_norm": 0.6163617100126443, + "learning_rate": 4.976868828439394e-06, + "loss": 0.56, + "step": 897 + }, + { + "epoch": 0.14568462037637897, + "grad_norm": 0.623175021120429, + "learning_rate": 4.9768108363828016e-06, + "loss": 0.6236, + "step": 898 + }, + { + "epoch": 0.14584685269305644, + "grad_norm": 0.6003827766921609, + "learning_rate": 4.976752772060101e-06, + "loss": 0.5991, + "step": 899 + }, + { + "epoch": 0.14600908500973395, + "grad_norm": 0.6006122988280549, + "learning_rate": 4.976694635472986e-06, + "loss": 0.6175, + "step": 900 + }, + { + "epoch": 0.14617131732641142, + "grad_norm": 0.6049825071019315, + "learning_rate": 4.976636426623153e-06, + "loss": 0.5876, + "step": 901 + }, + { + "epoch": 0.1463335496430889, + "grad_norm": 0.6200906844731876, + "learning_rate": 4.976578145512302e-06, + "loss": 0.5828, + "step": 902 + }, + { + "epoch": 0.1464957819597664, + "grad_norm": 0.5877737170418217, + "learning_rate": 4.976519792142132e-06, + "loss": 0.5708, + "step": 903 + }, + { + "epoch": 0.14665801427644387, + "grad_norm": 0.6016472103082066, + "learning_rate": 4.976461366514345e-06, + "loss": 0.6008, + "step": 904 + }, + { + "epoch": 0.14682024659312135, + "grad_norm": 0.5977130495202795, + "learning_rate": 4.976402868630645e-06, + "loss": 0.594, + "step": 905 + }, + { + "epoch": 0.14698247890979882, + "grad_norm": 0.6244681316806157, + "learning_rate": 4.9763442984927424e-06, + "loss": 0.6047, + "step": 906 + }, + { + "epoch": 0.14714471122647632, + "grad_norm": 0.6069698993063275, + "learning_rate": 4.9762856561023425e-06, + "loss": 0.6199, + "step": 907 + }, + { + "epoch": 0.1473069435431538, + "grad_norm": 0.5850502329646227, + "learning_rate": 4.976226941461158e-06, + "loss": 0.6294, + "step": 908 + }, + { + "epoch": 0.14746917585983127, + "grad_norm": 0.6101082874641243, + "learning_rate": 4.976168154570902e-06, + "loss": 0.64, + "step": 909 + }, + { + "epoch": 0.14763140817650877, + "grad_norm": 0.5783932483061993, + "learning_rate": 4.976109295433289e-06, + "loss": 0.6224, + "step": 910 + }, + { + "epoch": 0.14779364049318625, + "grad_norm": 0.5985120577013504, + "learning_rate": 4.976050364050036e-06, + "loss": 0.6176, + "step": 911 + }, + { + "epoch": 0.14795587280986372, + "grad_norm": 0.585961952108234, + "learning_rate": 4.975991360422864e-06, + "loss": 0.6081, + "step": 912 + }, + { + "epoch": 0.1481181051265412, + "grad_norm": 0.619404685807388, + "learning_rate": 4.975932284553494e-06, + "loss": 0.5893, + "step": 913 + }, + { + "epoch": 0.1482803374432187, + "grad_norm": 0.6273869391578534, + "learning_rate": 4.975873136443649e-06, + "loss": 0.5954, + "step": 914 + }, + { + "epoch": 0.14844256975989617, + "grad_norm": 0.5888205791545073, + "learning_rate": 4.975813916095055e-06, + "loss": 0.6158, + "step": 915 + }, + { + "epoch": 0.14860480207657364, + "grad_norm": 0.5822559858548441, + "learning_rate": 4.9757546235094405e-06, + "loss": 0.5936, + "step": 916 + }, + { + "epoch": 0.14876703439325115, + "grad_norm": 0.6507044850675124, + "learning_rate": 4.9756952586885345e-06, + "loss": 0.6072, + "step": 917 + }, + { + "epoch": 0.14892926670992862, + "grad_norm": 0.5964559908476156, + "learning_rate": 4.975635821634069e-06, + "loss": 0.5688, + "step": 918 + }, + { + "epoch": 0.1490914990266061, + "grad_norm": 0.6219518019414724, + "learning_rate": 4.97557631234778e-06, + "loss": 0.5999, + "step": 919 + }, + { + "epoch": 0.14925373134328357, + "grad_norm": 0.6530567149789034, + "learning_rate": 4.975516730831402e-06, + "loss": 0.5934, + "step": 920 + }, + { + "epoch": 0.14941596365996107, + "grad_norm": 0.6317465377783246, + "learning_rate": 4.9754570770866744e-06, + "loss": 0.6316, + "step": 921 + }, + { + "epoch": 0.14957819597663854, + "grad_norm": 0.620683087946778, + "learning_rate": 4.975397351115337e-06, + "loss": 0.6403, + "step": 922 + }, + { + "epoch": 0.14974042829331602, + "grad_norm": 0.6404170993364097, + "learning_rate": 4.9753375529191335e-06, + "loss": 0.5815, + "step": 923 + }, + { + "epoch": 0.14990266060999352, + "grad_norm": 0.6225004103898745, + "learning_rate": 4.975277682499808e-06, + "loss": 0.6262, + "step": 924 + }, + { + "epoch": 0.150064892926671, + "grad_norm": 0.616079600773955, + "learning_rate": 4.975217739859106e-06, + "loss": 0.5968, + "step": 925 + }, + { + "epoch": 0.15022712524334847, + "grad_norm": 0.5757356816145729, + "learning_rate": 4.975157724998778e-06, + "loss": 0.6077, + "step": 926 + }, + { + "epoch": 0.15038935756002594, + "grad_norm": 0.5784571813671191, + "learning_rate": 4.975097637920575e-06, + "loss": 0.6334, + "step": 927 + }, + { + "epoch": 0.15055158987670345, + "grad_norm": 0.575833542377451, + "learning_rate": 4.975037478626251e-06, + "loss": 0.5994, + "step": 928 + }, + { + "epoch": 0.15071382219338092, + "grad_norm": 0.5636209106938298, + "learning_rate": 4.974977247117559e-06, + "loss": 0.6055, + "step": 929 + }, + { + "epoch": 0.1508760545100584, + "grad_norm": 0.598465508823024, + "learning_rate": 4.974916943396257e-06, + "loss": 0.5852, + "step": 930 + }, + { + "epoch": 0.1510382868267359, + "grad_norm": 0.5875482483355983, + "learning_rate": 4.974856567464107e-06, + "loss": 0.5998, + "step": 931 + }, + { + "epoch": 0.15120051914341337, + "grad_norm": 0.5743446576746822, + "learning_rate": 4.974796119322868e-06, + "loss": 0.6042, + "step": 932 + }, + { + "epoch": 0.15136275146009084, + "grad_norm": 0.6089172407867584, + "learning_rate": 4.974735598974304e-06, + "loss": 0.607, + "step": 933 + }, + { + "epoch": 0.15152498377676835, + "grad_norm": 0.6184893192762578, + "learning_rate": 4.974675006420181e-06, + "loss": 0.5772, + "step": 934 + }, + { + "epoch": 0.15168721609344582, + "grad_norm": 0.6288575100066697, + "learning_rate": 4.974614341662267e-06, + "loss": 0.6228, + "step": 935 + }, + { + "epoch": 0.1518494484101233, + "grad_norm": 0.5979512218032039, + "learning_rate": 4.974553604702332e-06, + "loss": 0.6048, + "step": 936 + }, + { + "epoch": 0.15201168072680077, + "grad_norm": 0.6401374901192448, + "learning_rate": 4.974492795542148e-06, + "loss": 0.6239, + "step": 937 + }, + { + "epoch": 0.15217391304347827, + "grad_norm": 0.5883901901752558, + "learning_rate": 4.974431914183491e-06, + "loss": 0.6267, + "step": 938 + }, + { + "epoch": 0.15233614536015574, + "grad_norm": 0.6142387224431242, + "learning_rate": 4.974370960628135e-06, + "loss": 0.5761, + "step": 939 + }, + { + "epoch": 0.15249837767683322, + "grad_norm": 0.6115890669604067, + "learning_rate": 4.974309934877859e-06, + "loss": 0.6305, + "step": 940 + }, + { + "epoch": 0.15266060999351072, + "grad_norm": 0.6520371382461403, + "learning_rate": 4.9742488369344445e-06, + "loss": 0.6165, + "step": 941 + }, + { + "epoch": 0.1528228423101882, + "grad_norm": 0.6074719117739136, + "learning_rate": 4.9741876667996725e-06, + "loss": 0.6074, + "step": 942 + }, + { + "epoch": 0.15298507462686567, + "grad_norm": 0.5880498420391725, + "learning_rate": 4.974126424475329e-06, + "loss": 0.5864, + "step": 943 + }, + { + "epoch": 0.15314730694354314, + "grad_norm": 0.6026432266168775, + "learning_rate": 4.974065109963201e-06, + "loss": 0.6265, + "step": 944 + }, + { + "epoch": 0.15330953926022065, + "grad_norm": 0.6020716346864958, + "learning_rate": 4.974003723265077e-06, + "loss": 0.5938, + "step": 945 + }, + { + "epoch": 0.15347177157689812, + "grad_norm": 0.6485160440036423, + "learning_rate": 4.973942264382748e-06, + "loss": 0.6238, + "step": 946 + }, + { + "epoch": 0.1536340038935756, + "grad_norm": 0.5985323926802604, + "learning_rate": 4.973880733318007e-06, + "loss": 0.6081, + "step": 947 + }, + { + "epoch": 0.1537962362102531, + "grad_norm": 0.6090121300282543, + "learning_rate": 4.97381913007265e-06, + "loss": 0.6273, + "step": 948 + }, + { + "epoch": 0.15395846852693057, + "grad_norm": 0.6022760754169411, + "learning_rate": 4.973757454648475e-06, + "loss": 0.6199, + "step": 949 + }, + { + "epoch": 0.15412070084360804, + "grad_norm": 0.6169571436987275, + "learning_rate": 4.973695707047279e-06, + "loss": 0.5995, + "step": 950 + }, + { + "epoch": 0.15428293316028552, + "grad_norm": 0.5893222968144365, + "learning_rate": 4.973633887270866e-06, + "loss": 0.6151, + "step": 951 + }, + { + "epoch": 0.15444516547696302, + "grad_norm": 0.6408781962766349, + "learning_rate": 4.973571995321039e-06, + "loss": 0.5746, + "step": 952 + }, + { + "epoch": 0.1546073977936405, + "grad_norm": 0.6086350001063424, + "learning_rate": 4.973510031199603e-06, + "loss": 0.6323, + "step": 953 + }, + { + "epoch": 0.15476963011031797, + "grad_norm": 0.6135368671674201, + "learning_rate": 4.973447994908368e-06, + "loss": 0.6529, + "step": 954 + }, + { + "epoch": 0.15493186242699547, + "grad_norm": 0.6238060258890813, + "learning_rate": 4.973385886449141e-06, + "loss": 0.6523, + "step": 955 + }, + { + "epoch": 0.15509409474367294, + "grad_norm": 0.7227128527761003, + "learning_rate": 4.973323705823737e-06, + "loss": 0.6355, + "step": 956 + }, + { + "epoch": 0.15525632706035042, + "grad_norm": 0.6246442729374666, + "learning_rate": 4.9732614530339695e-06, + "loss": 0.6674, + "step": 957 + }, + { + "epoch": 0.1554185593770279, + "grad_norm": 0.6053386715406172, + "learning_rate": 4.9731991280816534e-06, + "loss": 0.6102, + "step": 958 + }, + { + "epoch": 0.1555807916937054, + "grad_norm": 0.6173056650525708, + "learning_rate": 4.973136730968608e-06, + "loss": 0.6367, + "step": 959 + }, + { + "epoch": 0.15574302401038287, + "grad_norm": 0.6399891954140845, + "learning_rate": 4.973074261696655e-06, + "loss": 0.6, + "step": 960 + }, + { + "epoch": 0.15590525632706034, + "grad_norm": 0.6286037296566651, + "learning_rate": 4.9730117202676165e-06, + "loss": 0.6269, + "step": 961 + }, + { + "epoch": 0.15606748864373785, + "grad_norm": 0.6310173207149398, + "learning_rate": 4.972949106683316e-06, + "loss": 0.6075, + "step": 962 + }, + { + "epoch": 0.15622972096041532, + "grad_norm": 0.6111292649368333, + "learning_rate": 4.9728864209455816e-06, + "loss": 0.6185, + "step": 963 + }, + { + "epoch": 0.1563919532770928, + "grad_norm": 0.6035506057907128, + "learning_rate": 4.972823663056242e-06, + "loss": 0.6229, + "step": 964 + }, + { + "epoch": 0.15655418559377027, + "grad_norm": 0.6242945462885627, + "learning_rate": 4.9727608330171285e-06, + "loss": 0.5905, + "step": 965 + }, + { + "epoch": 0.15671641791044777, + "grad_norm": 0.6357161773111434, + "learning_rate": 4.972697930830073e-06, + "loss": 0.5863, + "step": 966 + }, + { + "epoch": 0.15687865022712524, + "grad_norm": 0.6775448791079173, + "learning_rate": 4.972634956496913e-06, + "loss": 0.616, + "step": 967 + }, + { + "epoch": 0.15704088254380272, + "grad_norm": 0.6059480666203664, + "learning_rate": 4.972571910019485e-06, + "loss": 0.5922, + "step": 968 + }, + { + "epoch": 0.15720311486048022, + "grad_norm": 0.6140915783027491, + "learning_rate": 4.972508791399628e-06, + "loss": 0.6325, + "step": 969 + }, + { + "epoch": 0.1573653471771577, + "grad_norm": 0.5957057484120275, + "learning_rate": 4.972445600639183e-06, + "loss": 0.6365, + "step": 970 + }, + { + "epoch": 0.15752757949383517, + "grad_norm": 0.6110673248508176, + "learning_rate": 4.972382337739996e-06, + "loss": 0.6329, + "step": 971 + }, + { + "epoch": 0.15768981181051264, + "grad_norm": 0.5985995084602003, + "learning_rate": 4.9723190027039106e-06, + "loss": 0.6337, + "step": 972 + }, + { + "epoch": 0.15785204412719014, + "grad_norm": 0.5828655867605322, + "learning_rate": 4.972255595532776e-06, + "loss": 0.6327, + "step": 973 + }, + { + "epoch": 0.15801427644386762, + "grad_norm": 0.5797887995112816, + "learning_rate": 4.972192116228442e-06, + "loss": 0.5977, + "step": 974 + }, + { + "epoch": 0.1581765087605451, + "grad_norm": 0.568899779179041, + "learning_rate": 4.972128564792759e-06, + "loss": 0.6202, + "step": 975 + }, + { + "epoch": 0.1583387410772226, + "grad_norm": 0.5997255915415748, + "learning_rate": 4.972064941227585e-06, + "loss": 0.6157, + "step": 976 + }, + { + "epoch": 0.15850097339390007, + "grad_norm": 0.5914090349224548, + "learning_rate": 4.972001245534774e-06, + "loss": 0.6586, + "step": 977 + }, + { + "epoch": 0.15866320571057754, + "grad_norm": 0.6300820369825929, + "learning_rate": 4.971937477716183e-06, + "loss": 0.5784, + "step": 978 + }, + { + "epoch": 0.15882543802725502, + "grad_norm": 0.6085900690560968, + "learning_rate": 4.971873637773675e-06, + "loss": 0.5869, + "step": 979 + }, + { + "epoch": 0.15898767034393252, + "grad_norm": 0.6311972870933331, + "learning_rate": 4.971809725709112e-06, + "loss": 0.5978, + "step": 980 + }, + { + "epoch": 0.15914990266061, + "grad_norm": 0.5701924320254436, + "learning_rate": 4.971745741524358e-06, + "loss": 0.5879, + "step": 981 + }, + { + "epoch": 0.15931213497728747, + "grad_norm": 0.6039136592281663, + "learning_rate": 4.971681685221282e-06, + "loss": 0.5988, + "step": 982 + }, + { + "epoch": 0.15947436729396497, + "grad_norm": 0.6530018151155735, + "learning_rate": 4.97161755680175e-06, + "loss": 0.6416, + "step": 983 + }, + { + "epoch": 0.15963659961064244, + "grad_norm": 0.5558678217790901, + "learning_rate": 4.971553356267635e-06, + "loss": 0.5682, + "step": 984 + }, + { + "epoch": 0.15979883192731992, + "grad_norm": 0.6092817796791554, + "learning_rate": 4.97148908362081e-06, + "loss": 0.6182, + "step": 985 + }, + { + "epoch": 0.1599610642439974, + "grad_norm": 0.5994848019853191, + "learning_rate": 4.9714247388631496e-06, + "loss": 0.5742, + "step": 986 + }, + { + "epoch": 0.1601232965606749, + "grad_norm": 0.576429982285585, + "learning_rate": 4.971360321996532e-06, + "loss": 0.6092, + "step": 987 + }, + { + "epoch": 0.16028552887735237, + "grad_norm": 0.6013954660509494, + "learning_rate": 4.971295833022836e-06, + "loss": 0.6348, + "step": 988 + }, + { + "epoch": 0.16044776119402984, + "grad_norm": 0.6088993039158622, + "learning_rate": 4.971231271943944e-06, + "loss": 0.5978, + "step": 989 + }, + { + "epoch": 0.16060999351070734, + "grad_norm": 0.599619789248101, + "learning_rate": 4.971166638761739e-06, + "loss": 0.6214, + "step": 990 + }, + { + "epoch": 0.16077222582738482, + "grad_norm": 0.5775741083718705, + "learning_rate": 4.971101933478106e-06, + "loss": 0.6041, + "step": 991 + }, + { + "epoch": 0.1609344581440623, + "grad_norm": 0.5860652349147724, + "learning_rate": 4.971037156094935e-06, + "loss": 0.6286, + "step": 992 + }, + { + "epoch": 0.16109669046073977, + "grad_norm": 0.5955534492841615, + "learning_rate": 4.970972306614115e-06, + "loss": 0.6192, + "step": 993 + }, + { + "epoch": 0.16125892277741727, + "grad_norm": 0.5823876472721123, + "learning_rate": 4.970907385037537e-06, + "loss": 0.6142, + "step": 994 + }, + { + "epoch": 0.16142115509409474, + "grad_norm": 0.6117864687241432, + "learning_rate": 4.970842391367097e-06, + "loss": 0.6238, + "step": 995 + }, + { + "epoch": 0.16158338741077222, + "grad_norm": 0.5694510652360035, + "learning_rate": 4.970777325604691e-06, + "loss": 0.5821, + "step": 996 + }, + { + "epoch": 0.16174561972744972, + "grad_norm": 0.6491076177096823, + "learning_rate": 4.970712187752217e-06, + "loss": 0.6548, + "step": 997 + }, + { + "epoch": 0.1619078520441272, + "grad_norm": 0.6223858894592356, + "learning_rate": 4.970646977811575e-06, + "loss": 0.5989, + "step": 998 + }, + { + "epoch": 0.16207008436080467, + "grad_norm": 0.6231390624722054, + "learning_rate": 4.970581695784668e-06, + "loss": 0.6305, + "step": 999 + }, + { + "epoch": 0.16223231667748214, + "grad_norm": 0.6226175694200908, + "learning_rate": 4.970516341673401e-06, + "loss": 0.5839, + "step": 1000 + }, + { + "epoch": 0.16239454899415964, + "grad_norm": 0.5986019257103152, + "learning_rate": 4.970450915479682e-06, + "loss": 0.5738, + "step": 1001 + }, + { + "epoch": 0.16255678131083712, + "grad_norm": 0.5864293773995689, + "learning_rate": 4.970385417205418e-06, + "loss": 0.6182, + "step": 1002 + }, + { + "epoch": 0.1627190136275146, + "grad_norm": 0.5913557297619039, + "learning_rate": 4.970319846852521e-06, + "loss": 0.5809, + "step": 1003 + }, + { + "epoch": 0.1628812459441921, + "grad_norm": 0.604971392467276, + "learning_rate": 4.970254204422903e-06, + "loss": 0.5954, + "step": 1004 + }, + { + "epoch": 0.16304347826086957, + "grad_norm": 0.6019148025711034, + "learning_rate": 4.9701884899184805e-06, + "loss": 0.6026, + "step": 1005 + }, + { + "epoch": 0.16320571057754704, + "grad_norm": 0.6163994340310645, + "learning_rate": 4.970122703341171e-06, + "loss": 0.6407, + "step": 1006 + }, + { + "epoch": 0.16336794289422452, + "grad_norm": 0.5937582212853106, + "learning_rate": 4.970056844692893e-06, + "loss": 0.5857, + "step": 1007 + }, + { + "epoch": 0.16353017521090202, + "grad_norm": 0.7082910843359993, + "learning_rate": 4.96999091397557e-06, + "loss": 0.5995, + "step": 1008 + }, + { + "epoch": 0.1636924075275795, + "grad_norm": 0.5653737104110389, + "learning_rate": 4.969924911191122e-06, + "loss": 0.5757, + "step": 1009 + }, + { + "epoch": 0.16385463984425697, + "grad_norm": 0.5951730230142602, + "learning_rate": 4.969858836341479e-06, + "loss": 0.6435, + "step": 1010 + }, + { + "epoch": 0.16401687216093447, + "grad_norm": 0.6707364680103735, + "learning_rate": 4.969792689428565e-06, + "loss": 0.626, + "step": 1011 + }, + { + "epoch": 0.16417910447761194, + "grad_norm": 0.5955448629923231, + "learning_rate": 4.9697264704543135e-06, + "loss": 0.5909, + "step": 1012 + }, + { + "epoch": 0.16434133679428942, + "grad_norm": 0.6331141388421108, + "learning_rate": 4.969660179420654e-06, + "loss": 0.5953, + "step": 1013 + }, + { + "epoch": 0.16450356911096692, + "grad_norm": 0.6161337530751133, + "learning_rate": 4.969593816329522e-06, + "loss": 0.6037, + "step": 1014 + }, + { + "epoch": 0.1646658014276444, + "grad_norm": 0.6069061938742474, + "learning_rate": 4.969527381182853e-06, + "loss": 0.5925, + "step": 1015 + }, + { + "epoch": 0.16482803374432187, + "grad_norm": 0.6236182869110838, + "learning_rate": 4.9694608739825865e-06, + "loss": 0.6149, + "step": 1016 + }, + { + "epoch": 0.16499026606099934, + "grad_norm": 0.61163961550965, + "learning_rate": 4.969394294730662e-06, + "loss": 0.6188, + "step": 1017 + }, + { + "epoch": 0.16515249837767684, + "grad_norm": 0.6134340854916733, + "learning_rate": 4.969327643429022e-06, + "loss": 0.5914, + "step": 1018 + }, + { + "epoch": 0.16531473069435432, + "grad_norm": 0.608029506208964, + "learning_rate": 4.969260920079611e-06, + "loss": 0.6015, + "step": 1019 + }, + { + "epoch": 0.1654769630110318, + "grad_norm": 0.6065541300611998, + "learning_rate": 4.9691941246843764e-06, + "loss": 0.622, + "step": 1020 + }, + { + "epoch": 0.1656391953277093, + "grad_norm": 0.5717513771884877, + "learning_rate": 4.969127257245268e-06, + "loss": 0.5373, + "step": 1021 + }, + { + "epoch": 0.16580142764438677, + "grad_norm": 0.5954748280178684, + "learning_rate": 4.969060317764236e-06, + "loss": 0.5944, + "step": 1022 + }, + { + "epoch": 0.16596365996106424, + "grad_norm": 0.6271906299272371, + "learning_rate": 4.9689933062432326e-06, + "loss": 0.6344, + "step": 1023 + }, + { + "epoch": 0.16612589227774172, + "grad_norm": 0.5994987575519337, + "learning_rate": 4.968926222684213e-06, + "loss": 0.6359, + "step": 1024 + }, + { + "epoch": 0.16628812459441922, + "grad_norm": 0.5791816174339127, + "learning_rate": 4.9688590670891365e-06, + "loss": 0.6081, + "step": 1025 + }, + { + "epoch": 0.1664503569110967, + "grad_norm": 0.6033124206748912, + "learning_rate": 4.96879183945996e-06, + "loss": 0.6119, + "step": 1026 + }, + { + "epoch": 0.16661258922777417, + "grad_norm": 0.6168529546882716, + "learning_rate": 4.968724539798648e-06, + "loss": 0.6041, + "step": 1027 + }, + { + "epoch": 0.16677482154445167, + "grad_norm": 0.6153954542859396, + "learning_rate": 4.968657168107161e-06, + "loss": 0.6192, + "step": 1028 + }, + { + "epoch": 0.16693705386112914, + "grad_norm": 0.6527052573489793, + "learning_rate": 4.968589724387466e-06, + "loss": 0.5723, + "step": 1029 + }, + { + "epoch": 0.16709928617780662, + "grad_norm": 0.6039340122868346, + "learning_rate": 4.968522208641531e-06, + "loss": 0.5876, + "step": 1030 + }, + { + "epoch": 0.1672615184944841, + "grad_norm": 0.5967571123404928, + "learning_rate": 4.968454620871326e-06, + "loss": 0.6179, + "step": 1031 + }, + { + "epoch": 0.1674237508111616, + "grad_norm": 0.6218467076640427, + "learning_rate": 4.9683869610788225e-06, + "loss": 0.5992, + "step": 1032 + }, + { + "epoch": 0.16758598312783907, + "grad_norm": 0.6236635181125703, + "learning_rate": 4.968319229265995e-06, + "loss": 0.5853, + "step": 1033 + }, + { + "epoch": 0.16774821544451654, + "grad_norm": 0.6324008073656485, + "learning_rate": 4.96825142543482e-06, + "loss": 0.6108, + "step": 1034 + }, + { + "epoch": 0.16791044776119404, + "grad_norm": 0.6099554961918725, + "learning_rate": 4.9681835495872755e-06, + "loss": 0.6176, + "step": 1035 + }, + { + "epoch": 0.16807268007787152, + "grad_norm": 0.6084635673858839, + "learning_rate": 4.9681156017253415e-06, + "loss": 0.6073, + "step": 1036 + }, + { + "epoch": 0.168234912394549, + "grad_norm": 0.5961555269786262, + "learning_rate": 4.968047581851001e-06, + "loss": 0.6194, + "step": 1037 + }, + { + "epoch": 0.16839714471122647, + "grad_norm": 0.6135739027003293, + "learning_rate": 4.967979489966238e-06, + "loss": 0.5914, + "step": 1038 + }, + { + "epoch": 0.16855937702790397, + "grad_norm": 0.6464713024726698, + "learning_rate": 4.967911326073041e-06, + "loss": 0.5865, + "step": 1039 + }, + { + "epoch": 0.16872160934458144, + "grad_norm": 0.586477425756855, + "learning_rate": 4.967843090173396e-06, + "loss": 0.5919, + "step": 1040 + }, + { + "epoch": 0.16888384166125892, + "grad_norm": 0.5572184042372879, + "learning_rate": 4.967774782269296e-06, + "loss": 0.5991, + "step": 1041 + }, + { + "epoch": 0.16904607397793642, + "grad_norm": 0.6298887693439229, + "learning_rate": 4.967706402362733e-06, + "loss": 0.6275, + "step": 1042 + }, + { + "epoch": 0.1692083062946139, + "grad_norm": 0.599387918122626, + "learning_rate": 4.967637950455704e-06, + "loss": 0.645, + "step": 1043 + }, + { + "epoch": 0.16937053861129137, + "grad_norm": 0.6439116173096137, + "learning_rate": 4.967569426550204e-06, + "loss": 0.5983, + "step": 1044 + }, + { + "epoch": 0.16953277092796884, + "grad_norm": 0.6446035942848656, + "learning_rate": 4.967500830648233e-06, + "loss": 0.5986, + "step": 1045 + }, + { + "epoch": 0.16969500324464634, + "grad_norm": 0.5833716654050279, + "learning_rate": 4.967432162751792e-06, + "loss": 0.5933, + "step": 1046 + }, + { + "epoch": 0.16985723556132382, + "grad_norm": 0.6038771446061685, + "learning_rate": 4.967363422862887e-06, + "loss": 0.6295, + "step": 1047 + }, + { + "epoch": 0.1700194678780013, + "grad_norm": 0.6196868553582501, + "learning_rate": 4.96729461098352e-06, + "loss": 0.6508, + "step": 1048 + }, + { + "epoch": 0.1701817001946788, + "grad_norm": 0.5850743994477787, + "learning_rate": 4.967225727115701e-06, + "loss": 0.6249, + "step": 1049 + }, + { + "epoch": 0.17034393251135627, + "grad_norm": 0.6102654632364496, + "learning_rate": 4.96715677126144e-06, + "loss": 0.6209, + "step": 1050 + }, + { + "epoch": 0.17050616482803374, + "grad_norm": 0.5954346831249587, + "learning_rate": 4.967087743422747e-06, + "loss": 0.6008, + "step": 1051 + }, + { + "epoch": 0.17066839714471121, + "grad_norm": 0.5895376452135016, + "learning_rate": 4.967018643601637e-06, + "loss": 0.5916, + "step": 1052 + }, + { + "epoch": 0.17083062946138872, + "grad_norm": 0.604869962127835, + "learning_rate": 4.966949471800128e-06, + "loss": 0.6197, + "step": 1053 + }, + { + "epoch": 0.1709928617780662, + "grad_norm": 0.5962773922505602, + "learning_rate": 4.966880228020237e-06, + "loss": 0.6143, + "step": 1054 + }, + { + "epoch": 0.17115509409474366, + "grad_norm": 0.6137509248321833, + "learning_rate": 4.9668109122639814e-06, + "loss": 0.6321, + "step": 1055 + }, + { + "epoch": 0.17131732641142117, + "grad_norm": 0.6062074739925668, + "learning_rate": 4.966741524533388e-06, + "loss": 0.612, + "step": 1056 + }, + { + "epoch": 0.17147955872809864, + "grad_norm": 0.6231509396014376, + "learning_rate": 4.96667206483048e-06, + "loss": 0.6233, + "step": 1057 + }, + { + "epoch": 0.17164179104477612, + "grad_norm": 0.5816457147164606, + "learning_rate": 4.966602533157283e-06, + "loss": 0.613, + "step": 1058 + }, + { + "epoch": 0.1718040233614536, + "grad_norm": 0.6170563380299164, + "learning_rate": 4.966532929515825e-06, + "loss": 0.5528, + "step": 1059 + }, + { + "epoch": 0.1719662556781311, + "grad_norm": 0.6037434000490827, + "learning_rate": 4.96646325390814e-06, + "loss": 0.6057, + "step": 1060 + }, + { + "epoch": 0.17212848799480857, + "grad_norm": 0.6333455823071592, + "learning_rate": 4.966393506336258e-06, + "loss": 0.5874, + "step": 1061 + }, + { + "epoch": 0.17229072031148604, + "grad_norm": 0.6163379184592133, + "learning_rate": 4.9663236868022155e-06, + "loss": 0.6181, + "step": 1062 + }, + { + "epoch": 0.17245295262816354, + "grad_norm": 0.6377330927048187, + "learning_rate": 4.966253795308049e-06, + "loss": 0.6102, + "step": 1063 + }, + { + "epoch": 0.17261518494484102, + "grad_norm": 0.5733462265964931, + "learning_rate": 4.9661838318557975e-06, + "loss": 0.6252, + "step": 1064 + }, + { + "epoch": 0.1727774172615185, + "grad_norm": 0.590176080017382, + "learning_rate": 4.966113796447504e-06, + "loss": 0.6083, + "step": 1065 + }, + { + "epoch": 0.17293964957819596, + "grad_norm": 0.5922442488209292, + "learning_rate": 4.966043689085209e-06, + "loss": 0.6246, + "step": 1066 + }, + { + "epoch": 0.17310188189487347, + "grad_norm": 0.5803120252129362, + "learning_rate": 4.965973509770962e-06, + "loss": 0.6308, + "step": 1067 + }, + { + "epoch": 0.17326411421155094, + "grad_norm": 0.5996856737845271, + "learning_rate": 4.965903258506806e-06, + "loss": 0.607, + "step": 1068 + }, + { + "epoch": 0.17342634652822841, + "grad_norm": 0.5869326947091971, + "learning_rate": 4.965832935294794e-06, + "loss": 0.6016, + "step": 1069 + }, + { + "epoch": 0.17358857884490592, + "grad_norm": 0.5554016181071924, + "learning_rate": 4.965762540136977e-06, + "loss": 0.5699, + "step": 1070 + }, + { + "epoch": 0.1737508111615834, + "grad_norm": 0.5962723614467997, + "learning_rate": 4.9656920730354095e-06, + "loss": 0.6291, + "step": 1071 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 0.6106461755553269, + "learning_rate": 4.965621533992146e-06, + "loss": 0.6405, + "step": 1072 + }, + { + "epoch": 0.17407527579493834, + "grad_norm": 0.5783125617216769, + "learning_rate": 4.965550923009246e-06, + "loss": 0.6046, + "step": 1073 + }, + { + "epoch": 0.17423750811161584, + "grad_norm": 0.6165268802157555, + "learning_rate": 4.965480240088769e-06, + "loss": 0.6203, + "step": 1074 + }, + { + "epoch": 0.17439974042829332, + "grad_norm": 0.5774448761075495, + "learning_rate": 4.965409485232777e-06, + "loss": 0.5958, + "step": 1075 + }, + { + "epoch": 0.1745619727449708, + "grad_norm": 0.5954639540032611, + "learning_rate": 4.965338658443335e-06, + "loss": 0.5968, + "step": 1076 + }, + { + "epoch": 0.1747242050616483, + "grad_norm": 0.6328206143092653, + "learning_rate": 4.965267759722511e-06, + "loss": 0.6203, + "step": 1077 + }, + { + "epoch": 0.17488643737832577, + "grad_norm": 0.6115055513702519, + "learning_rate": 4.965196789072371e-06, + "loss": 0.6181, + "step": 1078 + }, + { + "epoch": 0.17504866969500324, + "grad_norm": 0.6019924994059606, + "learning_rate": 4.965125746494986e-06, + "loss": 0.6196, + "step": 1079 + }, + { + "epoch": 0.1752109020116807, + "grad_norm": 0.625354904256166, + "learning_rate": 4.96505463199243e-06, + "loss": 0.6315, + "step": 1080 + }, + { + "epoch": 0.17537313432835822, + "grad_norm": 0.5983619714554361, + "learning_rate": 4.964983445566778e-06, + "loss": 0.6157, + "step": 1081 + }, + { + "epoch": 0.1755353666450357, + "grad_norm": 0.6420316804610261, + "learning_rate": 4.9649121872201065e-06, + "loss": 0.6046, + "step": 1082 + }, + { + "epoch": 0.17569759896171316, + "grad_norm": 0.6172355335525843, + "learning_rate": 4.964840856954495e-06, + "loss": 0.6198, + "step": 1083 + }, + { + "epoch": 0.17585983127839067, + "grad_norm": 0.6315640420192988, + "learning_rate": 4.964769454772024e-06, + "loss": 0.5877, + "step": 1084 + }, + { + "epoch": 0.17602206359506814, + "grad_norm": 0.6289074725518355, + "learning_rate": 4.964697980674776e-06, + "loss": 0.6717, + "step": 1085 + }, + { + "epoch": 0.17618429591174561, + "grad_norm": 0.62789918164221, + "learning_rate": 4.964626434664839e-06, + "loss": 0.6357, + "step": 1086 + }, + { + "epoch": 0.1763465282284231, + "grad_norm": 0.6389873306556212, + "learning_rate": 4.964554816744299e-06, + "loss": 0.5872, + "step": 1087 + }, + { + "epoch": 0.1765087605451006, + "grad_norm": 0.6001707626788508, + "learning_rate": 4.964483126915246e-06, + "loss": 0.5551, + "step": 1088 + }, + { + "epoch": 0.17667099286177806, + "grad_norm": 0.5667051653129146, + "learning_rate": 4.96441136517977e-06, + "loss": 0.6016, + "step": 1089 + }, + { + "epoch": 0.17683322517845554, + "grad_norm": 0.5773619823869277, + "learning_rate": 4.964339531539967e-06, + "loss": 0.6109, + "step": 1090 + }, + { + "epoch": 0.17699545749513304, + "grad_norm": 0.6112366536824503, + "learning_rate": 4.9642676259979325e-06, + "loss": 0.6238, + "step": 1091 + }, + { + "epoch": 0.17715768981181051, + "grad_norm": 0.6357370567014374, + "learning_rate": 4.964195648555763e-06, + "loss": 0.6202, + "step": 1092 + }, + { + "epoch": 0.177319922128488, + "grad_norm": 0.6232202103470114, + "learning_rate": 4.96412359921556e-06, + "loss": 0.6158, + "step": 1093 + }, + { + "epoch": 0.17748215444516546, + "grad_norm": 0.590371266393222, + "learning_rate": 4.9640514779794256e-06, + "loss": 0.5985, + "step": 1094 + }, + { + "epoch": 0.17764438676184297, + "grad_norm": 0.6292683025031878, + "learning_rate": 4.963979284849464e-06, + "loss": 0.6241, + "step": 1095 + }, + { + "epoch": 0.17780661907852044, + "grad_norm": 0.5898640822386814, + "learning_rate": 4.963907019827782e-06, + "loss": 0.6509, + "step": 1096 + }, + { + "epoch": 0.1779688513951979, + "grad_norm": 0.5917063360619585, + "learning_rate": 4.963834682916486e-06, + "loss": 0.5832, + "step": 1097 + }, + { + "epoch": 0.17813108371187542, + "grad_norm": 0.5886826031854161, + "learning_rate": 4.963762274117689e-06, + "loss": 0.5993, + "step": 1098 + }, + { + "epoch": 0.1782933160285529, + "grad_norm": 0.6055994259366927, + "learning_rate": 4.963689793433502e-06, + "loss": 0.5977, + "step": 1099 + }, + { + "epoch": 0.17845554834523036, + "grad_norm": 0.5922305357499035, + "learning_rate": 4.963617240866041e-06, + "loss": 0.5984, + "step": 1100 + }, + { + "epoch": 0.17861778066190787, + "grad_norm": 0.5956274196187563, + "learning_rate": 4.963544616417423e-06, + "loss": 0.5646, + "step": 1101 + }, + { + "epoch": 0.17878001297858534, + "grad_norm": 0.609658090172767, + "learning_rate": 4.963471920089767e-06, + "loss": 0.6067, + "step": 1102 + }, + { + "epoch": 0.17894224529526281, + "grad_norm": 0.5858191672518961, + "learning_rate": 4.963399151885191e-06, + "loss": 0.5892, + "step": 1103 + }, + { + "epoch": 0.1791044776119403, + "grad_norm": 0.5667585323608033, + "learning_rate": 4.963326311805822e-06, + "loss": 0.6253, + "step": 1104 + }, + { + "epoch": 0.1792667099286178, + "grad_norm": 0.5809036433884162, + "learning_rate": 4.963253399853783e-06, + "loss": 0.6291, + "step": 1105 + }, + { + "epoch": 0.17942894224529526, + "grad_norm": 0.5982831835233255, + "learning_rate": 4.963180416031203e-06, + "loss": 0.623, + "step": 1106 + }, + { + "epoch": 0.17959117456197274, + "grad_norm": 0.6077636480056947, + "learning_rate": 4.963107360340211e-06, + "loss": 0.605, + "step": 1107 + }, + { + "epoch": 0.17975340687865024, + "grad_norm": 0.592740490558326, + "learning_rate": 4.963034232782938e-06, + "loss": 0.6157, + "step": 1108 + }, + { + "epoch": 0.17991563919532771, + "grad_norm": 0.6016475679820263, + "learning_rate": 4.962961033361518e-06, + "loss": 0.5701, + "step": 1109 + }, + { + "epoch": 0.1800778715120052, + "grad_norm": 0.6029007129468751, + "learning_rate": 4.962887762078087e-06, + "loss": 0.6308, + "step": 1110 + }, + { + "epoch": 0.18024010382868266, + "grad_norm": 0.6026277366925374, + "learning_rate": 4.962814418934782e-06, + "loss": 0.6214, + "step": 1111 + }, + { + "epoch": 0.18040233614536016, + "grad_norm": 0.5971925856354774, + "learning_rate": 4.9627410039337426e-06, + "loss": 0.5832, + "step": 1112 + }, + { + "epoch": 0.18056456846203764, + "grad_norm": 0.5788401759919508, + "learning_rate": 4.962667517077112e-06, + "loss": 0.5726, + "step": 1113 + }, + { + "epoch": 0.1807268007787151, + "grad_norm": 0.6519858706697396, + "learning_rate": 4.962593958367035e-06, + "loss": 0.5932, + "step": 1114 + }, + { + "epoch": 0.18088903309539262, + "grad_norm": 0.6101575277585116, + "learning_rate": 4.9625203278056555e-06, + "loss": 0.593, + "step": 1115 + }, + { + "epoch": 0.1810512654120701, + "grad_norm": 0.5985781025464431, + "learning_rate": 4.962446625395123e-06, + "loss": 0.585, + "step": 1116 + }, + { + "epoch": 0.18121349772874756, + "grad_norm": 0.5980929048692083, + "learning_rate": 4.962372851137589e-06, + "loss": 0.5944, + "step": 1117 + }, + { + "epoch": 0.18137573004542504, + "grad_norm": 0.6063602459365744, + "learning_rate": 4.962299005035205e-06, + "loss": 0.6064, + "step": 1118 + }, + { + "epoch": 0.18153796236210254, + "grad_norm": 0.6134452234864224, + "learning_rate": 4.962225087090126e-06, + "loss": 0.6175, + "step": 1119 + }, + { + "epoch": 0.18170019467878, + "grad_norm": 0.5915702313965355, + "learning_rate": 4.962151097304507e-06, + "loss": 0.6179, + "step": 1120 + }, + { + "epoch": 0.1818624269954575, + "grad_norm": 0.6120680876262137, + "learning_rate": 4.962077035680509e-06, + "loss": 0.5994, + "step": 1121 + }, + { + "epoch": 0.182024659312135, + "grad_norm": 0.6172710635317538, + "learning_rate": 4.962002902220293e-06, + "loss": 0.6268, + "step": 1122 + }, + { + "epoch": 0.18218689162881246, + "grad_norm": 0.5869855787407116, + "learning_rate": 4.961928696926019e-06, + "loss": 0.5395, + "step": 1123 + }, + { + "epoch": 0.18234912394548994, + "grad_norm": 0.6195265930639581, + "learning_rate": 4.961854419799856e-06, + "loss": 0.5802, + "step": 1124 + }, + { + "epoch": 0.1825113562621674, + "grad_norm": 0.613366963421957, + "learning_rate": 4.961780070843969e-06, + "loss": 0.6151, + "step": 1125 + }, + { + "epoch": 0.18267358857884491, + "grad_norm": 0.5991039733407976, + "learning_rate": 4.9617056500605275e-06, + "loss": 0.5341, + "step": 1126 + }, + { + "epoch": 0.1828358208955224, + "grad_norm": 0.5896656854975649, + "learning_rate": 4.961631157451703e-06, + "loss": 0.5754, + "step": 1127 + }, + { + "epoch": 0.18299805321219986, + "grad_norm": 0.6117146262789118, + "learning_rate": 4.961556593019669e-06, + "loss": 0.6139, + "step": 1128 + }, + { + "epoch": 0.18316028552887736, + "grad_norm": 0.5816027416463199, + "learning_rate": 4.9614819567666004e-06, + "loss": 0.613, + "step": 1129 + }, + { + "epoch": 0.18332251784555484, + "grad_norm": 0.6189446519801434, + "learning_rate": 4.961407248694676e-06, + "loss": 0.5847, + "step": 1130 + }, + { + "epoch": 0.1834847501622323, + "grad_norm": 0.558363760803332, + "learning_rate": 4.961332468806076e-06, + "loss": 0.5856, + "step": 1131 + }, + { + "epoch": 0.1836469824789098, + "grad_norm": 0.5945461159711994, + "learning_rate": 4.961257617102981e-06, + "loss": 0.6021, + "step": 1132 + }, + { + "epoch": 0.1838092147955873, + "grad_norm": 0.6481879464353205, + "learning_rate": 4.961182693587575e-06, + "loss": 0.5974, + "step": 1133 + }, + { + "epoch": 0.18397144711226476, + "grad_norm": 0.6016930801052638, + "learning_rate": 4.9611076982620445e-06, + "loss": 0.6389, + "step": 1134 + }, + { + "epoch": 0.18413367942894224, + "grad_norm": 0.6252184677645155, + "learning_rate": 4.961032631128577e-06, + "loss": 0.6086, + "step": 1135 + }, + { + "epoch": 0.18429591174561974, + "grad_norm": 0.5855256948058871, + "learning_rate": 4.9609574921893635e-06, + "loss": 0.5907, + "step": 1136 + }, + { + "epoch": 0.1844581440622972, + "grad_norm": 0.5733679924553502, + "learning_rate": 4.960882281446597e-06, + "loss": 0.6158, + "step": 1137 + }, + { + "epoch": 0.1846203763789747, + "grad_norm": 0.6165761493311235, + "learning_rate": 4.96080699890247e-06, + "loss": 0.6145, + "step": 1138 + }, + { + "epoch": 0.18478260869565216, + "grad_norm": 0.5990795162495907, + "learning_rate": 4.9607316445591805e-06, + "loss": 0.5613, + "step": 1139 + }, + { + "epoch": 0.18494484101232966, + "grad_norm": 0.6177417892485306, + "learning_rate": 4.960656218418926e-06, + "loss": 0.5763, + "step": 1140 + }, + { + "epoch": 0.18510707332900714, + "grad_norm": 0.6057872102106402, + "learning_rate": 4.960580720483909e-06, + "loss": 0.612, + "step": 1141 + }, + { + "epoch": 0.1852693056456846, + "grad_norm": 0.5886201096192528, + "learning_rate": 4.9605051507563306e-06, + "loss": 0.6159, + "step": 1142 + }, + { + "epoch": 0.18543153796236211, + "grad_norm": 0.5733257344435732, + "learning_rate": 4.960429509238397e-06, + "loss": 0.5998, + "step": 1143 + }, + { + "epoch": 0.1855937702790396, + "grad_norm": 0.5779995499647129, + "learning_rate": 4.960353795932313e-06, + "loss": 0.5951, + "step": 1144 + }, + { + "epoch": 0.18575600259571706, + "grad_norm": 0.5502470384276871, + "learning_rate": 4.96027801084029e-06, + "loss": 0.5853, + "step": 1145 + }, + { + "epoch": 0.18591823491239454, + "grad_norm": 0.5885497579731498, + "learning_rate": 4.960202153964539e-06, + "loss": 0.565, + "step": 1146 + }, + { + "epoch": 0.18608046722907204, + "grad_norm": 0.5914413103645989, + "learning_rate": 4.960126225307272e-06, + "loss": 0.599, + "step": 1147 + }, + { + "epoch": 0.1862426995457495, + "grad_norm": 0.615268365665859, + "learning_rate": 4.960050224870706e-06, + "loss": 0.5948, + "step": 1148 + }, + { + "epoch": 0.186404931862427, + "grad_norm": 0.5697282225681489, + "learning_rate": 4.959974152657056e-06, + "loss": 0.595, + "step": 1149 + }, + { + "epoch": 0.1865671641791045, + "grad_norm": 0.6483820697448626, + "learning_rate": 4.959898008668544e-06, + "loss": 0.6114, + "step": 1150 + }, + { + "epoch": 0.18672939649578196, + "grad_norm": 0.5777116742213168, + "learning_rate": 4.959821792907391e-06, + "loss": 0.5894, + "step": 1151 + }, + { + "epoch": 0.18689162881245944, + "grad_norm": 0.6012329329010528, + "learning_rate": 4.9597455053758196e-06, + "loss": 0.5973, + "step": 1152 + }, + { + "epoch": 0.1870538611291369, + "grad_norm": 0.6164257792433125, + "learning_rate": 4.9596691460760574e-06, + "loss": 0.6289, + "step": 1153 + }, + { + "epoch": 0.1872160934458144, + "grad_norm": 0.5920505593377271, + "learning_rate": 4.95959271501033e-06, + "loss": 0.5365, + "step": 1154 + }, + { + "epoch": 0.1873783257624919, + "grad_norm": 0.6307287264382337, + "learning_rate": 4.959516212180871e-06, + "loss": 0.6458, + "step": 1155 + }, + { + "epoch": 0.18754055807916936, + "grad_norm": 0.6322211060099541, + "learning_rate": 4.959439637589909e-06, + "loss": 0.5646, + "step": 1156 + }, + { + "epoch": 0.18770279039584686, + "grad_norm": 0.5815144752195689, + "learning_rate": 4.95936299123968e-06, + "loss": 0.5679, + "step": 1157 + }, + { + "epoch": 0.18786502271252434, + "grad_norm": 0.6341902200743382, + "learning_rate": 4.95928627313242e-06, + "loss": 0.6071, + "step": 1158 + }, + { + "epoch": 0.1880272550292018, + "grad_norm": 0.6571023311063197, + "learning_rate": 4.9592094832703675e-06, + "loss": 0.6289, + "step": 1159 + }, + { + "epoch": 0.18818948734587929, + "grad_norm": 0.5739583505488541, + "learning_rate": 4.959132621655763e-06, + "loss": 0.6103, + "step": 1160 + }, + { + "epoch": 0.1883517196625568, + "grad_norm": 0.6234127482404676, + "learning_rate": 4.959055688290849e-06, + "loss": 0.5882, + "step": 1161 + }, + { + "epoch": 0.18851395197923426, + "grad_norm": 0.5958781155873729, + "learning_rate": 4.9589786831778694e-06, + "loss": 0.6048, + "step": 1162 + }, + { + "epoch": 0.18867618429591174, + "grad_norm": 0.6167796003783553, + "learning_rate": 4.958901606319072e-06, + "loss": 0.607, + "step": 1163 + }, + { + "epoch": 0.18883841661258924, + "grad_norm": 0.6344514478608634, + "learning_rate": 4.958824457716707e-06, + "loss": 0.5673, + "step": 1164 + }, + { + "epoch": 0.1890006489292667, + "grad_norm": 0.5949273359003828, + "learning_rate": 4.958747237373022e-06, + "loss": 0.6014, + "step": 1165 + }, + { + "epoch": 0.1891628812459442, + "grad_norm": 0.5959920638643135, + "learning_rate": 4.9586699452902725e-06, + "loss": 0.6072, + "step": 1166 + }, + { + "epoch": 0.18932511356262166, + "grad_norm": 0.6733563563793203, + "learning_rate": 4.9585925814707135e-06, + "loss": 0.5908, + "step": 1167 + }, + { + "epoch": 0.18948734587929916, + "grad_norm": 0.5917429480057945, + "learning_rate": 4.958515145916602e-06, + "loss": 0.6192, + "step": 1168 + }, + { + "epoch": 0.18964957819597664, + "grad_norm": 0.5959346142215607, + "learning_rate": 4.958437638630196e-06, + "loss": 0.5933, + "step": 1169 + }, + { + "epoch": 0.1898118105126541, + "grad_norm": 0.6722636183269224, + "learning_rate": 4.958360059613758e-06, + "loss": 0.6218, + "step": 1170 + }, + { + "epoch": 0.1899740428293316, + "grad_norm": 0.6175042360541824, + "learning_rate": 4.958282408869552e-06, + "loss": 0.6042, + "step": 1171 + }, + { + "epoch": 0.1901362751460091, + "grad_norm": 0.6228279621409807, + "learning_rate": 4.958204686399843e-06, + "loss": 0.613, + "step": 1172 + }, + { + "epoch": 0.19029850746268656, + "grad_norm": 0.6073849036488516, + "learning_rate": 4.9581268922069e-06, + "loss": 0.6135, + "step": 1173 + }, + { + "epoch": 0.19046073977936404, + "grad_norm": 0.5848210202146589, + "learning_rate": 4.958049026292992e-06, + "loss": 0.6305, + "step": 1174 + }, + { + "epoch": 0.19062297209604154, + "grad_norm": 0.6164825408694329, + "learning_rate": 4.957971088660389e-06, + "loss": 0.5889, + "step": 1175 + }, + { + "epoch": 0.190785204412719, + "grad_norm": 0.6097704233712872, + "learning_rate": 4.957893079311368e-06, + "loss": 0.6161, + "step": 1176 + }, + { + "epoch": 0.19094743672939649, + "grad_norm": 0.598564500239825, + "learning_rate": 4.9578149982482024e-06, + "loss": 0.6084, + "step": 1177 + }, + { + "epoch": 0.191109669046074, + "grad_norm": 0.596865612056636, + "learning_rate": 4.957736845473173e-06, + "loss": 0.6018, + "step": 1178 + }, + { + "epoch": 0.19127190136275146, + "grad_norm": 0.590499712898664, + "learning_rate": 4.957658620988558e-06, + "loss": 0.6402, + "step": 1179 + }, + { + "epoch": 0.19143413367942894, + "grad_norm": 0.6536264996179818, + "learning_rate": 4.95758032479664e-06, + "loss": 0.6294, + "step": 1180 + }, + { + "epoch": 0.19159636599610644, + "grad_norm": 0.5939078100209145, + "learning_rate": 4.957501956899705e-06, + "loss": 0.601, + "step": 1181 + }, + { + "epoch": 0.1917585983127839, + "grad_norm": 0.6096946522777954, + "learning_rate": 4.957423517300038e-06, + "loss": 0.6086, + "step": 1182 + }, + { + "epoch": 0.1919208306294614, + "grad_norm": 0.5997834181255423, + "learning_rate": 4.957345005999929e-06, + "loss": 0.6033, + "step": 1183 + }, + { + "epoch": 0.19208306294613886, + "grad_norm": 0.6479547793803566, + "learning_rate": 4.957266423001667e-06, + "loss": 0.6383, + "step": 1184 + }, + { + "epoch": 0.19224529526281636, + "grad_norm": 0.6527213871968588, + "learning_rate": 4.957187768307546e-06, + "loss": 0.5806, + "step": 1185 + }, + { + "epoch": 0.19240752757949384, + "grad_norm": 0.598446834754155, + "learning_rate": 4.957109041919861e-06, + "loss": 0.5986, + "step": 1186 + }, + { + "epoch": 0.1925697598961713, + "grad_norm": 0.6035707156371687, + "learning_rate": 4.957030243840908e-06, + "loss": 0.6031, + "step": 1187 + }, + { + "epoch": 0.1927319922128488, + "grad_norm": 0.6765309980503842, + "learning_rate": 4.956951374072987e-06, + "loss": 0.5934, + "step": 1188 + }, + { + "epoch": 0.1928942245295263, + "grad_norm": 0.5742117279840303, + "learning_rate": 4.956872432618399e-06, + "loss": 0.5727, + "step": 1189 + }, + { + "epoch": 0.19305645684620376, + "grad_norm": 0.5695453584007483, + "learning_rate": 4.956793419479447e-06, + "loss": 0.6238, + "step": 1190 + }, + { + "epoch": 0.19321868916288124, + "grad_norm": 0.6145408864513074, + "learning_rate": 4.956714334658437e-06, + "loss": 0.6037, + "step": 1191 + }, + { + "epoch": 0.19338092147955874, + "grad_norm": 0.5953351129161855, + "learning_rate": 4.956635178157676e-06, + "loss": 0.5895, + "step": 1192 + }, + { + "epoch": 0.1935431537962362, + "grad_norm": 0.6199142018146915, + "learning_rate": 4.956555949979473e-06, + "loss": 0.5854, + "step": 1193 + }, + { + "epoch": 0.19370538611291369, + "grad_norm": 0.6496344433512758, + "learning_rate": 4.956476650126141e-06, + "loss": 0.5929, + "step": 1194 + }, + { + "epoch": 0.1938676184295912, + "grad_norm": 0.6281757443396719, + "learning_rate": 4.956397278599993e-06, + "loss": 0.5878, + "step": 1195 + }, + { + "epoch": 0.19402985074626866, + "grad_norm": 0.6131634179879207, + "learning_rate": 4.956317835403343e-06, + "loss": 0.6016, + "step": 1196 + }, + { + "epoch": 0.19419208306294614, + "grad_norm": 0.6217579084051886, + "learning_rate": 4.956238320538513e-06, + "loss": 0.5786, + "step": 1197 + }, + { + "epoch": 0.1943543153796236, + "grad_norm": 0.6080912979811621, + "learning_rate": 4.9561587340078195e-06, + "loss": 0.6094, + "step": 1198 + }, + { + "epoch": 0.1945165476963011, + "grad_norm": 0.6217057992992115, + "learning_rate": 4.9560790758135865e-06, + "loss": 0.6299, + "step": 1199 + }, + { + "epoch": 0.1946787800129786, + "grad_norm": 0.5843432626467236, + "learning_rate": 4.9559993459581375e-06, + "loss": 0.5966, + "step": 1200 + }, + { + "epoch": 0.19484101232965606, + "grad_norm": 0.6323924743962996, + "learning_rate": 4.955919544443799e-06, + "loss": 0.5967, + "step": 1201 + }, + { + "epoch": 0.19500324464633356, + "grad_norm": 0.6010147402858631, + "learning_rate": 4.955839671272899e-06, + "loss": 0.5791, + "step": 1202 + }, + { + "epoch": 0.19516547696301104, + "grad_norm": 0.6222490495189206, + "learning_rate": 4.955759726447768e-06, + "loss": 0.5969, + "step": 1203 + }, + { + "epoch": 0.1953277092796885, + "grad_norm": 0.6239210747097769, + "learning_rate": 4.9556797099707385e-06, + "loss": 0.6332, + "step": 1204 + }, + { + "epoch": 0.19548994159636598, + "grad_norm": 0.6124622183612545, + "learning_rate": 4.955599621844146e-06, + "loss": 0.6092, + "step": 1205 + }, + { + "epoch": 0.1956521739130435, + "grad_norm": 0.6181367794559042, + "learning_rate": 4.9555194620703265e-06, + "loss": 0.576, + "step": 1206 + }, + { + "epoch": 0.19581440622972096, + "grad_norm": 0.5943303049438871, + "learning_rate": 4.9554392306516185e-06, + "loss": 0.5897, + "step": 1207 + }, + { + "epoch": 0.19597663854639844, + "grad_norm": 0.599901224777169, + "learning_rate": 4.955358927590363e-06, + "loss": 0.6011, + "step": 1208 + }, + { + "epoch": 0.19613887086307594, + "grad_norm": 0.6254336865019409, + "learning_rate": 4.955278552888905e-06, + "loss": 0.6309, + "step": 1209 + }, + { + "epoch": 0.1963011031797534, + "grad_norm": 0.6277745421072424, + "learning_rate": 4.955198106549587e-06, + "loss": 0.641, + "step": 1210 + }, + { + "epoch": 0.19646333549643089, + "grad_norm": 0.614372424377645, + "learning_rate": 4.955117588574757e-06, + "loss": 0.6461, + "step": 1211 + }, + { + "epoch": 0.19662556781310836, + "grad_norm": 0.6130813828614134, + "learning_rate": 4.955036998966764e-06, + "loss": 0.6108, + "step": 1212 + }, + { + "epoch": 0.19678780012978586, + "grad_norm": 0.5887911207887796, + "learning_rate": 4.954956337727961e-06, + "loss": 0.5857, + "step": 1213 + }, + { + "epoch": 0.19695003244646334, + "grad_norm": 0.6224372125914112, + "learning_rate": 4.9548756048607e-06, + "loss": 0.6194, + "step": 1214 + }, + { + "epoch": 0.1971122647631408, + "grad_norm": 0.6375087437056371, + "learning_rate": 4.9547948003673355e-06, + "loss": 0.6228, + "step": 1215 + }, + { + "epoch": 0.1972744970798183, + "grad_norm": 0.5949439230149751, + "learning_rate": 4.954713924250227e-06, + "loss": 0.6374, + "step": 1216 + }, + { + "epoch": 0.19743672939649579, + "grad_norm": 0.6173157593876938, + "learning_rate": 4.954632976511734e-06, + "loss": 0.6098, + "step": 1217 + }, + { + "epoch": 0.19759896171317326, + "grad_norm": 0.6111529870748234, + "learning_rate": 4.9545519571542185e-06, + "loss": 0.6305, + "step": 1218 + }, + { + "epoch": 0.19776119402985073, + "grad_norm": 0.6003072293086307, + "learning_rate": 4.954470866180043e-06, + "loss": 0.5913, + "step": 1219 + }, + { + "epoch": 0.19792342634652824, + "grad_norm": 0.587640653488848, + "learning_rate": 4.954389703591575e-06, + "loss": 0.5919, + "step": 1220 + }, + { + "epoch": 0.1980856586632057, + "grad_norm": 0.6035426688071022, + "learning_rate": 4.954308469391182e-06, + "loss": 0.5752, + "step": 1221 + }, + { + "epoch": 0.19824789097988318, + "grad_norm": 0.6083852172702188, + "learning_rate": 4.954227163581234e-06, + "loss": 0.6275, + "step": 1222 + }, + { + "epoch": 0.1984101232965607, + "grad_norm": 0.6036879017912778, + "learning_rate": 4.954145786164104e-06, + "loss": 0.6223, + "step": 1223 + }, + { + "epoch": 0.19857235561323816, + "grad_norm": 0.6210273027432706, + "learning_rate": 4.954064337142165e-06, + "loss": 0.5862, + "step": 1224 + }, + { + "epoch": 0.19873458792991563, + "grad_norm": 0.6428415561234412, + "learning_rate": 4.953982816517795e-06, + "loss": 0.6255, + "step": 1225 + }, + { + "epoch": 0.1988968202465931, + "grad_norm": 0.6168829795280875, + "learning_rate": 4.953901224293372e-06, + "loss": 0.5557, + "step": 1226 + }, + { + "epoch": 0.1990590525632706, + "grad_norm": 0.6228961186000481, + "learning_rate": 4.9538195604712744e-06, + "loss": 0.5892, + "step": 1227 + }, + { + "epoch": 0.19922128487994809, + "grad_norm": 0.6256856232348587, + "learning_rate": 4.953737825053889e-06, + "loss": 0.6011, + "step": 1228 + }, + { + "epoch": 0.19938351719662556, + "grad_norm": 0.6120531307449295, + "learning_rate": 4.953656018043598e-06, + "loss": 0.5773, + "step": 1229 + }, + { + "epoch": 0.19954574951330306, + "grad_norm": 0.654779243398723, + "learning_rate": 4.953574139442788e-06, + "loss": 0.6413, + "step": 1230 + }, + { + "epoch": 0.19970798182998054, + "grad_norm": 0.6181137456845065, + "learning_rate": 4.953492189253849e-06, + "loss": 0.5881, + "step": 1231 + }, + { + "epoch": 0.199870214146658, + "grad_norm": 0.6051252127025728, + "learning_rate": 4.9534101674791725e-06, + "loss": 0.6071, + "step": 1232 + }, + { + "epoch": 0.20003244646333548, + "grad_norm": 0.66002487738701, + "learning_rate": 4.953328074121151e-06, + "loss": 0.6458, + "step": 1233 + }, + { + "epoch": 0.20019467878001299, + "grad_norm": 0.5713637099925918, + "learning_rate": 4.95324590918218e-06, + "loss": 0.6029, + "step": 1234 + }, + { + "epoch": 0.20035691109669046, + "grad_norm": 0.6131603035664464, + "learning_rate": 4.953163672664655e-06, + "loss": 0.5956, + "step": 1235 + }, + { + "epoch": 0.20051914341336793, + "grad_norm": 0.6567094532479972, + "learning_rate": 4.953081364570978e-06, + "loss": 0.5944, + "step": 1236 + }, + { + "epoch": 0.20068137573004544, + "grad_norm": 0.5880833473157768, + "learning_rate": 4.95299898490355e-06, + "loss": 0.6276, + "step": 1237 + }, + { + "epoch": 0.2008436080467229, + "grad_norm": 0.5949517747154293, + "learning_rate": 4.952916533664774e-06, + "loss": 0.5707, + "step": 1238 + }, + { + "epoch": 0.20100584036340038, + "grad_norm": 0.5947420792785769, + "learning_rate": 4.952834010857055e-06, + "loss": 0.6113, + "step": 1239 + }, + { + "epoch": 0.20116807268007786, + "grad_norm": 0.6185409282537807, + "learning_rate": 4.952751416482801e-06, + "loss": 0.5813, + "step": 1240 + }, + { + "epoch": 0.20133030499675536, + "grad_norm": 0.622016762316573, + "learning_rate": 4.952668750544424e-06, + "loss": 0.5958, + "step": 1241 + }, + { + "epoch": 0.20149253731343283, + "grad_norm": 0.6695272155491899, + "learning_rate": 4.9525860130443335e-06, + "loss": 0.5891, + "step": 1242 + }, + { + "epoch": 0.2016547696301103, + "grad_norm": 0.5998657695056986, + "learning_rate": 4.952503203984945e-06, + "loss": 0.585, + "step": 1243 + }, + { + "epoch": 0.2018170019467878, + "grad_norm": 0.6759050370382578, + "learning_rate": 4.952420323368673e-06, + "loss": 0.6587, + "step": 1244 + }, + { + "epoch": 0.20197923426346528, + "grad_norm": 0.6349719985779108, + "learning_rate": 4.952337371197938e-06, + "loss": 0.5923, + "step": 1245 + }, + { + "epoch": 0.20214146658014276, + "grad_norm": 0.6242484404246827, + "learning_rate": 4.952254347475158e-06, + "loss": 0.6392, + "step": 1246 + }, + { + "epoch": 0.20230369889682023, + "grad_norm": 0.6234772309690443, + "learning_rate": 4.952171252202758e-06, + "loss": 0.6379, + "step": 1247 + }, + { + "epoch": 0.20246593121349774, + "grad_norm": 0.5923770465013234, + "learning_rate": 4.952088085383159e-06, + "loss": 0.5784, + "step": 1248 + }, + { + "epoch": 0.2026281635301752, + "grad_norm": 0.7052024906421073, + "learning_rate": 4.952004847018791e-06, + "loss": 0.6029, + "step": 1249 + }, + { + "epoch": 0.20279039584685268, + "grad_norm": 0.6250633472862352, + "learning_rate": 4.951921537112081e-06, + "loss": 0.5849, + "step": 1250 + }, + { + "epoch": 0.20295262816353019, + "grad_norm": 0.6340025634834924, + "learning_rate": 4.951838155665459e-06, + "loss": 0.6161, + "step": 1251 + }, + { + "epoch": 0.20311486048020766, + "grad_norm": 0.612819103316731, + "learning_rate": 4.951754702681359e-06, + "loss": 0.6014, + "step": 1252 + }, + { + "epoch": 0.20327709279688513, + "grad_norm": 0.6215749851958966, + "learning_rate": 4.951671178162216e-06, + "loss": 0.5866, + "step": 1253 + }, + { + "epoch": 0.2034393251135626, + "grad_norm": 0.5990674260112, + "learning_rate": 4.951587582110466e-06, + "loss": 0.5891, + "step": 1254 + }, + { + "epoch": 0.2036015574302401, + "grad_norm": 0.6241404014810501, + "learning_rate": 4.951503914528549e-06, + "loss": 0.6065, + "step": 1255 + }, + { + "epoch": 0.20376378974691758, + "grad_norm": 0.5798246743675389, + "learning_rate": 4.951420175418906e-06, + "loss": 0.6177, + "step": 1256 + }, + { + "epoch": 0.20392602206359506, + "grad_norm": 0.6169972839230377, + "learning_rate": 4.95133636478398e-06, + "loss": 0.6021, + "step": 1257 + }, + { + "epoch": 0.20408825438027256, + "grad_norm": 0.6632308598895973, + "learning_rate": 4.951252482626218e-06, + "loss": 0.6379, + "step": 1258 + }, + { + "epoch": 0.20425048669695003, + "grad_norm": 0.6072334570415203, + "learning_rate": 4.951168528948064e-06, + "loss": 0.5644, + "step": 1259 + }, + { + "epoch": 0.2044127190136275, + "grad_norm": 0.5876614617280659, + "learning_rate": 4.9510845037519706e-06, + "loss": 0.5919, + "step": 1260 + }, + { + "epoch": 0.204574951330305, + "grad_norm": 0.6120983058715527, + "learning_rate": 4.951000407040387e-06, + "loss": 0.5818, + "step": 1261 + }, + { + "epoch": 0.20473718364698248, + "grad_norm": 0.5912981319137978, + "learning_rate": 4.950916238815769e-06, + "loss": 0.5684, + "step": 1262 + }, + { + "epoch": 0.20489941596365996, + "grad_norm": 0.6240755808477109, + "learning_rate": 4.950831999080572e-06, + "loss": 0.5614, + "step": 1263 + }, + { + "epoch": 0.20506164828033743, + "grad_norm": 0.5986015269921205, + "learning_rate": 4.950747687837253e-06, + "loss": 0.5623, + "step": 1264 + }, + { + "epoch": 0.20522388059701493, + "grad_norm": 0.6165522071244551, + "learning_rate": 4.950663305088273e-06, + "loss": 0.6191, + "step": 1265 + }, + { + "epoch": 0.2053861129136924, + "grad_norm": 0.6071621714927621, + "learning_rate": 4.950578850836092e-06, + "loss": 0.6209, + "step": 1266 + }, + { + "epoch": 0.20554834523036988, + "grad_norm": 0.6099578488169501, + "learning_rate": 4.950494325083176e-06, + "loss": 0.6154, + "step": 1267 + }, + { + "epoch": 0.20571057754704739, + "grad_norm": 0.5732600325938431, + "learning_rate": 4.950409727831992e-06, + "loss": 0.5915, + "step": 1268 + }, + { + "epoch": 0.20587280986372486, + "grad_norm": 0.636875617415641, + "learning_rate": 4.950325059085006e-06, + "loss": 0.6014, + "step": 1269 + }, + { + "epoch": 0.20603504218040233, + "grad_norm": 0.5917183452920914, + "learning_rate": 4.950240318844689e-06, + "loss": 0.612, + "step": 1270 + }, + { + "epoch": 0.2061972744970798, + "grad_norm": 0.5943369143840209, + "learning_rate": 4.950155507113515e-06, + "loss": 0.5796, + "step": 1271 + }, + { + "epoch": 0.2063595068137573, + "grad_norm": 0.6498637957145112, + "learning_rate": 4.950070623893957e-06, + "loss": 0.6326, + "step": 1272 + }, + { + "epoch": 0.20652173913043478, + "grad_norm": 0.6058659370183022, + "learning_rate": 4.9499856691884916e-06, + "loss": 0.5833, + "step": 1273 + }, + { + "epoch": 0.20668397144711226, + "grad_norm": 0.6669009748353031, + "learning_rate": 4.949900642999599e-06, + "loss": 0.6022, + "step": 1274 + }, + { + "epoch": 0.20684620376378976, + "grad_norm": 0.5907094397597059, + "learning_rate": 4.949815545329759e-06, + "loss": 0.6097, + "step": 1275 + }, + { + "epoch": 0.20700843608046723, + "grad_norm": 0.6245642204663755, + "learning_rate": 4.949730376181454e-06, + "loss": 0.6177, + "step": 1276 + }, + { + "epoch": 0.2071706683971447, + "grad_norm": 0.623796761645668, + "learning_rate": 4.94964513555717e-06, + "loss": 0.6035, + "step": 1277 + }, + { + "epoch": 0.20733290071382218, + "grad_norm": 0.601203182219786, + "learning_rate": 4.949559823459393e-06, + "loss": 0.5784, + "step": 1278 + }, + { + "epoch": 0.20749513303049968, + "grad_norm": 0.5916641389097482, + "learning_rate": 4.9494744398906136e-06, + "loss": 0.6291, + "step": 1279 + }, + { + "epoch": 0.20765736534717716, + "grad_norm": 0.6425386450996059, + "learning_rate": 4.949388984853322e-06, + "loss": 0.6121, + "step": 1280 + }, + { + "epoch": 0.20781959766385463, + "grad_norm": 0.5980362865737447, + "learning_rate": 4.949303458350012e-06, + "loss": 0.6109, + "step": 1281 + }, + { + "epoch": 0.20798182998053213, + "grad_norm": 0.5922595168271972, + "learning_rate": 4.949217860383178e-06, + "loss": 0.5831, + "step": 1282 + }, + { + "epoch": 0.2081440622972096, + "grad_norm": 0.6087781990014042, + "learning_rate": 4.949132190955319e-06, + "loss": 0.6191, + "step": 1283 + }, + { + "epoch": 0.20830629461388708, + "grad_norm": 0.6055486741870714, + "learning_rate": 4.949046450068933e-06, + "loss": 0.6392, + "step": 1284 + }, + { + "epoch": 0.20846852693056456, + "grad_norm": 0.6369392630122354, + "learning_rate": 4.948960637726523e-06, + "loss": 0.6102, + "step": 1285 + }, + { + "epoch": 0.20863075924724206, + "grad_norm": 0.5747164223654212, + "learning_rate": 4.948874753930592e-06, + "loss": 0.5932, + "step": 1286 + }, + { + "epoch": 0.20879299156391953, + "grad_norm": 0.6407406151791861, + "learning_rate": 4.948788798683647e-06, + "loss": 0.623, + "step": 1287 + }, + { + "epoch": 0.208955223880597, + "grad_norm": 0.6230276289877978, + "learning_rate": 4.948702771988195e-06, + "loss": 0.5989, + "step": 1288 + }, + { + "epoch": 0.2091174561972745, + "grad_norm": 0.6287834442146435, + "learning_rate": 4.948616673846746e-06, + "loss": 0.6087, + "step": 1289 + }, + { + "epoch": 0.20927968851395198, + "grad_norm": 0.652623843371174, + "learning_rate": 4.948530504261812e-06, + "loss": 0.6034, + "step": 1290 + }, + { + "epoch": 0.20944192083062946, + "grad_norm": 0.6155245141062676, + "learning_rate": 4.948444263235906e-06, + "loss": 0.5736, + "step": 1291 + }, + { + "epoch": 0.20960415314730693, + "grad_norm": 0.6183889674801003, + "learning_rate": 4.948357950771547e-06, + "loss": 0.6038, + "step": 1292 + }, + { + "epoch": 0.20976638546398443, + "grad_norm": 0.6275236062331645, + "learning_rate": 4.948271566871252e-06, + "loss": 0.5859, + "step": 1293 + }, + { + "epoch": 0.2099286177806619, + "grad_norm": 0.6356969387338313, + "learning_rate": 4.948185111537541e-06, + "loss": 0.6251, + "step": 1294 + }, + { + "epoch": 0.21009085009733938, + "grad_norm": 0.6138578015888697, + "learning_rate": 4.948098584772937e-06, + "loss": 0.618, + "step": 1295 + }, + { + "epoch": 0.21025308241401688, + "grad_norm": 0.5792577389446136, + "learning_rate": 4.948011986579964e-06, + "loss": 0.5977, + "step": 1296 + }, + { + "epoch": 0.21041531473069436, + "grad_norm": 0.6350852438541685, + "learning_rate": 4.94792531696115e-06, + "loss": 0.599, + "step": 1297 + }, + { + "epoch": 0.21057754704737183, + "grad_norm": 0.6574300169333106, + "learning_rate": 4.9478385759190224e-06, + "loss": 0.6007, + "step": 1298 + }, + { + "epoch": 0.2107397793640493, + "grad_norm": 0.6740346417171994, + "learning_rate": 4.947751763456112e-06, + "loss": 0.6304, + "step": 1299 + }, + { + "epoch": 0.2109020116807268, + "grad_norm": 0.6272866646241899, + "learning_rate": 4.9476648795749535e-06, + "loss": 0.605, + "step": 1300 + }, + { + "epoch": 0.21106424399740428, + "grad_norm": 0.5976183968599061, + "learning_rate": 4.94757792427808e-06, + "loss": 0.5994, + "step": 1301 + }, + { + "epoch": 0.21122647631408176, + "grad_norm": 0.616011122422227, + "learning_rate": 4.947490897568029e-06, + "loss": 0.5988, + "step": 1302 + }, + { + "epoch": 0.21138870863075926, + "grad_norm": 0.6244588192004803, + "learning_rate": 4.947403799447341e-06, + "loss": 0.5876, + "step": 1303 + }, + { + "epoch": 0.21155094094743673, + "grad_norm": 0.582282567758869, + "learning_rate": 4.947316629918556e-06, + "loss": 0.5641, + "step": 1304 + }, + { + "epoch": 0.2117131732641142, + "grad_norm": 0.6315326429086353, + "learning_rate": 4.947229388984217e-06, + "loss": 0.6201, + "step": 1305 + }, + { + "epoch": 0.21187540558079168, + "grad_norm": 0.6588687130120885, + "learning_rate": 4.94714207664687e-06, + "loss": 0.6214, + "step": 1306 + }, + { + "epoch": 0.21203763789746918, + "grad_norm": 0.5977327392510796, + "learning_rate": 4.947054692909064e-06, + "loss": 0.5963, + "step": 1307 + }, + { + "epoch": 0.21219987021414666, + "grad_norm": 0.5931509683323742, + "learning_rate": 4.946967237773346e-06, + "loss": 0.5737, + "step": 1308 + }, + { + "epoch": 0.21236210253082413, + "grad_norm": 0.6626804922131678, + "learning_rate": 4.946879711242269e-06, + "loss": 0.6076, + "step": 1309 + }, + { + "epoch": 0.21252433484750163, + "grad_norm": 0.6487963684008126, + "learning_rate": 4.9467921133183864e-06, + "loss": 0.5749, + "step": 1310 + }, + { + "epoch": 0.2126865671641791, + "grad_norm": 0.632480874778401, + "learning_rate": 4.946704444004255e-06, + "loss": 0.6179, + "step": 1311 + }, + { + "epoch": 0.21284879948085658, + "grad_norm": 0.6309594774951652, + "learning_rate": 4.946616703302432e-06, + "loss": 0.6063, + "step": 1312 + }, + { + "epoch": 0.21301103179753406, + "grad_norm": 0.6295472379368576, + "learning_rate": 4.946528891215477e-06, + "loss": 0.6278, + "step": 1313 + }, + { + "epoch": 0.21317326411421156, + "grad_norm": 0.6068865489362992, + "learning_rate": 4.946441007745952e-06, + "loss": 0.586, + "step": 1314 + }, + { + "epoch": 0.21333549643088903, + "grad_norm": 0.6036504987485667, + "learning_rate": 4.946353052896424e-06, + "loss": 0.6164, + "step": 1315 + }, + { + "epoch": 0.2134977287475665, + "grad_norm": 0.6020678271503712, + "learning_rate": 4.9462650266694546e-06, + "loss": 0.6272, + "step": 1316 + }, + { + "epoch": 0.213659961064244, + "grad_norm": 0.5967788899048402, + "learning_rate": 4.946176929067615e-06, + "loss": 0.6012, + "step": 1317 + }, + { + "epoch": 0.21382219338092148, + "grad_norm": 0.5853289106024835, + "learning_rate": 4.946088760093475e-06, + "loss": 0.6247, + "step": 1318 + }, + { + "epoch": 0.21398442569759896, + "grad_norm": 0.609852848683426, + "learning_rate": 4.9460005197496085e-06, + "loss": 0.604, + "step": 1319 + }, + { + "epoch": 0.21414665801427643, + "grad_norm": 0.6278754079854114, + "learning_rate": 4.945912208038588e-06, + "loss": 0.5913, + "step": 1320 + }, + { + "epoch": 0.21430889033095393, + "grad_norm": 0.5789986459215876, + "learning_rate": 4.945823824962991e-06, + "loss": 0.616, + "step": 1321 + }, + { + "epoch": 0.2144711226476314, + "grad_norm": 0.5965375405314193, + "learning_rate": 4.945735370525397e-06, + "loss": 0.5643, + "step": 1322 + }, + { + "epoch": 0.21463335496430888, + "grad_norm": 0.5898003343678164, + "learning_rate": 4.9456468447283855e-06, + "loss": 0.5915, + "step": 1323 + }, + { + "epoch": 0.21479558728098638, + "grad_norm": 0.6089815959229987, + "learning_rate": 4.94555824757454e-06, + "loss": 0.6047, + "step": 1324 + }, + { + "epoch": 0.21495781959766386, + "grad_norm": 0.6588336604623748, + "learning_rate": 4.9454695790664466e-06, + "loss": 0.6069, + "step": 1325 + }, + { + "epoch": 0.21512005191434133, + "grad_norm": 0.5902782147628004, + "learning_rate": 4.94538083920669e-06, + "loss": 0.6005, + "step": 1326 + }, + { + "epoch": 0.2152822842310188, + "grad_norm": 0.6267783890697518, + "learning_rate": 4.945292027997862e-06, + "loss": 0.6266, + "step": 1327 + }, + { + "epoch": 0.2154445165476963, + "grad_norm": 0.6046116312648729, + "learning_rate": 4.945203145442552e-06, + "loss": 0.6239, + "step": 1328 + }, + { + "epoch": 0.21560674886437378, + "grad_norm": 0.6161183308442508, + "learning_rate": 4.9451141915433545e-06, + "loss": 0.634, + "step": 1329 + }, + { + "epoch": 0.21576898118105126, + "grad_norm": 0.6606972550023492, + "learning_rate": 4.9450251663028635e-06, + "loss": 0.6099, + "step": 1330 + }, + { + "epoch": 0.21593121349772876, + "grad_norm": 0.5701825091460435, + "learning_rate": 4.944936069723678e-06, + "loss": 0.611, + "step": 1331 + }, + { + "epoch": 0.21609344581440623, + "grad_norm": 0.6091533502635803, + "learning_rate": 4.944846901808397e-06, + "loss": 0.577, + "step": 1332 + }, + { + "epoch": 0.2162556781310837, + "grad_norm": 0.5948201724839125, + "learning_rate": 4.9447576625596216e-06, + "loss": 0.5718, + "step": 1333 + }, + { + "epoch": 0.21641791044776118, + "grad_norm": 0.6285841612992027, + "learning_rate": 4.944668351979956e-06, + "loss": 0.5826, + "step": 1334 + }, + { + "epoch": 0.21658014276443868, + "grad_norm": 0.605175004104568, + "learning_rate": 4.944578970072006e-06, + "loss": 0.6034, + "step": 1335 + }, + { + "epoch": 0.21674237508111616, + "grad_norm": 0.606276056848766, + "learning_rate": 4.94448951683838e-06, + "loss": 0.5774, + "step": 1336 + }, + { + "epoch": 0.21690460739779363, + "grad_norm": 0.5803329457117632, + "learning_rate": 4.944399992281688e-06, + "loss": 0.5919, + "step": 1337 + }, + { + "epoch": 0.21706683971447113, + "grad_norm": 0.622264269632002, + "learning_rate": 4.944310396404541e-06, + "loss": 0.6035, + "step": 1338 + }, + { + "epoch": 0.2172290720311486, + "grad_norm": 0.5998475130582269, + "learning_rate": 4.944220729209553e-06, + "loss": 0.5955, + "step": 1339 + }, + { + "epoch": 0.21739130434782608, + "grad_norm": 0.623907119632296, + "learning_rate": 4.9441309906993415e-06, + "loss": 0.5812, + "step": 1340 + }, + { + "epoch": 0.21755353666450358, + "grad_norm": 0.6229595220082642, + "learning_rate": 4.944041180876524e-06, + "loss": 0.6103, + "step": 1341 + }, + { + "epoch": 0.21771576898118106, + "grad_norm": 0.602500773208358, + "learning_rate": 4.943951299743722e-06, + "loss": 0.639, + "step": 1342 + }, + { + "epoch": 0.21787800129785853, + "grad_norm": 0.6360904906299292, + "learning_rate": 4.943861347303557e-06, + "loss": 0.5691, + "step": 1343 + }, + { + "epoch": 0.218040233614536, + "grad_norm": 0.6055555958213834, + "learning_rate": 4.943771323558654e-06, + "loss": 0.6053, + "step": 1344 + }, + { + "epoch": 0.2182024659312135, + "grad_norm": 0.6663446668788658, + "learning_rate": 4.943681228511638e-06, + "loss": 0.5998, + "step": 1345 + }, + { + "epoch": 0.21836469824789098, + "grad_norm": 0.6012350808373588, + "learning_rate": 4.94359106216514e-06, + "loss": 0.5747, + "step": 1346 + }, + { + "epoch": 0.21852693056456846, + "grad_norm": 0.5998326677178355, + "learning_rate": 4.94350082452179e-06, + "loss": 0.5908, + "step": 1347 + }, + { + "epoch": 0.21868916288124596, + "grad_norm": 0.6143549990442115, + "learning_rate": 4.94341051558422e-06, + "loss": 0.5957, + "step": 1348 + }, + { + "epoch": 0.21885139519792343, + "grad_norm": 0.5861286858157845, + "learning_rate": 4.943320135355066e-06, + "loss": 0.607, + "step": 1349 + }, + { + "epoch": 0.2190136275146009, + "grad_norm": 0.6258599310857427, + "learning_rate": 4.943229683836964e-06, + "loss": 0.5967, + "step": 1350 + }, + { + "epoch": 0.21917585983127838, + "grad_norm": 0.5883169247155939, + "learning_rate": 4.943139161032555e-06, + "loss": 0.6246, + "step": 1351 + }, + { + "epoch": 0.21933809214795588, + "grad_norm": 0.6574241580830437, + "learning_rate": 4.943048566944478e-06, + "loss": 0.5516, + "step": 1352 + }, + { + "epoch": 0.21950032446463336, + "grad_norm": 0.7011828091321363, + "learning_rate": 4.942957901575378e-06, + "loss": 0.5826, + "step": 1353 + }, + { + "epoch": 0.21966255678131083, + "grad_norm": 0.6267631170613084, + "learning_rate": 4.942867164927899e-06, + "loss": 0.6284, + "step": 1354 + }, + { + "epoch": 0.21982478909798833, + "grad_norm": 0.619001072949918, + "learning_rate": 4.942776357004689e-06, + "loss": 0.5886, + "step": 1355 + }, + { + "epoch": 0.2199870214146658, + "grad_norm": 0.6425217379749528, + "learning_rate": 4.942685477808398e-06, + "loss": 0.5921, + "step": 1356 + }, + { + "epoch": 0.22014925373134328, + "grad_norm": 0.5842837829350612, + "learning_rate": 4.942594527341678e-06, + "loss": 0.6001, + "step": 1357 + }, + { + "epoch": 0.22031148604802075, + "grad_norm": 0.6114103587625302, + "learning_rate": 4.94250350560718e-06, + "loss": 0.5764, + "step": 1358 + }, + { + "epoch": 0.22047371836469826, + "grad_norm": 0.5964611237627487, + "learning_rate": 4.942412412607562e-06, + "loss": 0.5756, + "step": 1359 + }, + { + "epoch": 0.22063595068137573, + "grad_norm": 0.5980416839934113, + "learning_rate": 4.942321248345482e-06, + "loss": 0.6178, + "step": 1360 + }, + { + "epoch": 0.2207981829980532, + "grad_norm": 0.5812917209827991, + "learning_rate": 4.942230012823599e-06, + "loss": 0.5573, + "step": 1361 + }, + { + "epoch": 0.2209604153147307, + "grad_norm": 0.6204849096274657, + "learning_rate": 4.942138706044575e-06, + "loss": 0.6208, + "step": 1362 + }, + { + "epoch": 0.22112264763140818, + "grad_norm": 0.6372218845966229, + "learning_rate": 4.942047328011075e-06, + "loss": 0.5996, + "step": 1363 + }, + { + "epoch": 0.22128487994808566, + "grad_norm": 0.5972314951164148, + "learning_rate": 4.941955878725764e-06, + "loss": 0.6243, + "step": 1364 + }, + { + "epoch": 0.22144711226476313, + "grad_norm": 0.6170246044446315, + "learning_rate": 4.941864358191311e-06, + "loss": 0.5867, + "step": 1365 + }, + { + "epoch": 0.22160934458144063, + "grad_norm": 0.6054657855131892, + "learning_rate": 4.941772766410385e-06, + "loss": 0.6179, + "step": 1366 + }, + { + "epoch": 0.2217715768981181, + "grad_norm": 0.6508910382413309, + "learning_rate": 4.94168110338566e-06, + "loss": 0.605, + "step": 1367 + }, + { + "epoch": 0.22193380921479558, + "grad_norm": 0.5806807446722222, + "learning_rate": 4.94158936911981e-06, + "loss": 0.5859, + "step": 1368 + }, + { + "epoch": 0.22209604153147308, + "grad_norm": 0.6366847466581883, + "learning_rate": 4.9414975636155115e-06, + "loss": 0.5872, + "step": 1369 + }, + { + "epoch": 0.22225827384815056, + "grad_norm": 0.6166857965729082, + "learning_rate": 4.941405686875444e-06, + "loss": 0.6179, + "step": 1370 + }, + { + "epoch": 0.22242050616482803, + "grad_norm": 0.5849690710032271, + "learning_rate": 4.941313738902285e-06, + "loss": 0.5935, + "step": 1371 + }, + { + "epoch": 0.2225827384815055, + "grad_norm": 0.6805035726000412, + "learning_rate": 4.941221719698721e-06, + "loss": 0.5654, + "step": 1372 + }, + { + "epoch": 0.222744970798183, + "grad_norm": 0.6164290746878315, + "learning_rate": 4.941129629267434e-06, + "loss": 0.6106, + "step": 1373 + }, + { + "epoch": 0.22290720311486048, + "grad_norm": 0.6202601559172385, + "learning_rate": 4.941037467611113e-06, + "loss": 0.6351, + "step": 1374 + }, + { + "epoch": 0.22306943543153795, + "grad_norm": 0.690077228137854, + "learning_rate": 4.940945234732447e-06, + "loss": 0.6086, + "step": 1375 + }, + { + "epoch": 0.22323166774821546, + "grad_norm": 0.6287841088471529, + "learning_rate": 4.940852930634126e-06, + "loss": 0.5858, + "step": 1376 + }, + { + "epoch": 0.22339390006489293, + "grad_norm": 0.6011140890861798, + "learning_rate": 4.940760555318842e-06, + "loss": 0.5958, + "step": 1377 + }, + { + "epoch": 0.2235561323815704, + "grad_norm": 0.6268121220387641, + "learning_rate": 4.940668108789294e-06, + "loss": 0.5889, + "step": 1378 + }, + { + "epoch": 0.22371836469824788, + "grad_norm": 0.6323534824702806, + "learning_rate": 4.940575591048176e-06, + "loss": 0.6281, + "step": 1379 + }, + { + "epoch": 0.22388059701492538, + "grad_norm": 0.5848491707746809, + "learning_rate": 4.9404830020981876e-06, + "loss": 0.6465, + "step": 1380 + }, + { + "epoch": 0.22404282933160286, + "grad_norm": 0.6289098136028344, + "learning_rate": 4.9403903419420325e-06, + "loss": 0.5984, + "step": 1381 + }, + { + "epoch": 0.22420506164828033, + "grad_norm": 0.6713587336021313, + "learning_rate": 4.940297610582412e-06, + "loss": 0.6168, + "step": 1382 + }, + { + "epoch": 0.22436729396495783, + "grad_norm": 0.6586954390650538, + "learning_rate": 4.940204808022033e-06, + "loss": 0.6021, + "step": 1383 + }, + { + "epoch": 0.2245295262816353, + "grad_norm": 0.6294432181139292, + "learning_rate": 4.940111934263602e-06, + "loss": 0.5896, + "step": 1384 + }, + { + "epoch": 0.22469175859831278, + "grad_norm": 0.6053082560989665, + "learning_rate": 4.940018989309831e-06, + "loss": 0.6407, + "step": 1385 + }, + { + "epoch": 0.22485399091499025, + "grad_norm": 0.6099026668374585, + "learning_rate": 4.939925973163431e-06, + "loss": 0.5971, + "step": 1386 + }, + { + "epoch": 0.22501622323166776, + "grad_norm": 0.6354915111135996, + "learning_rate": 4.9398328858271136e-06, + "loss": 0.5916, + "step": 1387 + }, + { + "epoch": 0.22517845554834523, + "grad_norm": 0.6067565427203242, + "learning_rate": 4.939739727303597e-06, + "loss": 0.6084, + "step": 1388 + }, + { + "epoch": 0.2253406878650227, + "grad_norm": 0.6007961732796276, + "learning_rate": 4.9396464975956e-06, + "loss": 0.6269, + "step": 1389 + }, + { + "epoch": 0.2255029201817002, + "grad_norm": 0.6044589777670352, + "learning_rate": 4.939553196705841e-06, + "loss": 0.5969, + "step": 1390 + }, + { + "epoch": 0.22566515249837768, + "grad_norm": 0.6652137062800471, + "learning_rate": 4.939459824637043e-06, + "loss": 0.6006, + "step": 1391 + }, + { + "epoch": 0.22582738481505515, + "grad_norm": 0.6649105657541012, + "learning_rate": 4.93936638139193e-06, + "loss": 0.5874, + "step": 1392 + }, + { + "epoch": 0.22598961713173263, + "grad_norm": 0.612565399384748, + "learning_rate": 4.939272866973229e-06, + "loss": 0.5858, + "step": 1393 + }, + { + "epoch": 0.22615184944841013, + "grad_norm": 0.592232403974615, + "learning_rate": 4.9391792813836684e-06, + "loss": 0.5806, + "step": 1394 + }, + { + "epoch": 0.2263140817650876, + "grad_norm": 0.5816194754167735, + "learning_rate": 4.939085624625978e-06, + "loss": 0.6143, + "step": 1395 + }, + { + "epoch": 0.22647631408176508, + "grad_norm": 0.6338029582599887, + "learning_rate": 4.938991896702891e-06, + "loss": 0.5891, + "step": 1396 + }, + { + "epoch": 0.22663854639844258, + "grad_norm": 0.5948829247452744, + "learning_rate": 4.938898097617142e-06, + "loss": 0.5805, + "step": 1397 + }, + { + "epoch": 0.22680077871512005, + "grad_norm": 0.6223315318604311, + "learning_rate": 4.938804227371467e-06, + "loss": 0.6134, + "step": 1398 + }, + { + "epoch": 0.22696301103179753, + "grad_norm": 0.5722596711756192, + "learning_rate": 4.938710285968607e-06, + "loss": 0.6243, + "step": 1399 + }, + { + "epoch": 0.227125243348475, + "grad_norm": 0.6166768264113226, + "learning_rate": 4.938616273411302e-06, + "loss": 0.6292, + "step": 1400 + }, + { + "epoch": 0.2272874756651525, + "grad_norm": 0.5731952367413011, + "learning_rate": 4.938522189702294e-06, + "loss": 0.59, + "step": 1401 + }, + { + "epoch": 0.22744970798182998, + "grad_norm": 0.5891969021891726, + "learning_rate": 4.938428034844328e-06, + "loss": 0.5892, + "step": 1402 + }, + { + "epoch": 0.22761194029850745, + "grad_norm": 0.6045387298547922, + "learning_rate": 4.938333808840153e-06, + "loss": 0.5727, + "step": 1403 + }, + { + "epoch": 0.22777417261518496, + "grad_norm": 0.6159375275706946, + "learning_rate": 4.938239511692517e-06, + "loss": 0.5669, + "step": 1404 + }, + { + "epoch": 0.22793640493186243, + "grad_norm": 0.6293395419323152, + "learning_rate": 4.93814514340417e-06, + "loss": 0.5675, + "step": 1405 + }, + { + "epoch": 0.2280986372485399, + "grad_norm": 0.5837858147369351, + "learning_rate": 4.938050703977868e-06, + "loss": 0.5767, + "step": 1406 + }, + { + "epoch": 0.22826086956521738, + "grad_norm": 0.5877117876128769, + "learning_rate": 4.937956193416366e-06, + "loss": 0.6101, + "step": 1407 + }, + { + "epoch": 0.22842310188189488, + "grad_norm": 0.590646912557577, + "learning_rate": 4.9378616117224195e-06, + "loss": 0.6045, + "step": 1408 + }, + { + "epoch": 0.22858533419857235, + "grad_norm": 0.6102381902703331, + "learning_rate": 4.937766958898789e-06, + "loss": 0.5988, + "step": 1409 + }, + { + "epoch": 0.22874756651524983, + "grad_norm": 0.6404733836565211, + "learning_rate": 4.937672234948239e-06, + "loss": 0.5859, + "step": 1410 + }, + { + "epoch": 0.22890979883192733, + "grad_norm": 0.6280296090704511, + "learning_rate": 4.937577439873529e-06, + "loss": 0.5806, + "step": 1411 + }, + { + "epoch": 0.2290720311486048, + "grad_norm": 0.5839272005262037, + "learning_rate": 4.937482573677427e-06, + "loss": 0.5966, + "step": 1412 + }, + { + "epoch": 0.22923426346528228, + "grad_norm": 0.5569765188695591, + "learning_rate": 4.937387636362701e-06, + "loss": 0.5584, + "step": 1413 + }, + { + "epoch": 0.22939649578195975, + "grad_norm": 0.6995834082013163, + "learning_rate": 4.9372926279321206e-06, + "loss": 0.6255, + "step": 1414 + }, + { + "epoch": 0.22955872809863725, + "grad_norm": 0.6146509477137541, + "learning_rate": 4.9371975483884586e-06, + "loss": 0.5779, + "step": 1415 + }, + { + "epoch": 0.22972096041531473, + "grad_norm": 0.6142471971419473, + "learning_rate": 4.9371023977344875e-06, + "loss": 0.5999, + "step": 1416 + }, + { + "epoch": 0.2298831927319922, + "grad_norm": 0.5826774860484214, + "learning_rate": 4.937007175972986e-06, + "loss": 0.5667, + "step": 1417 + }, + { + "epoch": 0.2300454250486697, + "grad_norm": 0.6223176546586281, + "learning_rate": 4.936911883106729e-06, + "loss": 0.5882, + "step": 1418 + }, + { + "epoch": 0.23020765736534718, + "grad_norm": 0.6041683582446505, + "learning_rate": 4.936816519138499e-06, + "loss": 0.5826, + "step": 1419 + }, + { + "epoch": 0.23036988968202465, + "grad_norm": 0.6152740103821005, + "learning_rate": 4.936721084071079e-06, + "loss": 0.6248, + "step": 1420 + }, + { + "epoch": 0.23053212199870216, + "grad_norm": 0.5835064749199441, + "learning_rate": 4.936625577907252e-06, + "loss": 0.6001, + "step": 1421 + }, + { + "epoch": 0.23069435431537963, + "grad_norm": 0.6093888924141869, + "learning_rate": 4.936530000649806e-06, + "loss": 0.5916, + "step": 1422 + }, + { + "epoch": 0.2308565866320571, + "grad_norm": 0.6241504361140443, + "learning_rate": 4.936434352301529e-06, + "loss": 0.6047, + "step": 1423 + }, + { + "epoch": 0.23101881894873458, + "grad_norm": 0.618266324255466, + "learning_rate": 4.936338632865212e-06, + "loss": 0.5705, + "step": 1424 + }, + { + "epoch": 0.23118105126541208, + "grad_norm": 0.6168527146841144, + "learning_rate": 4.936242842343647e-06, + "loss": 0.5658, + "step": 1425 + }, + { + "epoch": 0.23134328358208955, + "grad_norm": 0.6007682008594244, + "learning_rate": 4.936146980739629e-06, + "loss": 0.5792, + "step": 1426 + }, + { + "epoch": 0.23150551589876703, + "grad_norm": 0.5757097298549826, + "learning_rate": 4.936051048055956e-06, + "loss": 0.6054, + "step": 1427 + }, + { + "epoch": 0.23166774821544453, + "grad_norm": 0.6005292593705729, + "learning_rate": 4.935955044295426e-06, + "loss": 0.5914, + "step": 1428 + }, + { + "epoch": 0.231829980532122, + "grad_norm": 0.6121792518041376, + "learning_rate": 4.935858969460842e-06, + "loss": 0.6089, + "step": 1429 + }, + { + "epoch": 0.23199221284879948, + "grad_norm": 0.6167303917329421, + "learning_rate": 4.935762823555004e-06, + "loss": 0.5917, + "step": 1430 + }, + { + "epoch": 0.23215444516547695, + "grad_norm": 0.6110528113667596, + "learning_rate": 4.935666606580719e-06, + "loss": 0.6344, + "step": 1431 + }, + { + "epoch": 0.23231667748215445, + "grad_norm": 0.6234384123232284, + "learning_rate": 4.935570318540795e-06, + "loss": 0.6052, + "step": 1432 + }, + { + "epoch": 0.23247890979883193, + "grad_norm": 0.6222920378978298, + "learning_rate": 4.935473959438041e-06, + "loss": 0.6069, + "step": 1433 + }, + { + "epoch": 0.2326411421155094, + "grad_norm": 0.5881964800529915, + "learning_rate": 4.935377529275267e-06, + "loss": 0.6031, + "step": 1434 + }, + { + "epoch": 0.2328033744321869, + "grad_norm": 0.6250264236603128, + "learning_rate": 4.935281028055289e-06, + "loss": 0.6179, + "step": 1435 + }, + { + "epoch": 0.23296560674886438, + "grad_norm": 0.5941503061830213, + "learning_rate": 4.93518445578092e-06, + "loss": 0.5635, + "step": 1436 + }, + { + "epoch": 0.23312783906554185, + "grad_norm": 0.5988200184023867, + "learning_rate": 4.93508781245498e-06, + "loss": 0.5906, + "step": 1437 + }, + { + "epoch": 0.23329007138221933, + "grad_norm": 0.5829017724578018, + "learning_rate": 4.934991098080288e-06, + "loss": 0.6434, + "step": 1438 + }, + { + "epoch": 0.23345230369889683, + "grad_norm": 0.6136313057534243, + "learning_rate": 4.934894312659665e-06, + "loss": 0.6211, + "step": 1439 + }, + { + "epoch": 0.2336145360155743, + "grad_norm": 0.6018278214102102, + "learning_rate": 4.9347974561959355e-06, + "loss": 0.6149, + "step": 1440 + }, + { + "epoch": 0.23377676833225178, + "grad_norm": 0.5994596945565375, + "learning_rate": 4.934700528691926e-06, + "loss": 0.6217, + "step": 1441 + }, + { + "epoch": 0.23393900064892928, + "grad_norm": 0.649054959382085, + "learning_rate": 4.9346035301504644e-06, + "loss": 0.6229, + "step": 1442 + }, + { + "epoch": 0.23410123296560675, + "grad_norm": 0.5896564630350069, + "learning_rate": 4.93450646057438e-06, + "loss": 0.6171, + "step": 1443 + }, + { + "epoch": 0.23426346528228423, + "grad_norm": 0.6275019820877349, + "learning_rate": 4.934409319966505e-06, + "loss": 0.6086, + "step": 1444 + }, + { + "epoch": 0.2344256975989617, + "grad_norm": 0.5950544151083221, + "learning_rate": 4.934312108329676e-06, + "loss": 0.6011, + "step": 1445 + }, + { + "epoch": 0.2345879299156392, + "grad_norm": 0.5818416854418135, + "learning_rate": 4.934214825666727e-06, + "loss": 0.5876, + "step": 1446 + }, + { + "epoch": 0.23475016223231668, + "grad_norm": 0.6096467436929608, + "learning_rate": 4.934117471980496e-06, + "loss": 0.5866, + "step": 1447 + }, + { + "epoch": 0.23491239454899415, + "grad_norm": 0.5895643527175217, + "learning_rate": 4.9340200472738255e-06, + "loss": 0.616, + "step": 1448 + }, + { + "epoch": 0.23507462686567165, + "grad_norm": 0.5739966750126183, + "learning_rate": 4.933922551549556e-06, + "loss": 0.6101, + "step": 1449 + }, + { + "epoch": 0.23523685918234913, + "grad_norm": 0.6388939784284482, + "learning_rate": 4.933824984810534e-06, + "loss": 0.6044, + "step": 1450 + }, + { + "epoch": 0.2353990914990266, + "grad_norm": 0.6302654619481463, + "learning_rate": 4.933727347059606e-06, + "loss": 0.6038, + "step": 1451 + }, + { + "epoch": 0.23556132381570408, + "grad_norm": 0.597839377005935, + "learning_rate": 4.93362963829962e-06, + "loss": 0.6109, + "step": 1452 + }, + { + "epoch": 0.23572355613238158, + "grad_norm": 0.5989332219007099, + "learning_rate": 4.933531858533426e-06, + "loss": 0.6233, + "step": 1453 + }, + { + "epoch": 0.23588578844905905, + "grad_norm": 0.6401304807310685, + "learning_rate": 4.933434007763878e-06, + "loss": 0.6221, + "step": 1454 + }, + { + "epoch": 0.23604802076573653, + "grad_norm": 0.6055361071357991, + "learning_rate": 4.933336085993833e-06, + "loss": 0.6113, + "step": 1455 + }, + { + "epoch": 0.23621025308241403, + "grad_norm": 0.6142848588970133, + "learning_rate": 4.933238093226145e-06, + "loss": 0.6136, + "step": 1456 + }, + { + "epoch": 0.2363724853990915, + "grad_norm": 0.576521002188965, + "learning_rate": 4.933140029463673e-06, + "loss": 0.565, + "step": 1457 + }, + { + "epoch": 0.23653471771576898, + "grad_norm": 0.6025067768217808, + "learning_rate": 4.933041894709281e-06, + "loss": 0.6278, + "step": 1458 + }, + { + "epoch": 0.23669695003244645, + "grad_norm": 0.6110490061897867, + "learning_rate": 4.932943688965831e-06, + "loss": 0.5992, + "step": 1459 + }, + { + "epoch": 0.23685918234912395, + "grad_norm": 0.647466500830809, + "learning_rate": 4.932845412236187e-06, + "loss": 0.5903, + "step": 1460 + }, + { + "epoch": 0.23702141466580143, + "grad_norm": 0.604570220914429, + "learning_rate": 4.932747064523218e-06, + "loss": 0.5714, + "step": 1461 + }, + { + "epoch": 0.2371836469824789, + "grad_norm": 0.6145977212588136, + "learning_rate": 4.932648645829793e-06, + "loss": 0.5811, + "step": 1462 + }, + { + "epoch": 0.2373458792991564, + "grad_norm": 0.6103802397049721, + "learning_rate": 4.932550156158783e-06, + "loss": 0.5979, + "step": 1463 + }, + { + "epoch": 0.23750811161583388, + "grad_norm": 0.6054863225942508, + "learning_rate": 4.932451595513063e-06, + "loss": 0.6152, + "step": 1464 + }, + { + "epoch": 0.23767034393251135, + "grad_norm": 0.6098758740480589, + "learning_rate": 4.932352963895507e-06, + "loss": 0.6221, + "step": 1465 + }, + { + "epoch": 0.23783257624918883, + "grad_norm": 0.6138092559252605, + "learning_rate": 4.932254261308994e-06, + "loss": 0.5932, + "step": 1466 + }, + { + "epoch": 0.23799480856586633, + "grad_norm": 0.6061294044005036, + "learning_rate": 4.932155487756405e-06, + "loss": 0.6045, + "step": 1467 + }, + { + "epoch": 0.2381570408825438, + "grad_norm": 0.6225628791555227, + "learning_rate": 4.9320566432406185e-06, + "loss": 0.5944, + "step": 1468 + }, + { + "epoch": 0.23831927319922128, + "grad_norm": 0.6108096608069751, + "learning_rate": 4.931957727764522e-06, + "loss": 0.6409, + "step": 1469 + }, + { + "epoch": 0.23848150551589878, + "grad_norm": 0.6195778255702437, + "learning_rate": 4.931858741331e-06, + "loss": 0.609, + "step": 1470 + }, + { + "epoch": 0.23864373783257625, + "grad_norm": 0.6128710424410406, + "learning_rate": 4.9317596839429394e-06, + "loss": 0.6187, + "step": 1471 + }, + { + "epoch": 0.23880597014925373, + "grad_norm": 0.6181273945023696, + "learning_rate": 4.931660555603232e-06, + "loss": 0.5523, + "step": 1472 + }, + { + "epoch": 0.2389682024659312, + "grad_norm": 0.635217682056683, + "learning_rate": 4.9315613563147705e-06, + "loss": 0.6144, + "step": 1473 + }, + { + "epoch": 0.2391304347826087, + "grad_norm": 0.5904385856657383, + "learning_rate": 4.931462086080448e-06, + "loss": 0.5962, + "step": 1474 + }, + { + "epoch": 0.23929266709928618, + "grad_norm": 0.6217408376610453, + "learning_rate": 4.931362744903162e-06, + "loss": 0.5769, + "step": 1475 + }, + { + "epoch": 0.23945489941596365, + "grad_norm": 0.6436028727143522, + "learning_rate": 4.9312633327858095e-06, + "loss": 0.6091, + "step": 1476 + }, + { + "epoch": 0.23961713173264115, + "grad_norm": 0.6167693944841254, + "learning_rate": 4.931163849731293e-06, + "loss": 0.605, + "step": 1477 + }, + { + "epoch": 0.23977936404931863, + "grad_norm": 0.6162455822597647, + "learning_rate": 4.931064295742513e-06, + "loss": 0.6112, + "step": 1478 + }, + { + "epoch": 0.2399415963659961, + "grad_norm": 0.597532053597381, + "learning_rate": 4.930964670822376e-06, + "loss": 0.61, + "step": 1479 + }, + { + "epoch": 0.24010382868267358, + "grad_norm": 0.6098739062694608, + "learning_rate": 4.930864974973788e-06, + "loss": 0.5678, + "step": 1480 + }, + { + "epoch": 0.24026606099935108, + "grad_norm": 0.6219941636178266, + "learning_rate": 4.930765208199658e-06, + "loss": 0.5869, + "step": 1481 + }, + { + "epoch": 0.24042829331602855, + "grad_norm": 0.6102935461131636, + "learning_rate": 4.930665370502896e-06, + "loss": 0.6271, + "step": 1482 + }, + { + "epoch": 0.24059052563270603, + "grad_norm": 0.5887580107538054, + "learning_rate": 4.930565461886416e-06, + "loss": 0.5739, + "step": 1483 + }, + { + "epoch": 0.24075275794938353, + "grad_norm": 0.6108031738534028, + "learning_rate": 4.9304654823531335e-06, + "loss": 0.6112, + "step": 1484 + }, + { + "epoch": 0.240914990266061, + "grad_norm": 0.5965339773382069, + "learning_rate": 4.930365431905964e-06, + "loss": 0.6, + "step": 1485 + }, + { + "epoch": 0.24107722258273848, + "grad_norm": 0.5655021443306142, + "learning_rate": 4.930265310547829e-06, + "loss": 0.605, + "step": 1486 + }, + { + "epoch": 0.24123945489941595, + "grad_norm": 0.6227816818922634, + "learning_rate": 4.9301651182816475e-06, + "loss": 0.6246, + "step": 1487 + }, + { + "epoch": 0.24140168721609345, + "grad_norm": 0.6002132310063195, + "learning_rate": 4.9300648551103435e-06, + "loss": 0.5879, + "step": 1488 + }, + { + "epoch": 0.24156391953277093, + "grad_norm": 0.5616342873796213, + "learning_rate": 4.929964521036843e-06, + "loss": 0.5633, + "step": 1489 + }, + { + "epoch": 0.2417261518494484, + "grad_norm": 0.5752201896204491, + "learning_rate": 4.929864116064073e-06, + "loss": 0.5791, + "step": 1490 + }, + { + "epoch": 0.2418883841661259, + "grad_norm": 0.5641386336648613, + "learning_rate": 4.9297636401949625e-06, + "loss": 0.5136, + "step": 1491 + }, + { + "epoch": 0.24205061648280338, + "grad_norm": 0.6304410448050014, + "learning_rate": 4.9296630934324445e-06, + "loss": 0.5625, + "step": 1492 + }, + { + "epoch": 0.24221284879948085, + "grad_norm": 0.6030919596399773, + "learning_rate": 4.9295624757794504e-06, + "loss": 0.6163, + "step": 1493 + }, + { + "epoch": 0.24237508111615833, + "grad_norm": 0.6142549598631218, + "learning_rate": 4.929461787238919e-06, + "loss": 0.583, + "step": 1494 + }, + { + "epoch": 0.24253731343283583, + "grad_norm": 0.5973122662163125, + "learning_rate": 4.929361027813785e-06, + "loss": 0.6169, + "step": 1495 + }, + { + "epoch": 0.2426995457495133, + "grad_norm": 0.6115905386642659, + "learning_rate": 4.92926019750699e-06, + "loss": 0.5951, + "step": 1496 + }, + { + "epoch": 0.24286177806619078, + "grad_norm": 0.6123982938637617, + "learning_rate": 4.929159296321476e-06, + "loss": 0.6138, + "step": 1497 + }, + { + "epoch": 0.24302401038286828, + "grad_norm": 0.6231546114171158, + "learning_rate": 4.9290583242601865e-06, + "loss": 0.5826, + "step": 1498 + }, + { + "epoch": 0.24318624269954575, + "grad_norm": 0.5969602004294278, + "learning_rate": 4.928957281326067e-06, + "loss": 0.6074, + "step": 1499 + }, + { + "epoch": 0.24334847501622323, + "grad_norm": 0.6651791652179367, + "learning_rate": 4.928856167522067e-06, + "loss": 0.5762, + "step": 1500 + }, + { + "epoch": 0.24351070733290073, + "grad_norm": 0.5933272306099158, + "learning_rate": 4.928754982851136e-06, + "loss": 0.5852, + "step": 1501 + }, + { + "epoch": 0.2436729396495782, + "grad_norm": 0.6057355883712134, + "learning_rate": 4.928653727316226e-06, + "loss": 0.584, + "step": 1502 + }, + { + "epoch": 0.24383517196625568, + "grad_norm": 0.5861161173750555, + "learning_rate": 4.928552400920293e-06, + "loss": 0.5655, + "step": 1503 + }, + { + "epoch": 0.24399740428293315, + "grad_norm": 0.5859570076391003, + "learning_rate": 4.92845100366629e-06, + "loss": 0.595, + "step": 1504 + }, + { + "epoch": 0.24415963659961065, + "grad_norm": 0.6209801761399901, + "learning_rate": 4.928349535557179e-06, + "loss": 0.6173, + "step": 1505 + }, + { + "epoch": 0.24432186891628813, + "grad_norm": 0.5806943055342594, + "learning_rate": 4.928247996595918e-06, + "loss": 0.5802, + "step": 1506 + }, + { + "epoch": 0.2444841012329656, + "grad_norm": 0.5888697761164038, + "learning_rate": 4.92814638678547e-06, + "loss": 0.5997, + "step": 1507 + }, + { + "epoch": 0.2446463335496431, + "grad_norm": 0.5868257354571705, + "learning_rate": 4.928044706128803e-06, + "loss": 0.5836, + "step": 1508 + }, + { + "epoch": 0.24480856586632058, + "grad_norm": 0.6055051333758614, + "learning_rate": 4.927942954628879e-06, + "loss": 0.5846, + "step": 1509 + }, + { + "epoch": 0.24497079818299805, + "grad_norm": 0.5816698853340736, + "learning_rate": 4.92784113228867e-06, + "loss": 0.5487, + "step": 1510 + }, + { + "epoch": 0.24513303049967552, + "grad_norm": 0.5906864758630893, + "learning_rate": 4.927739239111144e-06, + "loss": 0.5781, + "step": 1511 + }, + { + "epoch": 0.24529526281635303, + "grad_norm": 0.6088389534019524, + "learning_rate": 4.927637275099278e-06, + "loss": 0.6102, + "step": 1512 + }, + { + "epoch": 0.2454574951330305, + "grad_norm": 0.6225099838006876, + "learning_rate": 4.927535240256044e-06, + "loss": 0.6106, + "step": 1513 + }, + { + "epoch": 0.24561972744970798, + "grad_norm": 0.593916841127765, + "learning_rate": 4.9274331345844194e-06, + "loss": 0.5649, + "step": 1514 + }, + { + "epoch": 0.24578195976638548, + "grad_norm": 0.5804128311380358, + "learning_rate": 4.927330958087384e-06, + "loss": 0.5918, + "step": 1515 + }, + { + "epoch": 0.24594419208306295, + "grad_norm": 0.6049509581755506, + "learning_rate": 4.9272287107679186e-06, + "loss": 0.6256, + "step": 1516 + }, + { + "epoch": 0.24610642439974043, + "grad_norm": 0.6043189055224468, + "learning_rate": 4.927126392629007e-06, + "loss": 0.5718, + "step": 1517 + }, + { + "epoch": 0.2462686567164179, + "grad_norm": 0.6770036447052281, + "learning_rate": 4.927024003673634e-06, + "loss": 0.5922, + "step": 1518 + }, + { + "epoch": 0.2464308890330954, + "grad_norm": 0.6192004116656142, + "learning_rate": 4.926921543904787e-06, + "loss": 0.5524, + "step": 1519 + }, + { + "epoch": 0.24659312134977288, + "grad_norm": 0.5863230068097643, + "learning_rate": 4.926819013325456e-06, + "loss": 0.6, + "step": 1520 + }, + { + "epoch": 0.24675535366645035, + "grad_norm": 0.5837423050679463, + "learning_rate": 4.926716411938632e-06, + "loss": 0.5891, + "step": 1521 + }, + { + "epoch": 0.24691758598312785, + "grad_norm": 0.5760956545664767, + "learning_rate": 4.926613739747309e-06, + "loss": 0.5797, + "step": 1522 + }, + { + "epoch": 0.24707981829980533, + "grad_norm": 0.6207500528048008, + "learning_rate": 4.9265109967544826e-06, + "loss": 0.5783, + "step": 1523 + }, + { + "epoch": 0.2472420506164828, + "grad_norm": 0.6390573759257613, + "learning_rate": 4.92640818296315e-06, + "loss": 0.6216, + "step": 1524 + }, + { + "epoch": 0.24740428293316027, + "grad_norm": 0.6043211737658218, + "learning_rate": 4.926305298376312e-06, + "loss": 0.5992, + "step": 1525 + }, + { + "epoch": 0.24756651524983778, + "grad_norm": 0.6013557766182328, + "learning_rate": 4.92620234299697e-06, + "loss": 0.5762, + "step": 1526 + }, + { + "epoch": 0.24772874756651525, + "grad_norm": 0.6786561245811094, + "learning_rate": 4.926099316828128e-06, + "loss": 0.585, + "step": 1527 + }, + { + "epoch": 0.24789097988319272, + "grad_norm": 0.6072288553333577, + "learning_rate": 4.925996219872792e-06, + "loss": 0.5931, + "step": 1528 + }, + { + "epoch": 0.24805321219987023, + "grad_norm": 0.6065233926941739, + "learning_rate": 4.925893052133969e-06, + "loss": 0.5662, + "step": 1529 + }, + { + "epoch": 0.2482154445165477, + "grad_norm": 0.5919483597052467, + "learning_rate": 4.92578981361467e-06, + "loss": 0.5606, + "step": 1530 + }, + { + "epoch": 0.24837767683322517, + "grad_norm": 0.6214505082414656, + "learning_rate": 4.925686504317908e-06, + "loss": 0.5824, + "step": 1531 + }, + { + "epoch": 0.24853990914990265, + "grad_norm": 0.6189883316300826, + "learning_rate": 4.925583124246697e-06, + "loss": 0.6385, + "step": 1532 + }, + { + "epoch": 0.24870214146658015, + "grad_norm": 0.6228731661539436, + "learning_rate": 4.9254796734040525e-06, + "loss": 0.5978, + "step": 1533 + }, + { + "epoch": 0.24886437378325763, + "grad_norm": 0.6123546203758915, + "learning_rate": 4.925376151792992e-06, + "loss": 0.5935, + "step": 1534 + }, + { + "epoch": 0.2490266060999351, + "grad_norm": 0.580082563433187, + "learning_rate": 4.925272559416539e-06, + "loss": 0.5629, + "step": 1535 + }, + { + "epoch": 0.2491888384166126, + "grad_norm": 0.6300077462237422, + "learning_rate": 4.925168896277713e-06, + "loss": 0.606, + "step": 1536 + }, + { + "epoch": 0.24935107073329008, + "grad_norm": 0.5996477454839667, + "learning_rate": 4.92506516237954e-06, + "loss": 0.5893, + "step": 1537 + }, + { + "epoch": 0.24951330304996755, + "grad_norm": 0.6106952429465968, + "learning_rate": 4.924961357725047e-06, + "loss": 0.5684, + "step": 1538 + }, + { + "epoch": 0.24967553536664502, + "grad_norm": 0.5864985281668241, + "learning_rate": 4.924857482317263e-06, + "loss": 0.6126, + "step": 1539 + }, + { + "epoch": 0.24983776768332253, + "grad_norm": 0.5991978816232497, + "learning_rate": 4.924753536159216e-06, + "loss": 0.5815, + "step": 1540 + }, + { + "epoch": 0.25, + "grad_norm": 0.6338911679113383, + "learning_rate": 4.924649519253942e-06, + "loss": 0.6219, + "step": 1541 + }, + { + "epoch": 0.2501622323166775, + "grad_norm": 0.6268301267450744, + "learning_rate": 4.924545431604475e-06, + "loss": 0.589, + "step": 1542 + }, + { + "epoch": 0.25032446463335495, + "grad_norm": 0.591556851126689, + "learning_rate": 4.924441273213851e-06, + "loss": 0.5851, + "step": 1543 + }, + { + "epoch": 0.25048669695003245, + "grad_norm": 0.6187752077271832, + "learning_rate": 4.9243370440851104e-06, + "loss": 0.6125, + "step": 1544 + }, + { + "epoch": 0.25064892926670995, + "grad_norm": 0.580750048140003, + "learning_rate": 4.924232744221293e-06, + "loss": 0.6162, + "step": 1545 + }, + { + "epoch": 0.2508111615833874, + "grad_norm": 0.608456888501447, + "learning_rate": 4.924128373625443e-06, + "loss": 0.6004, + "step": 1546 + }, + { + "epoch": 0.2509733939000649, + "grad_norm": 0.6042616416161527, + "learning_rate": 4.924023932300604e-06, + "loss": 0.6061, + "step": 1547 + }, + { + "epoch": 0.2511356262167424, + "grad_norm": 0.6225107369634678, + "learning_rate": 4.9239194202498266e-06, + "loss": 0.6148, + "step": 1548 + }, + { + "epoch": 0.25129785853341985, + "grad_norm": 0.5921606158186473, + "learning_rate": 4.923814837476157e-06, + "loss": 0.631, + "step": 1549 + }, + { + "epoch": 0.25146009085009735, + "grad_norm": 0.6025132908493885, + "learning_rate": 4.923710183982648e-06, + "loss": 0.5525, + "step": 1550 + }, + { + "epoch": 0.2516223231667748, + "grad_norm": 0.5893739502376129, + "learning_rate": 4.923605459772353e-06, + "loss": 0.585, + "step": 1551 + }, + { + "epoch": 0.2517845554834523, + "grad_norm": 0.6350581570557441, + "learning_rate": 4.923500664848327e-06, + "loss": 0.6082, + "step": 1552 + }, + { + "epoch": 0.2519467878001298, + "grad_norm": 0.6252205347484454, + "learning_rate": 4.9233957992136275e-06, + "loss": 0.5808, + "step": 1553 + }, + { + "epoch": 0.25210902011680725, + "grad_norm": 0.6215039905804723, + "learning_rate": 4.923290862871315e-06, + "loss": 0.5637, + "step": 1554 + }, + { + "epoch": 0.25227125243348475, + "grad_norm": 0.5994847685984693, + "learning_rate": 4.923185855824451e-06, + "loss": 0.5826, + "step": 1555 + }, + { + "epoch": 0.25243348475016225, + "grad_norm": 0.6094830503253174, + "learning_rate": 4.923080778076099e-06, + "loss": 0.5754, + "step": 1556 + }, + { + "epoch": 0.2525957170668397, + "grad_norm": 0.6057813360385681, + "learning_rate": 4.922975629629325e-06, + "loss": 0.5957, + "step": 1557 + }, + { + "epoch": 0.2527579493835172, + "grad_norm": 0.5846326478616451, + "learning_rate": 4.922870410487197e-06, + "loss": 0.5749, + "step": 1558 + }, + { + "epoch": 0.2529201817001947, + "grad_norm": 0.6007493929688295, + "learning_rate": 4.922765120652786e-06, + "loss": 0.6149, + "step": 1559 + }, + { + "epoch": 0.25308241401687215, + "grad_norm": 0.6228770914367132, + "learning_rate": 4.9226597601291616e-06, + "loss": 0.6307, + "step": 1560 + }, + { + "epoch": 0.25324464633354965, + "grad_norm": 0.5974086796052162, + "learning_rate": 4.9225543289194e-06, + "loss": 0.61, + "step": 1561 + }, + { + "epoch": 0.25340687865022715, + "grad_norm": 0.5992713101612954, + "learning_rate": 4.9224488270265755e-06, + "loss": 0.5656, + "step": 1562 + }, + { + "epoch": 0.2535691109669046, + "grad_norm": 0.5949845293978858, + "learning_rate": 4.922343254453769e-06, + "loss": 0.6056, + "step": 1563 + }, + { + "epoch": 0.2537313432835821, + "grad_norm": 0.5945950392148339, + "learning_rate": 4.922237611204058e-06, + "loss": 0.6177, + "step": 1564 + }, + { + "epoch": 0.25389357560025955, + "grad_norm": 0.5877056364673422, + "learning_rate": 4.922131897280528e-06, + "loss": 0.5732, + "step": 1565 + }, + { + "epoch": 0.25405580791693705, + "grad_norm": 0.5938831035169881, + "learning_rate": 4.92202611268626e-06, + "loss": 0.57, + "step": 1566 + }, + { + "epoch": 0.25421804023361455, + "grad_norm": 0.6149338571643433, + "learning_rate": 4.921920257424343e-06, + "loss": 0.5781, + "step": 1567 + }, + { + "epoch": 0.254380272550292, + "grad_norm": 0.6578280475801392, + "learning_rate": 4.9218143314978636e-06, + "loss": 0.5772, + "step": 1568 + }, + { + "epoch": 0.2545425048669695, + "grad_norm": 0.6076239600505856, + "learning_rate": 4.921708334909914e-06, + "loss": 0.5916, + "step": 1569 + }, + { + "epoch": 0.254704737183647, + "grad_norm": 0.5992204314443029, + "learning_rate": 4.921602267663586e-06, + "loss": 0.5955, + "step": 1570 + }, + { + "epoch": 0.25486696950032445, + "grad_norm": 0.6259014580932237, + "learning_rate": 4.921496129761975e-06, + "loss": 0.6359, + "step": 1571 + }, + { + "epoch": 0.25502920181700195, + "grad_norm": 0.6275142086028078, + "learning_rate": 4.921389921208177e-06, + "loss": 0.6039, + "step": 1572 + }, + { + "epoch": 0.25519143413367945, + "grad_norm": 0.6797292228033308, + "learning_rate": 4.921283642005293e-06, + "loss": 0.5776, + "step": 1573 + }, + { + "epoch": 0.2553536664503569, + "grad_norm": 0.5771927429523598, + "learning_rate": 4.9211772921564205e-06, + "loss": 0.6145, + "step": 1574 + }, + { + "epoch": 0.2555158987670344, + "grad_norm": 0.6024452504967319, + "learning_rate": 4.921070871664664e-06, + "loss": 0.633, + "step": 1575 + }, + { + "epoch": 0.2556781310837119, + "grad_norm": 0.6241461130098599, + "learning_rate": 4.92096438053313e-06, + "loss": 0.5961, + "step": 1576 + }, + { + "epoch": 0.25584036340038935, + "grad_norm": 0.5981490466874902, + "learning_rate": 4.920857818764924e-06, + "loss": 0.5894, + "step": 1577 + }, + { + "epoch": 0.25600259571706685, + "grad_norm": 0.5998091042090675, + "learning_rate": 4.9207511863631545e-06, + "loss": 0.6145, + "step": 1578 + }, + { + "epoch": 0.2561648280337443, + "grad_norm": 0.6117075858087033, + "learning_rate": 4.9206444833309346e-06, + "loss": 0.6263, + "step": 1579 + }, + { + "epoch": 0.2563270603504218, + "grad_norm": 0.6768000902872507, + "learning_rate": 4.920537709671376e-06, + "loss": 0.5944, + "step": 1580 + }, + { + "epoch": 0.2564892926670993, + "grad_norm": 0.6446213172240877, + "learning_rate": 4.9204308653875955e-06, + "loss": 0.6188, + "step": 1581 + }, + { + "epoch": 0.25665152498377675, + "grad_norm": 0.5460857383997616, + "learning_rate": 4.92032395048271e-06, + "loss": 0.5816, + "step": 1582 + }, + { + "epoch": 0.25681375730045425, + "grad_norm": 0.65498993338852, + "learning_rate": 4.9202169649598375e-06, + "loss": 0.6337, + "step": 1583 + }, + { + "epoch": 0.25697598961713175, + "grad_norm": 0.6024790558967993, + "learning_rate": 4.920109908822101e-06, + "loss": 0.5973, + "step": 1584 + }, + { + "epoch": 0.2571382219338092, + "grad_norm": 0.6080455192046009, + "learning_rate": 4.920002782072624e-06, + "loss": 0.6076, + "step": 1585 + }, + { + "epoch": 0.2573004542504867, + "grad_norm": 0.5853143146722577, + "learning_rate": 4.919895584714533e-06, + "loss": 0.5733, + "step": 1586 + }, + { + "epoch": 0.2574626865671642, + "grad_norm": 0.554811522778502, + "learning_rate": 4.9197883167509534e-06, + "loss": 0.532, + "step": 1587 + }, + { + "epoch": 0.25762491888384165, + "grad_norm": 0.6103413610620341, + "learning_rate": 4.9196809781850165e-06, + "loss": 0.5882, + "step": 1588 + }, + { + "epoch": 0.25778715120051915, + "grad_norm": 0.617225401424743, + "learning_rate": 4.919573569019854e-06, + "loss": 0.5868, + "step": 1589 + }, + { + "epoch": 0.25794938351719665, + "grad_norm": 0.566030016860098, + "learning_rate": 4.919466089258599e-06, + "loss": 0.5836, + "step": 1590 + }, + { + "epoch": 0.2581116158338741, + "grad_norm": 0.5551097345467807, + "learning_rate": 4.919358538904389e-06, + "loss": 0.5622, + "step": 1591 + }, + { + "epoch": 0.2582738481505516, + "grad_norm": 0.6286775215456737, + "learning_rate": 4.919250917960361e-06, + "loss": 0.6195, + "step": 1592 + }, + { + "epoch": 0.25843608046722905, + "grad_norm": 0.599276520502202, + "learning_rate": 4.919143226429655e-06, + "loss": 0.5811, + "step": 1593 + }, + { + "epoch": 0.25859831278390655, + "grad_norm": 0.5740112654793429, + "learning_rate": 4.919035464315413e-06, + "loss": 0.5818, + "step": 1594 + }, + { + "epoch": 0.25876054510058405, + "grad_norm": 0.6012925468508018, + "learning_rate": 4.91892763162078e-06, + "loss": 0.5981, + "step": 1595 + }, + { + "epoch": 0.2589227774172615, + "grad_norm": 0.6136888957628708, + "learning_rate": 4.918819728348901e-06, + "loss": 0.6245, + "step": 1596 + }, + { + "epoch": 0.259085009733939, + "grad_norm": 0.5855723339250309, + "learning_rate": 4.918711754502926e-06, + "loss": 0.5184, + "step": 1597 + }, + { + "epoch": 0.2592472420506165, + "grad_norm": 0.6115201066300965, + "learning_rate": 4.918603710086003e-06, + "loss": 0.627, + "step": 1598 + }, + { + "epoch": 0.25940947436729395, + "grad_norm": 0.6119030774283349, + "learning_rate": 4.918495595101287e-06, + "loss": 0.6045, + "step": 1599 + }, + { + "epoch": 0.25957170668397145, + "grad_norm": 0.5516257164995446, + "learning_rate": 4.918387409551931e-06, + "loss": 0.6193, + "step": 1600 + }, + { + "epoch": 0.25973393900064895, + "grad_norm": 0.6267265963398576, + "learning_rate": 4.918279153441091e-06, + "loss": 0.5644, + "step": 1601 + }, + { + "epoch": 0.2598961713173264, + "grad_norm": 0.6043234923102487, + "learning_rate": 4.918170826771928e-06, + "loss": 0.5667, + "step": 1602 + }, + { + "epoch": 0.2600584036340039, + "grad_norm": 0.6088845360269436, + "learning_rate": 4.9180624295475996e-06, + "loss": 0.5935, + "step": 1603 + }, + { + "epoch": 0.2602206359506814, + "grad_norm": 0.6184333020547882, + "learning_rate": 4.9179539617712704e-06, + "loss": 0.6147, + "step": 1604 + }, + { + "epoch": 0.26038286826735885, + "grad_norm": 0.6138965071985524, + "learning_rate": 4.917845423446105e-06, + "loss": 0.6168, + "step": 1605 + }, + { + "epoch": 0.26054510058403635, + "grad_norm": 0.6518463868885784, + "learning_rate": 4.91773681457527e-06, + "loss": 0.5958, + "step": 1606 + }, + { + "epoch": 0.2607073329007138, + "grad_norm": 0.5642950864787748, + "learning_rate": 4.917628135161934e-06, + "loss": 0.576, + "step": 1607 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 0.5863656083766594, + "learning_rate": 4.917519385209268e-06, + "loss": 0.5929, + "step": 1608 + }, + { + "epoch": 0.2610317975340688, + "grad_norm": 0.5755196632198658, + "learning_rate": 4.917410564720446e-06, + "loss": 0.5841, + "step": 1609 + }, + { + "epoch": 0.26119402985074625, + "grad_norm": 0.6088964981099797, + "learning_rate": 4.917301673698642e-06, + "loss": 0.548, + "step": 1610 + }, + { + "epoch": 0.26135626216742375, + "grad_norm": 0.6226422562975986, + "learning_rate": 4.917192712147033e-06, + "loss": 0.5813, + "step": 1611 + }, + { + "epoch": 0.26151849448410125, + "grad_norm": 0.5731261065645408, + "learning_rate": 4.917083680068799e-06, + "loss": 0.5626, + "step": 1612 + }, + { + "epoch": 0.2616807268007787, + "grad_norm": 0.5991058395999327, + "learning_rate": 4.916974577467121e-06, + "loss": 0.6053, + "step": 1613 + }, + { + "epoch": 0.2618429591174562, + "grad_norm": 0.5745556757554533, + "learning_rate": 4.916865404345183e-06, + "loss": 0.6133, + "step": 1614 + }, + { + "epoch": 0.2620051914341337, + "grad_norm": 0.5974217893145104, + "learning_rate": 4.9167561607061674e-06, + "loss": 0.6076, + "step": 1615 + }, + { + "epoch": 0.26216742375081115, + "grad_norm": 0.6176024299774932, + "learning_rate": 4.9166468465532645e-06, + "loss": 0.616, + "step": 1616 + }, + { + "epoch": 0.26232965606748865, + "grad_norm": 0.5903036739638505, + "learning_rate": 4.916537461889663e-06, + "loss": 0.638, + "step": 1617 + }, + { + "epoch": 0.26249188838416615, + "grad_norm": 0.5924654562798496, + "learning_rate": 4.916428006718555e-06, + "loss": 0.5987, + "step": 1618 + }, + { + "epoch": 0.2626541207008436, + "grad_norm": 0.5676949225074378, + "learning_rate": 4.916318481043133e-06, + "loss": 0.6063, + "step": 1619 + }, + { + "epoch": 0.2628163530175211, + "grad_norm": 0.5913837591996434, + "learning_rate": 4.916208884866593e-06, + "loss": 0.6368, + "step": 1620 + }, + { + "epoch": 0.26297858533419854, + "grad_norm": 0.6135182986359653, + "learning_rate": 4.916099218192133e-06, + "loss": 0.6049, + "step": 1621 + }, + { + "epoch": 0.26314081765087605, + "grad_norm": 0.5931708691406037, + "learning_rate": 4.915989481022952e-06, + "loss": 0.6222, + "step": 1622 + }, + { + "epoch": 0.26330304996755355, + "grad_norm": 0.5738766096120105, + "learning_rate": 4.915879673362252e-06, + "loss": 0.5893, + "step": 1623 + }, + { + "epoch": 0.263465282284231, + "grad_norm": 0.602539633044031, + "learning_rate": 4.915769795213239e-06, + "loss": 0.6194, + "step": 1624 + }, + { + "epoch": 0.2636275146009085, + "grad_norm": 0.5751969514361963, + "learning_rate": 4.915659846579116e-06, + "loss": 0.5728, + "step": 1625 + }, + { + "epoch": 0.263789746917586, + "grad_norm": 0.6416329243414098, + "learning_rate": 4.915549827463093e-06, + "loss": 0.5987, + "step": 1626 + }, + { + "epoch": 0.26395197923426345, + "grad_norm": 0.5735376610732351, + "learning_rate": 4.915439737868378e-06, + "loss": 0.574, + "step": 1627 + }, + { + "epoch": 0.26411421155094095, + "grad_norm": 0.5877348250381716, + "learning_rate": 4.915329577798185e-06, + "loss": 0.5873, + "step": 1628 + }, + { + "epoch": 0.26427644386761845, + "grad_norm": 0.5974149673211884, + "learning_rate": 4.915219347255727e-06, + "loss": 0.5948, + "step": 1629 + }, + { + "epoch": 0.2644386761842959, + "grad_norm": 0.6093256467118052, + "learning_rate": 4.9151090462442205e-06, + "loss": 0.5676, + "step": 1630 + }, + { + "epoch": 0.2646009085009734, + "grad_norm": 0.603694968603696, + "learning_rate": 4.914998674766885e-06, + "loss": 0.5878, + "step": 1631 + }, + { + "epoch": 0.2647631408176509, + "grad_norm": 0.5890242658042892, + "learning_rate": 4.914888232826939e-06, + "loss": 0.6308, + "step": 1632 + }, + { + "epoch": 0.26492537313432835, + "grad_norm": 0.608491574448101, + "learning_rate": 4.914777720427605e-06, + "loss": 0.604, + "step": 1633 + }, + { + "epoch": 0.26508760545100585, + "grad_norm": 0.5902077577397945, + "learning_rate": 4.914667137572108e-06, + "loss": 0.5973, + "step": 1634 + }, + { + "epoch": 0.26524983776768335, + "grad_norm": 0.6223199005666674, + "learning_rate": 4.914556484263675e-06, + "loss": 0.6143, + "step": 1635 + }, + { + "epoch": 0.2654120700843608, + "grad_norm": 0.6102917238736127, + "learning_rate": 4.914445760505534e-06, + "loss": 0.6269, + "step": 1636 + }, + { + "epoch": 0.2655743024010383, + "grad_norm": 0.5951582544836101, + "learning_rate": 4.914334966300916e-06, + "loss": 0.5788, + "step": 1637 + }, + { + "epoch": 0.26573653471771574, + "grad_norm": 0.5593496704526513, + "learning_rate": 4.914224101653053e-06, + "loss": 0.5975, + "step": 1638 + }, + { + "epoch": 0.26589876703439325, + "grad_norm": 0.6344897377435784, + "learning_rate": 4.91411316656518e-06, + "loss": 0.5884, + "step": 1639 + }, + { + "epoch": 0.26606099935107075, + "grad_norm": 0.6394032059769162, + "learning_rate": 4.9140021610405335e-06, + "loss": 0.6313, + "step": 1640 + }, + { + "epoch": 0.2662232316677482, + "grad_norm": 0.6061655344260116, + "learning_rate": 4.913891085082352e-06, + "loss": 0.6201, + "step": 1641 + }, + { + "epoch": 0.2663854639844257, + "grad_norm": 0.6360036437198581, + "learning_rate": 4.913779938693878e-06, + "loss": 0.575, + "step": 1642 + }, + { + "epoch": 0.2665476963011032, + "grad_norm": 0.6018370981253137, + "learning_rate": 4.913668721878353e-06, + "loss": 0.5799, + "step": 1643 + }, + { + "epoch": 0.26670992861778064, + "grad_norm": 0.5981429426260368, + "learning_rate": 4.913557434639022e-06, + "loss": 0.5841, + "step": 1644 + }, + { + "epoch": 0.26687216093445815, + "grad_norm": 0.5825705451969819, + "learning_rate": 4.913446076979133e-06, + "loss": 0.6208, + "step": 1645 + }, + { + "epoch": 0.26703439325113565, + "grad_norm": 0.5926574970359081, + "learning_rate": 4.913334648901934e-06, + "loss": 0.6054, + "step": 1646 + }, + { + "epoch": 0.2671966255678131, + "grad_norm": 0.6138594376694035, + "learning_rate": 4.9132231504106765e-06, + "loss": 0.6276, + "step": 1647 + }, + { + "epoch": 0.2673588578844906, + "grad_norm": 0.5875816470123205, + "learning_rate": 4.913111581508614e-06, + "loss": 0.618, + "step": 1648 + }, + { + "epoch": 0.2675210902011681, + "grad_norm": 0.573896782707164, + "learning_rate": 4.912999942199002e-06, + "loss": 0.6242, + "step": 1649 + }, + { + "epoch": 0.26768332251784555, + "grad_norm": 0.6232387363073237, + "learning_rate": 4.912888232485097e-06, + "loss": 0.5902, + "step": 1650 + }, + { + "epoch": 0.26784555483452305, + "grad_norm": 0.6070794452950462, + "learning_rate": 4.912776452370159e-06, + "loss": 0.5961, + "step": 1651 + }, + { + "epoch": 0.2680077871512005, + "grad_norm": 0.5940565167894516, + "learning_rate": 4.912664601857448e-06, + "loss": 0.6074, + "step": 1652 + }, + { + "epoch": 0.268170019467878, + "grad_norm": 0.6068059504984933, + "learning_rate": 4.91255268095023e-06, + "loss": 0.5909, + "step": 1653 + }, + { + "epoch": 0.2683322517845555, + "grad_norm": 0.627962793041811, + "learning_rate": 4.912440689651769e-06, + "loss": 0.6138, + "step": 1654 + }, + { + "epoch": 0.26849448410123294, + "grad_norm": 0.631540353106904, + "learning_rate": 4.912328627965332e-06, + "loss": 0.5897, + "step": 1655 + }, + { + "epoch": 0.26865671641791045, + "grad_norm": 0.6056806891871701, + "learning_rate": 4.91221649589419e-06, + "loss": 0.6018, + "step": 1656 + }, + { + "epoch": 0.26881894873458795, + "grad_norm": 0.6076545027851394, + "learning_rate": 4.912104293441614e-06, + "loss": 0.629, + "step": 1657 + }, + { + "epoch": 0.2689811810512654, + "grad_norm": 0.6217197969894299, + "learning_rate": 4.911992020610878e-06, + "loss": 0.5815, + "step": 1658 + }, + { + "epoch": 0.2691434133679429, + "grad_norm": 0.6171495982023256, + "learning_rate": 4.911879677405257e-06, + "loss": 0.5675, + "step": 1659 + }, + { + "epoch": 0.2693056456846204, + "grad_norm": 0.6058668693707528, + "learning_rate": 4.91176726382803e-06, + "loss": 0.5809, + "step": 1660 + }, + { + "epoch": 0.26946787800129784, + "grad_norm": 0.6178145536385271, + "learning_rate": 4.911654779882476e-06, + "loss": 0.6006, + "step": 1661 + }, + { + "epoch": 0.26963011031797535, + "grad_norm": 0.5971397911859851, + "learning_rate": 4.911542225571877e-06, + "loss": 0.5571, + "step": 1662 + }, + { + "epoch": 0.26979234263465285, + "grad_norm": 0.5907914584493887, + "learning_rate": 4.9114296008995176e-06, + "loss": 0.6149, + "step": 1663 + }, + { + "epoch": 0.2699545749513303, + "grad_norm": 0.5996150395547649, + "learning_rate": 4.911316905868684e-06, + "loss": 0.5862, + "step": 1664 + }, + { + "epoch": 0.2701168072680078, + "grad_norm": 0.5783195462255238, + "learning_rate": 4.911204140482663e-06, + "loss": 0.6074, + "step": 1665 + }, + { + "epoch": 0.27027903958468524, + "grad_norm": 0.5811327355551333, + "learning_rate": 4.911091304744746e-06, + "loss": 0.5728, + "step": 1666 + }, + { + "epoch": 0.27044127190136275, + "grad_norm": 0.6165255693815211, + "learning_rate": 4.910978398658225e-06, + "loss": 0.5989, + "step": 1667 + }, + { + "epoch": 0.27060350421804025, + "grad_norm": 0.580964122727551, + "learning_rate": 4.910865422226394e-06, + "loss": 0.577, + "step": 1668 + }, + { + "epoch": 0.2707657365347177, + "grad_norm": 0.6872498989729483, + "learning_rate": 4.9107523754525496e-06, + "loss": 0.6166, + "step": 1669 + }, + { + "epoch": 0.2709279688513952, + "grad_norm": 0.581703379148013, + "learning_rate": 4.91063925833999e-06, + "loss": 0.6001, + "step": 1670 + }, + { + "epoch": 0.2710902011680727, + "grad_norm": 0.6521004003964477, + "learning_rate": 4.910526070892015e-06, + "loss": 0.5885, + "step": 1671 + }, + { + "epoch": 0.27125243348475014, + "grad_norm": 0.5676211915631543, + "learning_rate": 4.910412813111929e-06, + "loss": 0.5505, + "step": 1672 + }, + { + "epoch": 0.27141466580142765, + "grad_norm": 0.6251664019684803, + "learning_rate": 4.910299485003034e-06, + "loss": 0.5651, + "step": 1673 + }, + { + "epoch": 0.27157689811810515, + "grad_norm": 0.5691407757153785, + "learning_rate": 4.910186086568639e-06, + "loss": 0.5849, + "step": 1674 + }, + { + "epoch": 0.2717391304347826, + "grad_norm": 0.5980993151173576, + "learning_rate": 4.91007261781205e-06, + "loss": 0.5874, + "step": 1675 + }, + { + "epoch": 0.2719013627514601, + "grad_norm": 0.6188734766446427, + "learning_rate": 4.90995907873658e-06, + "loss": 0.577, + "step": 1676 + }, + { + "epoch": 0.2720635950681376, + "grad_norm": 0.580767992394302, + "learning_rate": 4.909845469345542e-06, + "loss": 0.6332, + "step": 1677 + }, + { + "epoch": 0.27222582738481504, + "grad_norm": 0.5982954612510294, + "learning_rate": 4.909731789642248e-06, + "loss": 0.6074, + "step": 1678 + }, + { + "epoch": 0.27238805970149255, + "grad_norm": 0.5809005691925144, + "learning_rate": 4.9096180396300175e-06, + "loss": 0.6346, + "step": 1679 + }, + { + "epoch": 0.27255029201817, + "grad_norm": 0.6204375901491888, + "learning_rate": 4.9095042193121676e-06, + "loss": 0.6073, + "step": 1680 + }, + { + "epoch": 0.2727125243348475, + "grad_norm": 0.5999181625061424, + "learning_rate": 4.9093903286920204e-06, + "loss": 0.5781, + "step": 1681 + }, + { + "epoch": 0.272874756651525, + "grad_norm": 0.624031003608979, + "learning_rate": 4.909276367772899e-06, + "loss": 0.6024, + "step": 1682 + }, + { + "epoch": 0.27303698896820244, + "grad_norm": 0.5943482949697014, + "learning_rate": 4.909162336558128e-06, + "loss": 0.5826, + "step": 1683 + }, + { + "epoch": 0.27319922128487995, + "grad_norm": 0.5946198033181797, + "learning_rate": 4.909048235051033e-06, + "loss": 0.5719, + "step": 1684 + }, + { + "epoch": 0.27336145360155745, + "grad_norm": 0.6207089062141998, + "learning_rate": 4.908934063254947e-06, + "loss": 0.5993, + "step": 1685 + }, + { + "epoch": 0.2735236859182349, + "grad_norm": 0.647543429585759, + "learning_rate": 4.908819821173197e-06, + "loss": 0.5772, + "step": 1686 + }, + { + "epoch": 0.2736859182349124, + "grad_norm": 0.5962280022579356, + "learning_rate": 4.908705508809119e-06, + "loss": 0.5811, + "step": 1687 + }, + { + "epoch": 0.2738481505515899, + "grad_norm": 0.6117600245967317, + "learning_rate": 4.9085911261660465e-06, + "loss": 0.617, + "step": 1688 + }, + { + "epoch": 0.27401038286826734, + "grad_norm": 0.6764849331132052, + "learning_rate": 4.9084766732473185e-06, + "loss": 0.5656, + "step": 1689 + }, + { + "epoch": 0.27417261518494485, + "grad_norm": 0.6143251898404244, + "learning_rate": 4.908362150056274e-06, + "loss": 0.6162, + "step": 1690 + }, + { + "epoch": 0.27433484750162235, + "grad_norm": 0.5686433966222779, + "learning_rate": 4.908247556596254e-06, + "loss": 0.5812, + "step": 1691 + }, + { + "epoch": 0.2744970798182998, + "grad_norm": 0.5956470147893165, + "learning_rate": 4.908132892870602e-06, + "loss": 0.607, + "step": 1692 + }, + { + "epoch": 0.2746593121349773, + "grad_norm": 0.6074302500967689, + "learning_rate": 4.908018158882664e-06, + "loss": 0.6189, + "step": 1693 + }, + { + "epoch": 0.27482154445165474, + "grad_norm": 0.5898206343308461, + "learning_rate": 4.907903354635786e-06, + "loss": 0.6029, + "step": 1694 + }, + { + "epoch": 0.27498377676833224, + "grad_norm": 0.624452934046447, + "learning_rate": 4.90778848013332e-06, + "loss": 0.6115, + "step": 1695 + }, + { + "epoch": 0.27514600908500975, + "grad_norm": 0.6161343207256594, + "learning_rate": 4.907673535378617e-06, + "loss": 0.5948, + "step": 1696 + }, + { + "epoch": 0.2753082414016872, + "grad_norm": 0.5897788710048356, + "learning_rate": 4.907558520375029e-06, + "loss": 0.6071, + "step": 1697 + }, + { + "epoch": 0.2754704737183647, + "grad_norm": 0.5967518063084198, + "learning_rate": 4.907443435125914e-06, + "loss": 0.5991, + "step": 1698 + }, + { + "epoch": 0.2756327060350422, + "grad_norm": 0.60768223243065, + "learning_rate": 4.907328279634629e-06, + "loss": 0.6342, + "step": 1699 + }, + { + "epoch": 0.27579493835171964, + "grad_norm": 0.5970052517737029, + "learning_rate": 4.907213053904534e-06, + "loss": 0.5875, + "step": 1700 + }, + { + "epoch": 0.27595717066839714, + "grad_norm": 0.5898240474543287, + "learning_rate": 4.90709775793899e-06, + "loss": 0.584, + "step": 1701 + }, + { + "epoch": 0.27611940298507465, + "grad_norm": 0.6232952432655111, + "learning_rate": 4.906982391741363e-06, + "loss": 0.6048, + "step": 1702 + }, + { + "epoch": 0.2762816353017521, + "grad_norm": 0.5801728491582301, + "learning_rate": 4.9068669553150176e-06, + "loss": 0.5929, + "step": 1703 + }, + { + "epoch": 0.2764438676184296, + "grad_norm": 0.6474264493586634, + "learning_rate": 4.906751448663322e-06, + "loss": 0.5761, + "step": 1704 + }, + { + "epoch": 0.2766060999351071, + "grad_norm": 0.6103423756028233, + "learning_rate": 4.906635871789646e-06, + "loss": 0.5571, + "step": 1705 + }, + { + "epoch": 0.27676833225178454, + "grad_norm": 0.6027847908630622, + "learning_rate": 4.906520224697364e-06, + "loss": 0.6091, + "step": 1706 + }, + { + "epoch": 0.27693056456846205, + "grad_norm": 0.6116480813972469, + "learning_rate": 4.906404507389848e-06, + "loss": 0.588, + "step": 1707 + }, + { + "epoch": 0.27709279688513955, + "grad_norm": 0.5699370766032102, + "learning_rate": 4.906288719870475e-06, + "loss": 0.6113, + "step": 1708 + }, + { + "epoch": 0.277255029201817, + "grad_norm": 0.6440990676742587, + "learning_rate": 4.906172862142622e-06, + "loss": 0.5815, + "step": 1709 + }, + { + "epoch": 0.2774172615184945, + "grad_norm": 0.6019068645771883, + "learning_rate": 4.906056934209672e-06, + "loss": 0.6199, + "step": 1710 + }, + { + "epoch": 0.27757949383517194, + "grad_norm": 0.6016127807441598, + "learning_rate": 4.905940936075006e-06, + "loss": 0.577, + "step": 1711 + }, + { + "epoch": 0.27774172615184944, + "grad_norm": 0.5690735755085262, + "learning_rate": 4.905824867742007e-06, + "loss": 0.6051, + "step": 1712 + }, + { + "epoch": 0.27790395846852695, + "grad_norm": 0.6312404757763108, + "learning_rate": 4.905708729214065e-06, + "loss": 0.5581, + "step": 1713 + }, + { + "epoch": 0.2780661907852044, + "grad_norm": 0.6065184539727829, + "learning_rate": 4.905592520494566e-06, + "loss": 0.6216, + "step": 1714 + }, + { + "epoch": 0.2782284231018819, + "grad_norm": 0.6364076363636294, + "learning_rate": 4.905476241586901e-06, + "loss": 0.604, + "step": 1715 + }, + { + "epoch": 0.2783906554185594, + "grad_norm": 0.5673587966597236, + "learning_rate": 4.905359892494463e-06, + "loss": 0.6321, + "step": 1716 + }, + { + "epoch": 0.27855288773523684, + "grad_norm": 0.6977697129917872, + "learning_rate": 4.9052434732206475e-06, + "loss": 0.6083, + "step": 1717 + }, + { + "epoch": 0.27871512005191434, + "grad_norm": 0.6036851828042582, + "learning_rate": 4.90512698376885e-06, + "loss": 0.6154, + "step": 1718 + }, + { + "epoch": 0.27887735236859185, + "grad_norm": 0.6359636492665637, + "learning_rate": 4.90501042414247e-06, + "loss": 0.572, + "step": 1719 + }, + { + "epoch": 0.2790395846852693, + "grad_norm": 0.5881434977680655, + "learning_rate": 4.904893794344908e-06, + "loss": 0.587, + "step": 1720 + }, + { + "epoch": 0.2792018170019468, + "grad_norm": 0.6081288989047432, + "learning_rate": 4.904777094379567e-06, + "loss": 0.5888, + "step": 1721 + }, + { + "epoch": 0.2793640493186243, + "grad_norm": 0.5655340788117311, + "learning_rate": 4.904660324249852e-06, + "loss": 0.597, + "step": 1722 + }, + { + "epoch": 0.27952628163530174, + "grad_norm": 0.6370389591451591, + "learning_rate": 4.904543483959171e-06, + "loss": 0.5781, + "step": 1723 + }, + { + "epoch": 0.27968851395197925, + "grad_norm": 0.6283636665822752, + "learning_rate": 4.904426573510931e-06, + "loss": 0.6123, + "step": 1724 + }, + { + "epoch": 0.2798507462686567, + "grad_norm": 0.6448571560475073, + "learning_rate": 4.904309592908545e-06, + "loss": 0.6002, + "step": 1725 + }, + { + "epoch": 0.2800129785853342, + "grad_norm": 0.6027241851021123, + "learning_rate": 4.9041925421554244e-06, + "loss": 0.5898, + "step": 1726 + }, + { + "epoch": 0.2801752109020117, + "grad_norm": 0.595523456223511, + "learning_rate": 4.9040754212549864e-06, + "loss": 0.5948, + "step": 1727 + }, + { + "epoch": 0.28033744321868914, + "grad_norm": 0.5995974932441307, + "learning_rate": 4.903958230210647e-06, + "loss": 0.5894, + "step": 1728 + }, + { + "epoch": 0.28049967553536664, + "grad_norm": 0.6006263239196116, + "learning_rate": 4.903840969025826e-06, + "loss": 0.5839, + "step": 1729 + }, + { + "epoch": 0.28066190785204415, + "grad_norm": 0.6038735174395499, + "learning_rate": 4.903723637703944e-06, + "loss": 0.648, + "step": 1730 + }, + { + "epoch": 0.2808241401687216, + "grad_norm": 0.5910741783113453, + "learning_rate": 4.903606236248425e-06, + "loss": 0.5917, + "step": 1731 + }, + { + "epoch": 0.2809863724853991, + "grad_norm": 0.6110618436614418, + "learning_rate": 4.903488764662694e-06, + "loss": 0.587, + "step": 1732 + }, + { + "epoch": 0.2811486048020766, + "grad_norm": 0.575119356197763, + "learning_rate": 4.90337122295018e-06, + "loss": 0.6105, + "step": 1733 + }, + { + "epoch": 0.28131083711875404, + "grad_norm": 0.5779913056994811, + "learning_rate": 4.903253611114309e-06, + "loss": 0.6129, + "step": 1734 + }, + { + "epoch": 0.28147306943543154, + "grad_norm": 0.5970183513246827, + "learning_rate": 4.903135929158517e-06, + "loss": 0.566, + "step": 1735 + }, + { + "epoch": 0.28163530175210905, + "grad_norm": 0.6422048923138157, + "learning_rate": 4.903018177086234e-06, + "loss": 0.5798, + "step": 1736 + }, + { + "epoch": 0.2817975340687865, + "grad_norm": 0.6176150014711261, + "learning_rate": 4.902900354900898e-06, + "loss": 0.5754, + "step": 1737 + }, + { + "epoch": 0.281959766385464, + "grad_norm": 0.6119633410458782, + "learning_rate": 4.902782462605946e-06, + "loss": 0.586, + "step": 1738 + }, + { + "epoch": 0.28212199870214144, + "grad_norm": 0.6520379073037995, + "learning_rate": 4.902664500204817e-06, + "loss": 0.6122, + "step": 1739 + }, + { + "epoch": 0.28228423101881894, + "grad_norm": 0.624933643226949, + "learning_rate": 4.902546467700954e-06, + "loss": 0.6088, + "step": 1740 + }, + { + "epoch": 0.28244646333549644, + "grad_norm": 0.5942182641271482, + "learning_rate": 4.9024283650978e-06, + "loss": 0.6312, + "step": 1741 + }, + { + "epoch": 0.2826086956521739, + "grad_norm": 0.6232593679420076, + "learning_rate": 4.9023101923988e-06, + "loss": 0.6094, + "step": 1742 + }, + { + "epoch": 0.2827709279688514, + "grad_norm": 0.6247469024320056, + "learning_rate": 4.902191949607405e-06, + "loss": 0.5669, + "step": 1743 + }, + { + "epoch": 0.2829331602855289, + "grad_norm": 0.6017368727218989, + "learning_rate": 4.9020736367270614e-06, + "loss": 0.5925, + "step": 1744 + }, + { + "epoch": 0.28309539260220634, + "grad_norm": 0.6015756019162717, + "learning_rate": 4.901955253761224e-06, + "loss": 0.5092, + "step": 1745 + }, + { + "epoch": 0.28325762491888384, + "grad_norm": 0.5996402614063474, + "learning_rate": 4.901836800713346e-06, + "loss": 0.5921, + "step": 1746 + }, + { + "epoch": 0.28341985723556135, + "grad_norm": 0.6084910579648697, + "learning_rate": 4.901718277586882e-06, + "loss": 0.5832, + "step": 1747 + }, + { + "epoch": 0.2835820895522388, + "grad_norm": 0.6113738335916555, + "learning_rate": 4.901599684385292e-06, + "loss": 0.559, + "step": 1748 + }, + { + "epoch": 0.2837443218689163, + "grad_norm": 0.5995732400210616, + "learning_rate": 4.9014810211120354e-06, + "loss": 0.6016, + "step": 1749 + }, + { + "epoch": 0.2839065541855938, + "grad_norm": 0.6272709203905092, + "learning_rate": 4.901362287770576e-06, + "loss": 0.5709, + "step": 1750 + }, + { + "epoch": 0.28406878650227124, + "grad_norm": 0.5940533783839406, + "learning_rate": 4.901243484364375e-06, + "loss": 0.582, + "step": 1751 + }, + { + "epoch": 0.28423101881894874, + "grad_norm": 0.6086294670310474, + "learning_rate": 4.901124610896902e-06, + "loss": 0.6205, + "step": 1752 + }, + { + "epoch": 0.2843932511356262, + "grad_norm": 0.6230307032709314, + "learning_rate": 4.901005667371622e-06, + "loss": 0.6034, + "step": 1753 + }, + { + "epoch": 0.2845554834523037, + "grad_norm": 0.5943957638536048, + "learning_rate": 4.9008866537920086e-06, + "loss": 0.5933, + "step": 1754 + }, + { + "epoch": 0.2847177157689812, + "grad_norm": 0.6332131737238315, + "learning_rate": 4.900767570161533e-06, + "loss": 0.5797, + "step": 1755 + }, + { + "epoch": 0.28487994808565864, + "grad_norm": 0.5933362077968596, + "learning_rate": 4.900648416483669e-06, + "loss": 0.613, + "step": 1756 + }, + { + "epoch": 0.28504218040233614, + "grad_norm": 0.5827705965105605, + "learning_rate": 4.900529192761894e-06, + "loss": 0.581, + "step": 1757 + }, + { + "epoch": 0.28520441271901364, + "grad_norm": 0.5922261399479817, + "learning_rate": 4.900409898999688e-06, + "loss": 0.6005, + "step": 1758 + }, + { + "epoch": 0.2853666450356911, + "grad_norm": 0.7001860884835109, + "learning_rate": 4.900290535200528e-06, + "loss": 0.6048, + "step": 1759 + }, + { + "epoch": 0.2855288773523686, + "grad_norm": 0.6118859496893753, + "learning_rate": 4.900171101367899e-06, + "loss": 0.5912, + "step": 1760 + }, + { + "epoch": 0.2856911096690461, + "grad_norm": 0.5908873767482902, + "learning_rate": 4.900051597505287e-06, + "loss": 0.6049, + "step": 1761 + }, + { + "epoch": 0.28585334198572354, + "grad_norm": 0.5851327743015572, + "learning_rate": 4.899932023616175e-06, + "loss": 0.5712, + "step": 1762 + }, + { + "epoch": 0.28601557430240104, + "grad_norm": 0.5872292952945924, + "learning_rate": 4.899812379704056e-06, + "loss": 0.5665, + "step": 1763 + }, + { + "epoch": 0.28617780661907855, + "grad_norm": 0.6123079170480781, + "learning_rate": 4.8996926657724185e-06, + "loss": 0.6112, + "step": 1764 + }, + { + "epoch": 0.286340038935756, + "grad_norm": 0.5974995594023254, + "learning_rate": 4.899572881824755e-06, + "loss": 0.6171, + "step": 1765 + }, + { + "epoch": 0.2865022712524335, + "grad_norm": 0.6197820555454897, + "learning_rate": 4.8994530278645625e-06, + "loss": 0.6024, + "step": 1766 + }, + { + "epoch": 0.28666450356911094, + "grad_norm": 0.5559115245656012, + "learning_rate": 4.899333103895336e-06, + "loss": 0.5865, + "step": 1767 + }, + { + "epoch": 0.28682673588578844, + "grad_norm": 0.5954586138920678, + "learning_rate": 4.899213109920575e-06, + "loss": 0.563, + "step": 1768 + }, + { + "epoch": 0.28698896820246594, + "grad_norm": 0.6171179987499568, + "learning_rate": 4.899093045943782e-06, + "loss": 0.5787, + "step": 1769 + }, + { + "epoch": 0.2871512005191434, + "grad_norm": 0.6071135125014633, + "learning_rate": 4.898972911968458e-06, + "loss": 0.6018, + "step": 1770 + }, + { + "epoch": 0.2873134328358209, + "grad_norm": 0.6151821834982417, + "learning_rate": 4.89885270799811e-06, + "loss": 0.5914, + "step": 1771 + }, + { + "epoch": 0.2874756651524984, + "grad_norm": 0.6275532118163353, + "learning_rate": 4.8987324340362445e-06, + "loss": 0.5675, + "step": 1772 + }, + { + "epoch": 0.28763789746917584, + "grad_norm": 0.6106418955998323, + "learning_rate": 4.89861209008637e-06, + "loss": 0.6065, + "step": 1773 + }, + { + "epoch": 0.28780012978585334, + "grad_norm": 0.6192914845058126, + "learning_rate": 4.898491676151998e-06, + "loss": 0.5577, + "step": 1774 + }, + { + "epoch": 0.28796236210253084, + "grad_norm": 0.6457877547230523, + "learning_rate": 4.8983711922366425e-06, + "loss": 0.5951, + "step": 1775 + }, + { + "epoch": 0.2881245944192083, + "grad_norm": 0.5791121231169156, + "learning_rate": 4.898250638343819e-06, + "loss": 0.5606, + "step": 1776 + }, + { + "epoch": 0.2882868267358858, + "grad_norm": 0.5817156808381615, + "learning_rate": 4.8981300144770434e-06, + "loss": 0.5991, + "step": 1777 + }, + { + "epoch": 0.2884490590525633, + "grad_norm": 0.6023626580305748, + "learning_rate": 4.898009320639838e-06, + "loss": 0.5953, + "step": 1778 + }, + { + "epoch": 0.28861129136924074, + "grad_norm": 0.604192669916429, + "learning_rate": 4.897888556835721e-06, + "loss": 0.6005, + "step": 1779 + }, + { + "epoch": 0.28877352368591824, + "grad_norm": 0.5922102581506684, + "learning_rate": 4.897767723068218e-06, + "loss": 0.5761, + "step": 1780 + }, + { + "epoch": 0.2889357560025957, + "grad_norm": 0.5861087517668516, + "learning_rate": 4.8976468193408546e-06, + "loss": 0.5999, + "step": 1781 + }, + { + "epoch": 0.2890979883192732, + "grad_norm": 0.5544764557252513, + "learning_rate": 4.8975258456571576e-06, + "loss": 0.5846, + "step": 1782 + }, + { + "epoch": 0.2892602206359507, + "grad_norm": 0.5686978971493258, + "learning_rate": 4.897404802020657e-06, + "loss": 0.5711, + "step": 1783 + }, + { + "epoch": 0.28942245295262814, + "grad_norm": 0.5829640020697714, + "learning_rate": 4.897283688434884e-06, + "loss": 0.5997, + "step": 1784 + }, + { + "epoch": 0.28958468526930564, + "grad_norm": 0.5795734736072519, + "learning_rate": 4.897162504903373e-06, + "loss": 0.625, + "step": 1785 + }, + { + "epoch": 0.28974691758598314, + "grad_norm": 0.5963699076604628, + "learning_rate": 4.89704125142966e-06, + "loss": 0.6038, + "step": 1786 + }, + { + "epoch": 0.2899091499026606, + "grad_norm": 0.6075517221534744, + "learning_rate": 4.896919928017282e-06, + "loss": 0.5587, + "step": 1787 + }, + { + "epoch": 0.2900713822193381, + "grad_norm": 0.6017039316187658, + "learning_rate": 4.89679853466978e-06, + "loss": 0.6237, + "step": 1788 + }, + { + "epoch": 0.2902336145360156, + "grad_norm": 0.6231357624056917, + "learning_rate": 4.8966770713906935e-06, + "loss": 0.6038, + "step": 1789 + }, + { + "epoch": 0.29039584685269304, + "grad_norm": 0.6314479198962866, + "learning_rate": 4.896555538183569e-06, + "loss": 0.6087, + "step": 1790 + }, + { + "epoch": 0.29055807916937054, + "grad_norm": 0.6045086622491225, + "learning_rate": 4.896433935051952e-06, + "loss": 0.6034, + "step": 1791 + }, + { + "epoch": 0.29072031148604804, + "grad_norm": 0.6291739831490449, + "learning_rate": 4.89631226199939e-06, + "loss": 0.6036, + "step": 1792 + }, + { + "epoch": 0.2908825438027255, + "grad_norm": 0.5970194555675993, + "learning_rate": 4.8961905190294325e-06, + "loss": 0.5943, + "step": 1793 + }, + { + "epoch": 0.291044776119403, + "grad_norm": 0.5892254673201027, + "learning_rate": 4.896068706145632e-06, + "loss": 0.632, + "step": 1794 + }, + { + "epoch": 0.2912070084360805, + "grad_norm": 0.6178396843675336, + "learning_rate": 4.895946823351544e-06, + "loss": 0.614, + "step": 1795 + }, + { + "epoch": 0.29136924075275794, + "grad_norm": 0.5649336939614403, + "learning_rate": 4.8958248706507225e-06, + "loss": 0.5631, + "step": 1796 + }, + { + "epoch": 0.29153147306943544, + "grad_norm": 0.609665071276488, + "learning_rate": 4.895702848046727e-06, + "loss": 0.5896, + "step": 1797 + }, + { + "epoch": 0.2916937053861129, + "grad_norm": 0.607721569285349, + "learning_rate": 4.895580755543119e-06, + "loss": 0.6213, + "step": 1798 + }, + { + "epoch": 0.2918559377027904, + "grad_norm": 0.5961169308438861, + "learning_rate": 4.895458593143458e-06, + "loss": 0.5601, + "step": 1799 + }, + { + "epoch": 0.2920181700194679, + "grad_norm": 0.5983948996798321, + "learning_rate": 4.89533636085131e-06, + "loss": 0.6108, + "step": 1800 + }, + { + "epoch": 0.29218040233614534, + "grad_norm": 0.606973549927582, + "learning_rate": 4.895214058670241e-06, + "loss": 0.612, + "step": 1801 + }, + { + "epoch": 0.29234263465282284, + "grad_norm": 0.6424653257999662, + "learning_rate": 4.89509168660382e-06, + "loss": 0.6166, + "step": 1802 + }, + { + "epoch": 0.29250486696950034, + "grad_norm": 0.6127120217624906, + "learning_rate": 4.894969244655617e-06, + "loss": 0.5962, + "step": 1803 + }, + { + "epoch": 0.2926670992861778, + "grad_norm": 0.5822172450131162, + "learning_rate": 4.8948467328292036e-06, + "loss": 0.6112, + "step": 1804 + }, + { + "epoch": 0.2928293316028553, + "grad_norm": 0.6104008825252891, + "learning_rate": 4.894724151128156e-06, + "loss": 0.5985, + "step": 1805 + }, + { + "epoch": 0.2929915639195328, + "grad_norm": 0.615933762492425, + "learning_rate": 4.89460149955605e-06, + "loss": 0.5705, + "step": 1806 + }, + { + "epoch": 0.29315379623621024, + "grad_norm": 0.6024522092327739, + "learning_rate": 4.894478778116464e-06, + "loss": 0.6281, + "step": 1807 + }, + { + "epoch": 0.29331602855288774, + "grad_norm": 0.5886624650725427, + "learning_rate": 4.894355986812978e-06, + "loss": 0.5689, + "step": 1808 + }, + { + "epoch": 0.29347826086956524, + "grad_norm": 0.6374706220908632, + "learning_rate": 4.894233125649177e-06, + "loss": 0.5965, + "step": 1809 + }, + { + "epoch": 0.2936404931862427, + "grad_norm": 0.6855222254123254, + "learning_rate": 4.894110194628644e-06, + "loss": 0.5795, + "step": 1810 + }, + { + "epoch": 0.2938027255029202, + "grad_norm": 0.5636422140775493, + "learning_rate": 4.893987193754965e-06, + "loss": 0.6054, + "step": 1811 + }, + { + "epoch": 0.29396495781959764, + "grad_norm": 0.6364601157216366, + "learning_rate": 4.893864123031732e-06, + "loss": 0.5814, + "step": 1812 + }, + { + "epoch": 0.29412719013627514, + "grad_norm": 0.5792007010829785, + "learning_rate": 4.893740982462532e-06, + "loss": 0.5739, + "step": 1813 + }, + { + "epoch": 0.29428942245295264, + "grad_norm": 0.6229360587068721, + "learning_rate": 4.89361777205096e-06, + "loss": 0.5935, + "step": 1814 + }, + { + "epoch": 0.2944516547696301, + "grad_norm": 0.58093435717772, + "learning_rate": 4.89349449180061e-06, + "loss": 0.5915, + "step": 1815 + }, + { + "epoch": 0.2946138870863076, + "grad_norm": 0.6449980197070792, + "learning_rate": 4.89337114171508e-06, + "loss": 0.5646, + "step": 1816 + }, + { + "epoch": 0.2947761194029851, + "grad_norm": 0.6033729070321211, + "learning_rate": 4.893247721797969e-06, + "loss": 0.6054, + "step": 1817 + }, + { + "epoch": 0.29493835171966254, + "grad_norm": 0.5890994943871737, + "learning_rate": 4.8931242320528765e-06, + "loss": 0.5794, + "step": 1818 + }, + { + "epoch": 0.29510058403634004, + "grad_norm": 0.6089289923131822, + "learning_rate": 4.893000672483407e-06, + "loss": 0.5974, + "step": 1819 + }, + { + "epoch": 0.29526281635301754, + "grad_norm": 0.6016154246753841, + "learning_rate": 4.892877043093165e-06, + "loss": 0.5944, + "step": 1820 + }, + { + "epoch": 0.295425048669695, + "grad_norm": 0.5827250942426185, + "learning_rate": 4.892753343885758e-06, + "loss": 0.6041, + "step": 1821 + }, + { + "epoch": 0.2955872809863725, + "grad_norm": 0.6088175963137575, + "learning_rate": 4.892629574864795e-06, + "loss": 0.5703, + "step": 1822 + }, + { + "epoch": 0.29574951330305, + "grad_norm": 0.5925690612892347, + "learning_rate": 4.892505736033887e-06, + "loss": 0.5783, + "step": 1823 + }, + { + "epoch": 0.29591174561972744, + "grad_norm": 0.6477703371198789, + "learning_rate": 4.892381827396648e-06, + "loss": 0.6292, + "step": 1824 + }, + { + "epoch": 0.29607397793640494, + "grad_norm": 0.6190457935169609, + "learning_rate": 4.8922578489566925e-06, + "loss": 0.626, + "step": 1825 + }, + { + "epoch": 0.2962362102530824, + "grad_norm": 0.5839670136991304, + "learning_rate": 4.892133800717638e-06, + "loss": 0.6227, + "step": 1826 + }, + { + "epoch": 0.2963984425697599, + "grad_norm": 0.6409763679999024, + "learning_rate": 4.892009682683104e-06, + "loss": 0.5627, + "step": 1827 + }, + { + "epoch": 0.2965606748864374, + "grad_norm": 0.5950612737963871, + "learning_rate": 4.891885494856712e-06, + "loss": 0.5936, + "step": 1828 + }, + { + "epoch": 0.29672290720311484, + "grad_norm": 0.6552069116531335, + "learning_rate": 4.891761237242086e-06, + "loss": 0.5897, + "step": 1829 + }, + { + "epoch": 0.29688513951979234, + "grad_norm": 0.5565383485735405, + "learning_rate": 4.891636909842849e-06, + "loss": 0.5813, + "step": 1830 + }, + { + "epoch": 0.29704737183646984, + "grad_norm": 0.5692018309959713, + "learning_rate": 4.8915125126626315e-06, + "loss": 0.5402, + "step": 1831 + }, + { + "epoch": 0.2972096041531473, + "grad_norm": 0.6069504476071373, + "learning_rate": 4.891388045705061e-06, + "loss": 0.6003, + "step": 1832 + }, + { + "epoch": 0.2973718364698248, + "grad_norm": 0.5758227219912587, + "learning_rate": 4.8912635089737705e-06, + "loss": 0.6092, + "step": 1833 + }, + { + "epoch": 0.2975340687865023, + "grad_norm": 0.5934680644687208, + "learning_rate": 4.891138902472393e-06, + "loss": 0.6167, + "step": 1834 + }, + { + "epoch": 0.29769630110317974, + "grad_norm": 0.5903254367172225, + "learning_rate": 4.891014226204564e-06, + "loss": 0.6158, + "step": 1835 + }, + { + "epoch": 0.29785853341985724, + "grad_norm": 0.6572726349326106, + "learning_rate": 4.890889480173922e-06, + "loss": 0.5842, + "step": 1836 + }, + { + "epoch": 0.29802076573653474, + "grad_norm": 0.5908516473415195, + "learning_rate": 4.890764664384105e-06, + "loss": 0.5512, + "step": 1837 + }, + { + "epoch": 0.2981829980532122, + "grad_norm": 0.6341108766142453, + "learning_rate": 4.890639778838757e-06, + "loss": 0.645, + "step": 1838 + }, + { + "epoch": 0.2983452303698897, + "grad_norm": 0.6279118957531201, + "learning_rate": 4.890514823541521e-06, + "loss": 0.5802, + "step": 1839 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 0.6199502545889187, + "learning_rate": 4.890389798496041e-06, + "loss": 0.5748, + "step": 1840 + }, + { + "epoch": 0.29866969500324464, + "grad_norm": 0.5760611204351503, + "learning_rate": 4.890264703705967e-06, + "loss": 0.588, + "step": 1841 + }, + { + "epoch": 0.29883192731992214, + "grad_norm": 0.5926725290002555, + "learning_rate": 4.890139539174948e-06, + "loss": 0.5837, + "step": 1842 + }, + { + "epoch": 0.2989941596365996, + "grad_norm": 0.6475823361936721, + "learning_rate": 4.890014304906637e-06, + "loss": 0.5918, + "step": 1843 + }, + { + "epoch": 0.2991563919532771, + "grad_norm": 0.5996608639325409, + "learning_rate": 4.8898890009046865e-06, + "loss": 0.5911, + "step": 1844 + }, + { + "epoch": 0.2993186242699546, + "grad_norm": 0.5915707755351053, + "learning_rate": 4.889763627172754e-06, + "loss": 0.5588, + "step": 1845 + }, + { + "epoch": 0.29948085658663204, + "grad_norm": 0.5439051317147147, + "learning_rate": 4.889638183714496e-06, + "loss": 0.5793, + "step": 1846 + }, + { + "epoch": 0.29964308890330954, + "grad_norm": 0.5924897092369419, + "learning_rate": 4.889512670533573e-06, + "loss": 0.6101, + "step": 1847 + }, + { + "epoch": 0.29980532121998704, + "grad_norm": 0.640833303009937, + "learning_rate": 4.889387087633647e-06, + "loss": 0.6078, + "step": 1848 + }, + { + "epoch": 0.2999675535366645, + "grad_norm": 0.5764171995724736, + "learning_rate": 4.889261435018384e-06, + "loss": 0.5997, + "step": 1849 + }, + { + "epoch": 0.300129785853342, + "grad_norm": 0.6002794785383051, + "learning_rate": 4.889135712691448e-06, + "loss": 0.5641, + "step": 1850 + }, + { + "epoch": 0.3002920181700195, + "grad_norm": 0.6320056471929506, + "learning_rate": 4.889009920656508e-06, + "loss": 0.6039, + "step": 1851 + }, + { + "epoch": 0.30045425048669694, + "grad_norm": 0.6286113455164538, + "learning_rate": 4.888884058917234e-06, + "loss": 0.5517, + "step": 1852 + }, + { + "epoch": 0.30061648280337444, + "grad_norm": 0.5992945517087032, + "learning_rate": 4.888758127477298e-06, + "loss": 0.6112, + "step": 1853 + }, + { + "epoch": 0.3007787151200519, + "grad_norm": 0.5899841041604227, + "learning_rate": 4.888632126340375e-06, + "loss": 0.5724, + "step": 1854 + }, + { + "epoch": 0.3009409474367294, + "grad_norm": 0.5848164683107878, + "learning_rate": 4.888506055510141e-06, + "loss": 0.5581, + "step": 1855 + }, + { + "epoch": 0.3011031797534069, + "grad_norm": 0.6059763725475156, + "learning_rate": 4.888379914990276e-06, + "loss": 0.5798, + "step": 1856 + }, + { + "epoch": 0.30126541207008434, + "grad_norm": 0.5870044146465524, + "learning_rate": 4.888253704784457e-06, + "loss": 0.5885, + "step": 1857 + }, + { + "epoch": 0.30142764438676184, + "grad_norm": 0.610076209625479, + "learning_rate": 4.888127424896369e-06, + "loss": 0.6051, + "step": 1858 + }, + { + "epoch": 0.30158987670343934, + "grad_norm": 0.602594949308699, + "learning_rate": 4.888001075329696e-06, + "loss": 0.5828, + "step": 1859 + }, + { + "epoch": 0.3017521090201168, + "grad_norm": 0.6105118215838953, + "learning_rate": 4.887874656088124e-06, + "loss": 0.5898, + "step": 1860 + }, + { + "epoch": 0.3019143413367943, + "grad_norm": 0.5902200904005711, + "learning_rate": 4.8877481671753415e-06, + "loss": 0.5889, + "step": 1861 + }, + { + "epoch": 0.3020765736534718, + "grad_norm": 0.6050047911905092, + "learning_rate": 4.88762160859504e-06, + "loss": 0.5977, + "step": 1862 + }, + { + "epoch": 0.30223880597014924, + "grad_norm": 0.6207143911897799, + "learning_rate": 4.8874949803509116e-06, + "loss": 0.6097, + "step": 1863 + }, + { + "epoch": 0.30240103828682674, + "grad_norm": 0.6261526743121628, + "learning_rate": 4.8873682824466506e-06, + "loss": 0.5941, + "step": 1864 + }, + { + "epoch": 0.30256327060350424, + "grad_norm": 0.6272583082312932, + "learning_rate": 4.887241514885954e-06, + "loss": 0.5802, + "step": 1865 + }, + { + "epoch": 0.3027255029201817, + "grad_norm": 0.5801554260604885, + "learning_rate": 4.887114677672521e-06, + "loss": 0.5799, + "step": 1866 + }, + { + "epoch": 0.3028877352368592, + "grad_norm": 0.6016967206224503, + "learning_rate": 4.886987770810051e-06, + "loss": 0.6053, + "step": 1867 + }, + { + "epoch": 0.3030499675535367, + "grad_norm": 0.5879232543978045, + "learning_rate": 4.886860794302248e-06, + "loss": 0.5896, + "step": 1868 + }, + { + "epoch": 0.30321219987021414, + "grad_norm": 0.6318719461134966, + "learning_rate": 4.886733748152817e-06, + "loss": 0.5926, + "step": 1869 + }, + { + "epoch": 0.30337443218689164, + "grad_norm": 0.6175435041752712, + "learning_rate": 4.886606632365464e-06, + "loss": 0.607, + "step": 1870 + }, + { + "epoch": 0.3035366645035691, + "grad_norm": 0.5837239795481084, + "learning_rate": 4.886479446943897e-06, + "loss": 0.5693, + "step": 1871 + }, + { + "epoch": 0.3036988968202466, + "grad_norm": 0.5889351076978521, + "learning_rate": 4.8863521918918285e-06, + "loss": 0.5697, + "step": 1872 + }, + { + "epoch": 0.3038611291369241, + "grad_norm": 0.5832922251893682, + "learning_rate": 4.886224867212972e-06, + "loss": 0.5855, + "step": 1873 + }, + { + "epoch": 0.30402336145360154, + "grad_norm": 0.5761915052516524, + "learning_rate": 4.88609747291104e-06, + "loss": 0.6029, + "step": 1874 + }, + { + "epoch": 0.30418559377027904, + "grad_norm": 0.5929532941907221, + "learning_rate": 4.885970008989752e-06, + "loss": 0.5952, + "step": 1875 + }, + { + "epoch": 0.30434782608695654, + "grad_norm": 0.5920326220544384, + "learning_rate": 4.8858424754528246e-06, + "loss": 0.6164, + "step": 1876 + }, + { + "epoch": 0.304510058403634, + "grad_norm": 0.5994764807431621, + "learning_rate": 4.885714872303982e-06, + "loss": 0.5642, + "step": 1877 + }, + { + "epoch": 0.3046722907203115, + "grad_norm": 0.6192193802038534, + "learning_rate": 4.885587199546944e-06, + "loss": 0.5614, + "step": 1878 + }, + { + "epoch": 0.304834523036989, + "grad_norm": 0.6235973563716595, + "learning_rate": 4.8854594571854365e-06, + "loss": 0.5427, + "step": 1879 + }, + { + "epoch": 0.30499675535366644, + "grad_norm": 0.5810407657032116, + "learning_rate": 4.885331645223189e-06, + "loss": 0.5945, + "step": 1880 + }, + { + "epoch": 0.30515898767034394, + "grad_norm": 0.5850507815903552, + "learning_rate": 4.885203763663929e-06, + "loss": 0.6059, + "step": 1881 + }, + { + "epoch": 0.30532121998702144, + "grad_norm": 0.5729542817800093, + "learning_rate": 4.885075812511386e-06, + "loss": 0.5606, + "step": 1882 + }, + { + "epoch": 0.3054834523036989, + "grad_norm": 0.6171331555057723, + "learning_rate": 4.884947791769296e-06, + "loss": 0.5538, + "step": 1883 + }, + { + "epoch": 0.3056456846203764, + "grad_norm": 0.6074559228436445, + "learning_rate": 4.8848197014413935e-06, + "loss": 0.5852, + "step": 1884 + }, + { + "epoch": 0.30580791693705384, + "grad_norm": 0.6204575433017163, + "learning_rate": 4.8846915415314155e-06, + "loss": 0.5971, + "step": 1885 + }, + { + "epoch": 0.30597014925373134, + "grad_norm": 0.5940872415311909, + "learning_rate": 4.884563312043101e-06, + "loss": 0.5767, + "step": 1886 + }, + { + "epoch": 0.30613238157040884, + "grad_norm": 0.6099453342223983, + "learning_rate": 4.884435012980191e-06, + "loss": 0.583, + "step": 1887 + }, + { + "epoch": 0.3062946138870863, + "grad_norm": 0.5834686020973521, + "learning_rate": 4.884306644346431e-06, + "loss": 0.5852, + "step": 1888 + }, + { + "epoch": 0.3064568462037638, + "grad_norm": 0.6062155952681048, + "learning_rate": 4.884178206145565e-06, + "loss": 0.6151, + "step": 1889 + }, + { + "epoch": 0.3066190785204413, + "grad_norm": 0.5630414628230951, + "learning_rate": 4.884049698381339e-06, + "loss": 0.5833, + "step": 1890 + }, + { + "epoch": 0.30678131083711874, + "grad_norm": 0.5880203515077672, + "learning_rate": 4.883921121057506e-06, + "loss": 0.5731, + "step": 1891 + }, + { + "epoch": 0.30694354315379624, + "grad_norm": 0.6078505867329479, + "learning_rate": 4.8837924741778136e-06, + "loss": 0.6086, + "step": 1892 + }, + { + "epoch": 0.30710577547047374, + "grad_norm": 0.6134895229887372, + "learning_rate": 4.8836637577460176e-06, + "loss": 0.562, + "step": 1893 + }, + { + "epoch": 0.3072680077871512, + "grad_norm": 0.5831943013517502, + "learning_rate": 4.883534971765874e-06, + "loss": 0.5765, + "step": 1894 + }, + { + "epoch": 0.3074302401038287, + "grad_norm": 0.6123876423784198, + "learning_rate": 4.8834061162411396e-06, + "loss": 0.5966, + "step": 1895 + }, + { + "epoch": 0.3075924724205062, + "grad_norm": 0.6221259588131586, + "learning_rate": 4.883277191175572e-06, + "loss": 0.5999, + "step": 1896 + }, + { + "epoch": 0.30775470473718364, + "grad_norm": 0.6042288535704967, + "learning_rate": 4.883148196572938e-06, + "loss": 0.5926, + "step": 1897 + }, + { + "epoch": 0.30791693705386114, + "grad_norm": 0.589814808491926, + "learning_rate": 4.883019132436996e-06, + "loss": 0.5721, + "step": 1898 + }, + { + "epoch": 0.3080791693705386, + "grad_norm": 0.6229636742698998, + "learning_rate": 4.882889998771515e-06, + "loss": 0.599, + "step": 1899 + }, + { + "epoch": 0.3082414016872161, + "grad_norm": 0.6102566151748461, + "learning_rate": 4.88276079558026e-06, + "loss": 0.5939, + "step": 1900 + }, + { + "epoch": 0.3084036340038936, + "grad_norm": 0.5978566214116042, + "learning_rate": 4.882631522867004e-06, + "loss": 0.5978, + "step": 1901 + }, + { + "epoch": 0.30856586632057104, + "grad_norm": 0.6003243328641554, + "learning_rate": 4.882502180635516e-06, + "loss": 0.6009, + "step": 1902 + }, + { + "epoch": 0.30872809863724854, + "grad_norm": 0.636335732290755, + "learning_rate": 4.882372768889572e-06, + "loss": 0.6242, + "step": 1903 + }, + { + "epoch": 0.30889033095392604, + "grad_norm": 0.6021984257957742, + "learning_rate": 4.882243287632947e-06, + "loss": 0.6013, + "step": 1904 + }, + { + "epoch": 0.3090525632706035, + "grad_norm": 0.6301764364699273, + "learning_rate": 4.882113736869418e-06, + "loss": 0.6183, + "step": 1905 + }, + { + "epoch": 0.309214795587281, + "grad_norm": 0.6040003447936357, + "learning_rate": 4.881984116602766e-06, + "loss": 0.5897, + "step": 1906 + }, + { + "epoch": 0.3093770279039585, + "grad_norm": 0.5767839954779054, + "learning_rate": 4.881854426836773e-06, + "loss": 0.6042, + "step": 1907 + }, + { + "epoch": 0.30953926022063594, + "grad_norm": 0.642876390954777, + "learning_rate": 4.881724667575222e-06, + "loss": 0.5765, + "step": 1908 + }, + { + "epoch": 0.30970149253731344, + "grad_norm": 0.6042998840073415, + "learning_rate": 4.881594838821899e-06, + "loss": 0.5893, + "step": 1909 + }, + { + "epoch": 0.30986372485399094, + "grad_norm": 0.5756471946329045, + "learning_rate": 4.881464940580594e-06, + "loss": 0.6028, + "step": 1910 + }, + { + "epoch": 0.3100259571706684, + "grad_norm": 0.5780249927390124, + "learning_rate": 4.8813349728550946e-06, + "loss": 0.5891, + "step": 1911 + }, + { + "epoch": 0.3101881894873459, + "grad_norm": 0.5842245856663595, + "learning_rate": 4.881204935649194e-06, + "loss": 0.617, + "step": 1912 + }, + { + "epoch": 0.31035042180402334, + "grad_norm": 0.6096057547543832, + "learning_rate": 4.881074828966687e-06, + "loss": 0.5452, + "step": 1913 + }, + { + "epoch": 0.31051265412070084, + "grad_norm": 0.6132529289343338, + "learning_rate": 4.880944652811368e-06, + "loss": 0.6231, + "step": 1914 + }, + { + "epoch": 0.31067488643737834, + "grad_norm": 0.5933009200399497, + "learning_rate": 4.880814407187037e-06, + "loss": 0.5412, + "step": 1915 + }, + { + "epoch": 0.3108371187540558, + "grad_norm": 0.5627888512169732, + "learning_rate": 4.880684092097493e-06, + "loss": 0.5894, + "step": 1916 + }, + { + "epoch": 0.3109993510707333, + "grad_norm": 0.5961204881377077, + "learning_rate": 4.880553707546538e-06, + "loss": 0.5897, + "step": 1917 + }, + { + "epoch": 0.3111615833874108, + "grad_norm": 0.6137171540462707, + "learning_rate": 4.880423253537977e-06, + "loss": 0.6191, + "step": 1918 + }, + { + "epoch": 0.31132381570408824, + "grad_norm": 0.5835874202554775, + "learning_rate": 4.8802927300756165e-06, + "loss": 0.5613, + "step": 1919 + }, + { + "epoch": 0.31148604802076574, + "grad_norm": 0.5886864700270752, + "learning_rate": 4.880162137163264e-06, + "loss": 0.5878, + "step": 1920 + }, + { + "epoch": 0.31164828033744324, + "grad_norm": 0.5805863069980859, + "learning_rate": 4.880031474804731e-06, + "loss": 0.5603, + "step": 1921 + }, + { + "epoch": 0.3118105126541207, + "grad_norm": 0.6219434197688491, + "learning_rate": 4.879900743003827e-06, + "loss": 0.5694, + "step": 1922 + }, + { + "epoch": 0.3119727449707982, + "grad_norm": 0.6051791702096656, + "learning_rate": 4.8797699417643705e-06, + "loss": 0.5916, + "step": 1923 + }, + { + "epoch": 0.3121349772874757, + "grad_norm": 0.5989194470016085, + "learning_rate": 4.879639071090174e-06, + "loss": 0.5573, + "step": 1924 + }, + { + "epoch": 0.31229720960415314, + "grad_norm": 0.6292418846725175, + "learning_rate": 4.879508130985059e-06, + "loss": 0.6044, + "step": 1925 + }, + { + "epoch": 0.31245944192083064, + "grad_norm": 0.6390937054086089, + "learning_rate": 4.879377121452844e-06, + "loss": 0.612, + "step": 1926 + }, + { + "epoch": 0.3126216742375081, + "grad_norm": 0.5679805869926481, + "learning_rate": 4.879246042497352e-06, + "loss": 0.5832, + "step": 1927 + }, + { + "epoch": 0.3127839065541856, + "grad_norm": 0.5950033537609486, + "learning_rate": 4.879114894122408e-06, + "loss": 0.6083, + "step": 1928 + }, + { + "epoch": 0.3129461388708631, + "grad_norm": 0.5578016051157086, + "learning_rate": 4.8789836763318386e-06, + "loss": 0.5688, + "step": 1929 + }, + { + "epoch": 0.31310837118754054, + "grad_norm": 0.6073397397958753, + "learning_rate": 4.8788523891294715e-06, + "loss": 0.6046, + "step": 1930 + }, + { + "epoch": 0.31327060350421804, + "grad_norm": 0.5889901920638859, + "learning_rate": 4.878721032519137e-06, + "loss": 0.5996, + "step": 1931 + }, + { + "epoch": 0.31343283582089554, + "grad_norm": 0.6030807357336815, + "learning_rate": 4.878589606504669e-06, + "loss": 0.5751, + "step": 1932 + }, + { + "epoch": 0.313595068137573, + "grad_norm": 0.5975812384671225, + "learning_rate": 4.878458111089901e-06, + "loss": 0.6062, + "step": 1933 + }, + { + "epoch": 0.3137573004542505, + "grad_norm": 0.6401811097073389, + "learning_rate": 4.878326546278671e-06, + "loss": 0.5619, + "step": 1934 + }, + { + "epoch": 0.313919532770928, + "grad_norm": 0.6654836994692632, + "learning_rate": 4.878194912074816e-06, + "loss": 0.571, + "step": 1935 + }, + { + "epoch": 0.31408176508760544, + "grad_norm": 0.5630110574936043, + "learning_rate": 4.878063208482178e-06, + "loss": 0.5515, + "step": 1936 + }, + { + "epoch": 0.31424399740428294, + "grad_norm": 0.601846356708817, + "learning_rate": 4.877931435504599e-06, + "loss": 0.5964, + "step": 1937 + }, + { + "epoch": 0.31440622972096044, + "grad_norm": 0.618874896905912, + "learning_rate": 4.877799593145924e-06, + "loss": 0.5721, + "step": 1938 + }, + { + "epoch": 0.3145684620376379, + "grad_norm": 0.6110772239946402, + "learning_rate": 4.87766768141e-06, + "loss": 0.5881, + "step": 1939 + }, + { + "epoch": 0.3147306943543154, + "grad_norm": 0.6186091637060314, + "learning_rate": 4.877535700300676e-06, + "loss": 0.5888, + "step": 1940 + }, + { + "epoch": 0.31489292667099283, + "grad_norm": 0.5880010893450455, + "learning_rate": 4.877403649821802e-06, + "loss": 0.6091, + "step": 1941 + }, + { + "epoch": 0.31505515898767034, + "grad_norm": 0.6179946925987735, + "learning_rate": 4.8772715299772315e-06, + "loss": 0.5915, + "step": 1942 + }, + { + "epoch": 0.31521739130434784, + "grad_norm": 0.5976280417726405, + "learning_rate": 4.877139340770818e-06, + "loss": 0.5845, + "step": 1943 + }, + { + "epoch": 0.3153796236210253, + "grad_norm": 0.6206752298276448, + "learning_rate": 4.877007082206421e-06, + "loss": 0.5859, + "step": 1944 + }, + { + "epoch": 0.3155418559377028, + "grad_norm": 0.607324445885701, + "learning_rate": 4.876874754287897e-06, + "loss": 0.573, + "step": 1945 + }, + { + "epoch": 0.3157040882543803, + "grad_norm": 0.608726064566139, + "learning_rate": 4.876742357019109e-06, + "loss": 0.5801, + "step": 1946 + }, + { + "epoch": 0.31586632057105773, + "grad_norm": 0.6238972181787446, + "learning_rate": 4.876609890403917e-06, + "loss": 0.5977, + "step": 1947 + }, + { + "epoch": 0.31602855288773524, + "grad_norm": 0.6004316170635277, + "learning_rate": 4.8764773544461895e-06, + "loss": 0.5774, + "step": 1948 + }, + { + "epoch": 0.31619078520441274, + "grad_norm": 0.5805175917753134, + "learning_rate": 4.876344749149791e-06, + "loss": 0.5559, + "step": 1949 + }, + { + "epoch": 0.3163530175210902, + "grad_norm": 0.6096534296897114, + "learning_rate": 4.876212074518591e-06, + "loss": 0.6132, + "step": 1950 + }, + { + "epoch": 0.3165152498377677, + "grad_norm": 0.5934071432128487, + "learning_rate": 4.876079330556462e-06, + "loss": 0.5763, + "step": 1951 + }, + { + "epoch": 0.3166774821544452, + "grad_norm": 0.6226257594219342, + "learning_rate": 4.875946517267276e-06, + "loss": 0.6082, + "step": 1952 + }, + { + "epoch": 0.31683971447112264, + "grad_norm": 0.5998429545909814, + "learning_rate": 4.8758136346549065e-06, + "loss": 0.6165, + "step": 1953 + }, + { + "epoch": 0.31700194678780014, + "grad_norm": 0.5902207663362795, + "learning_rate": 4.875680682723234e-06, + "loss": 0.5767, + "step": 1954 + }, + { + "epoch": 0.31716417910447764, + "grad_norm": 0.5958490843696861, + "learning_rate": 4.875547661476135e-06, + "loss": 0.5821, + "step": 1955 + }, + { + "epoch": 0.3173264114211551, + "grad_norm": 0.6064557396002956, + "learning_rate": 4.875414570917492e-06, + "loss": 0.5575, + "step": 1956 + }, + { + "epoch": 0.3174886437378326, + "grad_norm": 0.5868150992712532, + "learning_rate": 4.875281411051188e-06, + "loss": 0.5828, + "step": 1957 + }, + { + "epoch": 0.31765087605451003, + "grad_norm": 0.5953231277301905, + "learning_rate": 4.875148181881108e-06, + "loss": 0.6151, + "step": 1958 + }, + { + "epoch": 0.31781310837118754, + "grad_norm": 0.5939767891330241, + "learning_rate": 4.875014883411139e-06, + "loss": 0.5912, + "step": 1959 + }, + { + "epoch": 0.31797534068786504, + "grad_norm": 0.55800397888555, + "learning_rate": 4.87488151564517e-06, + "loss": 0.5644, + "step": 1960 + }, + { + "epoch": 0.3181375730045425, + "grad_norm": 0.6183213303043346, + "learning_rate": 4.874748078587092e-06, + "loss": 0.6027, + "step": 1961 + }, + { + "epoch": 0.31829980532122, + "grad_norm": 0.603968270673476, + "learning_rate": 4.874614572240801e-06, + "loss": 0.5864, + "step": 1962 + }, + { + "epoch": 0.3184620376378975, + "grad_norm": 0.6068811062172137, + "learning_rate": 4.87448099661019e-06, + "loss": 0.6225, + "step": 1963 + }, + { + "epoch": 0.31862426995457493, + "grad_norm": 0.5850052754882173, + "learning_rate": 4.874347351699157e-06, + "loss": 0.5758, + "step": 1964 + }, + { + "epoch": 0.31878650227125244, + "grad_norm": 0.5806954942019807, + "learning_rate": 4.8742136375115995e-06, + "loss": 0.6019, + "step": 1965 + }, + { + "epoch": 0.31894873458792994, + "grad_norm": 0.5969362352077172, + "learning_rate": 4.874079854051421e-06, + "loss": 0.617, + "step": 1966 + }, + { + "epoch": 0.3191109669046074, + "grad_norm": 0.6087519949657332, + "learning_rate": 4.873946001322525e-06, + "loss": 0.5615, + "step": 1967 + }, + { + "epoch": 0.3192731992212849, + "grad_norm": 0.6234429238375625, + "learning_rate": 4.873812079328817e-06, + "loss": 0.562, + "step": 1968 + }, + { + "epoch": 0.3194354315379624, + "grad_norm": 0.6142581174007199, + "learning_rate": 4.873678088074203e-06, + "loss": 0.603, + "step": 1969 + }, + { + "epoch": 0.31959766385463984, + "grad_norm": 0.5836293190065389, + "learning_rate": 4.873544027562593e-06, + "loss": 0.5607, + "step": 1970 + }, + { + "epoch": 0.31975989617131734, + "grad_norm": 0.6181226070124581, + "learning_rate": 4.873409897797899e-06, + "loss": 0.6105, + "step": 1971 + }, + { + "epoch": 0.3199221284879948, + "grad_norm": 0.5981003762243842, + "learning_rate": 4.873275698784035e-06, + "loss": 0.568, + "step": 1972 + }, + { + "epoch": 0.3200843608046723, + "grad_norm": 0.5713735971630024, + "learning_rate": 4.8731414305249145e-06, + "loss": 0.6101, + "step": 1973 + }, + { + "epoch": 0.3202465931213498, + "grad_norm": 0.6079173884557935, + "learning_rate": 4.873007093024456e-06, + "loss": 0.5737, + "step": 1974 + }, + { + "epoch": 0.32040882543802723, + "grad_norm": 0.6067818582156199, + "learning_rate": 4.8728726862865815e-06, + "loss": 0.5626, + "step": 1975 + }, + { + "epoch": 0.32057105775470474, + "grad_norm": 0.593076301678963, + "learning_rate": 4.872738210315209e-06, + "loss": 0.5584, + "step": 1976 + }, + { + "epoch": 0.32073329007138224, + "grad_norm": 0.6194740891207774, + "learning_rate": 4.8726036651142645e-06, + "loss": 0.5712, + "step": 1977 + }, + { + "epoch": 0.3208955223880597, + "grad_norm": 0.5854672634771334, + "learning_rate": 4.872469050687673e-06, + "loss": 0.5961, + "step": 1978 + }, + { + "epoch": 0.3210577547047372, + "grad_norm": 0.6040225783035419, + "learning_rate": 4.8723343670393605e-06, + "loss": 0.5691, + "step": 1979 + }, + { + "epoch": 0.3212199870214147, + "grad_norm": 0.5924255531362471, + "learning_rate": 4.872199614173259e-06, + "loss": 0.6249, + "step": 1980 + }, + { + "epoch": 0.32138221933809213, + "grad_norm": 0.5966700386138455, + "learning_rate": 4.8720647920932995e-06, + "loss": 0.5945, + "step": 1981 + }, + { + "epoch": 0.32154445165476964, + "grad_norm": 0.635611255686691, + "learning_rate": 4.871929900803415e-06, + "loss": 0.5978, + "step": 1982 + }, + { + "epoch": 0.32170668397144714, + "grad_norm": 0.5991559954110832, + "learning_rate": 4.871794940307543e-06, + "loss": 0.5809, + "step": 1983 + }, + { + "epoch": 0.3218689162881246, + "grad_norm": 0.6271960354196736, + "learning_rate": 4.871659910609619e-06, + "loss": 0.6069, + "step": 1984 + }, + { + "epoch": 0.3220311486048021, + "grad_norm": 0.5913617391839056, + "learning_rate": 4.871524811713584e-06, + "loss": 0.5859, + "step": 1985 + }, + { + "epoch": 0.32219338092147953, + "grad_norm": 0.5951500129525231, + "learning_rate": 4.8713896436233784e-06, + "loss": 0.6073, + "step": 1986 + }, + { + "epoch": 0.32235561323815703, + "grad_norm": 0.6009676733021354, + "learning_rate": 4.871254406342949e-06, + "loss": 0.5981, + "step": 1987 + }, + { + "epoch": 0.32251784555483454, + "grad_norm": 0.649507189898911, + "learning_rate": 4.871119099876239e-06, + "loss": 0.5715, + "step": 1988 + }, + { + "epoch": 0.322680077871512, + "grad_norm": 0.5818420996721758, + "learning_rate": 4.870983724227197e-06, + "loss": 0.5688, + "step": 1989 + }, + { + "epoch": 0.3228423101881895, + "grad_norm": 0.5972914716922332, + "learning_rate": 4.870848279399774e-06, + "loss": 0.5744, + "step": 1990 + }, + { + "epoch": 0.323004542504867, + "grad_norm": 0.5953419018569059, + "learning_rate": 4.870712765397919e-06, + "loss": 0.5701, + "step": 1991 + }, + { + "epoch": 0.32316677482154443, + "grad_norm": 0.5843059605691728, + "learning_rate": 4.8705771822255895e-06, + "loss": 0.5991, + "step": 1992 + }, + { + "epoch": 0.32332900713822194, + "grad_norm": 0.6647456153139246, + "learning_rate": 4.870441529886739e-06, + "loss": 0.5369, + "step": 1993 + }, + { + "epoch": 0.32349123945489944, + "grad_norm": 0.5999077643158793, + "learning_rate": 4.870305808385325e-06, + "loss": 0.5763, + "step": 1994 + }, + { + "epoch": 0.3236534717715769, + "grad_norm": 0.5754034193713602, + "learning_rate": 4.87017001772531e-06, + "loss": 0.5882, + "step": 1995 + }, + { + "epoch": 0.3238157040882544, + "grad_norm": 0.6045976327739478, + "learning_rate": 4.870034157910654e-06, + "loss": 0.6067, + "step": 1996 + }, + { + "epoch": 0.3239779364049319, + "grad_norm": 0.5762281489923349, + "learning_rate": 4.8698982289453215e-06, + "loss": 0.5873, + "step": 1997 + }, + { + "epoch": 0.32414016872160933, + "grad_norm": 0.6148169014290199, + "learning_rate": 4.869762230833278e-06, + "loss": 0.5774, + "step": 1998 + }, + { + "epoch": 0.32430240103828684, + "grad_norm": 0.5946527956240234, + "learning_rate": 4.869626163578493e-06, + "loss": 0.5529, + "step": 1999 + }, + { + "epoch": 0.3244646333549643, + "grad_norm": 0.6214733408415113, + "learning_rate": 4.869490027184935e-06, + "loss": 0.5677, + "step": 2000 + }, + { + "epoch": 0.3246268656716418, + "grad_norm": 0.5914584182868036, + "learning_rate": 4.869353821656577e-06, + "loss": 0.59, + "step": 2001 + }, + { + "epoch": 0.3247890979883193, + "grad_norm": 0.6108563602319799, + "learning_rate": 4.869217546997392e-06, + "loss": 0.587, + "step": 2002 + }, + { + "epoch": 0.32495133030499673, + "grad_norm": 0.616030611462366, + "learning_rate": 4.869081203211358e-06, + "loss": 0.5741, + "step": 2003 + }, + { + "epoch": 0.32511356262167423, + "grad_norm": 0.6483948599909051, + "learning_rate": 4.868944790302451e-06, + "loss": 0.6026, + "step": 2004 + }, + { + "epoch": 0.32527579493835174, + "grad_norm": 0.5906083190513722, + "learning_rate": 4.868808308274653e-06, + "loss": 0.5817, + "step": 2005 + }, + { + "epoch": 0.3254380272550292, + "grad_norm": 0.6134841306569694, + "learning_rate": 4.8686717571319444e-06, + "loss": 0.5646, + "step": 2006 + }, + { + "epoch": 0.3256002595717067, + "grad_norm": 0.5913665761870603, + "learning_rate": 4.868535136878311e-06, + "loss": 0.5719, + "step": 2007 + }, + { + "epoch": 0.3257624918883842, + "grad_norm": 0.5792220444324863, + "learning_rate": 4.868398447517737e-06, + "loss": 0.5964, + "step": 2008 + }, + { + "epoch": 0.32592472420506163, + "grad_norm": 0.6319841692433312, + "learning_rate": 4.8682616890542125e-06, + "loss": 0.6195, + "step": 2009 + }, + { + "epoch": 0.32608695652173914, + "grad_norm": 0.5721760500908672, + "learning_rate": 4.868124861491727e-06, + "loss": 0.5663, + "step": 2010 + }, + { + "epoch": 0.32624918883841664, + "grad_norm": 0.5981499124842813, + "learning_rate": 4.867987964834272e-06, + "loss": 0.582, + "step": 2011 + }, + { + "epoch": 0.3264114211550941, + "grad_norm": 0.6322927657090588, + "learning_rate": 4.867850999085843e-06, + "loss": 0.6096, + "step": 2012 + }, + { + "epoch": 0.3265736534717716, + "grad_norm": 0.5905447811447536, + "learning_rate": 4.867713964250436e-06, + "loss": 0.6013, + "step": 2013 + }, + { + "epoch": 0.32673588578844903, + "grad_norm": 0.6002965049597947, + "learning_rate": 4.867576860332048e-06, + "loss": 0.601, + "step": 2014 + }, + { + "epoch": 0.32689811810512653, + "grad_norm": 0.5907758751338698, + "learning_rate": 4.867439687334682e-06, + "loss": 0.5607, + "step": 2015 + }, + { + "epoch": 0.32706035042180404, + "grad_norm": 0.5940143079949406, + "learning_rate": 4.8673024452623365e-06, + "loss": 0.5686, + "step": 2016 + }, + { + "epoch": 0.3272225827384815, + "grad_norm": 0.5874072387551806, + "learning_rate": 4.86716513411902e-06, + "loss": 0.5571, + "step": 2017 + }, + { + "epoch": 0.327384815055159, + "grad_norm": 0.5859006357601835, + "learning_rate": 4.867027753908735e-06, + "loss": 0.594, + "step": 2018 + }, + { + "epoch": 0.3275470473718365, + "grad_norm": 0.5890859586576591, + "learning_rate": 4.866890304635492e-06, + "loss": 0.602, + "step": 2019 + }, + { + "epoch": 0.32770927968851393, + "grad_norm": 0.5890213207016589, + "learning_rate": 4.8667527863033015e-06, + "loss": 0.5824, + "step": 2020 + }, + { + "epoch": 0.32787151200519143, + "grad_norm": 0.5988911055551334, + "learning_rate": 4.866615198916175e-06, + "loss": 0.6124, + "step": 2021 + }, + { + "epoch": 0.32803374432186894, + "grad_norm": 0.6198101702924668, + "learning_rate": 4.866477542478127e-06, + "loss": 0.5843, + "step": 2022 + }, + { + "epoch": 0.3281959766385464, + "grad_norm": 0.5938631763647604, + "learning_rate": 4.866339816993175e-06, + "loss": 0.5841, + "step": 2023 + }, + { + "epoch": 0.3283582089552239, + "grad_norm": 0.5812428850257193, + "learning_rate": 4.866202022465336e-06, + "loss": 0.6058, + "step": 2024 + }, + { + "epoch": 0.3285204412719014, + "grad_norm": 0.6648365168567227, + "learning_rate": 4.866064158898631e-06, + "loss": 0.5997, + "step": 2025 + }, + { + "epoch": 0.32868267358857883, + "grad_norm": 0.5549809048091607, + "learning_rate": 4.865926226297083e-06, + "loss": 0.554, + "step": 2026 + }, + { + "epoch": 0.32884490590525634, + "grad_norm": 0.5795790571401965, + "learning_rate": 4.865788224664716e-06, + "loss": 0.6001, + "step": 2027 + }, + { + "epoch": 0.32900713822193384, + "grad_norm": 0.591020399046188, + "learning_rate": 4.865650154005556e-06, + "loss": 0.6164, + "step": 2028 + }, + { + "epoch": 0.3291693705386113, + "grad_norm": 0.5803657936986734, + "learning_rate": 4.865512014323633e-06, + "loss": 0.5976, + "step": 2029 + }, + { + "epoch": 0.3293316028552888, + "grad_norm": 0.67215531322637, + "learning_rate": 4.865373805622975e-06, + "loss": 0.5902, + "step": 2030 + }, + { + "epoch": 0.32949383517196623, + "grad_norm": 0.5992020247431968, + "learning_rate": 4.8652355279076155e-06, + "loss": 0.5989, + "step": 2031 + }, + { + "epoch": 0.32965606748864373, + "grad_norm": 0.6269654990328515, + "learning_rate": 4.86509718118159e-06, + "loss": 0.5449, + "step": 2032 + }, + { + "epoch": 0.32981829980532124, + "grad_norm": 0.6158204303513883, + "learning_rate": 4.864958765448936e-06, + "loss": 0.5978, + "step": 2033 + }, + { + "epoch": 0.3299805321219987, + "grad_norm": 0.6150293974069373, + "learning_rate": 4.864820280713689e-06, + "loss": 0.5616, + "step": 2034 + }, + { + "epoch": 0.3301427644386762, + "grad_norm": 0.5904109320133257, + "learning_rate": 4.864681726979893e-06, + "loss": 0.5793, + "step": 2035 + }, + { + "epoch": 0.3303049967553537, + "grad_norm": 0.5971575034651831, + "learning_rate": 4.864543104251587e-06, + "loss": 0.5786, + "step": 2036 + }, + { + "epoch": 0.33046722907203113, + "grad_norm": 0.5847634646464798, + "learning_rate": 4.864404412532818e-06, + "loss": 0.5828, + "step": 2037 + }, + { + "epoch": 0.33062946138870863, + "grad_norm": 0.5980986268080367, + "learning_rate": 4.864265651827632e-06, + "loss": 0.5758, + "step": 2038 + }, + { + "epoch": 0.33079169370538614, + "grad_norm": 0.6105718353973164, + "learning_rate": 4.864126822140079e-06, + "loss": 0.5837, + "step": 2039 + }, + { + "epoch": 0.3309539260220636, + "grad_norm": 0.610257403570718, + "learning_rate": 4.863987923474206e-06, + "loss": 0.5534, + "step": 2040 + }, + { + "epoch": 0.3311161583387411, + "grad_norm": 0.5662358272689563, + "learning_rate": 4.86384895583407e-06, + "loss": 0.6285, + "step": 2041 + }, + { + "epoch": 0.3312783906554186, + "grad_norm": 0.6016919095463616, + "learning_rate": 4.863709919223722e-06, + "loss": 0.5916, + "step": 2042 + }, + { + "epoch": 0.33144062297209603, + "grad_norm": 0.598219403083518, + "learning_rate": 4.8635708136472215e-06, + "loss": 0.5794, + "step": 2043 + }, + { + "epoch": 0.33160285528877353, + "grad_norm": 0.6179222505177872, + "learning_rate": 4.863431639108625e-06, + "loss": 0.579, + "step": 2044 + }, + { + "epoch": 0.331765087605451, + "grad_norm": 0.5785043871769547, + "learning_rate": 4.863292395611996e-06, + "loss": 0.5656, + "step": 2045 + }, + { + "epoch": 0.3319273199221285, + "grad_norm": 0.5932539786297548, + "learning_rate": 4.863153083161394e-06, + "loss": 0.5812, + "step": 2046 + }, + { + "epoch": 0.332089552238806, + "grad_norm": 0.5806302167612019, + "learning_rate": 4.863013701760886e-06, + "loss": 0.5862, + "step": 2047 + }, + { + "epoch": 0.33225178455548343, + "grad_norm": 0.5797680366866698, + "learning_rate": 4.862874251414537e-06, + "loss": 0.5829, + "step": 2048 + }, + { + "epoch": 0.33241401687216093, + "grad_norm": 0.5836305244009815, + "learning_rate": 4.862734732126417e-06, + "loss": 0.5762, + "step": 2049 + }, + { + "epoch": 0.33257624918883844, + "grad_norm": 0.5993854502999827, + "learning_rate": 4.862595143900597e-06, + "loss": 0.5839, + "step": 2050 + }, + { + "epoch": 0.3327384815055159, + "grad_norm": 0.6126165183000467, + "learning_rate": 4.862455486741148e-06, + "loss": 0.573, + "step": 2051 + }, + { + "epoch": 0.3329007138221934, + "grad_norm": 0.5739341543559466, + "learning_rate": 4.862315760652147e-06, + "loss": 0.6017, + "step": 2052 + }, + { + "epoch": 0.3330629461388709, + "grad_norm": 0.6016773638186519, + "learning_rate": 4.86217596563767e-06, + "loss": 0.6154, + "step": 2053 + }, + { + "epoch": 0.33322517845554833, + "grad_norm": 0.6000706946416736, + "learning_rate": 4.8620361017017945e-06, + "loss": 0.591, + "step": 2054 + }, + { + "epoch": 0.33338741077222583, + "grad_norm": 0.6337404074127956, + "learning_rate": 4.861896168848603e-06, + "loss": 0.5726, + "step": 2055 + }, + { + "epoch": 0.33354964308890334, + "grad_norm": 0.5988660118393696, + "learning_rate": 4.861756167082179e-06, + "loss": 0.5886, + "step": 2056 + }, + { + "epoch": 0.3337118754055808, + "grad_norm": 0.5841670379397415, + "learning_rate": 4.861616096406605e-06, + "loss": 0.5239, + "step": 2057 + }, + { + "epoch": 0.3338741077222583, + "grad_norm": 0.6077883280782995, + "learning_rate": 4.8614759568259685e-06, + "loss": 0.5725, + "step": 2058 + }, + { + "epoch": 0.33403634003893573, + "grad_norm": 0.6000555455811704, + "learning_rate": 4.861335748344359e-06, + "loss": 0.5577, + "step": 2059 + }, + { + "epoch": 0.33419857235561323, + "grad_norm": 0.615988170484571, + "learning_rate": 4.861195470965868e-06, + "loss": 0.5909, + "step": 2060 + }, + { + "epoch": 0.33436080467229073, + "grad_norm": 0.5583057119020419, + "learning_rate": 4.861055124694587e-06, + "loss": 0.576, + "step": 2061 + }, + { + "epoch": 0.3345230369889682, + "grad_norm": 0.5899695805069168, + "learning_rate": 4.860914709534612e-06, + "loss": 0.5703, + "step": 2062 + }, + { + "epoch": 0.3346852693056457, + "grad_norm": 0.5929928154074523, + "learning_rate": 4.860774225490038e-06, + "loss": 0.5756, + "step": 2063 + }, + { + "epoch": 0.3348475016223232, + "grad_norm": 0.5695459410211366, + "learning_rate": 4.8606336725649674e-06, + "loss": 0.5839, + "step": 2064 + }, + { + "epoch": 0.33500973393900063, + "grad_norm": 0.5923483053649946, + "learning_rate": 4.860493050763497e-06, + "loss": 0.596, + "step": 2065 + }, + { + "epoch": 0.33517196625567813, + "grad_norm": 0.6229770239071105, + "learning_rate": 4.860352360089733e-06, + "loss": 0.5885, + "step": 2066 + }, + { + "epoch": 0.33533419857235564, + "grad_norm": 0.6036373650147403, + "learning_rate": 4.860211600547778e-06, + "loss": 0.6058, + "step": 2067 + }, + { + "epoch": 0.3354964308890331, + "grad_norm": 0.5997277480485104, + "learning_rate": 4.860070772141741e-06, + "loss": 0.6178, + "step": 2068 + }, + { + "epoch": 0.3356586632057106, + "grad_norm": 0.6096980954360932, + "learning_rate": 4.859929874875731e-06, + "loss": 0.5821, + "step": 2069 + }, + { + "epoch": 0.3358208955223881, + "grad_norm": 0.6079445371765132, + "learning_rate": 4.859788908753856e-06, + "loss": 0.6078, + "step": 2070 + }, + { + "epoch": 0.33598312783906553, + "grad_norm": 0.5902188301382603, + "learning_rate": 4.859647873780232e-06, + "loss": 0.5782, + "step": 2071 + }, + { + "epoch": 0.33614536015574303, + "grad_norm": 0.6133441616204748, + "learning_rate": 4.859506769958973e-06, + "loss": 0.5863, + "step": 2072 + }, + { + "epoch": 0.3363075924724205, + "grad_norm": 0.606929350864094, + "learning_rate": 4.859365597294196e-06, + "loss": 0.6139, + "step": 2073 + }, + { + "epoch": 0.336469824789098, + "grad_norm": 0.5504357231355668, + "learning_rate": 4.859224355790019e-06, + "loss": 0.5971, + "step": 2074 + }, + { + "epoch": 0.3366320571057755, + "grad_norm": 0.5723111272388767, + "learning_rate": 4.859083045450565e-06, + "loss": 0.5685, + "step": 2075 + }, + { + "epoch": 0.33679428942245293, + "grad_norm": 0.5746006981058593, + "learning_rate": 4.858941666279956e-06, + "loss": 0.6157, + "step": 2076 + }, + { + "epoch": 0.33695652173913043, + "grad_norm": 0.6306830984781705, + "learning_rate": 4.858800218282316e-06, + "loss": 0.5994, + "step": 2077 + }, + { + "epoch": 0.33711875405580793, + "grad_norm": 0.5923171286989207, + "learning_rate": 4.858658701461773e-06, + "loss": 0.562, + "step": 2078 + }, + { + "epoch": 0.3372809863724854, + "grad_norm": 0.6147593480519593, + "learning_rate": 4.858517115822458e-06, + "loss": 0.6139, + "step": 2079 + }, + { + "epoch": 0.3374432186891629, + "grad_norm": 0.6046528065251544, + "learning_rate": 4.858375461368499e-06, + "loss": 0.5941, + "step": 2080 + }, + { + "epoch": 0.3376054510058404, + "grad_norm": 0.5693343241408135, + "learning_rate": 4.85823373810403e-06, + "loss": 0.5831, + "step": 2081 + }, + { + "epoch": 0.33776768332251783, + "grad_norm": 0.6055875762277583, + "learning_rate": 4.858091946033186e-06, + "loss": 0.5902, + "step": 2082 + }, + { + "epoch": 0.33792991563919533, + "grad_norm": 0.6014992577044899, + "learning_rate": 4.857950085160105e-06, + "loss": 0.5611, + "step": 2083 + }, + { + "epoch": 0.33809214795587283, + "grad_norm": 0.5899312269817251, + "learning_rate": 4.8578081554889256e-06, + "loss": 0.5881, + "step": 2084 + }, + { + "epoch": 0.3382543802725503, + "grad_norm": 0.5943686605205898, + "learning_rate": 4.857666157023788e-06, + "loss": 0.5983, + "step": 2085 + }, + { + "epoch": 0.3384166125892278, + "grad_norm": 0.5748720240583368, + "learning_rate": 4.857524089768837e-06, + "loss": 0.5614, + "step": 2086 + }, + { + "epoch": 0.33857884490590523, + "grad_norm": 0.6505134684296974, + "learning_rate": 4.857381953728217e-06, + "loss": 0.6244, + "step": 2087 + }, + { + "epoch": 0.33874107722258273, + "grad_norm": 0.5847365173467636, + "learning_rate": 4.857239748906074e-06, + "loss": 0.6116, + "step": 2088 + }, + { + "epoch": 0.33890330953926023, + "grad_norm": 0.5991290294293747, + "learning_rate": 4.857097475306558e-06, + "loss": 0.5776, + "step": 2089 + }, + { + "epoch": 0.3390655418559377, + "grad_norm": 0.589268222724266, + "learning_rate": 4.8569551329338205e-06, + "loss": 0.5986, + "step": 2090 + }, + { + "epoch": 0.3392277741726152, + "grad_norm": 0.6022712198954766, + "learning_rate": 4.8568127217920145e-06, + "loss": 0.6001, + "step": 2091 + }, + { + "epoch": 0.3393900064892927, + "grad_norm": 0.5710718921196751, + "learning_rate": 4.856670241885294e-06, + "loss": 0.5747, + "step": 2092 + }, + { + "epoch": 0.33955223880597013, + "grad_norm": 0.6371840873763449, + "learning_rate": 4.856527693217818e-06, + "loss": 0.5553, + "step": 2093 + }, + { + "epoch": 0.33971447112264763, + "grad_norm": 0.5576607743089049, + "learning_rate": 4.856385075793744e-06, + "loss": 0.5816, + "step": 2094 + }, + { + "epoch": 0.33987670343932513, + "grad_norm": 0.5774623279192348, + "learning_rate": 4.856242389617234e-06, + "loss": 0.5681, + "step": 2095 + }, + { + "epoch": 0.3400389357560026, + "grad_norm": 0.6098904214744898, + "learning_rate": 4.856099634692451e-06, + "loss": 0.5698, + "step": 2096 + }, + { + "epoch": 0.3402011680726801, + "grad_norm": 0.5672828009482832, + "learning_rate": 4.855956811023561e-06, + "loss": 0.6157, + "step": 2097 + }, + { + "epoch": 0.3403634003893576, + "grad_norm": 0.5960149329736231, + "learning_rate": 4.85581391861473e-06, + "loss": 0.5906, + "step": 2098 + }, + { + "epoch": 0.34052563270603503, + "grad_norm": 0.6121904941614972, + "learning_rate": 4.855670957470127e-06, + "loss": 0.5787, + "step": 2099 + }, + { + "epoch": 0.34068786502271253, + "grad_norm": 0.6043226286817057, + "learning_rate": 4.855527927593925e-06, + "loss": 0.5755, + "step": 2100 + }, + { + "epoch": 0.34085009733939, + "grad_norm": 0.6605681943771509, + "learning_rate": 4.855384828990295e-06, + "loss": 0.6299, + "step": 2101 + }, + { + "epoch": 0.3410123296560675, + "grad_norm": 0.5620441244241702, + "learning_rate": 4.855241661663413e-06, + "loss": 0.6266, + "step": 2102 + }, + { + "epoch": 0.341174561972745, + "grad_norm": 0.5985035589942209, + "learning_rate": 4.855098425617457e-06, + "loss": 0.5927, + "step": 2103 + }, + { + "epoch": 0.34133679428942243, + "grad_norm": 0.6197498577992421, + "learning_rate": 4.854955120856605e-06, + "loss": 0.563, + "step": 2104 + }, + { + "epoch": 0.34149902660609993, + "grad_norm": 0.5622329825531281, + "learning_rate": 4.854811747385039e-06, + "loss": 0.6023, + "step": 2105 + }, + { + "epoch": 0.34166125892277743, + "grad_norm": 0.5898666358142575, + "learning_rate": 4.854668305206942e-06, + "loss": 0.5804, + "step": 2106 + }, + { + "epoch": 0.3418234912394549, + "grad_norm": 0.5805644512932059, + "learning_rate": 4.8545247943265e-06, + "loss": 0.5967, + "step": 2107 + }, + { + "epoch": 0.3419857235561324, + "grad_norm": 0.6210473024996194, + "learning_rate": 4.854381214747898e-06, + "loss": 0.6002, + "step": 2108 + }, + { + "epoch": 0.3421479558728099, + "grad_norm": 0.6025780253179297, + "learning_rate": 4.854237566475327e-06, + "loss": 0.5396, + "step": 2109 + }, + { + "epoch": 0.34231018818948733, + "grad_norm": 0.5971817948684516, + "learning_rate": 4.854093849512978e-06, + "loss": 0.5964, + "step": 2110 + }, + { + "epoch": 0.34247242050616483, + "grad_norm": 0.604341634076739, + "learning_rate": 4.853950063865045e-06, + "loss": 0.586, + "step": 2111 + }, + { + "epoch": 0.34263465282284233, + "grad_norm": 0.6618257479742168, + "learning_rate": 4.853806209535722e-06, + "loss": 0.5317, + "step": 2112 + }, + { + "epoch": 0.3427968851395198, + "grad_norm": 0.5977480866403052, + "learning_rate": 4.853662286529207e-06, + "loss": 0.5779, + "step": 2113 + }, + { + "epoch": 0.3429591174561973, + "grad_norm": 0.6118530465321859, + "learning_rate": 4.853518294849698e-06, + "loss": 0.6113, + "step": 2114 + }, + { + "epoch": 0.3431213497728748, + "grad_norm": 0.6058933611844842, + "learning_rate": 4.853374234501398e-06, + "loss": 0.5841, + "step": 2115 + }, + { + "epoch": 0.34328358208955223, + "grad_norm": 0.5705230990777059, + "learning_rate": 4.853230105488509e-06, + "loss": 0.6102, + "step": 2116 + }, + { + "epoch": 0.34344581440622973, + "grad_norm": 0.574981922619092, + "learning_rate": 4.853085907815237e-06, + "loss": 0.5364, + "step": 2117 + }, + { + "epoch": 0.3436080467229072, + "grad_norm": 0.5943949979124892, + "learning_rate": 4.852941641485789e-06, + "loss": 0.5722, + "step": 2118 + }, + { + "epoch": 0.3437702790395847, + "grad_norm": 0.6565818909958798, + "learning_rate": 4.852797306504373e-06, + "loss": 0.595, + "step": 2119 + }, + { + "epoch": 0.3439325113562622, + "grad_norm": 0.6102855707482472, + "learning_rate": 4.8526529028752025e-06, + "loss": 0.5868, + "step": 2120 + }, + { + "epoch": 0.34409474367293963, + "grad_norm": 0.6176270159142836, + "learning_rate": 4.85250843060249e-06, + "loss": 0.5894, + "step": 2121 + }, + { + "epoch": 0.34425697598961713, + "grad_norm": 0.5806281562515374, + "learning_rate": 4.8523638896904505e-06, + "loss": 0.5639, + "step": 2122 + }, + { + "epoch": 0.34441920830629463, + "grad_norm": 0.6302958815819045, + "learning_rate": 4.852219280143301e-06, + "loss": 0.5695, + "step": 2123 + }, + { + "epoch": 0.3445814406229721, + "grad_norm": 0.5880940533036347, + "learning_rate": 4.852074601965261e-06, + "loss": 0.611, + "step": 2124 + }, + { + "epoch": 0.3447436729396496, + "grad_norm": 0.6225466576422233, + "learning_rate": 4.851929855160552e-06, + "loss": 0.5873, + "step": 2125 + }, + { + "epoch": 0.3449059052563271, + "grad_norm": 0.5565302111588255, + "learning_rate": 4.851785039733398e-06, + "loss": 0.6179, + "step": 2126 + }, + { + "epoch": 0.34506813757300453, + "grad_norm": 0.5830949347167043, + "learning_rate": 4.851640155688022e-06, + "loss": 0.5867, + "step": 2127 + }, + { + "epoch": 0.34523036988968203, + "grad_norm": 0.642970827820857, + "learning_rate": 4.851495203028655e-06, + "loss": 0.6142, + "step": 2128 + }, + { + "epoch": 0.34539260220635953, + "grad_norm": 0.6036568899195913, + "learning_rate": 4.851350181759522e-06, + "loss": 0.6046, + "step": 2129 + }, + { + "epoch": 0.345554834523037, + "grad_norm": 0.6079421928133156, + "learning_rate": 4.851205091884857e-06, + "loss": 0.5687, + "step": 2130 + }, + { + "epoch": 0.3457170668397145, + "grad_norm": 0.5684918384409361, + "learning_rate": 4.8510599334088936e-06, + "loss": 0.561, + "step": 2131 + }, + { + "epoch": 0.34587929915639193, + "grad_norm": 0.6085712806996202, + "learning_rate": 4.850914706335865e-06, + "loss": 0.5916, + "step": 2132 + }, + { + "epoch": 0.34604153147306943, + "grad_norm": 0.6544444958384408, + "learning_rate": 4.85076941067001e-06, + "loss": 0.5894, + "step": 2133 + }, + { + "epoch": 0.34620376378974693, + "grad_norm": 0.5927568554597471, + "learning_rate": 4.850624046415567e-06, + "loss": 0.5971, + "step": 2134 + }, + { + "epoch": 0.3463659961064244, + "grad_norm": 0.5681455016702814, + "learning_rate": 4.850478613576779e-06, + "loss": 0.5583, + "step": 2135 + }, + { + "epoch": 0.3465282284231019, + "grad_norm": 0.6074166030865567, + "learning_rate": 4.850333112157888e-06, + "loss": 0.5952, + "step": 2136 + }, + { + "epoch": 0.3466904607397794, + "grad_norm": 0.5839655866876474, + "learning_rate": 4.85018754216314e-06, + "loss": 0.617, + "step": 2137 + }, + { + "epoch": 0.34685269305645683, + "grad_norm": 0.6050657950849183, + "learning_rate": 4.850041903596781e-06, + "loss": 0.5594, + "step": 2138 + }, + { + "epoch": 0.34701492537313433, + "grad_norm": 0.6066281140545428, + "learning_rate": 4.84989619646306e-06, + "loss": 0.586, + "step": 2139 + }, + { + "epoch": 0.34717715768981183, + "grad_norm": 0.5672874642630896, + "learning_rate": 4.84975042076623e-06, + "loss": 0.5783, + "step": 2140 + }, + { + "epoch": 0.3473393900064893, + "grad_norm": 0.6239771129930048, + "learning_rate": 4.849604576510545e-06, + "loss": 0.6026, + "step": 2141 + }, + { + "epoch": 0.3475016223231668, + "grad_norm": 0.6070610381934751, + "learning_rate": 4.849458663700259e-06, + "loss": 0.5706, + "step": 2142 + }, + { + "epoch": 0.3476638546398443, + "grad_norm": 0.5883480361540121, + "learning_rate": 4.849312682339628e-06, + "loss": 0.5981, + "step": 2143 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.5875144644294449, + "learning_rate": 4.849166632432914e-06, + "loss": 0.5899, + "step": 2144 + }, + { + "epoch": 0.34798831927319923, + "grad_norm": 0.6317524700636579, + "learning_rate": 4.849020513984376e-06, + "loss": 0.5775, + "step": 2145 + }, + { + "epoch": 0.3481505515898767, + "grad_norm": 0.6029246627854529, + "learning_rate": 4.848874326998279e-06, + "loss": 0.576, + "step": 2146 + }, + { + "epoch": 0.3483127839065542, + "grad_norm": 0.5828626088373333, + "learning_rate": 4.848728071478887e-06, + "loss": 0.6166, + "step": 2147 + }, + { + "epoch": 0.3484750162232317, + "grad_norm": 0.597509373320515, + "learning_rate": 4.848581747430468e-06, + "loss": 0.5905, + "step": 2148 + }, + { + "epoch": 0.34863724853990913, + "grad_norm": 0.5814239055407198, + "learning_rate": 4.8484353548572915e-06, + "loss": 0.5987, + "step": 2149 + }, + { + "epoch": 0.34879948085658663, + "grad_norm": 0.5933556073404228, + "learning_rate": 4.848288893763628e-06, + "loss": 0.5688, + "step": 2150 + }, + { + "epoch": 0.34896171317326413, + "grad_norm": 0.638850653044345, + "learning_rate": 4.848142364153752e-06, + "loss": 0.5994, + "step": 2151 + }, + { + "epoch": 0.3491239454899416, + "grad_norm": 0.5774396736416015, + "learning_rate": 4.8479957660319375e-06, + "loss": 0.5762, + "step": 2152 + }, + { + "epoch": 0.3492861778066191, + "grad_norm": 0.5690456629592966, + "learning_rate": 4.847849099402463e-06, + "loss": 0.5969, + "step": 2153 + }, + { + "epoch": 0.3494484101232966, + "grad_norm": 0.5609677830292763, + "learning_rate": 4.847702364269607e-06, + "loss": 0.5755, + "step": 2154 + }, + { + "epoch": 0.34961064243997403, + "grad_norm": 0.5531488478240257, + "learning_rate": 4.847555560637651e-06, + "loss": 0.5693, + "step": 2155 + }, + { + "epoch": 0.34977287475665153, + "grad_norm": 0.5945216486642578, + "learning_rate": 4.847408688510878e-06, + "loss": 0.5917, + "step": 2156 + }, + { + "epoch": 0.34993510707332903, + "grad_norm": 0.5905048868350979, + "learning_rate": 4.8472617478935744e-06, + "loss": 0.5882, + "step": 2157 + }, + { + "epoch": 0.3500973393900065, + "grad_norm": 0.5955530891479405, + "learning_rate": 4.847114738790026e-06, + "loss": 0.5691, + "step": 2158 + }, + { + "epoch": 0.350259571706684, + "grad_norm": 0.5885250028335679, + "learning_rate": 4.8469676612045236e-06, + "loss": 0.564, + "step": 2159 + }, + { + "epoch": 0.3504218040233614, + "grad_norm": 0.5993969516103508, + "learning_rate": 4.846820515141357e-06, + "loss": 0.5896, + "step": 2160 + }, + { + "epoch": 0.35058403634003893, + "grad_norm": 0.6067592055173696, + "learning_rate": 4.84667330060482e-06, + "loss": 0.5929, + "step": 2161 + }, + { + "epoch": 0.35074626865671643, + "grad_norm": 0.5805242511732717, + "learning_rate": 4.846526017599209e-06, + "loss": 0.6002, + "step": 2162 + }, + { + "epoch": 0.3509085009733939, + "grad_norm": 0.6311214299090548, + "learning_rate": 4.84637866612882e-06, + "loss": 0.6127, + "step": 2163 + }, + { + "epoch": 0.3510707332900714, + "grad_norm": 0.5853862755914331, + "learning_rate": 4.846231246197954e-06, + "loss": 0.6154, + "step": 2164 + }, + { + "epoch": 0.3512329656067489, + "grad_norm": 0.5664105786743479, + "learning_rate": 4.846083757810909e-06, + "loss": 0.5823, + "step": 2165 + }, + { + "epoch": 0.35139519792342633, + "grad_norm": 0.6057263147587829, + "learning_rate": 4.845936200971991e-06, + "loss": 0.5972, + "step": 2166 + }, + { + "epoch": 0.35155743024010383, + "grad_norm": 0.6103206849564754, + "learning_rate": 4.845788575685505e-06, + "loss": 0.5754, + "step": 2167 + }, + { + "epoch": 0.35171966255678133, + "grad_norm": 0.5958704727307869, + "learning_rate": 4.845640881955757e-06, + "loss": 0.5915, + "step": 2168 + }, + { + "epoch": 0.3518818948734588, + "grad_norm": 0.6090401714484254, + "learning_rate": 4.845493119787057e-06, + "loss": 0.5943, + "step": 2169 + }, + { + "epoch": 0.3520441271901363, + "grad_norm": 0.5889113963377794, + "learning_rate": 4.8453452891837175e-06, + "loss": 0.5842, + "step": 2170 + }, + { + "epoch": 0.3522063595068138, + "grad_norm": 0.5699140151232801, + "learning_rate": 4.845197390150049e-06, + "loss": 0.5908, + "step": 2171 + }, + { + "epoch": 0.35236859182349123, + "grad_norm": 0.5981175649563588, + "learning_rate": 4.845049422690369e-06, + "loss": 0.59, + "step": 2172 + }, + { + "epoch": 0.35253082414016873, + "grad_norm": 0.5941614298587234, + "learning_rate": 4.844901386808994e-06, + "loss": 0.6223, + "step": 2173 + }, + { + "epoch": 0.3526930564568462, + "grad_norm": 0.5680426028938997, + "learning_rate": 4.8447532825102445e-06, + "loss": 0.6176, + "step": 2174 + }, + { + "epoch": 0.3528552887735237, + "grad_norm": 0.6141105363124546, + "learning_rate": 4.8446051097984395e-06, + "loss": 0.6155, + "step": 2175 + }, + { + "epoch": 0.3530175210902012, + "grad_norm": 0.587093731965574, + "learning_rate": 4.844456868677904e-06, + "loss": 0.5527, + "step": 2176 + }, + { + "epoch": 0.3531797534068786, + "grad_norm": 0.5648519881796872, + "learning_rate": 4.844308559152963e-06, + "loss": 0.5717, + "step": 2177 + }, + { + "epoch": 0.35334198572355613, + "grad_norm": 0.5941790677340844, + "learning_rate": 4.844160181227943e-06, + "loss": 0.6071, + "step": 2178 + }, + { + "epoch": 0.35350421804023363, + "grad_norm": 0.6282903072569948, + "learning_rate": 4.844011734907172e-06, + "loss": 0.5969, + "step": 2179 + }, + { + "epoch": 0.3536664503569111, + "grad_norm": 0.8136394308455064, + "learning_rate": 4.843863220194985e-06, + "loss": 0.5688, + "step": 2180 + }, + { + "epoch": 0.3538286826735886, + "grad_norm": 0.6005782423313825, + "learning_rate": 4.8437146370957125e-06, + "loss": 0.596, + "step": 2181 + }, + { + "epoch": 0.3539909149902661, + "grad_norm": 0.602549225988653, + "learning_rate": 4.84356598561369e-06, + "loss": 0.5411, + "step": 2182 + }, + { + "epoch": 0.3541531473069435, + "grad_norm": 0.6101318538730691, + "learning_rate": 4.843417265753255e-06, + "loss": 0.6121, + "step": 2183 + }, + { + "epoch": 0.35431537962362103, + "grad_norm": 0.5704323615796999, + "learning_rate": 4.8432684775187464e-06, + "loss": 0.5501, + "step": 2184 + }, + { + "epoch": 0.35447761194029853, + "grad_norm": 0.6193547873836247, + "learning_rate": 4.843119620914506e-06, + "loss": 0.574, + "step": 2185 + }, + { + "epoch": 0.354639844256976, + "grad_norm": 0.6088140088062853, + "learning_rate": 4.842970695944877e-06, + "loss": 0.615, + "step": 2186 + }, + { + "epoch": 0.3548020765736535, + "grad_norm": 0.5668661976702817, + "learning_rate": 4.842821702614204e-06, + "loss": 0.5688, + "step": 2187 + }, + { + "epoch": 0.3549643088903309, + "grad_norm": 0.5722282231985635, + "learning_rate": 4.8426726409268346e-06, + "loss": 0.585, + "step": 2188 + }, + { + "epoch": 0.35512654120700843, + "grad_norm": 0.5810209595500886, + "learning_rate": 4.842523510887118e-06, + "loss": 0.5824, + "step": 2189 + }, + { + "epoch": 0.35528877352368593, + "grad_norm": 0.5859548829181034, + "learning_rate": 4.842374312499405e-06, + "loss": 0.5657, + "step": 2190 + }, + { + "epoch": 0.3554510058403634, + "grad_norm": 0.546740375891541, + "learning_rate": 4.842225045768049e-06, + "loss": 0.5448, + "step": 2191 + }, + { + "epoch": 0.3556132381570409, + "grad_norm": 0.557663039649576, + "learning_rate": 4.842075710697405e-06, + "loss": 0.6253, + "step": 2192 + }, + { + "epoch": 0.3557754704737184, + "grad_norm": 0.5907314634570128, + "learning_rate": 4.841926307291831e-06, + "loss": 0.5846, + "step": 2193 + }, + { + "epoch": 0.3559377027903958, + "grad_norm": 0.582846865357071, + "learning_rate": 4.841776835555685e-06, + "loss": 0.5756, + "step": 2194 + }, + { + "epoch": 0.35609993510707333, + "grad_norm": 0.5800816419967363, + "learning_rate": 4.841627295493329e-06, + "loss": 0.6068, + "step": 2195 + }, + { + "epoch": 0.35626216742375083, + "grad_norm": 0.5987268410715805, + "learning_rate": 4.841477687109126e-06, + "loss": 0.6137, + "step": 2196 + }, + { + "epoch": 0.3564243997404283, + "grad_norm": 0.5953655465333801, + "learning_rate": 4.84132801040744e-06, + "loss": 0.5795, + "step": 2197 + }, + { + "epoch": 0.3565866320571058, + "grad_norm": 0.5938635841905072, + "learning_rate": 4.84117826539264e-06, + "loss": 0.5937, + "step": 2198 + }, + { + "epoch": 0.3567488643737833, + "grad_norm": 0.612620401475451, + "learning_rate": 4.841028452069094e-06, + "loss": 0.5797, + "step": 2199 + }, + { + "epoch": 0.3569110966904607, + "grad_norm": 0.6088956896769624, + "learning_rate": 4.840878570441173e-06, + "loss": 0.5707, + "step": 2200 + }, + { + "epoch": 0.35707332900713823, + "grad_norm": 0.6331624769557069, + "learning_rate": 4.840728620513251e-06, + "loss": 0.5582, + "step": 2201 + }, + { + "epoch": 0.35723556132381573, + "grad_norm": 0.6039639818697221, + "learning_rate": 4.840578602289702e-06, + "loss": 0.5687, + "step": 2202 + }, + { + "epoch": 0.3573977936404932, + "grad_norm": 0.5954906548892649, + "learning_rate": 4.840428515774904e-06, + "loss": 0.5791, + "step": 2203 + }, + { + "epoch": 0.3575600259571707, + "grad_norm": 0.5734212386614962, + "learning_rate": 4.840278360973235e-06, + "loss": 0.5647, + "step": 2204 + }, + { + "epoch": 0.3577222582738481, + "grad_norm": 0.5607044436224147, + "learning_rate": 4.8401281378890775e-06, + "loss": 0.5925, + "step": 2205 + }, + { + "epoch": 0.35788449059052563, + "grad_norm": 0.6126468574667382, + "learning_rate": 4.8399778465268136e-06, + "loss": 0.5497, + "step": 2206 + }, + { + "epoch": 0.35804672290720313, + "grad_norm": 0.5837041750741918, + "learning_rate": 4.839827486890829e-06, + "loss": 0.5794, + "step": 2207 + }, + { + "epoch": 0.3582089552238806, + "grad_norm": 0.6355251089605064, + "learning_rate": 4.83967705898551e-06, + "loss": 0.5854, + "step": 2208 + }, + { + "epoch": 0.3583711875405581, + "grad_norm": 0.580874957227364, + "learning_rate": 4.839526562815246e-06, + "loss": 0.6053, + "step": 2209 + }, + { + "epoch": 0.3585334198572356, + "grad_norm": 0.6169871808662765, + "learning_rate": 4.839375998384428e-06, + "loss": 0.5932, + "step": 2210 + }, + { + "epoch": 0.358695652173913, + "grad_norm": 0.5524336307081513, + "learning_rate": 4.83922536569745e-06, + "loss": 0.5757, + "step": 2211 + }, + { + "epoch": 0.35885788449059053, + "grad_norm": 0.6257430656630173, + "learning_rate": 4.839074664758705e-06, + "loss": 0.5637, + "step": 2212 + }, + { + "epoch": 0.35902011680726803, + "grad_norm": 0.6029731666081886, + "learning_rate": 4.838923895572591e-06, + "loss": 0.6077, + "step": 2213 + }, + { + "epoch": 0.3591823491239455, + "grad_norm": 0.5646788849797524, + "learning_rate": 4.838773058143507e-06, + "loss": 0.5643, + "step": 2214 + }, + { + "epoch": 0.359344581440623, + "grad_norm": 0.548828739954999, + "learning_rate": 4.838622152475855e-06, + "loss": 0.5905, + "step": 2215 + }, + { + "epoch": 0.3595068137573005, + "grad_norm": 0.5619892283204587, + "learning_rate": 4.8384711785740365e-06, + "loss": 0.567, + "step": 2216 + }, + { + "epoch": 0.3596690460739779, + "grad_norm": 0.6118042240399497, + "learning_rate": 4.838320136442458e-06, + "loss": 0.5815, + "step": 2217 + }, + { + "epoch": 0.35983127839065543, + "grad_norm": 0.6181120939032734, + "learning_rate": 4.8381690260855255e-06, + "loss": 0.5758, + "step": 2218 + }, + { + "epoch": 0.3599935107073329, + "grad_norm": 0.6097699829083567, + "learning_rate": 4.8380178475076465e-06, + "loss": 0.5904, + "step": 2219 + }, + { + "epoch": 0.3601557430240104, + "grad_norm": 0.5847271529077546, + "learning_rate": 4.837866600713235e-06, + "loss": 0.6121, + "step": 2220 + }, + { + "epoch": 0.3603179753406879, + "grad_norm": 0.5754183125806094, + "learning_rate": 4.837715285706701e-06, + "loss": 0.6296, + "step": 2221 + }, + { + "epoch": 0.3604802076573653, + "grad_norm": 0.6446651794392131, + "learning_rate": 4.837563902492462e-06, + "loss": 0.5674, + "step": 2222 + }, + { + "epoch": 0.3606424399740428, + "grad_norm": 0.5991548599582475, + "learning_rate": 4.837412451074933e-06, + "loss": 0.5586, + "step": 2223 + }, + { + "epoch": 0.36080467229072033, + "grad_norm": 0.6957371420453982, + "learning_rate": 4.837260931458535e-06, + "loss": 0.5726, + "step": 2224 + }, + { + "epoch": 0.3609669046073978, + "grad_norm": 0.5821038663514689, + "learning_rate": 4.837109343647687e-06, + "loss": 0.5913, + "step": 2225 + }, + { + "epoch": 0.3611291369240753, + "grad_norm": 0.5594270904739752, + "learning_rate": 4.836957687646811e-06, + "loss": 0.5774, + "step": 2226 + }, + { + "epoch": 0.3612913692407528, + "grad_norm": 0.5912874496660436, + "learning_rate": 4.836805963460335e-06, + "loss": 0.5879, + "step": 2227 + }, + { + "epoch": 0.3614536015574302, + "grad_norm": 0.5784965613196391, + "learning_rate": 4.836654171092683e-06, + "loss": 0.5568, + "step": 2228 + }, + { + "epoch": 0.36161583387410773, + "grad_norm": 0.6038938195063456, + "learning_rate": 4.836502310548286e-06, + "loss": 0.5947, + "step": 2229 + }, + { + "epoch": 0.36177806619078523, + "grad_norm": 0.565985345079839, + "learning_rate": 4.836350381831574e-06, + "loss": 0.5855, + "step": 2230 + }, + { + "epoch": 0.3619402985074627, + "grad_norm": 0.6102041738754433, + "learning_rate": 4.836198384946978e-06, + "loss": 0.5682, + "step": 2231 + }, + { + "epoch": 0.3621025308241402, + "grad_norm": 0.5864611107790023, + "learning_rate": 4.8360463198989365e-06, + "loss": 0.6198, + "step": 2232 + }, + { + "epoch": 0.3622647631408176, + "grad_norm": 0.6133855716281266, + "learning_rate": 4.835894186691884e-06, + "loss": 0.5788, + "step": 2233 + }, + { + "epoch": 0.3624269954574951, + "grad_norm": 0.5781962530885957, + "learning_rate": 4.835741985330259e-06, + "loss": 0.5782, + "step": 2234 + }, + { + "epoch": 0.36258922777417263, + "grad_norm": 0.5966385944493345, + "learning_rate": 4.835589715818504e-06, + "loss": 0.555, + "step": 2235 + }, + { + "epoch": 0.3627514600908501, + "grad_norm": 0.7233161503960781, + "learning_rate": 4.83543737816106e-06, + "loss": 0.5812, + "step": 2236 + }, + { + "epoch": 0.3629136924075276, + "grad_norm": 0.585548371272269, + "learning_rate": 4.835284972362373e-06, + "loss": 0.6054, + "step": 2237 + }, + { + "epoch": 0.3630759247242051, + "grad_norm": 0.5592299360203586, + "learning_rate": 4.835132498426889e-06, + "loss": 0.5855, + "step": 2238 + }, + { + "epoch": 0.3632381570408825, + "grad_norm": 0.5987252105679711, + "learning_rate": 4.8349799563590564e-06, + "loss": 0.587, + "step": 2239 + }, + { + "epoch": 0.36340038935756, + "grad_norm": 0.5534770969238942, + "learning_rate": 4.834827346163328e-06, + "loss": 0.6037, + "step": 2240 + }, + { + "epoch": 0.36356262167423753, + "grad_norm": 0.5746648906778583, + "learning_rate": 4.834674667844153e-06, + "loss": 0.5692, + "step": 2241 + }, + { + "epoch": 0.363724853990915, + "grad_norm": 0.5735893415259803, + "learning_rate": 4.83452192140599e-06, + "loss": 0.6266, + "step": 2242 + }, + { + "epoch": 0.3638870863075925, + "grad_norm": 0.6120163084985976, + "learning_rate": 4.8343691068532936e-06, + "loss": 0.5544, + "step": 2243 + }, + { + "epoch": 0.36404931862427, + "grad_norm": 0.6154524102997592, + "learning_rate": 4.834216224190522e-06, + "loss": 0.5968, + "step": 2244 + }, + { + "epoch": 0.3642115509409474, + "grad_norm": 0.6211292001350668, + "learning_rate": 4.834063273422137e-06, + "loss": 0.5904, + "step": 2245 + }, + { + "epoch": 0.36437378325762493, + "grad_norm": 0.5935665103319984, + "learning_rate": 4.8339102545526015e-06, + "loss": 0.5466, + "step": 2246 + }, + { + "epoch": 0.3645360155743024, + "grad_norm": 0.6186527941654721, + "learning_rate": 4.833757167586379e-06, + "loss": 0.5695, + "step": 2247 + }, + { + "epoch": 0.3646982478909799, + "grad_norm": 0.5735388629106659, + "learning_rate": 4.833604012527937e-06, + "loss": 0.5898, + "step": 2248 + }, + { + "epoch": 0.3648604802076574, + "grad_norm": 0.6387001013350714, + "learning_rate": 4.833450789381743e-06, + "loss": 0.5859, + "step": 2249 + }, + { + "epoch": 0.3650227125243348, + "grad_norm": 0.5982665846794116, + "learning_rate": 4.833297498152269e-06, + "loss": 0.5532, + "step": 2250 + }, + { + "epoch": 0.3651849448410123, + "grad_norm": 0.6162458979604307, + "learning_rate": 4.833144138843987e-06, + "loss": 0.5628, + "step": 2251 + }, + { + "epoch": 0.36534717715768983, + "grad_norm": 0.6297463812336666, + "learning_rate": 4.832990711461372e-06, + "loss": 0.5736, + "step": 2252 + }, + { + "epoch": 0.3655094094743673, + "grad_norm": 0.5961445291821609, + "learning_rate": 4.832837216008899e-06, + "loss": 0.6095, + "step": 2253 + }, + { + "epoch": 0.3656716417910448, + "grad_norm": 0.6232160106764657, + "learning_rate": 4.832683652491048e-06, + "loss": 0.6024, + "step": 2254 + }, + { + "epoch": 0.3658338741077223, + "grad_norm": 0.5921103685029948, + "learning_rate": 4.832530020912299e-06, + "loss": 0.5874, + "step": 2255 + }, + { + "epoch": 0.3659961064243997, + "grad_norm": 0.5733211713846806, + "learning_rate": 4.832376321277136e-06, + "loss": 0.5974, + "step": 2256 + }, + { + "epoch": 0.3661583387410772, + "grad_norm": 0.6118591108127222, + "learning_rate": 4.832222553590041e-06, + "loss": 0.5972, + "step": 2257 + }, + { + "epoch": 0.36632057105775473, + "grad_norm": 0.5969443480627727, + "learning_rate": 4.832068717855501e-06, + "loss": 0.6121, + "step": 2258 + }, + { + "epoch": 0.3664828033744322, + "grad_norm": 0.5956280949139843, + "learning_rate": 4.831914814078007e-06, + "loss": 0.6176, + "step": 2259 + }, + { + "epoch": 0.3666450356911097, + "grad_norm": 0.5930587310442242, + "learning_rate": 4.831760842262047e-06, + "loss": 0.5817, + "step": 2260 + }, + { + "epoch": 0.3668072680077871, + "grad_norm": 0.6216943267631627, + "learning_rate": 4.831606802412114e-06, + "loss": 0.6129, + "step": 2261 + }, + { + "epoch": 0.3669695003244646, + "grad_norm": 0.6390246404289197, + "learning_rate": 4.831452694532702e-06, + "loss": 0.5726, + "step": 2262 + }, + { + "epoch": 0.36713173264114213, + "grad_norm": 0.5849813144581058, + "learning_rate": 4.831298518628309e-06, + "loss": 0.5557, + "step": 2263 + }, + { + "epoch": 0.3672939649578196, + "grad_norm": 0.588985346359331, + "learning_rate": 4.831144274703432e-06, + "loss": 0.5799, + "step": 2264 + }, + { + "epoch": 0.3674561972744971, + "grad_norm": 0.5907670103029824, + "learning_rate": 4.830989962762571e-06, + "loss": 0.5979, + "step": 2265 + }, + { + "epoch": 0.3676184295911746, + "grad_norm": 0.5758948810386798, + "learning_rate": 4.83083558281023e-06, + "loss": 0.5803, + "step": 2266 + }, + { + "epoch": 0.367780661907852, + "grad_norm": 0.5905366517739907, + "learning_rate": 4.830681134850912e-06, + "loss": 0.5667, + "step": 2267 + }, + { + "epoch": 0.3679428942245295, + "grad_norm": 0.6079129391896971, + "learning_rate": 4.830526618889124e-06, + "loss": 0.5893, + "step": 2268 + }, + { + "epoch": 0.36810512654120703, + "grad_norm": 0.5616603027358492, + "learning_rate": 4.830372034929375e-06, + "loss": 0.5786, + "step": 2269 + }, + { + "epoch": 0.3682673588578845, + "grad_norm": 0.6068091321568493, + "learning_rate": 4.830217382976173e-06, + "loss": 0.566, + "step": 2270 + }, + { + "epoch": 0.368429591174562, + "grad_norm": 0.6176971875624573, + "learning_rate": 4.830062663034032e-06, + "loss": 0.601, + "step": 2271 + }, + { + "epoch": 0.3685918234912395, + "grad_norm": 0.6098112263641773, + "learning_rate": 4.8299078751074665e-06, + "loss": 0.5852, + "step": 2272 + }, + { + "epoch": 0.3687540558079169, + "grad_norm": 0.5726137865049531, + "learning_rate": 4.829753019200992e-06, + "loss": 0.6097, + "step": 2273 + }, + { + "epoch": 0.3689162881245944, + "grad_norm": 0.5805436457662064, + "learning_rate": 4.829598095319128e-06, + "loss": 0.5736, + "step": 2274 + }, + { + "epoch": 0.36907852044127193, + "grad_norm": 0.5912370456633951, + "learning_rate": 4.8294431034663925e-06, + "loss": 0.5411, + "step": 2275 + }, + { + "epoch": 0.3692407527579494, + "grad_norm": 0.5691577269063307, + "learning_rate": 4.82928804364731e-06, + "loss": 0.5234, + "step": 2276 + }, + { + "epoch": 0.3694029850746269, + "grad_norm": 0.6410076275210854, + "learning_rate": 4.829132915866402e-06, + "loss": 0.5766, + "step": 2277 + }, + { + "epoch": 0.3695652173913043, + "grad_norm": 0.5918127275479739, + "learning_rate": 4.828977720128198e-06, + "loss": 0.5735, + "step": 2278 + }, + { + "epoch": 0.3697274497079818, + "grad_norm": 0.5776030590252864, + "learning_rate": 4.8288224564372236e-06, + "loss": 0.5889, + "step": 2279 + }, + { + "epoch": 0.3698896820246593, + "grad_norm": 0.5709658330323572, + "learning_rate": 4.82866712479801e-06, + "loss": 0.5795, + "step": 2280 + }, + { + "epoch": 0.3700519143413368, + "grad_norm": 0.581982716929779, + "learning_rate": 4.828511725215089e-06, + "loss": 0.6026, + "step": 2281 + }, + { + "epoch": 0.3702141466580143, + "grad_norm": 0.5817567074970003, + "learning_rate": 4.828356257692995e-06, + "loss": 0.5872, + "step": 2282 + }, + { + "epoch": 0.3703763789746918, + "grad_norm": 0.5899538847639341, + "learning_rate": 4.828200722236265e-06, + "loss": 0.5896, + "step": 2283 + }, + { + "epoch": 0.3705386112913692, + "grad_norm": 0.5810512551019292, + "learning_rate": 4.828045118849435e-06, + "loss": 0.5704, + "step": 2284 + }, + { + "epoch": 0.3707008436080467, + "grad_norm": 0.6075203836448286, + "learning_rate": 4.8278894475370455e-06, + "loss": 0.5792, + "step": 2285 + }, + { + "epoch": 0.37086307592472423, + "grad_norm": 0.5908108160788682, + "learning_rate": 4.82773370830364e-06, + "loss": 0.5694, + "step": 2286 + }, + { + "epoch": 0.3710253082414017, + "grad_norm": 0.5729377676278816, + "learning_rate": 4.827577901153761e-06, + "loss": 0.5624, + "step": 2287 + }, + { + "epoch": 0.3711875405580792, + "grad_norm": 0.5979848564155273, + "learning_rate": 4.827422026091955e-06, + "loss": 0.571, + "step": 2288 + }, + { + "epoch": 0.3713497728747567, + "grad_norm": 0.598039357720397, + "learning_rate": 4.827266083122772e-06, + "loss": 0.5545, + "step": 2289 + }, + { + "epoch": 0.3715120051914341, + "grad_norm": 0.614489083992024, + "learning_rate": 4.827110072250758e-06, + "loss": 0.5919, + "step": 2290 + }, + { + "epoch": 0.3716742375081116, + "grad_norm": 0.6041830069627625, + "learning_rate": 4.826953993480467e-06, + "loss": 0.5629, + "step": 2291 + }, + { + "epoch": 0.3718364698247891, + "grad_norm": 0.6202583711409788, + "learning_rate": 4.8267978468164535e-06, + "loss": 0.5801, + "step": 2292 + }, + { + "epoch": 0.3719987021414666, + "grad_norm": 0.5991396419195762, + "learning_rate": 4.826641632263273e-06, + "loss": 0.6067, + "step": 2293 + }, + { + "epoch": 0.3721609344581441, + "grad_norm": 0.5945929768201152, + "learning_rate": 4.826485349825482e-06, + "loss": 0.5643, + "step": 2294 + }, + { + "epoch": 0.3723231667748215, + "grad_norm": 0.6041271562307738, + "learning_rate": 4.8263289995076435e-06, + "loss": 0.5693, + "step": 2295 + }, + { + "epoch": 0.372485399091499, + "grad_norm": 0.5826672414036546, + "learning_rate": 4.826172581314317e-06, + "loss": 0.5954, + "step": 2296 + }, + { + "epoch": 0.3726476314081765, + "grad_norm": 0.6033006929786353, + "learning_rate": 4.826016095250066e-06, + "loss": 0.5479, + "step": 2297 + }, + { + "epoch": 0.372809863724854, + "grad_norm": 0.594432281371909, + "learning_rate": 4.825859541319458e-06, + "loss": 0.5617, + "step": 2298 + }, + { + "epoch": 0.3729720960415315, + "grad_norm": 0.6155322703814352, + "learning_rate": 4.825702919527058e-06, + "loss": 0.5443, + "step": 2299 + }, + { + "epoch": 0.373134328358209, + "grad_norm": 0.6282747779184974, + "learning_rate": 4.825546229877439e-06, + "loss": 0.5987, + "step": 2300 + }, + { + "epoch": 0.3732965606748864, + "grad_norm": 0.6006248440068245, + "learning_rate": 4.8253894723751716e-06, + "loss": 0.5455, + "step": 2301 + }, + { + "epoch": 0.3734587929915639, + "grad_norm": 0.606170778750596, + "learning_rate": 4.825232647024829e-06, + "loss": 0.5842, + "step": 2302 + }, + { + "epoch": 0.37362102530824143, + "grad_norm": 0.6027431698828688, + "learning_rate": 4.825075753830987e-06, + "loss": 0.554, + "step": 2303 + }, + { + "epoch": 0.3737832576249189, + "grad_norm": 0.6062022250581242, + "learning_rate": 4.824918792798222e-06, + "loss": 0.6072, + "step": 2304 + }, + { + "epoch": 0.3739454899415964, + "grad_norm": 0.6037600698198704, + "learning_rate": 4.824761763931117e-06, + "loss": 0.6091, + "step": 2305 + }, + { + "epoch": 0.3741077222582738, + "grad_norm": 0.5847341856394601, + "learning_rate": 4.824604667234252e-06, + "loss": 0.5693, + "step": 2306 + }, + { + "epoch": 0.3742699545749513, + "grad_norm": 0.6046203378178482, + "learning_rate": 4.8244475027122095e-06, + "loss": 0.5824, + "step": 2307 + }, + { + "epoch": 0.3744321868916288, + "grad_norm": 0.5658741935851979, + "learning_rate": 4.824290270369576e-06, + "loss": 0.575, + "step": 2308 + }, + { + "epoch": 0.3745944192083063, + "grad_norm": 0.5914503251933607, + "learning_rate": 4.824132970210939e-06, + "loss": 0.5709, + "step": 2309 + }, + { + "epoch": 0.3747566515249838, + "grad_norm": 0.6073281113274399, + "learning_rate": 4.823975602240889e-06, + "loss": 0.5924, + "step": 2310 + }, + { + "epoch": 0.3749188838416613, + "grad_norm": 0.6012462480395058, + "learning_rate": 4.823818166464016e-06, + "loss": 0.5528, + "step": 2311 + }, + { + "epoch": 0.3750811161583387, + "grad_norm": 0.5946842254488881, + "learning_rate": 4.823660662884915e-06, + "loss": 0.5455, + "step": 2312 + }, + { + "epoch": 0.3752433484750162, + "grad_norm": 0.6108194544337752, + "learning_rate": 4.823503091508179e-06, + "loss": 0.569, + "step": 2313 + }, + { + "epoch": 0.3754055807916937, + "grad_norm": 0.5986077890849886, + "learning_rate": 4.8233454523384085e-06, + "loss": 0.5628, + "step": 2314 + }, + { + "epoch": 0.3755678131083712, + "grad_norm": 0.6114606245893456, + "learning_rate": 4.8231877453802015e-06, + "loss": 0.6247, + "step": 2315 + }, + { + "epoch": 0.3757300454250487, + "grad_norm": 0.6098592469714084, + "learning_rate": 4.823029970638159e-06, + "loss": 0.593, + "step": 2316 + }, + { + "epoch": 0.3758922777417262, + "grad_norm": 0.5729791551288317, + "learning_rate": 4.822872128116885e-06, + "loss": 0.59, + "step": 2317 + }, + { + "epoch": 0.3760545100584036, + "grad_norm": 0.6068056785387306, + "learning_rate": 4.822714217820986e-06, + "loss": 0.5627, + "step": 2318 + }, + { + "epoch": 0.3762167423750811, + "grad_norm": 0.6030086896468344, + "learning_rate": 4.822556239755068e-06, + "loss": 0.5513, + "step": 2319 + }, + { + "epoch": 0.37637897469175857, + "grad_norm": 0.5980724229937339, + "learning_rate": 4.822398193923739e-06, + "loss": 0.5718, + "step": 2320 + }, + { + "epoch": 0.3765412070084361, + "grad_norm": 0.580658401097759, + "learning_rate": 4.8222400803316124e-06, + "loss": 0.5976, + "step": 2321 + }, + { + "epoch": 0.3767034393251136, + "grad_norm": 0.6679149538870208, + "learning_rate": 4.822081898983302e-06, + "loss": 0.5976, + "step": 2322 + }, + { + "epoch": 0.376865671641791, + "grad_norm": 0.5934826468359237, + "learning_rate": 4.821923649883421e-06, + "loss": 0.5646, + "step": 2323 + }, + { + "epoch": 0.3770279039584685, + "grad_norm": 0.5857051432896305, + "learning_rate": 4.821765333036588e-06, + "loss": 0.5723, + "step": 2324 + }, + { + "epoch": 0.377190136275146, + "grad_norm": 0.6019311585876038, + "learning_rate": 4.821606948447421e-06, + "loss": 0.5732, + "step": 2325 + }, + { + "epoch": 0.3773523685918235, + "grad_norm": 0.5904221153536044, + "learning_rate": 4.821448496120543e-06, + "loss": 0.5892, + "step": 2326 + }, + { + "epoch": 0.377514600908501, + "grad_norm": 0.5853775879639672, + "learning_rate": 4.821289976060576e-06, + "loss": 0.618, + "step": 2327 + }, + { + "epoch": 0.3776768332251785, + "grad_norm": 0.5650916476730633, + "learning_rate": 4.821131388272145e-06, + "loss": 0.5816, + "step": 2328 + }, + { + "epoch": 0.3778390655418559, + "grad_norm": 0.5623894834827002, + "learning_rate": 4.820972732759879e-06, + "loss": 0.5784, + "step": 2329 + }, + { + "epoch": 0.3780012978585334, + "grad_norm": 0.6049938382851221, + "learning_rate": 4.820814009528405e-06, + "loss": 0.5847, + "step": 2330 + }, + { + "epoch": 0.3781635301752109, + "grad_norm": 0.5657846150885989, + "learning_rate": 4.820655218582354e-06, + "loss": 0.5794, + "step": 2331 + }, + { + "epoch": 0.3783257624918884, + "grad_norm": 0.6176919494414196, + "learning_rate": 4.820496359926361e-06, + "loss": 0.5511, + "step": 2332 + }, + { + "epoch": 0.3784879948085659, + "grad_norm": 0.5891046131252073, + "learning_rate": 4.820337433565058e-06, + "loss": 0.5843, + "step": 2333 + }, + { + "epoch": 0.3786502271252433, + "grad_norm": 0.5822420256542091, + "learning_rate": 4.8201784395030845e-06, + "loss": 0.6019, + "step": 2334 + }, + { + "epoch": 0.3788124594419208, + "grad_norm": 0.5729197244132147, + "learning_rate": 4.82001937774508e-06, + "loss": 0.6233, + "step": 2335 + }, + { + "epoch": 0.3789746917585983, + "grad_norm": 0.5894244586162558, + "learning_rate": 4.819860248295684e-06, + "loss": 0.5956, + "step": 2336 + }, + { + "epoch": 0.37913692407527577, + "grad_norm": 0.567482608215277, + "learning_rate": 4.819701051159539e-06, + "loss": 0.5814, + "step": 2337 + }, + { + "epoch": 0.3792991563919533, + "grad_norm": 0.6144275721087888, + "learning_rate": 4.81954178634129e-06, + "loss": 0.5585, + "step": 2338 + }, + { + "epoch": 0.3794613887086308, + "grad_norm": 0.5958135330416755, + "learning_rate": 4.819382453845585e-06, + "loss": 0.5796, + "step": 2339 + }, + { + "epoch": 0.3796236210253082, + "grad_norm": 0.575980775507026, + "learning_rate": 4.819223053677073e-06, + "loss": 0.5509, + "step": 2340 + }, + { + "epoch": 0.3797858533419857, + "grad_norm": 0.5826863504917958, + "learning_rate": 4.819063585840404e-06, + "loss": 0.5701, + "step": 2341 + }, + { + "epoch": 0.3799480856586632, + "grad_norm": 0.6001656882823692, + "learning_rate": 4.81890405034023e-06, + "loss": 0.579, + "step": 2342 + }, + { + "epoch": 0.3801103179753407, + "grad_norm": 0.5777142573423882, + "learning_rate": 4.8187444471812076e-06, + "loss": 0.5863, + "step": 2343 + }, + { + "epoch": 0.3802725502920182, + "grad_norm": 0.6216068964002909, + "learning_rate": 4.818584776367992e-06, + "loss": 0.5628, + "step": 2344 + }, + { + "epoch": 0.3804347826086957, + "grad_norm": 0.5720441129641325, + "learning_rate": 4.818425037905243e-06, + "loss": 0.5861, + "step": 2345 + }, + { + "epoch": 0.3805970149253731, + "grad_norm": 0.5860911752303274, + "learning_rate": 4.818265231797622e-06, + "loss": 0.6242, + "step": 2346 + }, + { + "epoch": 0.3807592472420506, + "grad_norm": 0.5678166261352066, + "learning_rate": 4.818105358049789e-06, + "loss": 0.561, + "step": 2347 + }, + { + "epoch": 0.38092147955872807, + "grad_norm": 0.6015860615922095, + "learning_rate": 4.817945416666411e-06, + "loss": 0.6048, + "step": 2348 + }, + { + "epoch": 0.3810837118754056, + "grad_norm": 0.6032649230444879, + "learning_rate": 4.817785407652154e-06, + "loss": 0.5876, + "step": 2349 + }, + { + "epoch": 0.3812459441920831, + "grad_norm": 0.5744331526649801, + "learning_rate": 4.8176253310116864e-06, + "loss": 0.5609, + "step": 2350 + }, + { + "epoch": 0.3814081765087605, + "grad_norm": 0.5690330964680386, + "learning_rate": 4.817465186749679e-06, + "loss": 0.5866, + "step": 2351 + }, + { + "epoch": 0.381570408825438, + "grad_norm": 0.5627016702481104, + "learning_rate": 4.817304974870804e-06, + "loss": 0.5975, + "step": 2352 + }, + { + "epoch": 0.3817326411421155, + "grad_norm": 0.5713780766567409, + "learning_rate": 4.817144695379736e-06, + "loss": 0.5736, + "step": 2353 + }, + { + "epoch": 0.38189487345879297, + "grad_norm": 0.6006567553532638, + "learning_rate": 4.816984348281152e-06, + "loss": 0.5948, + "step": 2354 + }, + { + "epoch": 0.3820571057754705, + "grad_norm": 0.5804626373629473, + "learning_rate": 4.816823933579729e-06, + "loss": 0.5776, + "step": 2355 + }, + { + "epoch": 0.382219338092148, + "grad_norm": 0.6168129570779292, + "learning_rate": 4.81666345128015e-06, + "loss": 0.6092, + "step": 2356 + }, + { + "epoch": 0.3823815704088254, + "grad_norm": 0.5680644227938262, + "learning_rate": 4.816502901387095e-06, + "loss": 0.5515, + "step": 2357 + }, + { + "epoch": 0.3825438027255029, + "grad_norm": 0.5925587219784899, + "learning_rate": 4.81634228390525e-06, + "loss": 0.5784, + "step": 2358 + }, + { + "epoch": 0.3827060350421804, + "grad_norm": 0.6068707271023482, + "learning_rate": 4.816181598839299e-06, + "loss": 0.5912, + "step": 2359 + }, + { + "epoch": 0.38286826735885787, + "grad_norm": 0.6425132651735012, + "learning_rate": 4.816020846193933e-06, + "loss": 0.5919, + "step": 2360 + }, + { + "epoch": 0.3830304996755354, + "grad_norm": 0.6347949029476945, + "learning_rate": 4.8158600259738415e-06, + "loss": 0.6003, + "step": 2361 + }, + { + "epoch": 0.3831927319922129, + "grad_norm": 0.5812779578428482, + "learning_rate": 4.815699138183716e-06, + "loss": 0.5534, + "step": 2362 + }, + { + "epoch": 0.3833549643088903, + "grad_norm": 0.578979552053988, + "learning_rate": 4.815538182828251e-06, + "loss": 0.573, + "step": 2363 + }, + { + "epoch": 0.3835171966255678, + "grad_norm": 0.5678138688549876, + "learning_rate": 4.815377159912144e-06, + "loss": 0.5734, + "step": 2364 + }, + { + "epoch": 0.38367942894224527, + "grad_norm": 0.6096652653618149, + "learning_rate": 4.815216069440092e-06, + "loss": 0.5668, + "step": 2365 + }, + { + "epoch": 0.3838416612589228, + "grad_norm": 0.623437836779074, + "learning_rate": 4.815054911416795e-06, + "loss": 0.578, + "step": 2366 + }, + { + "epoch": 0.3840038935756003, + "grad_norm": 0.5549261479143731, + "learning_rate": 4.814893685846954e-06, + "loss": 0.5931, + "step": 2367 + }, + { + "epoch": 0.3841661258922777, + "grad_norm": 0.574147283808282, + "learning_rate": 4.814732392735276e-06, + "loss": 0.5919, + "step": 2368 + }, + { + "epoch": 0.3843283582089552, + "grad_norm": 0.5785955601545755, + "learning_rate": 4.8145710320864655e-06, + "loss": 0.5414, + "step": 2369 + }, + { + "epoch": 0.3844905905256327, + "grad_norm": 0.5886410521027408, + "learning_rate": 4.81440960390523e-06, + "loss": 0.5826, + "step": 2370 + }, + { + "epoch": 0.38465282284231017, + "grad_norm": 0.6216071633206911, + "learning_rate": 4.81424810819628e-06, + "loss": 0.5808, + "step": 2371 + }, + { + "epoch": 0.3848150551589877, + "grad_norm": 0.5697254422357991, + "learning_rate": 4.814086544964328e-06, + "loss": 0.5774, + "step": 2372 + }, + { + "epoch": 0.3849772874756652, + "grad_norm": 0.6032112339442911, + "learning_rate": 4.813924914214088e-06, + "loss": 0.5771, + "step": 2373 + }, + { + "epoch": 0.3851395197923426, + "grad_norm": 0.5588121552898391, + "learning_rate": 4.813763215950275e-06, + "loss": 0.5606, + "step": 2374 + }, + { + "epoch": 0.3853017521090201, + "grad_norm": 0.625498969389341, + "learning_rate": 4.813601450177607e-06, + "loss": 0.5588, + "step": 2375 + }, + { + "epoch": 0.3854639844256976, + "grad_norm": 0.5914135011512802, + "learning_rate": 4.8134396169008035e-06, + "loss": 0.573, + "step": 2376 + }, + { + "epoch": 0.38562621674237507, + "grad_norm": 0.7739589576195534, + "learning_rate": 4.813277716124588e-06, + "loss": 0.5628, + "step": 2377 + }, + { + "epoch": 0.3857884490590526, + "grad_norm": 0.579812590054506, + "learning_rate": 4.813115747853682e-06, + "loss": 0.599, + "step": 2378 + }, + { + "epoch": 0.38595068137573, + "grad_norm": 0.5915019397420511, + "learning_rate": 4.812953712092814e-06, + "loss": 0.5808, + "step": 2379 + }, + { + "epoch": 0.3861129136924075, + "grad_norm": 0.6154609154406703, + "learning_rate": 4.812791608846709e-06, + "loss": 0.5609, + "step": 2380 + }, + { + "epoch": 0.386275146009085, + "grad_norm": 0.6410662883428793, + "learning_rate": 4.8126294381201e-06, + "loss": 0.5733, + "step": 2381 + }, + { + "epoch": 0.38643737832576247, + "grad_norm": 0.6008401373589519, + "learning_rate": 4.812467199917715e-06, + "loss": 0.5905, + "step": 2382 + }, + { + "epoch": 0.38659961064244, + "grad_norm": 0.6166616915484202, + "learning_rate": 4.812304894244289e-06, + "loss": 0.5631, + "step": 2383 + }, + { + "epoch": 0.3867618429591175, + "grad_norm": 0.5408016817297968, + "learning_rate": 4.812142521104559e-06, + "loss": 0.571, + "step": 2384 + }, + { + "epoch": 0.3869240752757949, + "grad_norm": 0.6092645556909795, + "learning_rate": 4.811980080503261e-06, + "loss": 0.5734, + "step": 2385 + }, + { + "epoch": 0.3870863075924724, + "grad_norm": 0.5707748609315144, + "learning_rate": 4.8118175724451345e-06, + "loss": 0.5483, + "step": 2386 + }, + { + "epoch": 0.3872485399091499, + "grad_norm": 0.5799106218851557, + "learning_rate": 4.811654996934921e-06, + "loss": 0.5978, + "step": 2387 + }, + { + "epoch": 0.38741077222582737, + "grad_norm": 0.5846068700407601, + "learning_rate": 4.811492353977366e-06, + "loss": 0.5674, + "step": 2388 + }, + { + "epoch": 0.3875730045425049, + "grad_norm": 0.5863696502489651, + "learning_rate": 4.811329643577212e-06, + "loss": 0.5682, + "step": 2389 + }, + { + "epoch": 0.3877352368591824, + "grad_norm": 0.5862842487636989, + "learning_rate": 4.811166865739209e-06, + "loss": 0.5735, + "step": 2390 + }, + { + "epoch": 0.3878974691758598, + "grad_norm": 0.6397086868950345, + "learning_rate": 4.811004020468105e-06, + "loss": 0.5972, + "step": 2391 + }, + { + "epoch": 0.3880597014925373, + "grad_norm": 0.6266907401446664, + "learning_rate": 4.810841107768651e-06, + "loss": 0.5752, + "step": 2392 + }, + { + "epoch": 0.38822193380921477, + "grad_norm": 0.5993900219244775, + "learning_rate": 4.810678127645601e-06, + "loss": 0.5976, + "step": 2393 + }, + { + "epoch": 0.38838416612589227, + "grad_norm": 0.5870091633143167, + "learning_rate": 4.81051508010371e-06, + "loss": 0.5892, + "step": 2394 + }, + { + "epoch": 0.3885463984425698, + "grad_norm": 0.566633348896261, + "learning_rate": 4.810351965147737e-06, + "loss": 0.5514, + "step": 2395 + }, + { + "epoch": 0.3887086307592472, + "grad_norm": 0.5846048632715368, + "learning_rate": 4.810188782782438e-06, + "loss": 0.5906, + "step": 2396 + }, + { + "epoch": 0.3888708630759247, + "grad_norm": 0.772978105569237, + "learning_rate": 4.810025533012576e-06, + "loss": 0.5927, + "step": 2397 + }, + { + "epoch": 0.3890330953926022, + "grad_norm": 0.6222639415224615, + "learning_rate": 4.809862215842914e-06, + "loss": 0.5719, + "step": 2398 + }, + { + "epoch": 0.38919532770927967, + "grad_norm": 0.5800030352809522, + "learning_rate": 4.809698831278217e-06, + "loss": 0.5742, + "step": 2399 + }, + { + "epoch": 0.3893575600259572, + "grad_norm": 0.5932958678070767, + "learning_rate": 4.809535379323252e-06, + "loss": 0.5935, + "step": 2400 + }, + { + "epoch": 0.3895197923426347, + "grad_norm": 0.597018943544658, + "learning_rate": 4.809371859982789e-06, + "loss": 0.5878, + "step": 2401 + }, + { + "epoch": 0.3896820246593121, + "grad_norm": 0.5923382440436092, + "learning_rate": 4.809208273261598e-06, + "loss": 0.5651, + "step": 2402 + }, + { + "epoch": 0.3898442569759896, + "grad_norm": 0.6130012511953838, + "learning_rate": 4.8090446191644525e-06, + "loss": 0.5419, + "step": 2403 + }, + { + "epoch": 0.3900064892926671, + "grad_norm": 0.6269452021077405, + "learning_rate": 4.808880897696127e-06, + "loss": 0.596, + "step": 2404 + }, + { + "epoch": 0.39016872160934457, + "grad_norm": 0.6048874575661485, + "learning_rate": 4.808717108861398e-06, + "loss": 0.572, + "step": 2405 + }, + { + "epoch": 0.3903309539260221, + "grad_norm": 0.5785430693851359, + "learning_rate": 4.808553252665045e-06, + "loss": 0.5986, + "step": 2406 + }, + { + "epoch": 0.3904931862426995, + "grad_norm": 0.5621636076820771, + "learning_rate": 4.808389329111849e-06, + "loss": 0.5821, + "step": 2407 + }, + { + "epoch": 0.390655418559377, + "grad_norm": 0.6199415050941549, + "learning_rate": 4.808225338206593e-06, + "loss": 0.5876, + "step": 2408 + }, + { + "epoch": 0.3908176508760545, + "grad_norm": 0.6169208011598176, + "learning_rate": 4.808061279954061e-06, + "loss": 0.6152, + "step": 2409 + }, + { + "epoch": 0.39097988319273197, + "grad_norm": 0.6077325901929755, + "learning_rate": 4.80789715435904e-06, + "loss": 0.5766, + "step": 2410 + }, + { + "epoch": 0.39114211550940947, + "grad_norm": 0.5596849056597408, + "learning_rate": 4.8077329614263195e-06, + "loss": 0.5819, + "step": 2411 + }, + { + "epoch": 0.391304347826087, + "grad_norm": 0.5836471590976678, + "learning_rate": 4.807568701160688e-06, + "loss": 0.569, + "step": 2412 + }, + { + "epoch": 0.3914665801427644, + "grad_norm": 0.5918078206527067, + "learning_rate": 4.807404373566942e-06, + "loss": 0.5652, + "step": 2413 + }, + { + "epoch": 0.3916288124594419, + "grad_norm": 0.6257186414735721, + "learning_rate": 4.807239978649873e-06, + "loss": 0.5578, + "step": 2414 + }, + { + "epoch": 0.3917910447761194, + "grad_norm": 0.5747180580157675, + "learning_rate": 4.807075516414277e-06, + "loss": 0.59, + "step": 2415 + }, + { + "epoch": 0.39195327709279687, + "grad_norm": 0.6249265125285566, + "learning_rate": 4.806910986864954e-06, + "loss": 0.5397, + "step": 2416 + }, + { + "epoch": 0.39211550940947437, + "grad_norm": 0.6226152459800718, + "learning_rate": 4.806746390006706e-06, + "loss": 0.5761, + "step": 2417 + }, + { + "epoch": 0.3922777417261519, + "grad_norm": 0.6051569260517519, + "learning_rate": 4.806581725844333e-06, + "loss": 0.5913, + "step": 2418 + }, + { + "epoch": 0.3924399740428293, + "grad_norm": 0.5798000600136944, + "learning_rate": 4.8064169943826385e-06, + "loss": 0.5997, + "step": 2419 + }, + { + "epoch": 0.3926022063595068, + "grad_norm": 0.6085242684625691, + "learning_rate": 4.806252195626433e-06, + "loss": 0.589, + "step": 2420 + }, + { + "epoch": 0.39276443867618427, + "grad_norm": 0.6011083090284858, + "learning_rate": 4.806087329580521e-06, + "loss": 0.563, + "step": 2421 + }, + { + "epoch": 0.39292667099286177, + "grad_norm": 0.596685674619629, + "learning_rate": 4.805922396249715e-06, + "loss": 0.6202, + "step": 2422 + }, + { + "epoch": 0.3930889033095393, + "grad_norm": 0.575534002434354, + "learning_rate": 4.805757395638826e-06, + "loss": 0.5753, + "step": 2423 + }, + { + "epoch": 0.3932511356262167, + "grad_norm": 0.5968115728563113, + "learning_rate": 4.805592327752669e-06, + "loss": 0.5528, + "step": 2424 + }, + { + "epoch": 0.3934133679428942, + "grad_norm": 0.5778474849868096, + "learning_rate": 4.80542719259606e-06, + "loss": 0.5911, + "step": 2425 + }, + { + "epoch": 0.3935756002595717, + "grad_norm": 0.5746655240562734, + "learning_rate": 4.805261990173817e-06, + "loss": 0.5797, + "step": 2426 + }, + { + "epoch": 0.39373783257624917, + "grad_norm": 0.585237729399825, + "learning_rate": 4.805096720490761e-06, + "loss": 0.5814, + "step": 2427 + }, + { + "epoch": 0.39390006489292667, + "grad_norm": 0.600592602621955, + "learning_rate": 4.804931383551712e-06, + "loss": 0.5496, + "step": 2428 + }, + { + "epoch": 0.3940622972096042, + "grad_norm": 0.6102025785535493, + "learning_rate": 4.8047659793614955e-06, + "loss": 0.5959, + "step": 2429 + }, + { + "epoch": 0.3942245295262816, + "grad_norm": 0.709396418202036, + "learning_rate": 4.804600507924938e-06, + "loss": 0.5924, + "step": 2430 + }, + { + "epoch": 0.3943867618429591, + "grad_norm": 0.5796615075185492, + "learning_rate": 4.804434969246867e-06, + "loss": 0.6116, + "step": 2431 + }, + { + "epoch": 0.3945489941596366, + "grad_norm": 0.5990389413007758, + "learning_rate": 4.804269363332112e-06, + "loss": 0.6071, + "step": 2432 + }, + { + "epoch": 0.39471122647631407, + "grad_norm": 0.5618891865564133, + "learning_rate": 4.804103690185505e-06, + "loss": 0.5783, + "step": 2433 + }, + { + "epoch": 0.39487345879299157, + "grad_norm": 0.6007406555824408, + "learning_rate": 4.803937949811881e-06, + "loss": 0.5966, + "step": 2434 + }, + { + "epoch": 0.3950356911096691, + "grad_norm": 0.5861792741942577, + "learning_rate": 4.8037721422160735e-06, + "loss": 0.5706, + "step": 2435 + }, + { + "epoch": 0.3951979234263465, + "grad_norm": 0.580475874851543, + "learning_rate": 4.803606267402923e-06, + "loss": 0.5138, + "step": 2436 + }, + { + "epoch": 0.395360155743024, + "grad_norm": 0.5893020190642597, + "learning_rate": 4.803440325377267e-06, + "loss": 0.562, + "step": 2437 + }, + { + "epoch": 0.39552238805970147, + "grad_norm": 0.5855857141678723, + "learning_rate": 4.803274316143948e-06, + "loss": 0.5703, + "step": 2438 + }, + { + "epoch": 0.39568462037637897, + "grad_norm": 0.6272751954355744, + "learning_rate": 4.80310823970781e-06, + "loss": 0.5663, + "step": 2439 + }, + { + "epoch": 0.3958468526930565, + "grad_norm": 0.6058188873939679, + "learning_rate": 4.802942096073698e-06, + "loss": 0.5707, + "step": 2440 + }, + { + "epoch": 0.3960090850097339, + "grad_norm": 0.5909299392450983, + "learning_rate": 4.802775885246461e-06, + "loss": 0.6046, + "step": 2441 + }, + { + "epoch": 0.3961713173264114, + "grad_norm": 0.6232898815913072, + "learning_rate": 4.802609607230947e-06, + "loss": 0.5455, + "step": 2442 + }, + { + "epoch": 0.3963335496430889, + "grad_norm": 0.6092293403082228, + "learning_rate": 4.802443262032008e-06, + "loss": 0.5798, + "step": 2443 + }, + { + "epoch": 0.39649578195976637, + "grad_norm": 0.5886638632767431, + "learning_rate": 4.802276849654497e-06, + "loss": 0.5396, + "step": 2444 + }, + { + "epoch": 0.39665801427644387, + "grad_norm": 0.5845016397978328, + "learning_rate": 4.80211037010327e-06, + "loss": 0.6007, + "step": 2445 + }, + { + "epoch": 0.3968202465931214, + "grad_norm": 0.5677907553649145, + "learning_rate": 4.801943823383185e-06, + "loss": 0.544, + "step": 2446 + }, + { + "epoch": 0.3969824789097988, + "grad_norm": 0.6011870415995918, + "learning_rate": 4.801777209499101e-06, + "loss": 0.5669, + "step": 2447 + }, + { + "epoch": 0.3971447112264763, + "grad_norm": 0.5784297590201819, + "learning_rate": 4.801610528455878e-06, + "loss": 0.5695, + "step": 2448 + }, + { + "epoch": 0.3973069435431538, + "grad_norm": 0.6037818718959568, + "learning_rate": 4.80144378025838e-06, + "loss": 0.5782, + "step": 2449 + }, + { + "epoch": 0.39746917585983127, + "grad_norm": 0.606762242500834, + "learning_rate": 4.8012769649114735e-06, + "loss": 0.5808, + "step": 2450 + }, + { + "epoch": 0.39763140817650877, + "grad_norm": 0.5730224698634416, + "learning_rate": 4.801110082420024e-06, + "loss": 0.5694, + "step": 2451 + }, + { + "epoch": 0.3977936404931862, + "grad_norm": 0.6336178083090975, + "learning_rate": 4.800943132788901e-06, + "loss": 0.5733, + "step": 2452 + }, + { + "epoch": 0.3979558728098637, + "grad_norm": 0.5859196752872298, + "learning_rate": 4.800776116022977e-06, + "loss": 0.5693, + "step": 2453 + }, + { + "epoch": 0.3981181051265412, + "grad_norm": 0.5761567761525385, + "learning_rate": 4.800609032127123e-06, + "loss": 0.5991, + "step": 2454 + }, + { + "epoch": 0.39828033744321867, + "grad_norm": 0.6379100071429371, + "learning_rate": 4.800441881106215e-06, + "loss": 0.566, + "step": 2455 + }, + { + "epoch": 0.39844256975989617, + "grad_norm": 0.6146887551948378, + "learning_rate": 4.80027466296513e-06, + "loss": 0.5864, + "step": 2456 + }, + { + "epoch": 0.39860480207657367, + "grad_norm": 0.5946482036005637, + "learning_rate": 4.800107377708747e-06, + "loss": 0.5467, + "step": 2457 + }, + { + "epoch": 0.3987670343932511, + "grad_norm": 0.6391862588590438, + "learning_rate": 4.7999400253419474e-06, + "loss": 0.5748, + "step": 2458 + }, + { + "epoch": 0.3989292667099286, + "grad_norm": 0.6058532095073645, + "learning_rate": 4.799772605869613e-06, + "loss": 0.5829, + "step": 2459 + }, + { + "epoch": 0.3990914990266061, + "grad_norm": 0.5849542182926907, + "learning_rate": 4.799605119296628e-06, + "loss": 0.6017, + "step": 2460 + }, + { + "epoch": 0.39925373134328357, + "grad_norm": 0.6334751091174016, + "learning_rate": 4.799437565627881e-06, + "loss": 0.5961, + "step": 2461 + }, + { + "epoch": 0.39941596365996107, + "grad_norm": 0.6401341477772732, + "learning_rate": 4.79926994486826e-06, + "loss": 0.5753, + "step": 2462 + }, + { + "epoch": 0.3995781959766386, + "grad_norm": 0.6024396748636589, + "learning_rate": 4.799102257022656e-06, + "loss": 0.5843, + "step": 2463 + }, + { + "epoch": 0.399740428293316, + "grad_norm": 0.5698444939426697, + "learning_rate": 4.798934502095961e-06, + "loss": 0.6203, + "step": 2464 + }, + { + "epoch": 0.3999026606099935, + "grad_norm": 0.6415980731637829, + "learning_rate": 4.79876668009307e-06, + "loss": 0.5554, + "step": 2465 + }, + { + "epoch": 0.40006489292667097, + "grad_norm": 0.5962816649928789, + "learning_rate": 4.798598791018878e-06, + "loss": 0.5774, + "step": 2466 + }, + { + "epoch": 0.40022712524334847, + "grad_norm": 0.6062930641247262, + "learning_rate": 4.798430834878287e-06, + "loss": 0.5637, + "step": 2467 + }, + { + "epoch": 0.40038935756002597, + "grad_norm": 0.5617254748153547, + "learning_rate": 4.798262811676194e-06, + "loss": 0.5294, + "step": 2468 + }, + { + "epoch": 0.4005515898767034, + "grad_norm": 0.6148168263675496, + "learning_rate": 4.798094721417504e-06, + "loss": 0.5231, + "step": 2469 + }, + { + "epoch": 0.4007138221933809, + "grad_norm": 0.6111778430399538, + "learning_rate": 4.79792656410712e-06, + "loss": 0.5834, + "step": 2470 + }, + { + "epoch": 0.4008760545100584, + "grad_norm": 0.6152796434064308, + "learning_rate": 4.7977583397499475e-06, + "loss": 0.5722, + "step": 2471 + }, + { + "epoch": 0.40103828682673587, + "grad_norm": 0.6008337228330107, + "learning_rate": 4.797590048350896e-06, + "loss": 0.5882, + "step": 2472 + }, + { + "epoch": 0.40120051914341337, + "grad_norm": 0.6008447017173606, + "learning_rate": 4.797421689914876e-06, + "loss": 0.5411, + "step": 2473 + }, + { + "epoch": 0.40136275146009087, + "grad_norm": 0.5911822020061155, + "learning_rate": 4.797253264446799e-06, + "loss": 0.5651, + "step": 2474 + }, + { + "epoch": 0.4015249837767683, + "grad_norm": 0.6032921870738107, + "learning_rate": 4.797084771951581e-06, + "loss": 0.5759, + "step": 2475 + }, + { + "epoch": 0.4016872160934458, + "grad_norm": 0.5673470265138709, + "learning_rate": 4.7969162124341354e-06, + "loss": 0.5758, + "step": 2476 + }, + { + "epoch": 0.4018494484101233, + "grad_norm": 0.5844240333608619, + "learning_rate": 4.796747585899382e-06, + "loss": 0.5902, + "step": 2477 + }, + { + "epoch": 0.40201168072680077, + "grad_norm": 0.6003909119595685, + "learning_rate": 4.7965788923522395e-06, + "loss": 0.574, + "step": 2478 + }, + { + "epoch": 0.40217391304347827, + "grad_norm": 0.6000606329264218, + "learning_rate": 4.7964101317976305e-06, + "loss": 0.5758, + "step": 2479 + }, + { + "epoch": 0.4023361453601557, + "grad_norm": 0.5735862532184886, + "learning_rate": 4.79624130424048e-06, + "loss": 0.6013, + "step": 2480 + }, + { + "epoch": 0.4024983776768332, + "grad_norm": 0.5723650035810754, + "learning_rate": 4.796072409685713e-06, + "loss": 0.5963, + "step": 2481 + }, + { + "epoch": 0.4026606099935107, + "grad_norm": 0.567020409247674, + "learning_rate": 4.795903448138256e-06, + "loss": 0.6028, + "step": 2482 + }, + { + "epoch": 0.40282284231018817, + "grad_norm": 0.7571498777981329, + "learning_rate": 4.795734419603041e-06, + "loss": 0.5818, + "step": 2483 + }, + { + "epoch": 0.40298507462686567, + "grad_norm": 0.5817793929213839, + "learning_rate": 4.795565324084999e-06, + "loss": 0.5967, + "step": 2484 + }, + { + "epoch": 0.40314730694354317, + "grad_norm": 0.6651520463957991, + "learning_rate": 4.795396161589065e-06, + "loss": 0.5797, + "step": 2485 + }, + { + "epoch": 0.4033095392602206, + "grad_norm": 0.6280083450184797, + "learning_rate": 4.795226932120171e-06, + "loss": 0.5573, + "step": 2486 + }, + { + "epoch": 0.4034717715768981, + "grad_norm": 0.6046767682160534, + "learning_rate": 4.795057635683258e-06, + "loss": 0.5519, + "step": 2487 + }, + { + "epoch": 0.4036340038935756, + "grad_norm": 0.5829170278012836, + "learning_rate": 4.794888272283264e-06, + "loss": 0.5656, + "step": 2488 + }, + { + "epoch": 0.40379623621025307, + "grad_norm": 0.5998825236355443, + "learning_rate": 4.794718841925132e-06, + "loss": 0.5947, + "step": 2489 + }, + { + "epoch": 0.40395846852693057, + "grad_norm": 0.581584294130756, + "learning_rate": 4.794549344613804e-06, + "loss": 0.5823, + "step": 2490 + }, + { + "epoch": 0.40412070084360807, + "grad_norm": 0.6128865780246724, + "learning_rate": 4.794379780354226e-06, + "loss": 0.596, + "step": 2491 + }, + { + "epoch": 0.4042829331602855, + "grad_norm": 0.6028333851554403, + "learning_rate": 4.7942101491513445e-06, + "loss": 0.6016, + "step": 2492 + }, + { + "epoch": 0.404445165476963, + "grad_norm": 0.6090587789666261, + "learning_rate": 4.7940404510101115e-06, + "loss": 0.5745, + "step": 2493 + }, + { + "epoch": 0.40460739779364047, + "grad_norm": 0.6025150630098927, + "learning_rate": 4.793870685935475e-06, + "loss": 0.549, + "step": 2494 + }, + { + "epoch": 0.40476963011031797, + "grad_norm": 0.6020497416596824, + "learning_rate": 4.79370085393239e-06, + "loss": 0.578, + "step": 2495 + }, + { + "epoch": 0.40493186242699547, + "grad_norm": 0.6082883005575795, + "learning_rate": 4.793530955005812e-06, + "loss": 0.5669, + "step": 2496 + }, + { + "epoch": 0.4050940947436729, + "grad_norm": 0.5755827576649821, + "learning_rate": 4.793360989160697e-06, + "loss": 0.5545, + "step": 2497 + }, + { + "epoch": 0.4052563270603504, + "grad_norm": 0.5653892605945517, + "learning_rate": 4.793190956402005e-06, + "loss": 0.5965, + "step": 2498 + }, + { + "epoch": 0.4054185593770279, + "grad_norm": 0.6110420671223552, + "learning_rate": 4.793020856734697e-06, + "loss": 0.5628, + "step": 2499 + }, + { + "epoch": 0.40558079169370537, + "grad_norm": 0.6166994491278082, + "learning_rate": 4.792850690163735e-06, + "loss": 0.5456, + "step": 2500 + }, + { + "epoch": 0.40574302401038287, + "grad_norm": 0.603784384278216, + "learning_rate": 4.792680456694085e-06, + "loss": 0.5912, + "step": 2501 + }, + { + "epoch": 0.40590525632706037, + "grad_norm": 0.6104208203330691, + "learning_rate": 4.792510156330714e-06, + "loss": 0.6145, + "step": 2502 + }, + { + "epoch": 0.4060674886437378, + "grad_norm": 0.5611507041458116, + "learning_rate": 4.79233978907859e-06, + "loss": 0.581, + "step": 2503 + }, + { + "epoch": 0.4062297209604153, + "grad_norm": 0.5547038172109285, + "learning_rate": 4.792169354942685e-06, + "loss": 0.5809, + "step": 2504 + }, + { + "epoch": 0.4063919532770928, + "grad_norm": 0.5779512756623515, + "learning_rate": 4.791998853927971e-06, + "loss": 0.536, + "step": 2505 + }, + { + "epoch": 0.40655418559377027, + "grad_norm": 0.6263021890651329, + "learning_rate": 4.7918282860394225e-06, + "loss": 0.5651, + "step": 2506 + }, + { + "epoch": 0.40671641791044777, + "grad_norm": 0.6001699846382711, + "learning_rate": 4.791657651282017e-06, + "loss": 0.5638, + "step": 2507 + }, + { + "epoch": 0.4068786502271252, + "grad_norm": 0.5919233661923227, + "learning_rate": 4.791486949660732e-06, + "loss": 0.6036, + "step": 2508 + }, + { + "epoch": 0.4070408825438027, + "grad_norm": 0.6409979347177103, + "learning_rate": 4.791316181180549e-06, + "loss": 0.5436, + "step": 2509 + }, + { + "epoch": 0.4072031148604802, + "grad_norm": 0.5972037546835487, + "learning_rate": 4.79114534584645e-06, + "loss": 0.6039, + "step": 2510 + }, + { + "epoch": 0.40736534717715767, + "grad_norm": 0.5750702560775373, + "learning_rate": 4.79097444366342e-06, + "loss": 0.5922, + "step": 2511 + }, + { + "epoch": 0.40752757949383517, + "grad_norm": 0.5946834716959438, + "learning_rate": 4.790803474636445e-06, + "loss": 0.5694, + "step": 2512 + }, + { + "epoch": 0.40768981181051267, + "grad_norm": 0.5848554749288483, + "learning_rate": 4.790632438770513e-06, + "loss": 0.5771, + "step": 2513 + }, + { + "epoch": 0.4078520441271901, + "grad_norm": 0.6153695929193597, + "learning_rate": 4.790461336070615e-06, + "loss": 0.5851, + "step": 2514 + }, + { + "epoch": 0.4080142764438676, + "grad_norm": 0.5884945881065894, + "learning_rate": 4.790290166541744e-06, + "loss": 0.5709, + "step": 2515 + }, + { + "epoch": 0.4081765087605451, + "grad_norm": 0.618143809689855, + "learning_rate": 4.7901189301888925e-06, + "loss": 0.5352, + "step": 2516 + }, + { + "epoch": 0.40833874107722257, + "grad_norm": 0.6276573776896889, + "learning_rate": 4.789947627017058e-06, + "loss": 0.6228, + "step": 2517 + }, + { + "epoch": 0.40850097339390007, + "grad_norm": 0.5668358413609691, + "learning_rate": 4.789776257031238e-06, + "loss": 0.5442, + "step": 2518 + }, + { + "epoch": 0.40866320571057757, + "grad_norm": 0.6301939120478178, + "learning_rate": 4.789604820236432e-06, + "loss": 0.575, + "step": 2519 + }, + { + "epoch": 0.408825438027255, + "grad_norm": 0.5920552220465437, + "learning_rate": 4.789433316637644e-06, + "loss": 0.6203, + "step": 2520 + }, + { + "epoch": 0.4089876703439325, + "grad_norm": 0.6157878097974501, + "learning_rate": 4.789261746239876e-06, + "loss": 0.5772, + "step": 2521 + }, + { + "epoch": 0.40914990266061, + "grad_norm": 0.566961889383489, + "learning_rate": 4.789090109048134e-06, + "loss": 0.5497, + "step": 2522 + }, + { + "epoch": 0.40931213497728747, + "grad_norm": 0.5968062136976231, + "learning_rate": 4.788918405067428e-06, + "loss": 0.5649, + "step": 2523 + }, + { + "epoch": 0.40947436729396497, + "grad_norm": 0.6158270144695419, + "learning_rate": 4.788746634302766e-06, + "loss": 0.5608, + "step": 2524 + }, + { + "epoch": 0.4096365996106424, + "grad_norm": 0.613146476686902, + "learning_rate": 4.788574796759159e-06, + "loss": 0.5808, + "step": 2525 + }, + { + "epoch": 0.4097988319273199, + "grad_norm": 0.5742278784994689, + "learning_rate": 4.788402892441622e-06, + "loss": 0.569, + "step": 2526 + }, + { + "epoch": 0.4099610642439974, + "grad_norm": 0.5863563172580828, + "learning_rate": 4.788230921355171e-06, + "loss": 0.5817, + "step": 2527 + }, + { + "epoch": 0.41012329656067487, + "grad_norm": 0.610710970679401, + "learning_rate": 4.788058883504824e-06, + "loss": 0.6057, + "step": 2528 + }, + { + "epoch": 0.41028552887735237, + "grad_norm": 0.5853426742288244, + "learning_rate": 4.787886778895599e-06, + "loss": 0.6102, + "step": 2529 + }, + { + "epoch": 0.41044776119402987, + "grad_norm": 0.5899142726043575, + "learning_rate": 4.787714607532518e-06, + "loss": 0.6158, + "step": 2530 + }, + { + "epoch": 0.4106099935107073, + "grad_norm": 0.5832736046536072, + "learning_rate": 4.787542369420604e-06, + "loss": 0.5434, + "step": 2531 + }, + { + "epoch": 0.4107722258273848, + "grad_norm": 0.5788368638439344, + "learning_rate": 4.787370064564884e-06, + "loss": 0.5797, + "step": 2532 + }, + { + "epoch": 0.4109344581440623, + "grad_norm": 0.5922531615035892, + "learning_rate": 4.787197692970383e-06, + "loss": 0.5572, + "step": 2533 + }, + { + "epoch": 0.41109669046073977, + "grad_norm": 0.5757173935498983, + "learning_rate": 4.787025254642133e-06, + "loss": 0.5995, + "step": 2534 + }, + { + "epoch": 0.41125892277741727, + "grad_norm": 0.586077612686955, + "learning_rate": 4.786852749585164e-06, + "loss": 0.5883, + "step": 2535 + }, + { + "epoch": 0.41142115509409477, + "grad_norm": 0.6080461113005592, + "learning_rate": 4.786680177804508e-06, + "loss": 0.5734, + "step": 2536 + }, + { + "epoch": 0.4115833874107722, + "grad_norm": 0.6155504489736219, + "learning_rate": 4.786507539305202e-06, + "loss": 0.5768, + "step": 2537 + }, + { + "epoch": 0.4117456197274497, + "grad_norm": 0.577169917897951, + "learning_rate": 4.786334834092282e-06, + "loss": 0.5604, + "step": 2538 + }, + { + "epoch": 0.41190785204412717, + "grad_norm": 0.5791543265757385, + "learning_rate": 4.786162062170788e-06, + "loss": 0.5613, + "step": 2539 + }, + { + "epoch": 0.41207008436080467, + "grad_norm": 0.5819174970708489, + "learning_rate": 4.785989223545759e-06, + "loss": 0.5897, + "step": 2540 + }, + { + "epoch": 0.41223231667748217, + "grad_norm": 0.6005852461770023, + "learning_rate": 4.785816318222241e-06, + "loss": 0.5868, + "step": 2541 + }, + { + "epoch": 0.4123945489941596, + "grad_norm": 0.5521269091996329, + "learning_rate": 4.785643346205277e-06, + "loss": 0.5667, + "step": 2542 + }, + { + "epoch": 0.4125567813108371, + "grad_norm": 0.6113738916452722, + "learning_rate": 4.785470307499913e-06, + "loss": 0.5857, + "step": 2543 + }, + { + "epoch": 0.4127190136275146, + "grad_norm": 0.6180231441531823, + "learning_rate": 4.7852972021112e-06, + "loss": 0.544, + "step": 2544 + }, + { + "epoch": 0.41288124594419207, + "grad_norm": 0.588818365514136, + "learning_rate": 4.785124030044186e-06, + "loss": 0.6135, + "step": 2545 + }, + { + "epoch": 0.41304347826086957, + "grad_norm": 0.5884682016374965, + "learning_rate": 4.784950791303926e-06, + "loss": 0.6008, + "step": 2546 + }, + { + "epoch": 0.41320571057754707, + "grad_norm": 0.5829089433994632, + "learning_rate": 4.784777485895473e-06, + "loss": 0.5917, + "step": 2547 + }, + { + "epoch": 0.4133679428942245, + "grad_norm": 0.6295247640793052, + "learning_rate": 4.784604113823885e-06, + "loss": 0.5885, + "step": 2548 + }, + { + "epoch": 0.413530175210902, + "grad_norm": 0.6064294468051956, + "learning_rate": 4.78443067509422e-06, + "loss": 0.5768, + "step": 2549 + }, + { + "epoch": 0.4136924075275795, + "grad_norm": 0.6109791732706203, + "learning_rate": 4.784257169711537e-06, + "loss": 0.6044, + "step": 2550 + }, + { + "epoch": 0.41385463984425697, + "grad_norm": 0.6057507757271221, + "learning_rate": 4.784083597680901e-06, + "loss": 0.5925, + "step": 2551 + }, + { + "epoch": 0.41401687216093447, + "grad_norm": 0.5798975942602511, + "learning_rate": 4.783909959007374e-06, + "loss": 0.575, + "step": 2552 + }, + { + "epoch": 0.4141791044776119, + "grad_norm": 0.5784779818508826, + "learning_rate": 4.783736253696023e-06, + "loss": 0.6172, + "step": 2553 + }, + { + "epoch": 0.4143413367942894, + "grad_norm": 0.6025654857575776, + "learning_rate": 4.783562481751916e-06, + "loss": 0.5616, + "step": 2554 + }, + { + "epoch": 0.4145035691109669, + "grad_norm": 0.6836149154791662, + "learning_rate": 4.783388643180124e-06, + "loss": 0.561, + "step": 2555 + }, + { + "epoch": 0.41466580142764436, + "grad_norm": 0.5781606237232327, + "learning_rate": 4.78321473798572e-06, + "loss": 0.5574, + "step": 2556 + }, + { + "epoch": 0.41482803374432187, + "grad_norm": 0.6337165624808063, + "learning_rate": 4.783040766173775e-06, + "loss": 0.5448, + "step": 2557 + }, + { + "epoch": 0.41499026606099937, + "grad_norm": 0.5644320337625124, + "learning_rate": 4.782866727749368e-06, + "loss": 0.5647, + "step": 2558 + }, + { + "epoch": 0.4151524983776768, + "grad_norm": 0.6778659146550582, + "learning_rate": 4.782692622717574e-06, + "loss": 0.6062, + "step": 2559 + }, + { + "epoch": 0.4153147306943543, + "grad_norm": 0.6107793144879182, + "learning_rate": 4.782518451083476e-06, + "loss": 0.5284, + "step": 2560 + }, + { + "epoch": 0.4154769630110318, + "grad_norm": 0.5748509439471161, + "learning_rate": 4.782344212852154e-06, + "loss": 0.604, + "step": 2561 + }, + { + "epoch": 0.41563919532770927, + "grad_norm": 0.5761833982495265, + "learning_rate": 4.782169908028691e-06, + "loss": 0.5741, + "step": 2562 + }, + { + "epoch": 0.41580142764438677, + "grad_norm": 0.61887834493641, + "learning_rate": 4.781995536618174e-06, + "loss": 0.5758, + "step": 2563 + }, + { + "epoch": 0.41596365996106427, + "grad_norm": 0.6120893342349748, + "learning_rate": 4.781821098625691e-06, + "loss": 0.617, + "step": 2564 + }, + { + "epoch": 0.4161258922777417, + "grad_norm": 0.5614340893395604, + "learning_rate": 4.781646594056331e-06, + "loss": 0.552, + "step": 2565 + }, + { + "epoch": 0.4162881245944192, + "grad_norm": 0.5764806113596944, + "learning_rate": 4.7814720229151855e-06, + "loss": 0.567, + "step": 2566 + }, + { + "epoch": 0.41645035691109666, + "grad_norm": 0.5795867578256201, + "learning_rate": 4.781297385207348e-06, + "loss": 0.5676, + "step": 2567 + }, + { + "epoch": 0.41661258922777417, + "grad_norm": 0.5915913782164697, + "learning_rate": 4.781122680937914e-06, + "loss": 0.5717, + "step": 2568 + }, + { + "epoch": 0.41677482154445167, + "grad_norm": 0.5742021579969073, + "learning_rate": 4.78094791011198e-06, + "loss": 0.5473, + "step": 2569 + }, + { + "epoch": 0.4169370538611291, + "grad_norm": 0.5945314550487244, + "learning_rate": 4.780773072734647e-06, + "loss": 0.6145, + "step": 2570 + }, + { + "epoch": 0.4170992861778066, + "grad_norm": 0.6014580081257915, + "learning_rate": 4.780598168811015e-06, + "loss": 0.5866, + "step": 2571 + }, + { + "epoch": 0.4172615184944841, + "grad_norm": 0.5997925446578941, + "learning_rate": 4.780423198346188e-06, + "loss": 0.5542, + "step": 2572 + }, + { + "epoch": 0.41742375081116156, + "grad_norm": 0.5733552884425099, + "learning_rate": 4.78024816134527e-06, + "loss": 0.57, + "step": 2573 + }, + { + "epoch": 0.41758598312783907, + "grad_norm": 0.6302523649763313, + "learning_rate": 4.7800730578133695e-06, + "loss": 0.58, + "step": 2574 + }, + { + "epoch": 0.41774821544451657, + "grad_norm": 0.576032634638381, + "learning_rate": 4.7798978877555934e-06, + "loss": 0.586, + "step": 2575 + }, + { + "epoch": 0.417910447761194, + "grad_norm": 0.5915152023245063, + "learning_rate": 4.7797226511770556e-06, + "loss": 0.5756, + "step": 2576 + }, + { + "epoch": 0.4180726800778715, + "grad_norm": 0.5924802472333962, + "learning_rate": 4.779547348082866e-06, + "loss": 0.5608, + "step": 2577 + }, + { + "epoch": 0.418234912394549, + "grad_norm": 0.5943027580318778, + "learning_rate": 4.779371978478142e-06, + "loss": 0.5942, + "step": 2578 + }, + { + "epoch": 0.41839714471122647, + "grad_norm": 0.5626467330589882, + "learning_rate": 4.779196542367998e-06, + "loss": 0.5487, + "step": 2579 + }, + { + "epoch": 0.41855937702790397, + "grad_norm": 0.6129091200737772, + "learning_rate": 4.779021039757555e-06, + "loss": 0.612, + "step": 2580 + }, + { + "epoch": 0.4187216093445814, + "grad_norm": 0.5639480977343119, + "learning_rate": 4.7788454706519315e-06, + "loss": 0.5697, + "step": 2581 + }, + { + "epoch": 0.4188838416612589, + "grad_norm": 0.5351895565106778, + "learning_rate": 4.778669835056252e-06, + "loss": 0.5754, + "step": 2582 + }, + { + "epoch": 0.4190460739779364, + "grad_norm": 0.5712087263242777, + "learning_rate": 4.778494132975639e-06, + "loss": 0.6202, + "step": 2583 + }, + { + "epoch": 0.41920830629461386, + "grad_norm": 0.5792882591743366, + "learning_rate": 4.778318364415222e-06, + "loss": 0.5861, + "step": 2584 + }, + { + "epoch": 0.41937053861129137, + "grad_norm": 0.5881279620639216, + "learning_rate": 4.778142529380127e-06, + "loss": 0.5519, + "step": 2585 + }, + { + "epoch": 0.41953277092796887, + "grad_norm": 0.5892854791530476, + "learning_rate": 4.777966627875484e-06, + "loss": 0.5774, + "step": 2586 + }, + { + "epoch": 0.4196950032446463, + "grad_norm": 0.5744222719622005, + "learning_rate": 4.777790659906426e-06, + "loss": 0.5897, + "step": 2587 + }, + { + "epoch": 0.4198572355613238, + "grad_norm": 0.5663056473015057, + "learning_rate": 4.777614625478089e-06, + "loss": 0.5707, + "step": 2588 + }, + { + "epoch": 0.4200194678780013, + "grad_norm": 0.6460333594056936, + "learning_rate": 4.777438524595607e-06, + "loss": 0.5836, + "step": 2589 + }, + { + "epoch": 0.42018170019467876, + "grad_norm": 0.6080676975738246, + "learning_rate": 4.777262357264118e-06, + "loss": 0.5947, + "step": 2590 + }, + { + "epoch": 0.42034393251135627, + "grad_norm": 0.5961524633096948, + "learning_rate": 4.7770861234887636e-06, + "loss": 0.5895, + "step": 2591 + }, + { + "epoch": 0.42050616482803377, + "grad_norm": 0.5569013589359719, + "learning_rate": 4.776909823274684e-06, + "loss": 0.5533, + "step": 2592 + }, + { + "epoch": 0.4206683971447112, + "grad_norm": 0.5955953836548744, + "learning_rate": 4.776733456627025e-06, + "loss": 0.6121, + "step": 2593 + }, + { + "epoch": 0.4208306294613887, + "grad_norm": 0.568719084866718, + "learning_rate": 4.776557023550931e-06, + "loss": 0.5657, + "step": 2594 + }, + { + "epoch": 0.42099286177806616, + "grad_norm": 0.5584533406417707, + "learning_rate": 4.77638052405155e-06, + "loss": 0.577, + "step": 2595 + }, + { + "epoch": 0.42115509409474366, + "grad_norm": 0.6120928007052299, + "learning_rate": 4.776203958134034e-06, + "loss": 0.5547, + "step": 2596 + }, + { + "epoch": 0.42131732641142117, + "grad_norm": 0.5642587616475564, + "learning_rate": 4.776027325803531e-06, + "loss": 0.5758, + "step": 2597 + }, + { + "epoch": 0.4214795587280986, + "grad_norm": 0.5811413878793643, + "learning_rate": 4.775850627065197e-06, + "loss": 0.5838, + "step": 2598 + }, + { + "epoch": 0.4216417910447761, + "grad_norm": 0.6102425066095117, + "learning_rate": 4.7756738619241864e-06, + "loss": 0.5734, + "step": 2599 + }, + { + "epoch": 0.4218040233614536, + "grad_norm": 0.5827022532442118, + "learning_rate": 4.775497030385657e-06, + "loss": 0.5554, + "step": 2600 + }, + { + "epoch": 0.42196625567813106, + "grad_norm": 0.6173542391805671, + "learning_rate": 4.775320132454769e-06, + "loss": 0.5913, + "step": 2601 + }, + { + "epoch": 0.42212848799480857, + "grad_norm": 0.5693000103749799, + "learning_rate": 4.775143168136684e-06, + "loss": 0.5506, + "step": 2602 + }, + { + "epoch": 0.42229072031148607, + "grad_norm": 0.5939617540651325, + "learning_rate": 4.774966137436564e-06, + "loss": 0.5631, + "step": 2603 + }, + { + "epoch": 0.4224529526281635, + "grad_norm": 0.5767574289043331, + "learning_rate": 4.7747890403595744e-06, + "loss": 0.5608, + "step": 2604 + }, + { + "epoch": 0.422615184944841, + "grad_norm": 0.593343884908159, + "learning_rate": 4.774611876910883e-06, + "loss": 0.5801, + "step": 2605 + }, + { + "epoch": 0.4227774172615185, + "grad_norm": 0.6774025127688286, + "learning_rate": 4.774434647095658e-06, + "loss": 0.5668, + "step": 2606 + }, + { + "epoch": 0.42293964957819596, + "grad_norm": 0.5973399456966011, + "learning_rate": 4.774257350919071e-06, + "loss": 0.5866, + "step": 2607 + }, + { + "epoch": 0.42310188189487347, + "grad_norm": 0.6141835890484942, + "learning_rate": 4.7740799883862966e-06, + "loss": 0.5966, + "step": 2608 + }, + { + "epoch": 0.42326411421155097, + "grad_norm": 0.5862256137459089, + "learning_rate": 4.773902559502507e-06, + "loss": 0.5657, + "step": 2609 + }, + { + "epoch": 0.4234263465282284, + "grad_norm": 0.5697382357512834, + "learning_rate": 4.77372506427288e-06, + "loss": 0.5912, + "step": 2610 + }, + { + "epoch": 0.4235885788449059, + "grad_norm": 0.6081576995050642, + "learning_rate": 4.773547502702596e-06, + "loss": 0.5917, + "step": 2611 + }, + { + "epoch": 0.42375081116158336, + "grad_norm": 0.5806549360630612, + "learning_rate": 4.773369874796833e-06, + "loss": 0.6208, + "step": 2612 + }, + { + "epoch": 0.42391304347826086, + "grad_norm": 0.5925229376221971, + "learning_rate": 4.773192180560776e-06, + "loss": 0.5915, + "step": 2613 + }, + { + "epoch": 0.42407527579493837, + "grad_norm": 0.5800049490085614, + "learning_rate": 4.773014419999608e-06, + "loss": 0.594, + "step": 2614 + }, + { + "epoch": 0.4242375081116158, + "grad_norm": 0.5799095644778031, + "learning_rate": 4.772836593118516e-06, + "loss": 0.6011, + "step": 2615 + }, + { + "epoch": 0.4243997404282933, + "grad_norm": 0.5978610616380214, + "learning_rate": 4.77265869992269e-06, + "loss": 0.5801, + "step": 2616 + }, + { + "epoch": 0.4245619727449708, + "grad_norm": 0.590879927989352, + "learning_rate": 4.7724807404173175e-06, + "loss": 0.597, + "step": 2617 + }, + { + "epoch": 0.42472420506164826, + "grad_norm": 0.5847942925942509, + "learning_rate": 4.772302714607593e-06, + "loss": 0.5768, + "step": 2618 + }, + { + "epoch": 0.42488643737832577, + "grad_norm": 0.5920460520658977, + "learning_rate": 4.77212462249871e-06, + "loss": 0.5633, + "step": 2619 + }, + { + "epoch": 0.42504866969500327, + "grad_norm": 0.6026444538955404, + "learning_rate": 4.7719464640958655e-06, + "loss": 0.5835, + "step": 2620 + }, + { + "epoch": 0.4252109020116807, + "grad_norm": 0.5740703758203162, + "learning_rate": 4.771768239404256e-06, + "loss": 0.6047, + "step": 2621 + }, + { + "epoch": 0.4253731343283582, + "grad_norm": 0.5941097868485823, + "learning_rate": 4.771589948429084e-06, + "loss": 0.5535, + "step": 2622 + }, + { + "epoch": 0.4255353666450357, + "grad_norm": 0.5855696870185824, + "learning_rate": 4.771411591175548e-06, + "loss": 0.5941, + "step": 2623 + }, + { + "epoch": 0.42569759896171316, + "grad_norm": 0.5743394071681784, + "learning_rate": 4.771233167648857e-06, + "loss": 0.5542, + "step": 2624 + }, + { + "epoch": 0.42585983127839067, + "grad_norm": 0.6097763308781411, + "learning_rate": 4.771054677854211e-06, + "loss": 0.5467, + "step": 2625 + }, + { + "epoch": 0.4260220635950681, + "grad_norm": 0.59433316479298, + "learning_rate": 4.770876121796822e-06, + "loss": 0.5834, + "step": 2626 + }, + { + "epoch": 0.4261842959117456, + "grad_norm": 0.6193413306678746, + "learning_rate": 4.7706974994818985e-06, + "loss": 0.5585, + "step": 2627 + }, + { + "epoch": 0.4263465282284231, + "grad_norm": 0.58748895560283, + "learning_rate": 4.770518810914653e-06, + "loss": 0.5959, + "step": 2628 + }, + { + "epoch": 0.42650876054510056, + "grad_norm": 0.6109455173620658, + "learning_rate": 4.770340056100297e-06, + "loss": 0.5791, + "step": 2629 + }, + { + "epoch": 0.42667099286177806, + "grad_norm": 0.5839875644331571, + "learning_rate": 4.770161235044047e-06, + "loss": 0.5589, + "step": 2630 + }, + { + "epoch": 0.42683322517845557, + "grad_norm": 0.5933444589411291, + "learning_rate": 4.769982347751122e-06, + "loss": 0.5871, + "step": 2631 + }, + { + "epoch": 0.426995457495133, + "grad_norm": 0.6088268107889145, + "learning_rate": 4.76980339422674e-06, + "loss": 0.5953, + "step": 2632 + }, + { + "epoch": 0.4271576898118105, + "grad_norm": 0.5764693739451742, + "learning_rate": 4.769624374476122e-06, + "loss": 0.5728, + "step": 2633 + }, + { + "epoch": 0.427319922128488, + "grad_norm": 0.6348509443187902, + "learning_rate": 4.7694452885044915e-06, + "loss": 0.5956, + "step": 2634 + }, + { + "epoch": 0.42748215444516546, + "grad_norm": 0.6109610196865238, + "learning_rate": 4.769266136317074e-06, + "loss": 0.5694, + "step": 2635 + }, + { + "epoch": 0.42764438676184297, + "grad_norm": 0.5990280164107563, + "learning_rate": 4.769086917919098e-06, + "loss": 0.5851, + "step": 2636 + }, + { + "epoch": 0.42780661907852047, + "grad_norm": 0.6143605358790499, + "learning_rate": 4.76890763331579e-06, + "loss": 0.6094, + "step": 2637 + }, + { + "epoch": 0.4279688513951979, + "grad_norm": 0.5819384946797214, + "learning_rate": 4.768728282512383e-06, + "loss": 0.575, + "step": 2638 + }, + { + "epoch": 0.4281310837118754, + "grad_norm": 0.5916150946768985, + "learning_rate": 4.768548865514108e-06, + "loss": 0.6092, + "step": 2639 + }, + { + "epoch": 0.42829331602855286, + "grad_norm": 0.6323023095228372, + "learning_rate": 4.768369382326202e-06, + "loss": 0.5456, + "step": 2640 + }, + { + "epoch": 0.42845554834523036, + "grad_norm": 0.6016632857800692, + "learning_rate": 4.7681898329539004e-06, + "loss": 0.58, + "step": 2641 + }, + { + "epoch": 0.42861778066190787, + "grad_norm": 0.6080331514398087, + "learning_rate": 4.768010217402442e-06, + "loss": 0.582, + "step": 2642 + }, + { + "epoch": 0.4287800129785853, + "grad_norm": 0.5736412382761938, + "learning_rate": 4.767830535677068e-06, + "loss": 0.5746, + "step": 2643 + }, + { + "epoch": 0.4289422452952628, + "grad_norm": 0.5966135134103223, + "learning_rate": 4.767650787783021e-06, + "loss": 0.5821, + "step": 2644 + }, + { + "epoch": 0.4291044776119403, + "grad_norm": 0.5647004992840037, + "learning_rate": 4.767470973725546e-06, + "loss": 0.5771, + "step": 2645 + }, + { + "epoch": 0.42926670992861776, + "grad_norm": 0.6044659522194662, + "learning_rate": 4.767291093509888e-06, + "loss": 0.5974, + "step": 2646 + }, + { + "epoch": 0.42942894224529526, + "grad_norm": 0.5885790253831598, + "learning_rate": 4.7671111471412965e-06, + "loss": 0.5313, + "step": 2647 + }, + { + "epoch": 0.42959117456197277, + "grad_norm": 0.5910068458832021, + "learning_rate": 4.766931134625021e-06, + "loss": 0.576, + "step": 2648 + }, + { + "epoch": 0.4297534068786502, + "grad_norm": 0.5739474698713553, + "learning_rate": 4.7667510559663145e-06, + "loss": 0.6028, + "step": 2649 + }, + { + "epoch": 0.4299156391953277, + "grad_norm": 0.5824626061620859, + "learning_rate": 4.766570911170431e-06, + "loss": 0.5517, + "step": 2650 + }, + { + "epoch": 0.4300778715120052, + "grad_norm": 0.5976506344663008, + "learning_rate": 4.766390700242625e-06, + "loss": 0.5636, + "step": 2651 + }, + { + "epoch": 0.43024010382868266, + "grad_norm": 0.6298303481167344, + "learning_rate": 4.766210423188158e-06, + "loss": 0.5481, + "step": 2652 + }, + { + "epoch": 0.43040233614536016, + "grad_norm": 0.6265111073988321, + "learning_rate": 4.7660300800122875e-06, + "loss": 0.5477, + "step": 2653 + }, + { + "epoch": 0.4305645684620376, + "grad_norm": 0.6079985779145424, + "learning_rate": 4.7658496707202755e-06, + "loss": 0.5676, + "step": 2654 + }, + { + "epoch": 0.4307268007787151, + "grad_norm": 0.5623624243478698, + "learning_rate": 4.765669195317385e-06, + "loss": 0.5696, + "step": 2655 + }, + { + "epoch": 0.4308890330953926, + "grad_norm": 0.5695252081765518, + "learning_rate": 4.765488653808884e-06, + "loss": 0.6128, + "step": 2656 + }, + { + "epoch": 0.43105126541207006, + "grad_norm": 0.5862312808030029, + "learning_rate": 4.765308046200039e-06, + "loss": 0.6136, + "step": 2657 + }, + { + "epoch": 0.43121349772874756, + "grad_norm": 0.6082571262200138, + "learning_rate": 4.76512737249612e-06, + "loss": 0.5989, + "step": 2658 + }, + { + "epoch": 0.43137573004542507, + "grad_norm": 0.582748712798373, + "learning_rate": 4.7649466327023975e-06, + "loss": 0.5769, + "step": 2659 + }, + { + "epoch": 0.4315379623621025, + "grad_norm": 0.5736182854427428, + "learning_rate": 4.764765826824146e-06, + "loss": 0.5841, + "step": 2660 + }, + { + "epoch": 0.43170019467878, + "grad_norm": 0.6138622593097999, + "learning_rate": 4.76458495486664e-06, + "loss": 0.5773, + "step": 2661 + }, + { + "epoch": 0.4318624269954575, + "grad_norm": 0.6036837443022227, + "learning_rate": 4.764404016835158e-06, + "loss": 0.5659, + "step": 2662 + }, + { + "epoch": 0.43202465931213496, + "grad_norm": 0.6337796339247261, + "learning_rate": 4.764223012734979e-06, + "loss": 0.6043, + "step": 2663 + }, + { + "epoch": 0.43218689162881246, + "grad_norm": 0.5831566838845382, + "learning_rate": 4.764041942571383e-06, + "loss": 0.5771, + "step": 2664 + }, + { + "epoch": 0.43234912394548997, + "grad_norm": 0.5896843630029875, + "learning_rate": 4.763860806349654e-06, + "loss": 0.589, + "step": 2665 + }, + { + "epoch": 0.4325113562621674, + "grad_norm": 0.5949158446772639, + "learning_rate": 4.763679604075077e-06, + "loss": 0.5806, + "step": 2666 + }, + { + "epoch": 0.4326735885788449, + "grad_norm": 0.6291877887228932, + "learning_rate": 4.763498335752939e-06, + "loss": 0.6165, + "step": 2667 + }, + { + "epoch": 0.43283582089552236, + "grad_norm": 0.592135329629373, + "learning_rate": 4.763317001388529e-06, + "loss": 0.5931, + "step": 2668 + }, + { + "epoch": 0.43299805321219986, + "grad_norm": 0.6027223839083737, + "learning_rate": 4.763135600987136e-06, + "loss": 0.581, + "step": 2669 + }, + { + "epoch": 0.43316028552887736, + "grad_norm": 0.5863010969594553, + "learning_rate": 4.762954134554056e-06, + "loss": 0.573, + "step": 2670 + }, + { + "epoch": 0.4333225178455548, + "grad_norm": 0.5827036637041861, + "learning_rate": 4.762772602094581e-06, + "loss": 0.583, + "step": 2671 + }, + { + "epoch": 0.4334847501622323, + "grad_norm": 0.5773329125070338, + "learning_rate": 4.762591003614009e-06, + "loss": 0.5722, + "step": 2672 + }, + { + "epoch": 0.4336469824789098, + "grad_norm": 0.5439130285586354, + "learning_rate": 4.762409339117638e-06, + "loss": 0.5561, + "step": 2673 + }, + { + "epoch": 0.43380921479558726, + "grad_norm": 0.5877633504770815, + "learning_rate": 4.7622276086107685e-06, + "loss": 0.5843, + "step": 2674 + }, + { + "epoch": 0.43397144711226476, + "grad_norm": 0.5768345860170825, + "learning_rate": 4.762045812098702e-06, + "loss": 0.5754, + "step": 2675 + }, + { + "epoch": 0.43413367942894227, + "grad_norm": 0.5663529202088048, + "learning_rate": 4.7618639495867445e-06, + "loss": 0.5479, + "step": 2676 + }, + { + "epoch": 0.4342959117456197, + "grad_norm": 0.5708428462770408, + "learning_rate": 4.761682021080202e-06, + "loss": 0.5908, + "step": 2677 + }, + { + "epoch": 0.4344581440622972, + "grad_norm": 0.5639259785964033, + "learning_rate": 4.761500026584381e-06, + "loss": 0.576, + "step": 2678 + }, + { + "epoch": 0.4346203763789747, + "grad_norm": 0.5684536496818907, + "learning_rate": 4.761317966104592e-06, + "loss": 0.5864, + "step": 2679 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.6184939972242979, + "learning_rate": 4.761135839646148e-06, + "loss": 0.5879, + "step": 2680 + }, + { + "epoch": 0.43494484101232966, + "grad_norm": 0.6422453923213021, + "learning_rate": 4.760953647214363e-06, + "loss": 0.578, + "step": 2681 + }, + { + "epoch": 0.43510707332900717, + "grad_norm": 0.5713663660926092, + "learning_rate": 4.760771388814552e-06, + "loss": 0.5883, + "step": 2682 + }, + { + "epoch": 0.4352693056456846, + "grad_norm": 0.5718315585019639, + "learning_rate": 4.7605890644520335e-06, + "loss": 0.575, + "step": 2683 + }, + { + "epoch": 0.4354315379623621, + "grad_norm": 0.5722916732423451, + "learning_rate": 4.760406674132126e-06, + "loss": 0.5789, + "step": 2684 + }, + { + "epoch": 0.43559377027903956, + "grad_norm": 0.5853025492620662, + "learning_rate": 4.7602242178601525e-06, + "loss": 0.5641, + "step": 2685 + }, + { + "epoch": 0.43575600259571706, + "grad_norm": 0.612929223085608, + "learning_rate": 4.7600416956414355e-06, + "loss": 0.6023, + "step": 2686 + }, + { + "epoch": 0.43591823491239456, + "grad_norm": 0.6031645503679804, + "learning_rate": 4.7598591074813e-06, + "loss": 0.5695, + "step": 2687 + }, + { + "epoch": 0.436080467229072, + "grad_norm": 0.6019383123359212, + "learning_rate": 4.759676453385076e-06, + "loss": 0.5911, + "step": 2688 + }, + { + "epoch": 0.4362426995457495, + "grad_norm": 0.5813705712661092, + "learning_rate": 4.759493733358091e-06, + "loss": 0.554, + "step": 2689 + }, + { + "epoch": 0.436404931862427, + "grad_norm": 0.5941963569214187, + "learning_rate": 4.759310947405676e-06, + "loss": 0.5568, + "step": 2690 + }, + { + "epoch": 0.43656716417910446, + "grad_norm": 0.5775985248584257, + "learning_rate": 4.759128095533164e-06, + "loss": 0.5673, + "step": 2691 + }, + { + "epoch": 0.43672939649578196, + "grad_norm": 0.6246052147181993, + "learning_rate": 4.758945177745891e-06, + "loss": 0.5959, + "step": 2692 + }, + { + "epoch": 0.43689162881245946, + "grad_norm": 0.6146420227715057, + "learning_rate": 4.758762194049193e-06, + "loss": 0.5533, + "step": 2693 + }, + { + "epoch": 0.4370538611291369, + "grad_norm": 0.5847081204516998, + "learning_rate": 4.75857914444841e-06, + "loss": 0.5599, + "step": 2694 + }, + { + "epoch": 0.4372160934458144, + "grad_norm": 0.5960636170515679, + "learning_rate": 4.758396028948882e-06, + "loss": 0.5496, + "step": 2695 + }, + { + "epoch": 0.4373783257624919, + "grad_norm": 0.5882960382257633, + "learning_rate": 4.758212847555953e-06, + "loss": 0.5909, + "step": 2696 + }, + { + "epoch": 0.43754055807916936, + "grad_norm": 0.6103967541061762, + "learning_rate": 4.758029600274966e-06, + "loss": 0.5651, + "step": 2697 + }, + { + "epoch": 0.43770279039584686, + "grad_norm": 0.5780348260988986, + "learning_rate": 4.757846287111268e-06, + "loss": 0.5949, + "step": 2698 + }, + { + "epoch": 0.4378650227125243, + "grad_norm": 0.5781543397007766, + "learning_rate": 4.75766290807021e-06, + "loss": 0.5622, + "step": 2699 + }, + { + "epoch": 0.4380272550292018, + "grad_norm": 0.5932233907953698, + "learning_rate": 4.7574794631571385e-06, + "loss": 0.5587, + "step": 2700 + }, + { + "epoch": 0.4381894873458793, + "grad_norm": 0.6220432996512543, + "learning_rate": 4.757295952377409e-06, + "loss": 0.5956, + "step": 2701 + }, + { + "epoch": 0.43835171966255676, + "grad_norm": 0.5770311418147593, + "learning_rate": 4.757112375736374e-06, + "loss": 0.593, + "step": 2702 + }, + { + "epoch": 0.43851395197923426, + "grad_norm": 0.5768913480893427, + "learning_rate": 4.756928733239391e-06, + "loss": 0.5198, + "step": 2703 + }, + { + "epoch": 0.43867618429591176, + "grad_norm": 0.6096030378625558, + "learning_rate": 4.756745024891817e-06, + "loss": 0.5444, + "step": 2704 + }, + { + "epoch": 0.4388384166125892, + "grad_norm": 0.6421713098303797, + "learning_rate": 4.756561250699012e-06, + "loss": 0.5549, + "step": 2705 + }, + { + "epoch": 0.4390006489292667, + "grad_norm": 0.58834705470837, + "learning_rate": 4.7563774106663395e-06, + "loss": 0.5569, + "step": 2706 + }, + { + "epoch": 0.4391628812459442, + "grad_norm": 0.63620102478732, + "learning_rate": 4.7561935047991625e-06, + "loss": 0.5853, + "step": 2707 + }, + { + "epoch": 0.43932511356262166, + "grad_norm": 0.6147908739552451, + "learning_rate": 4.756009533102847e-06, + "loss": 0.5923, + "step": 2708 + }, + { + "epoch": 0.43948734587929916, + "grad_norm": 0.572207292255178, + "learning_rate": 4.755825495582761e-06, + "loss": 0.5737, + "step": 2709 + }, + { + "epoch": 0.43964957819597666, + "grad_norm": 0.6003611225661591, + "learning_rate": 4.7556413922442725e-06, + "loss": 0.5774, + "step": 2710 + }, + { + "epoch": 0.4398118105126541, + "grad_norm": 0.6159117608967799, + "learning_rate": 4.755457223092755e-06, + "loss": 0.5731, + "step": 2711 + }, + { + "epoch": 0.4399740428293316, + "grad_norm": 0.5931577373175332, + "learning_rate": 4.755272988133582e-06, + "loss": 0.6001, + "step": 2712 + }, + { + "epoch": 0.44013627514600906, + "grad_norm": 0.6087555505808223, + "learning_rate": 4.755088687372128e-06, + "loss": 0.5906, + "step": 2713 + }, + { + "epoch": 0.44029850746268656, + "grad_norm": 0.5875620508735967, + "learning_rate": 4.75490432081377e-06, + "loss": 0.5612, + "step": 2714 + }, + { + "epoch": 0.44046073977936406, + "grad_norm": 0.5705050550242017, + "learning_rate": 4.754719888463889e-06, + "loss": 0.5352, + "step": 2715 + }, + { + "epoch": 0.4406229720960415, + "grad_norm": 0.6282201097132373, + "learning_rate": 4.754535390327865e-06, + "loss": 0.5783, + "step": 2716 + }, + { + "epoch": 0.440785204412719, + "grad_norm": 0.6380686993459628, + "learning_rate": 4.754350826411082e-06, + "loss": 0.5612, + "step": 2717 + }, + { + "epoch": 0.4409474367293965, + "grad_norm": 0.586717554883721, + "learning_rate": 4.7541661967189225e-06, + "loss": 0.566, + "step": 2718 + }, + { + "epoch": 0.44110966904607396, + "grad_norm": 0.5920833316413845, + "learning_rate": 4.753981501256777e-06, + "loss": 0.5984, + "step": 2719 + }, + { + "epoch": 0.44127190136275146, + "grad_norm": 0.5921155974955967, + "learning_rate": 4.753796740030031e-06, + "loss": 0.5668, + "step": 2720 + }, + { + "epoch": 0.44143413367942896, + "grad_norm": 0.5844327829724616, + "learning_rate": 4.753611913044078e-06, + "loss": 0.5783, + "step": 2721 + }, + { + "epoch": 0.4415963659961064, + "grad_norm": 0.5977764627509782, + "learning_rate": 4.7534270203043096e-06, + "loss": 0.6071, + "step": 2722 + }, + { + "epoch": 0.4417585983127839, + "grad_norm": 0.5752937274244742, + "learning_rate": 4.75324206181612e-06, + "loss": 0.6095, + "step": 2723 + }, + { + "epoch": 0.4419208306294614, + "grad_norm": 0.573872290507913, + "learning_rate": 4.753057037584907e-06, + "loss": 0.5882, + "step": 2724 + }, + { + "epoch": 0.44208306294613886, + "grad_norm": 0.6038486013208911, + "learning_rate": 4.752871947616068e-06, + "loss": 0.5703, + "step": 2725 + }, + { + "epoch": 0.44224529526281636, + "grad_norm": 0.6015628452114571, + "learning_rate": 4.752686791915004e-06, + "loss": 0.5531, + "step": 2726 + }, + { + "epoch": 0.4424075275794938, + "grad_norm": 0.555189838885122, + "learning_rate": 4.752501570487117e-06, + "loss": 0.6038, + "step": 2727 + }, + { + "epoch": 0.4425697598961713, + "grad_norm": 0.6064925002974884, + "learning_rate": 4.752316283337811e-06, + "loss": 0.5485, + "step": 2728 + }, + { + "epoch": 0.4427319922128488, + "grad_norm": 0.5938877871932949, + "learning_rate": 4.752130930472492e-06, + "loss": 0.6001, + "step": 2729 + }, + { + "epoch": 0.44289422452952626, + "grad_norm": 0.610223663456583, + "learning_rate": 4.751945511896568e-06, + "loss": 0.5577, + "step": 2730 + }, + { + "epoch": 0.44305645684620376, + "grad_norm": 0.5878525223069618, + "learning_rate": 4.751760027615451e-06, + "loss": 0.5899, + "step": 2731 + }, + { + "epoch": 0.44321868916288126, + "grad_norm": 0.6536243488808644, + "learning_rate": 4.7515744776345505e-06, + "loss": 0.5305, + "step": 2732 + }, + { + "epoch": 0.4433809214795587, + "grad_norm": 0.6086224618806646, + "learning_rate": 4.751388861959281e-06, + "loss": 0.5717, + "step": 2733 + }, + { + "epoch": 0.4435431537962362, + "grad_norm": 0.5818293232773404, + "learning_rate": 4.751203180595059e-06, + "loss": 0.5296, + "step": 2734 + }, + { + "epoch": 0.4437053861129137, + "grad_norm": 0.5793319280193048, + "learning_rate": 4.7510174335473e-06, + "loss": 0.5865, + "step": 2735 + }, + { + "epoch": 0.44386761842959116, + "grad_norm": 0.5742431926781972, + "learning_rate": 4.750831620821426e-06, + "loss": 0.568, + "step": 2736 + }, + { + "epoch": 0.44402985074626866, + "grad_norm": 0.5854346852202915, + "learning_rate": 4.750645742422857e-06, + "loss": 0.5059, + "step": 2737 + }, + { + "epoch": 0.44419208306294616, + "grad_norm": 0.5641655797991637, + "learning_rate": 4.750459798357017e-06, + "loss": 0.5511, + "step": 2738 + }, + { + "epoch": 0.4443543153796236, + "grad_norm": 0.5735724198021591, + "learning_rate": 4.750273788629331e-06, + "loss": 0.6164, + "step": 2739 + }, + { + "epoch": 0.4445165476963011, + "grad_norm": 0.5614414445706148, + "learning_rate": 4.750087713245227e-06, + "loss": 0.5891, + "step": 2740 + }, + { + "epoch": 0.44467878001297856, + "grad_norm": 0.5775723239057463, + "learning_rate": 4.749901572210134e-06, + "loss": 0.5413, + "step": 2741 + }, + { + "epoch": 0.44484101232965606, + "grad_norm": 0.6001072109292541, + "learning_rate": 4.749715365529481e-06, + "loss": 0.57, + "step": 2742 + }, + { + "epoch": 0.44500324464633356, + "grad_norm": 0.5677407088127061, + "learning_rate": 4.749529093208704e-06, + "loss": 0.5635, + "step": 2743 + }, + { + "epoch": 0.445165476963011, + "grad_norm": 0.5713404712561223, + "learning_rate": 4.7493427552532355e-06, + "loss": 0.5741, + "step": 2744 + }, + { + "epoch": 0.4453277092796885, + "grad_norm": 0.5881176903303377, + "learning_rate": 4.749156351668514e-06, + "loss": 0.6063, + "step": 2745 + }, + { + "epoch": 0.445489941596366, + "grad_norm": 0.5992522750499403, + "learning_rate": 4.748969882459977e-06, + "loss": 0.583, + "step": 2746 + }, + { + "epoch": 0.44565217391304346, + "grad_norm": 0.5714222845786318, + "learning_rate": 4.748783347633066e-06, + "loss": 0.5626, + "step": 2747 + }, + { + "epoch": 0.44581440622972096, + "grad_norm": 0.5772173328157996, + "learning_rate": 4.7485967471932225e-06, + "loss": 0.5795, + "step": 2748 + }, + { + "epoch": 0.44597663854639846, + "grad_norm": 0.6053874075262748, + "learning_rate": 4.748410081145892e-06, + "loss": 0.5945, + "step": 2749 + }, + { + "epoch": 0.4461388708630759, + "grad_norm": 0.6085823286117469, + "learning_rate": 4.74822334949652e-06, + "loss": 0.5876, + "step": 2750 + }, + { + "epoch": 0.4463011031797534, + "grad_norm": 0.6097074643717804, + "learning_rate": 4.7480365522505555e-06, + "loss": 0.6389, + "step": 2751 + }, + { + "epoch": 0.4464633354964309, + "grad_norm": 0.5639805393812254, + "learning_rate": 4.7478496894134485e-06, + "loss": 0.5788, + "step": 2752 + }, + { + "epoch": 0.44662556781310836, + "grad_norm": 0.5847819090522554, + "learning_rate": 4.7476627609906515e-06, + "loss": 0.6009, + "step": 2753 + }, + { + "epoch": 0.44678780012978586, + "grad_norm": 0.5778411325774061, + "learning_rate": 4.747475766987617e-06, + "loss": 0.6005, + "step": 2754 + }, + { + "epoch": 0.4469500324464633, + "grad_norm": 0.575024908585363, + "learning_rate": 4.747288707409803e-06, + "loss": 0.5514, + "step": 2755 + }, + { + "epoch": 0.4471122647631408, + "grad_norm": 0.6092130317657508, + "learning_rate": 4.747101582262666e-06, + "loss": 0.5988, + "step": 2756 + }, + { + "epoch": 0.4472744970798183, + "grad_norm": 0.5896481069703754, + "learning_rate": 4.746914391551665e-06, + "loss": 0.5825, + "step": 2757 + }, + { + "epoch": 0.44743672939649576, + "grad_norm": 0.6046631584789806, + "learning_rate": 4.746727135282264e-06, + "loss": 0.5665, + "step": 2758 + }, + { + "epoch": 0.44759896171317326, + "grad_norm": 0.6249491134415271, + "learning_rate": 4.746539813459925e-06, + "loss": 0.5395, + "step": 2759 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 0.5813734547999229, + "learning_rate": 4.746352426090114e-06, + "loss": 0.5726, + "step": 2760 + }, + { + "epoch": 0.4479234263465282, + "grad_norm": 0.5711878907301837, + "learning_rate": 4.746164973178299e-06, + "loss": 0.5742, + "step": 2761 + }, + { + "epoch": 0.4480856586632057, + "grad_norm": 0.6027958597074003, + "learning_rate": 4.745977454729947e-06, + "loss": 0.5859, + "step": 2762 + }, + { + "epoch": 0.4482478909798832, + "grad_norm": 0.5861639512182303, + "learning_rate": 4.7457898707505324e-06, + "loss": 0.5738, + "step": 2763 + }, + { + "epoch": 0.44841012329656066, + "grad_norm": 0.6244869884200646, + "learning_rate": 4.745602221245526e-06, + "loss": 0.5765, + "step": 2764 + }, + { + "epoch": 0.44857235561323816, + "grad_norm": 0.5692712756093691, + "learning_rate": 4.745414506220404e-06, + "loss": 0.5199, + "step": 2765 + }, + { + "epoch": 0.44873458792991566, + "grad_norm": 0.6039352053911934, + "learning_rate": 4.745226725680644e-06, + "loss": 0.6217, + "step": 2766 + }, + { + "epoch": 0.4488968202465931, + "grad_norm": 0.5917265550755458, + "learning_rate": 4.745038879631723e-06, + "loss": 0.6022, + "step": 2767 + }, + { + "epoch": 0.4490590525632706, + "grad_norm": 0.5841912881283929, + "learning_rate": 4.744850968079122e-06, + "loss": 0.5791, + "step": 2768 + }, + { + "epoch": 0.4492212848799481, + "grad_norm": 0.6234249282276453, + "learning_rate": 4.744662991028325e-06, + "loss": 0.5997, + "step": 2769 + }, + { + "epoch": 0.44938351719662556, + "grad_norm": 0.5974087455648689, + "learning_rate": 4.7444749484848165e-06, + "loss": 0.557, + "step": 2770 + }, + { + "epoch": 0.44954574951330306, + "grad_norm": 0.5602619742159083, + "learning_rate": 4.7442868404540835e-06, + "loss": 0.5639, + "step": 2771 + }, + { + "epoch": 0.4497079818299805, + "grad_norm": 0.5945177876168423, + "learning_rate": 4.744098666941612e-06, + "loss": 0.5825, + "step": 2772 + }, + { + "epoch": 0.449870214146658, + "grad_norm": 0.620246194570808, + "learning_rate": 4.7439104279528945e-06, + "loss": 0.5604, + "step": 2773 + }, + { + "epoch": 0.4500324464633355, + "grad_norm": 0.5929866774654371, + "learning_rate": 4.743722123493423e-06, + "loss": 0.6115, + "step": 2774 + }, + { + "epoch": 0.45019467878001296, + "grad_norm": 0.5801009874794807, + "learning_rate": 4.743533753568691e-06, + "loss": 0.5848, + "step": 2775 + }, + { + "epoch": 0.45035691109669046, + "grad_norm": 0.6183956973427526, + "learning_rate": 4.743345318184194e-06, + "loss": 0.5815, + "step": 2776 + }, + { + "epoch": 0.45051914341336796, + "grad_norm": 0.5892408109710766, + "learning_rate": 4.743156817345432e-06, + "loss": 0.5978, + "step": 2777 + }, + { + "epoch": 0.4506813757300454, + "grad_norm": 0.5810532324109129, + "learning_rate": 4.742968251057904e-06, + "loss": 0.6006, + "step": 2778 + }, + { + "epoch": 0.4508436080467229, + "grad_norm": 0.5688629159150709, + "learning_rate": 4.742779619327111e-06, + "loss": 0.5583, + "step": 2779 + }, + { + "epoch": 0.4510058403634004, + "grad_norm": 0.6283236603305615, + "learning_rate": 4.742590922158558e-06, + "loss": 0.5929, + "step": 2780 + }, + { + "epoch": 0.45116807268007786, + "grad_norm": 0.6103197087717053, + "learning_rate": 4.74240215955775e-06, + "loss": 0.5598, + "step": 2781 + }, + { + "epoch": 0.45133030499675536, + "grad_norm": 0.5690356249736849, + "learning_rate": 4.742213331530195e-06, + "loss": 0.5677, + "step": 2782 + }, + { + "epoch": 0.45149253731343286, + "grad_norm": 0.5765041158599524, + "learning_rate": 4.7420244380814e-06, + "loss": 0.5729, + "step": 2783 + }, + { + "epoch": 0.4516547696301103, + "grad_norm": 0.579326901429301, + "learning_rate": 4.74183547921688e-06, + "loss": 0.5803, + "step": 2784 + }, + { + "epoch": 0.4518170019467878, + "grad_norm": 0.5951901142705649, + "learning_rate": 4.741646454942146e-06, + "loss": 0.5799, + "step": 2785 + }, + { + "epoch": 0.45197923426346526, + "grad_norm": 0.5626321859139095, + "learning_rate": 4.7414573652627145e-06, + "loss": 0.5872, + "step": 2786 + }, + { + "epoch": 0.45214146658014276, + "grad_norm": 0.5807778678888017, + "learning_rate": 4.741268210184101e-06, + "loss": 0.6097, + "step": 2787 + }, + { + "epoch": 0.45230369889682026, + "grad_norm": 0.565127597716174, + "learning_rate": 4.741078989711826e-06, + "loss": 0.549, + "step": 2788 + }, + { + "epoch": 0.4524659312134977, + "grad_norm": 0.6016353318772737, + "learning_rate": 4.74088970385141e-06, + "loss": 0.5356, + "step": 2789 + }, + { + "epoch": 0.4526281635301752, + "grad_norm": 0.5639977103228636, + "learning_rate": 4.740700352608375e-06, + "loss": 0.6014, + "step": 2790 + }, + { + "epoch": 0.4527903958468527, + "grad_norm": 0.6192625226518728, + "learning_rate": 4.740510935988247e-06, + "loss": 0.5301, + "step": 2791 + }, + { + "epoch": 0.45295262816353016, + "grad_norm": 0.6035693496732397, + "learning_rate": 4.740321453996551e-06, + "loss": 0.5521, + "step": 2792 + }, + { + "epoch": 0.45311486048020766, + "grad_norm": 0.6003204546858913, + "learning_rate": 4.740131906638817e-06, + "loss": 0.6144, + "step": 2793 + }, + { + "epoch": 0.45327709279688516, + "grad_norm": 0.5779288480585744, + "learning_rate": 4.739942293920575e-06, + "loss": 0.6002, + "step": 2794 + }, + { + "epoch": 0.4534393251135626, + "grad_norm": 0.5814276408598736, + "learning_rate": 4.739752615847357e-06, + "loss": 0.5631, + "step": 2795 + }, + { + "epoch": 0.4536015574302401, + "grad_norm": 0.6167896233864666, + "learning_rate": 4.7395628724246976e-06, + "loss": 0.584, + "step": 2796 + }, + { + "epoch": 0.4537637897469176, + "grad_norm": 0.5797415784281283, + "learning_rate": 4.739373063658133e-06, + "loss": 0.5965, + "step": 2797 + }, + { + "epoch": 0.45392602206359506, + "grad_norm": 0.5496978501220576, + "learning_rate": 4.739183189553201e-06, + "loss": 0.5614, + "step": 2798 + }, + { + "epoch": 0.45408825438027256, + "grad_norm": 0.6037116528288327, + "learning_rate": 4.738993250115442e-06, + "loss": 0.555, + "step": 2799 + }, + { + "epoch": 0.45425048669695, + "grad_norm": 0.640699265354275, + "learning_rate": 4.7388032453503964e-06, + "loss": 0.5436, + "step": 2800 + }, + { + "epoch": 0.4544127190136275, + "grad_norm": 0.5574891643163256, + "learning_rate": 4.738613175263611e-06, + "loss": 0.5707, + "step": 2801 + }, + { + "epoch": 0.454574951330305, + "grad_norm": 0.562033104923672, + "learning_rate": 4.738423039860628e-06, + "loss": 0.5558, + "step": 2802 + }, + { + "epoch": 0.45473718364698246, + "grad_norm": 0.5633701316791259, + "learning_rate": 4.7382328391469975e-06, + "loss": 0.5886, + "step": 2803 + }, + { + "epoch": 0.45489941596365996, + "grad_norm": 0.5903819116241754, + "learning_rate": 4.738042573128268e-06, + "loss": 0.5689, + "step": 2804 + }, + { + "epoch": 0.45506164828033746, + "grad_norm": 0.6362442583230092, + "learning_rate": 4.737852241809991e-06, + "loss": 0.5647, + "step": 2805 + }, + { + "epoch": 0.4552238805970149, + "grad_norm": 0.5567594215861481, + "learning_rate": 4.7376618451977195e-06, + "loss": 0.5455, + "step": 2806 + }, + { + "epoch": 0.4553861129136924, + "grad_norm": 0.5734411101892031, + "learning_rate": 4.73747138329701e-06, + "loss": 0.5615, + "step": 2807 + }, + { + "epoch": 0.4555483452303699, + "grad_norm": 0.601563870160068, + "learning_rate": 4.737280856113417e-06, + "loss": 0.5934, + "step": 2808 + }, + { + "epoch": 0.45571057754704736, + "grad_norm": 0.5783365669559261, + "learning_rate": 4.737090263652503e-06, + "loss": 0.5738, + "step": 2809 + }, + { + "epoch": 0.45587280986372486, + "grad_norm": 0.5692158200378235, + "learning_rate": 4.736899605919827e-06, + "loss": 0.5954, + "step": 2810 + }, + { + "epoch": 0.45603504218040236, + "grad_norm": 0.6011653461540277, + "learning_rate": 4.7367088829209515e-06, + "loss": 0.5857, + "step": 2811 + }, + { + "epoch": 0.4561972744970798, + "grad_norm": 0.5769515136078545, + "learning_rate": 4.736518094661442e-06, + "loss": 0.5667, + "step": 2812 + }, + { + "epoch": 0.4563595068137573, + "grad_norm": 0.5863432387505548, + "learning_rate": 4.736327241146864e-06, + "loss": 0.5618, + "step": 2813 + }, + { + "epoch": 0.45652173913043476, + "grad_norm": 0.5938457370099602, + "learning_rate": 4.736136322382788e-06, + "loss": 0.5696, + "step": 2814 + }, + { + "epoch": 0.45668397144711226, + "grad_norm": 0.6225817309739253, + "learning_rate": 4.735945338374783e-06, + "loss": 0.6007, + "step": 2815 + }, + { + "epoch": 0.45684620376378976, + "grad_norm": 0.5952494499920044, + "learning_rate": 4.735754289128422e-06, + "loss": 0.5638, + "step": 2816 + }, + { + "epoch": 0.4570084360804672, + "grad_norm": 0.5598574094646817, + "learning_rate": 4.735563174649278e-06, + "loss": 0.5968, + "step": 2817 + }, + { + "epoch": 0.4571706683971447, + "grad_norm": 0.573645777143262, + "learning_rate": 4.735371994942929e-06, + "loss": 0.5701, + "step": 2818 + }, + { + "epoch": 0.4573329007138222, + "grad_norm": 0.5878786738649963, + "learning_rate": 4.7351807500149525e-06, + "loss": 0.564, + "step": 2819 + }, + { + "epoch": 0.45749513303049966, + "grad_norm": 0.5930223941187546, + "learning_rate": 4.7349894398709285e-06, + "loss": 0.5769, + "step": 2820 + }, + { + "epoch": 0.45765736534717716, + "grad_norm": 0.6091283221491666, + "learning_rate": 4.734798064516437e-06, + "loss": 0.5738, + "step": 2821 + }, + { + "epoch": 0.45781959766385466, + "grad_norm": 0.5923939197827519, + "learning_rate": 4.734606623957065e-06, + "loss": 0.5757, + "step": 2822 + }, + { + "epoch": 0.4579818299805321, + "grad_norm": 0.6037830561045708, + "learning_rate": 4.734415118198395e-06, + "loss": 0.591, + "step": 2823 + }, + { + "epoch": 0.4581440622972096, + "grad_norm": 0.5831125065820023, + "learning_rate": 4.734223547246018e-06, + "loss": 0.5594, + "step": 2824 + }, + { + "epoch": 0.4583062946138871, + "grad_norm": 0.5932691249711848, + "learning_rate": 4.73403191110552e-06, + "loss": 0.5939, + "step": 2825 + }, + { + "epoch": 0.45846852693056456, + "grad_norm": 0.6123996055080595, + "learning_rate": 4.733840209782494e-06, + "loss": 0.6055, + "step": 2826 + }, + { + "epoch": 0.45863075924724206, + "grad_norm": 0.5756589730992232, + "learning_rate": 4.733648443282533e-06, + "loss": 0.5427, + "step": 2827 + }, + { + "epoch": 0.4587929915639195, + "grad_norm": 0.5911616605115093, + "learning_rate": 4.733456611611233e-06, + "loss": 0.6021, + "step": 2828 + }, + { + "epoch": 0.458955223880597, + "grad_norm": 0.6361201407805736, + "learning_rate": 4.733264714774192e-06, + "loss": 0.5819, + "step": 2829 + }, + { + "epoch": 0.4591174561972745, + "grad_norm": 0.6334167696715267, + "learning_rate": 4.733072752777005e-06, + "loss": 0.5373, + "step": 2830 + }, + { + "epoch": 0.45927968851395196, + "grad_norm": 0.5857821360508122, + "learning_rate": 4.732880725625277e-06, + "loss": 0.565, + "step": 2831 + }, + { + "epoch": 0.45944192083062946, + "grad_norm": 0.6141976319954868, + "learning_rate": 4.732688633324609e-06, + "loss": 0.5694, + "step": 2832 + }, + { + "epoch": 0.45960415314730696, + "grad_norm": 0.6099995701618257, + "learning_rate": 4.732496475880605e-06, + "loss": 0.5952, + "step": 2833 + }, + { + "epoch": 0.4597663854639844, + "grad_norm": 0.6121804029768582, + "learning_rate": 4.7323042532988735e-06, + "loss": 0.5823, + "step": 2834 + }, + { + "epoch": 0.4599286177806619, + "grad_norm": 0.6248284472752788, + "learning_rate": 4.732111965585021e-06, + "loss": 0.5851, + "step": 2835 + }, + { + "epoch": 0.4600908500973394, + "grad_norm": 0.5967379528466451, + "learning_rate": 4.73191961274466e-06, + "loss": 0.5847, + "step": 2836 + }, + { + "epoch": 0.46025308241401686, + "grad_norm": 0.619766588405587, + "learning_rate": 4.7317271947834006e-06, + "loss": 0.5581, + "step": 2837 + }, + { + "epoch": 0.46041531473069436, + "grad_norm": 0.5976859014398991, + "learning_rate": 4.731534711706859e-06, + "loss": 0.5778, + "step": 2838 + }, + { + "epoch": 0.46057754704737186, + "grad_norm": 0.6022077545073372, + "learning_rate": 4.731342163520649e-06, + "loss": 0.5848, + "step": 2839 + }, + { + "epoch": 0.4607397793640493, + "grad_norm": 0.5856373061490405, + "learning_rate": 4.731149550230391e-06, + "loss": 0.5715, + "step": 2840 + }, + { + "epoch": 0.4609020116807268, + "grad_norm": 0.6248760496400995, + "learning_rate": 4.7309568718417035e-06, + "loss": 0.5799, + "step": 2841 + }, + { + "epoch": 0.4610642439974043, + "grad_norm": 0.5947657506596375, + "learning_rate": 4.730764128360209e-06, + "loss": 0.5775, + "step": 2842 + }, + { + "epoch": 0.46122647631408176, + "grad_norm": 0.6007088862010928, + "learning_rate": 4.73057131979153e-06, + "loss": 0.6028, + "step": 2843 + }, + { + "epoch": 0.46138870863075926, + "grad_norm": 0.6090552551505268, + "learning_rate": 4.730378446141294e-06, + "loss": 0.573, + "step": 2844 + }, + { + "epoch": 0.4615509409474367, + "grad_norm": 0.5598395029383118, + "learning_rate": 4.730185507415127e-06, + "loss": 0.5536, + "step": 2845 + }, + { + "epoch": 0.4617131732641142, + "grad_norm": 0.5680602353366911, + "learning_rate": 4.7299925036186585e-06, + "loss": 0.5691, + "step": 2846 + }, + { + "epoch": 0.4618754055807917, + "grad_norm": 0.5765525888532548, + "learning_rate": 4.7297994347575205e-06, + "loss": 0.573, + "step": 2847 + }, + { + "epoch": 0.46203763789746916, + "grad_norm": 0.6015971833560259, + "learning_rate": 4.729606300837345e-06, + "loss": 0.5465, + "step": 2848 + }, + { + "epoch": 0.46219987021414666, + "grad_norm": 0.5925430261155317, + "learning_rate": 4.729413101863769e-06, + "loss": 0.5937, + "step": 2849 + }, + { + "epoch": 0.46236210253082416, + "grad_norm": 0.6155357624260948, + "learning_rate": 4.729219837842427e-06, + "loss": 0.5948, + "step": 2850 + }, + { + "epoch": 0.4625243348475016, + "grad_norm": 0.5689735402405708, + "learning_rate": 4.72902650877896e-06, + "loss": 0.5495, + "step": 2851 + }, + { + "epoch": 0.4626865671641791, + "grad_norm": 0.5878202168153764, + "learning_rate": 4.728833114679008e-06, + "loss": 0.6084, + "step": 2852 + }, + { + "epoch": 0.4628487994808566, + "grad_norm": 0.5853780210789685, + "learning_rate": 4.728639655548214e-06, + "loss": 0.5886, + "step": 2853 + }, + { + "epoch": 0.46301103179753406, + "grad_norm": 0.5876667742914752, + "learning_rate": 4.728446131392221e-06, + "loss": 0.5534, + "step": 2854 + }, + { + "epoch": 0.46317326411421156, + "grad_norm": 0.5993362542951924, + "learning_rate": 4.728252542216678e-06, + "loss": 0.5671, + "step": 2855 + }, + { + "epoch": 0.46333549643088906, + "grad_norm": 0.5697195647325213, + "learning_rate": 4.728058888027232e-06, + "loss": 0.5749, + "step": 2856 + }, + { + "epoch": 0.4634977287475665, + "grad_norm": 0.5897106326788831, + "learning_rate": 4.727865168829532e-06, + "loss": 0.6011, + "step": 2857 + }, + { + "epoch": 0.463659961064244, + "grad_norm": 0.6010749806170853, + "learning_rate": 4.727671384629233e-06, + "loss": 0.5723, + "step": 2858 + }, + { + "epoch": 0.46382219338092145, + "grad_norm": 0.5475838689673102, + "learning_rate": 4.727477535431987e-06, + "loss": 0.5706, + "step": 2859 + }, + { + "epoch": 0.46398442569759896, + "grad_norm": 0.576677511138582, + "learning_rate": 4.72728362124345e-06, + "loss": 0.5552, + "step": 2860 + }, + { + "epoch": 0.46414665801427646, + "grad_norm": 0.595960316690509, + "learning_rate": 4.727089642069281e-06, + "loss": 0.5579, + "step": 2861 + }, + { + "epoch": 0.4643088903309539, + "grad_norm": 0.61536458231776, + "learning_rate": 4.726895597915139e-06, + "loss": 0.5798, + "step": 2862 + }, + { + "epoch": 0.4644711226476314, + "grad_norm": 0.5980653433288717, + "learning_rate": 4.7267014887866855e-06, + "loss": 0.5716, + "step": 2863 + }, + { + "epoch": 0.4646333549643089, + "grad_norm": 0.6018001534749108, + "learning_rate": 4.726507314689584e-06, + "loss": 0.5847, + "step": 2864 + }, + { + "epoch": 0.46479558728098636, + "grad_norm": 0.5834315809619463, + "learning_rate": 4.7263130756295006e-06, + "loss": 0.5826, + "step": 2865 + }, + { + "epoch": 0.46495781959766386, + "grad_norm": 0.5936057806258349, + "learning_rate": 4.726118771612102e-06, + "loss": 0.5764, + "step": 2866 + }, + { + "epoch": 0.46512005191434136, + "grad_norm": 0.5760000143102503, + "learning_rate": 4.725924402643058e-06, + "loss": 0.5968, + "step": 2867 + }, + { + "epoch": 0.4652822842310188, + "grad_norm": 0.6422731241339628, + "learning_rate": 4.72572996872804e-06, + "loss": 0.5602, + "step": 2868 + }, + { + "epoch": 0.4654445165476963, + "grad_norm": 0.5689594986426235, + "learning_rate": 4.72553546987272e-06, + "loss": 0.5616, + "step": 2869 + }, + { + "epoch": 0.4656067488643738, + "grad_norm": 0.58385147602686, + "learning_rate": 4.725340906082772e-06, + "loss": 0.558, + "step": 2870 + }, + { + "epoch": 0.46576898118105126, + "grad_norm": 0.5866943847962158, + "learning_rate": 4.725146277363876e-06, + "loss": 0.6056, + "step": 2871 + }, + { + "epoch": 0.46593121349772876, + "grad_norm": 0.5738320139779864, + "learning_rate": 4.7249515837217075e-06, + "loss": 0.5515, + "step": 2872 + }, + { + "epoch": 0.4660934458144062, + "grad_norm": 1.0563887566146186, + "learning_rate": 4.724756825161949e-06, + "loss": 0.5597, + "step": 2873 + }, + { + "epoch": 0.4662556781310837, + "grad_norm": 0.5658695448492341, + "learning_rate": 4.724562001690282e-06, + "loss": 0.5304, + "step": 2874 + }, + { + "epoch": 0.4664179104477612, + "grad_norm": 0.5773908345981955, + "learning_rate": 4.724367113312391e-06, + "loss": 0.5791, + "step": 2875 + }, + { + "epoch": 0.46658014276443865, + "grad_norm": 0.5834070474023445, + "learning_rate": 4.724172160033964e-06, + "loss": 0.5797, + "step": 2876 + }, + { + "epoch": 0.46674237508111616, + "grad_norm": 0.5700593091035321, + "learning_rate": 4.723977141860686e-06, + "loss": 0.5448, + "step": 2877 + }, + { + "epoch": 0.46690460739779366, + "grad_norm": 0.6420620918149706, + "learning_rate": 4.72378205879825e-06, + "loss": 0.5704, + "step": 2878 + }, + { + "epoch": 0.4670668397144711, + "grad_norm": 0.5741812788561698, + "learning_rate": 4.723586910852346e-06, + "loss": 0.5715, + "step": 2879 + }, + { + "epoch": 0.4672290720311486, + "grad_norm": 0.5935694152387888, + "learning_rate": 4.723391698028668e-06, + "loss": 0.597, + "step": 2880 + }, + { + "epoch": 0.4673913043478261, + "grad_norm": 0.604400910270354, + "learning_rate": 4.723196420332914e-06, + "loss": 0.6044, + "step": 2881 + }, + { + "epoch": 0.46755353666450356, + "grad_norm": 0.5831589589884921, + "learning_rate": 4.723001077770779e-06, + "loss": 0.5922, + "step": 2882 + }, + { + "epoch": 0.46771576898118106, + "grad_norm": 0.558839465568742, + "learning_rate": 4.722805670347963e-06, + "loss": 0.575, + "step": 2883 + }, + { + "epoch": 0.46787800129785856, + "grad_norm": 0.6013244763238862, + "learning_rate": 4.7226101980701685e-06, + "loss": 0.5749, + "step": 2884 + }, + { + "epoch": 0.468040233614536, + "grad_norm": 0.6168609136839679, + "learning_rate": 4.7224146609430985e-06, + "loss": 0.5921, + "step": 2885 + }, + { + "epoch": 0.4682024659312135, + "grad_norm": 0.5658004392135701, + "learning_rate": 4.722219058972457e-06, + "loss": 0.5378, + "step": 2886 + }, + { + "epoch": 0.46836469824789095, + "grad_norm": 0.5852848502420257, + "learning_rate": 4.7220233921639525e-06, + "loss": 0.5602, + "step": 2887 + }, + { + "epoch": 0.46852693056456846, + "grad_norm": 0.5550246029844577, + "learning_rate": 4.721827660523294e-06, + "loss": 0.5947, + "step": 2888 + }, + { + "epoch": 0.46868916288124596, + "grad_norm": 0.5567968476584537, + "learning_rate": 4.721631864056191e-06, + "loss": 0.5407, + "step": 2889 + }, + { + "epoch": 0.4688513951979234, + "grad_norm": 0.6118377748972579, + "learning_rate": 4.721436002768357e-06, + "loss": 0.5563, + "step": 2890 + }, + { + "epoch": 0.4690136275146009, + "grad_norm": 0.5958419082335413, + "learning_rate": 4.721240076665508e-06, + "loss": 0.5454, + "step": 2891 + }, + { + "epoch": 0.4691758598312784, + "grad_norm": 0.5762632898878733, + "learning_rate": 4.721044085753359e-06, + "loss": 0.5976, + "step": 2892 + }, + { + "epoch": 0.46933809214795585, + "grad_norm": 0.5954959446567062, + "learning_rate": 4.720848030037628e-06, + "loss": 0.5664, + "step": 2893 + }, + { + "epoch": 0.46950032446463336, + "grad_norm": 0.6016320666061175, + "learning_rate": 4.720651909524037e-06, + "loss": 0.6125, + "step": 2894 + }, + { + "epoch": 0.46966255678131086, + "grad_norm": 0.6276628575096457, + "learning_rate": 4.720455724218306e-06, + "loss": 0.5602, + "step": 2895 + }, + { + "epoch": 0.4698247890979883, + "grad_norm": 0.5721623615120608, + "learning_rate": 4.720259474126162e-06, + "loss": 0.5689, + "step": 2896 + }, + { + "epoch": 0.4699870214146658, + "grad_norm": 0.5942056241043089, + "learning_rate": 4.720063159253328e-06, + "loss": 0.5641, + "step": 2897 + }, + { + "epoch": 0.4701492537313433, + "grad_norm": 0.5796645464904735, + "learning_rate": 4.719866779605534e-06, + "loss": 0.5791, + "step": 2898 + }, + { + "epoch": 0.47031148604802075, + "grad_norm": 0.5824636341932151, + "learning_rate": 4.719670335188509e-06, + "loss": 0.5754, + "step": 2899 + }, + { + "epoch": 0.47047371836469826, + "grad_norm": 0.5909372817921209, + "learning_rate": 4.719473826007985e-06, + "loss": 0.5577, + "step": 2900 + }, + { + "epoch": 0.4706359506813757, + "grad_norm": 0.6083161198359004, + "learning_rate": 4.719277252069696e-06, + "loss": 0.5785, + "step": 2901 + }, + { + "epoch": 0.4707981829980532, + "grad_norm": 0.6093114018448432, + "learning_rate": 4.719080613379376e-06, + "loss": 0.5989, + "step": 2902 + }, + { + "epoch": 0.4709604153147307, + "grad_norm": 0.5752891976721326, + "learning_rate": 4.7188839099427636e-06, + "loss": 0.5444, + "step": 2903 + }, + { + "epoch": 0.47112264763140815, + "grad_norm": 0.6398107541011486, + "learning_rate": 4.718687141765598e-06, + "loss": 0.5693, + "step": 2904 + }, + { + "epoch": 0.47128487994808566, + "grad_norm": 0.5578234767649559, + "learning_rate": 4.718490308853618e-06, + "loss": 0.5461, + "step": 2905 + }, + { + "epoch": 0.47144711226476316, + "grad_norm": 0.6187259045059361, + "learning_rate": 4.718293411212571e-06, + "loss": 0.5605, + "step": 2906 + }, + { + "epoch": 0.4716093445814406, + "grad_norm": 0.5873956681140583, + "learning_rate": 4.718096448848198e-06, + "loss": 0.5609, + "step": 2907 + }, + { + "epoch": 0.4717715768981181, + "grad_norm": 0.5826910142679117, + "learning_rate": 4.717899421766248e-06, + "loss": 0.576, + "step": 2908 + }, + { + "epoch": 0.4719338092147956, + "grad_norm": 0.5947413938723318, + "learning_rate": 4.7177023299724676e-06, + "loss": 0.5689, + "step": 2909 + }, + { + "epoch": 0.47209604153147305, + "grad_norm": 0.6015070456788019, + "learning_rate": 4.71750517347261e-06, + "loss": 0.5801, + "step": 2910 + }, + { + "epoch": 0.47225827384815056, + "grad_norm": 0.5737222926059935, + "learning_rate": 4.717307952272425e-06, + "loss": 0.5889, + "step": 2911 + }, + { + "epoch": 0.47242050616482806, + "grad_norm": 0.5947783844958623, + "learning_rate": 4.71711066637767e-06, + "loss": 0.563, + "step": 2912 + }, + { + "epoch": 0.4725827384815055, + "grad_norm": 0.6186696781671458, + "learning_rate": 4.7169133157940974e-06, + "loss": 0.5497, + "step": 2913 + }, + { + "epoch": 0.472744970798183, + "grad_norm": 0.5959951509425293, + "learning_rate": 4.716715900527468e-06, + "loss": 0.5767, + "step": 2914 + }, + { + "epoch": 0.47290720311486045, + "grad_norm": 0.6189833284432286, + "learning_rate": 4.716518420583542e-06, + "loss": 0.5961, + "step": 2915 + }, + { + "epoch": 0.47306943543153795, + "grad_norm": 0.5817972210524378, + "learning_rate": 4.716320875968081e-06, + "loss": 0.5805, + "step": 2916 + }, + { + "epoch": 0.47323166774821546, + "grad_norm": 0.5812701074519461, + "learning_rate": 4.716123266686847e-06, + "loss": 0.6003, + "step": 2917 + }, + { + "epoch": 0.4733939000648929, + "grad_norm": 0.5819932131948535, + "learning_rate": 4.715925592745607e-06, + "loss": 0.5919, + "step": 2918 + }, + { + "epoch": 0.4735561323815704, + "grad_norm": 0.568715370544355, + "learning_rate": 4.71572785415013e-06, + "loss": 0.5989, + "step": 2919 + }, + { + "epoch": 0.4737183646982479, + "grad_norm": 0.5714253760065326, + "learning_rate": 4.715530050906182e-06, + "loss": 0.5806, + "step": 2920 + }, + { + "epoch": 0.47388059701492535, + "grad_norm": 0.571595719761358, + "learning_rate": 4.715332183019537e-06, + "loss": 0.5968, + "step": 2921 + }, + { + "epoch": 0.47404282933160286, + "grad_norm": 0.5756886865796028, + "learning_rate": 4.715134250495968e-06, + "loss": 0.5703, + "step": 2922 + }, + { + "epoch": 0.47420506164828036, + "grad_norm": 0.6113574657631687, + "learning_rate": 4.714936253341248e-06, + "loss": 0.585, + "step": 2923 + }, + { + "epoch": 0.4743672939649578, + "grad_norm": 0.5716317510176707, + "learning_rate": 4.714738191561157e-06, + "loss": 0.5675, + "step": 2924 + }, + { + "epoch": 0.4745295262816353, + "grad_norm": 0.6205487490069749, + "learning_rate": 4.714540065161471e-06, + "loss": 0.5575, + "step": 2925 + }, + { + "epoch": 0.4746917585983128, + "grad_norm": 0.6179975416930655, + "learning_rate": 4.714341874147973e-06, + "loss": 0.5663, + "step": 2926 + }, + { + "epoch": 0.47485399091499025, + "grad_norm": 0.5574137986413679, + "learning_rate": 4.7141436185264445e-06, + "loss": 0.559, + "step": 2927 + }, + { + "epoch": 0.47501622323166776, + "grad_norm": 0.5766500658630481, + "learning_rate": 4.71394529830267e-06, + "loss": 0.5645, + "step": 2928 + }, + { + "epoch": 0.47517845554834526, + "grad_norm": 0.5900992764187126, + "learning_rate": 4.713746913482437e-06, + "loss": 0.5639, + "step": 2929 + }, + { + "epoch": 0.4753406878650227, + "grad_norm": 0.5610149406568353, + "learning_rate": 4.713548464071532e-06, + "loss": 0.5658, + "step": 2930 + }, + { + "epoch": 0.4755029201817002, + "grad_norm": 0.6073123182898998, + "learning_rate": 4.713349950075747e-06, + "loss": 0.5915, + "step": 2931 + }, + { + "epoch": 0.47566515249837765, + "grad_norm": 0.5934044428529008, + "learning_rate": 4.7131513715008725e-06, + "loss": 0.5799, + "step": 2932 + }, + { + "epoch": 0.47582738481505515, + "grad_norm": 0.6555823317689228, + "learning_rate": 4.712952728352703e-06, + "loss": 0.5862, + "step": 2933 + }, + { + "epoch": 0.47598961713173266, + "grad_norm": 0.5882991421275049, + "learning_rate": 4.712754020637035e-06, + "loss": 0.5386, + "step": 2934 + }, + { + "epoch": 0.4761518494484101, + "grad_norm": 0.6019383096144783, + "learning_rate": 4.712555248359666e-06, + "loss": 0.5975, + "step": 2935 + }, + { + "epoch": 0.4763140817650876, + "grad_norm": 0.6416604116780628, + "learning_rate": 4.712356411526395e-06, + "loss": 0.5389, + "step": 2936 + }, + { + "epoch": 0.4764763140817651, + "grad_norm": 0.5680443882983734, + "learning_rate": 4.712157510143023e-06, + "loss": 0.5889, + "step": 2937 + }, + { + "epoch": 0.47663854639844255, + "grad_norm": 0.6639479386218358, + "learning_rate": 4.711958544215355e-06, + "loss": 0.5871, + "step": 2938 + }, + { + "epoch": 0.47680077871512005, + "grad_norm": 0.6036897908436568, + "learning_rate": 4.711759513749196e-06, + "loss": 0.5778, + "step": 2939 + }, + { + "epoch": 0.47696301103179756, + "grad_norm": 0.5965378607543892, + "learning_rate": 4.71156041875035e-06, + "loss": 0.5404, + "step": 2940 + }, + { + "epoch": 0.477125243348475, + "grad_norm": 0.5819102781783284, + "learning_rate": 4.71136125922463e-06, + "loss": 0.5763, + "step": 2941 + }, + { + "epoch": 0.4772874756651525, + "grad_norm": 0.6154920212710657, + "learning_rate": 4.711162035177847e-06, + "loss": 0.5569, + "step": 2942 + }, + { + "epoch": 0.47744970798183, + "grad_norm": 0.6197451548939836, + "learning_rate": 4.7109627466158095e-06, + "loss": 0.5783, + "step": 2943 + }, + { + "epoch": 0.47761194029850745, + "grad_norm": 0.6216181407704316, + "learning_rate": 4.710763393544337e-06, + "loss": 0.5789, + "step": 2944 + }, + { + "epoch": 0.47777417261518496, + "grad_norm": 0.616518821014084, + "learning_rate": 4.710563975969242e-06, + "loss": 0.5563, + "step": 2945 + }, + { + "epoch": 0.4779364049318624, + "grad_norm": 0.5883084753030933, + "learning_rate": 4.710364493896345e-06, + "loss": 0.5662, + "step": 2946 + }, + { + "epoch": 0.4780986372485399, + "grad_norm": 0.6142310179770484, + "learning_rate": 4.710164947331467e-06, + "loss": 0.5678, + "step": 2947 + }, + { + "epoch": 0.4782608695652174, + "grad_norm": 0.5789001590237162, + "learning_rate": 4.709965336280429e-06, + "loss": 0.5723, + "step": 2948 + }, + { + "epoch": 0.47842310188189485, + "grad_norm": 0.6119096814286118, + "learning_rate": 4.709765660749055e-06, + "loss": 0.5924, + "step": 2949 + }, + { + "epoch": 0.47858533419857235, + "grad_norm": 0.5716792859448188, + "learning_rate": 4.709565920743172e-06, + "loss": 0.5537, + "step": 2950 + }, + { + "epoch": 0.47874756651524986, + "grad_norm": 0.5748453331473319, + "learning_rate": 4.7093661162686065e-06, + "loss": 0.5867, + "step": 2951 + }, + { + "epoch": 0.4789097988319273, + "grad_norm": 0.594828662736246, + "learning_rate": 4.709166247331189e-06, + "loss": 0.5886, + "step": 2952 + }, + { + "epoch": 0.4790720311486048, + "grad_norm": 0.6692391391936335, + "learning_rate": 4.708966313936751e-06, + "loss": 0.5704, + "step": 2953 + }, + { + "epoch": 0.4792342634652823, + "grad_norm": 0.6604537992359393, + "learning_rate": 4.708766316091125e-06, + "loss": 0.5689, + "step": 2954 + }, + { + "epoch": 0.47939649578195975, + "grad_norm": 0.6236643997465959, + "learning_rate": 4.708566253800148e-06, + "loss": 0.5889, + "step": 2955 + }, + { + "epoch": 0.47955872809863725, + "grad_norm": 0.619110416703065, + "learning_rate": 4.708366127069657e-06, + "loss": 0.587, + "step": 2956 + }, + { + "epoch": 0.47972096041531476, + "grad_norm": 0.5873612619464377, + "learning_rate": 4.708165935905491e-06, + "loss": 0.6099, + "step": 2957 + }, + { + "epoch": 0.4798831927319922, + "grad_norm": 0.6026018562934763, + "learning_rate": 4.707965680313489e-06, + "loss": 0.5792, + "step": 2958 + }, + { + "epoch": 0.4800454250486697, + "grad_norm": 0.6070740531045948, + "learning_rate": 4.707765360299496e-06, + "loss": 0.5903, + "step": 2959 + }, + { + "epoch": 0.48020765736534715, + "grad_norm": 0.6148424734821515, + "learning_rate": 4.707564975869357e-06, + "loss": 0.6064, + "step": 2960 + }, + { + "epoch": 0.48036988968202465, + "grad_norm": 0.5738051910898567, + "learning_rate": 4.707364527028917e-06, + "loss": 0.5803, + "step": 2961 + }, + { + "epoch": 0.48053212199870216, + "grad_norm": 0.5891962843147898, + "learning_rate": 4.707164013784026e-06, + "loss": 0.5393, + "step": 2962 + }, + { + "epoch": 0.4806943543153796, + "grad_norm": 0.5943458692765409, + "learning_rate": 4.706963436140533e-06, + "loss": 0.5656, + "step": 2963 + }, + { + "epoch": 0.4808565866320571, + "grad_norm": 0.5916423439394801, + "learning_rate": 4.706762794104292e-06, + "loss": 0.5832, + "step": 2964 + }, + { + "epoch": 0.4810188189487346, + "grad_norm": 0.6150406938985132, + "learning_rate": 4.7065620876811544e-06, + "loss": 0.5882, + "step": 2965 + }, + { + "epoch": 0.48118105126541205, + "grad_norm": 0.6079875162064441, + "learning_rate": 4.706361316876979e-06, + "loss": 0.5916, + "step": 2966 + }, + { + "epoch": 0.48134328358208955, + "grad_norm": 0.6034098897167325, + "learning_rate": 4.706160481697624e-06, + "loss": 0.5746, + "step": 2967 + }, + { + "epoch": 0.48150551589876706, + "grad_norm": 0.5854913602808118, + "learning_rate": 4.705959582148947e-06, + "loss": 0.5767, + "step": 2968 + }, + { + "epoch": 0.4816677482154445, + "grad_norm": 0.6130379100321822, + "learning_rate": 4.70575861823681e-06, + "loss": 0.5469, + "step": 2969 + }, + { + "epoch": 0.481829980532122, + "grad_norm": 0.5670623355690518, + "learning_rate": 4.705557589967077e-06, + "loss": 0.6071, + "step": 2970 + }, + { + "epoch": 0.4819922128487995, + "grad_norm": 0.5982171425988408, + "learning_rate": 4.705356497345615e-06, + "loss": 0.5563, + "step": 2971 + }, + { + "epoch": 0.48215444516547695, + "grad_norm": 0.5626498302325501, + "learning_rate": 4.705155340378288e-06, + "loss": 0.5532, + "step": 2972 + }, + { + "epoch": 0.48231667748215445, + "grad_norm": 0.5792500310532362, + "learning_rate": 4.704954119070968e-06, + "loss": 0.5786, + "step": 2973 + }, + { + "epoch": 0.4824789097988319, + "grad_norm": 0.5708059747446181, + "learning_rate": 4.704752833429524e-06, + "loss": 0.5669, + "step": 2974 + }, + { + "epoch": 0.4826411421155094, + "grad_norm": 0.580245616306666, + "learning_rate": 4.7045514834598314e-06, + "loss": 0.5977, + "step": 2975 + }, + { + "epoch": 0.4828033744321869, + "grad_norm": 0.6034594152370609, + "learning_rate": 4.704350069167763e-06, + "loss": 0.5856, + "step": 2976 + }, + { + "epoch": 0.48296560674886435, + "grad_norm": 0.5983131706016466, + "learning_rate": 4.704148590559196e-06, + "loss": 0.5522, + "step": 2977 + }, + { + "epoch": 0.48312783906554185, + "grad_norm": 0.574480204152832, + "learning_rate": 4.703947047640009e-06, + "loss": 0.5708, + "step": 2978 + }, + { + "epoch": 0.48329007138221936, + "grad_norm": 0.597933345315651, + "learning_rate": 4.703745440416081e-06, + "loss": 0.5501, + "step": 2979 + }, + { + "epoch": 0.4834523036988968, + "grad_norm": 0.5900518015655387, + "learning_rate": 4.703543768893297e-06, + "loss": 0.6079, + "step": 2980 + }, + { + "epoch": 0.4836145360155743, + "grad_norm": 0.596537582493592, + "learning_rate": 4.703342033077541e-06, + "loss": 0.5437, + "step": 2981 + }, + { + "epoch": 0.4837767683322518, + "grad_norm": 0.6013490088912506, + "learning_rate": 4.703140232974697e-06, + "loss": 0.5677, + "step": 2982 + }, + { + "epoch": 0.48393900064892925, + "grad_norm": 0.5808871536019967, + "learning_rate": 4.702938368590654e-06, + "loss": 0.5672, + "step": 2983 + }, + { + "epoch": 0.48410123296560675, + "grad_norm": 0.6188086713012392, + "learning_rate": 4.702736439931301e-06, + "loss": 0.5703, + "step": 2984 + }, + { + "epoch": 0.48426346528228426, + "grad_norm": 0.603020834789301, + "learning_rate": 4.702534447002531e-06, + "loss": 0.5682, + "step": 2985 + }, + { + "epoch": 0.4844256975989617, + "grad_norm": 0.6009717868385643, + "learning_rate": 4.702332389810237e-06, + "loss": 0.556, + "step": 2986 + }, + { + "epoch": 0.4845879299156392, + "grad_norm": 0.5881728645684521, + "learning_rate": 4.702130268360315e-06, + "loss": 0.548, + "step": 2987 + }, + { + "epoch": 0.48475016223231665, + "grad_norm": 0.613583388874319, + "learning_rate": 4.701928082658661e-06, + "loss": 0.6014, + "step": 2988 + }, + { + "epoch": 0.48491239454899415, + "grad_norm": 0.5666550002672867, + "learning_rate": 4.7017258327111756e-06, + "loss": 0.5817, + "step": 2989 + }, + { + "epoch": 0.48507462686567165, + "grad_norm": 0.5738180377811414, + "learning_rate": 4.701523518523758e-06, + "loss": 0.5498, + "step": 2990 + }, + { + "epoch": 0.4852368591823491, + "grad_norm": 0.5884977178287962, + "learning_rate": 4.701321140102314e-06, + "loss": 0.5597, + "step": 2991 + }, + { + "epoch": 0.4853990914990266, + "grad_norm": 0.5779223628986562, + "learning_rate": 4.701118697452746e-06, + "loss": 0.5574, + "step": 2992 + }, + { + "epoch": 0.4855613238157041, + "grad_norm": 0.6522807615423071, + "learning_rate": 4.700916190580961e-06, + "loss": 0.5793, + "step": 2993 + }, + { + "epoch": 0.48572355613238155, + "grad_norm": 0.5519257981954262, + "learning_rate": 4.700713619492868e-06, + "loss": 0.5702, + "step": 2994 + }, + { + "epoch": 0.48588578844905905, + "grad_norm": 0.5482370900278921, + "learning_rate": 4.700510984194378e-06, + "loss": 0.5114, + "step": 2995 + }, + { + "epoch": 0.48604802076573655, + "grad_norm": 0.5856034680633094, + "learning_rate": 4.700308284691402e-06, + "loss": 0.5913, + "step": 2996 + }, + { + "epoch": 0.486210253082414, + "grad_norm": 0.5695828033416095, + "learning_rate": 4.700105520989856e-06, + "loss": 0.5474, + "step": 2997 + }, + { + "epoch": 0.4863724853990915, + "grad_norm": 0.5829291587641044, + "learning_rate": 4.699902693095654e-06, + "loss": 0.6053, + "step": 2998 + }, + { + "epoch": 0.486534717715769, + "grad_norm": 0.5811932085535905, + "learning_rate": 4.699699801014715e-06, + "loss": 0.567, + "step": 2999 + }, + { + "epoch": 0.48669695003244645, + "grad_norm": 0.6181887737413575, + "learning_rate": 4.6994968447529586e-06, + "loss": 0.5496, + "step": 3000 + }, + { + "epoch": 0.48685918234912395, + "grad_norm": 0.6143574600095996, + "learning_rate": 4.699293824316307e-06, + "loss": 0.5574, + "step": 3001 + }, + { + "epoch": 0.48702141466580146, + "grad_norm": 0.5776261410745649, + "learning_rate": 4.6990907397106835e-06, + "loss": 0.5492, + "step": 3002 + }, + { + "epoch": 0.4871836469824789, + "grad_norm": 0.6305163459505783, + "learning_rate": 4.6988875909420126e-06, + "loss": 0.5948, + "step": 3003 + }, + { + "epoch": 0.4873458792991564, + "grad_norm": 0.6047662382062678, + "learning_rate": 4.698684378016223e-06, + "loss": 0.5738, + "step": 3004 + }, + { + "epoch": 0.48750811161583385, + "grad_norm": 0.5646915858576516, + "learning_rate": 4.698481100939243e-06, + "loss": 0.563, + "step": 3005 + }, + { + "epoch": 0.48767034393251135, + "grad_norm": 0.5568382975892205, + "learning_rate": 4.698277759717004e-06, + "loss": 0.5702, + "step": 3006 + }, + { + "epoch": 0.48783257624918885, + "grad_norm": 0.5634319159456742, + "learning_rate": 4.698074354355438e-06, + "loss": 0.5519, + "step": 3007 + }, + { + "epoch": 0.4879948085658663, + "grad_norm": 0.6222901277725462, + "learning_rate": 4.697870884860481e-06, + "loss": 0.5752, + "step": 3008 + }, + { + "epoch": 0.4881570408825438, + "grad_norm": 0.5917797114785296, + "learning_rate": 4.6976673512380685e-06, + "loss": 0.5501, + "step": 3009 + }, + { + "epoch": 0.4883192731992213, + "grad_norm": 0.5881275980120656, + "learning_rate": 4.69746375349414e-06, + "loss": 0.5585, + "step": 3010 + }, + { + "epoch": 0.48848150551589875, + "grad_norm": 0.5855079572327792, + "learning_rate": 4.697260091634635e-06, + "loss": 0.5821, + "step": 3011 + }, + { + "epoch": 0.48864373783257625, + "grad_norm": 0.5634184607610245, + "learning_rate": 4.697056365665498e-06, + "loss": 0.5516, + "step": 3012 + }, + { + "epoch": 0.48880597014925375, + "grad_norm": 0.5893594529961734, + "learning_rate": 4.69685257559267e-06, + "loss": 0.5584, + "step": 3013 + }, + { + "epoch": 0.4889682024659312, + "grad_norm": 0.6253002599130737, + "learning_rate": 4.696648721422099e-06, + "loss": 0.5992, + "step": 3014 + }, + { + "epoch": 0.4891304347826087, + "grad_norm": 0.5899647235662876, + "learning_rate": 4.696444803159732e-06, + "loss": 0.5757, + "step": 3015 + }, + { + "epoch": 0.4892926670992862, + "grad_norm": 0.6061628934263243, + "learning_rate": 4.696240820811519e-06, + "loss": 0.5239, + "step": 3016 + }, + { + "epoch": 0.48945489941596365, + "grad_norm": 0.5801225090436037, + "learning_rate": 4.696036774383411e-06, + "loss": 0.5563, + "step": 3017 + }, + { + "epoch": 0.48961713173264115, + "grad_norm": 0.58387911901327, + "learning_rate": 4.695832663881364e-06, + "loss": 0.5789, + "step": 3018 + }, + { + "epoch": 0.4897793640493186, + "grad_norm": 0.5965421850339511, + "learning_rate": 4.695628489311329e-06, + "loss": 0.5637, + "step": 3019 + }, + { + "epoch": 0.4899415963659961, + "grad_norm": 0.5820304786760458, + "learning_rate": 4.695424250679267e-06, + "loss": 0.5635, + "step": 3020 + }, + { + "epoch": 0.4901038286826736, + "grad_norm": 0.5931679966487949, + "learning_rate": 4.695219947991136e-06, + "loss": 0.5739, + "step": 3021 + }, + { + "epoch": 0.49026606099935105, + "grad_norm": 0.578064332214059, + "learning_rate": 4.6950155812528965e-06, + "loss": 0.5775, + "step": 3022 + }, + { + "epoch": 0.49042829331602855, + "grad_norm": 0.6025464918719083, + "learning_rate": 4.694811150470511e-06, + "loss": 0.5475, + "step": 3023 + }, + { + "epoch": 0.49059052563270605, + "grad_norm": 0.5694432842033574, + "learning_rate": 4.694606655649945e-06, + "loss": 0.5733, + "step": 3024 + }, + { + "epoch": 0.4907527579493835, + "grad_norm": 0.5937117191279394, + "learning_rate": 4.694402096797165e-06, + "loss": 0.5469, + "step": 3025 + }, + { + "epoch": 0.490914990266061, + "grad_norm": 0.5674017461619509, + "learning_rate": 4.694197473918139e-06, + "loss": 0.5754, + "step": 3026 + }, + { + "epoch": 0.4910772225827385, + "grad_norm": 0.5786114736082378, + "learning_rate": 4.6939927870188385e-06, + "loss": 0.5595, + "step": 3027 + }, + { + "epoch": 0.49123945489941595, + "grad_norm": 0.5655694706352324, + "learning_rate": 4.693788036105234e-06, + "loss": 0.5548, + "step": 3028 + }, + { + "epoch": 0.49140168721609345, + "grad_norm": 0.562057554397056, + "learning_rate": 4.6935832211833e-06, + "loss": 0.612, + "step": 3029 + }, + { + "epoch": 0.49156391953277095, + "grad_norm": 0.5856149945893121, + "learning_rate": 4.693378342259013e-06, + "loss": 0.5845, + "step": 3030 + }, + { + "epoch": 0.4917261518494484, + "grad_norm": 0.6352426742168503, + "learning_rate": 4.69317339933835e-06, + "loss": 0.6031, + "step": 3031 + }, + { + "epoch": 0.4918883841661259, + "grad_norm": 0.5742764023023739, + "learning_rate": 4.69296839242729e-06, + "loss": 0.5448, + "step": 3032 + }, + { + "epoch": 0.49205061648280335, + "grad_norm": 0.5818897084860429, + "learning_rate": 4.692763321531818e-06, + "loss": 0.5794, + "step": 3033 + }, + { + "epoch": 0.49221284879948085, + "grad_norm": 0.5947891399575757, + "learning_rate": 4.692558186657913e-06, + "loss": 0.5579, + "step": 3034 + }, + { + "epoch": 0.49237508111615835, + "grad_norm": 0.6082831308982856, + "learning_rate": 4.692352987811563e-06, + "loss": 0.5552, + "step": 3035 + }, + { + "epoch": 0.4925373134328358, + "grad_norm": 0.5934080260031049, + "learning_rate": 4.692147724998754e-06, + "loss": 0.6128, + "step": 3036 + }, + { + "epoch": 0.4926995457495133, + "grad_norm": 0.5901921745572306, + "learning_rate": 4.691942398225475e-06, + "loss": 0.5855, + "step": 3037 + }, + { + "epoch": 0.4928617780661908, + "grad_norm": 0.5795087365160841, + "learning_rate": 4.691737007497717e-06, + "loss": 0.5287, + "step": 3038 + }, + { + "epoch": 0.49302401038286825, + "grad_norm": 0.5954211173867612, + "learning_rate": 4.691531552821474e-06, + "loss": 0.5896, + "step": 3039 + }, + { + "epoch": 0.49318624269954575, + "grad_norm": 0.6103164956959776, + "learning_rate": 4.691326034202738e-06, + "loss": 0.5762, + "step": 3040 + }, + { + "epoch": 0.49334847501622325, + "grad_norm": 0.6040396132753018, + "learning_rate": 4.691120451647506e-06, + "loss": 0.5497, + "step": 3041 + }, + { + "epoch": 0.4935107073329007, + "grad_norm": 0.5652508595092977, + "learning_rate": 4.690914805161779e-06, + "loss": 0.6202, + "step": 3042 + }, + { + "epoch": 0.4936729396495782, + "grad_norm": 0.5614941098351647, + "learning_rate": 4.6907090947515545e-06, + "loss": 0.5775, + "step": 3043 + }, + { + "epoch": 0.4938351719662557, + "grad_norm": 0.5756411459736583, + "learning_rate": 4.690503320422834e-06, + "loss": 0.5946, + "step": 3044 + }, + { + "epoch": 0.49399740428293315, + "grad_norm": 0.5971682887387643, + "learning_rate": 4.690297482181625e-06, + "loss": 0.5651, + "step": 3045 + }, + { + "epoch": 0.49415963659961065, + "grad_norm": 0.5923861266804124, + "learning_rate": 4.6900915800339285e-06, + "loss": 0.5863, + "step": 3046 + }, + { + "epoch": 0.4943218689162881, + "grad_norm": 0.5914340599074139, + "learning_rate": 4.689885613985757e-06, + "loss": 0.5923, + "step": 3047 + }, + { + "epoch": 0.4944841012329656, + "grad_norm": 0.6073615085503504, + "learning_rate": 4.6896795840431155e-06, + "loss": 0.5694, + "step": 3048 + }, + { + "epoch": 0.4946463335496431, + "grad_norm": 0.5541212006254369, + "learning_rate": 4.689473490212017e-06, + "loss": 0.5584, + "step": 3049 + }, + { + "epoch": 0.49480856586632055, + "grad_norm": 0.6029600217619598, + "learning_rate": 4.689267332498477e-06, + "loss": 0.5611, + "step": 3050 + }, + { + "epoch": 0.49497079818299805, + "grad_norm": 0.610087515684271, + "learning_rate": 4.6890611109085075e-06, + "loss": 0.5726, + "step": 3051 + }, + { + "epoch": 0.49513303049967555, + "grad_norm": 0.5963258504455631, + "learning_rate": 4.688854825448127e-06, + "loss": 0.6215, + "step": 3052 + }, + { + "epoch": 0.495295262816353, + "grad_norm": 0.6018435118360806, + "learning_rate": 4.688648476123354e-06, + "loss": 0.5761, + "step": 3053 + }, + { + "epoch": 0.4954574951330305, + "grad_norm": 0.6014780201945389, + "learning_rate": 4.6884420629402085e-06, + "loss": 0.5339, + "step": 3054 + }, + { + "epoch": 0.495619727449708, + "grad_norm": 0.6030588428664504, + "learning_rate": 4.688235585904714e-06, + "loss": 0.5834, + "step": 3055 + }, + { + "epoch": 0.49578195976638545, + "grad_norm": 0.5614972777056569, + "learning_rate": 4.6880290450228945e-06, + "loss": 0.5497, + "step": 3056 + }, + { + "epoch": 0.49594419208306295, + "grad_norm": 0.6197844595233879, + "learning_rate": 4.687822440300776e-06, + "loss": 0.5644, + "step": 3057 + }, + { + "epoch": 0.49610642439974045, + "grad_norm": 0.5616975634265642, + "learning_rate": 4.687615771744387e-06, + "loss": 0.5955, + "step": 3058 + }, + { + "epoch": 0.4962686567164179, + "grad_norm": 0.5941214476274016, + "learning_rate": 4.687409039359757e-06, + "loss": 0.5898, + "step": 3059 + }, + { + "epoch": 0.4964308890330954, + "grad_norm": 0.6139047506383737, + "learning_rate": 4.6872022431529195e-06, + "loss": 0.5807, + "step": 3060 + }, + { + "epoch": 0.49659312134977285, + "grad_norm": 0.604833084377649, + "learning_rate": 4.686995383129906e-06, + "loss": 0.5527, + "step": 3061 + }, + { + "epoch": 0.49675535366645035, + "grad_norm": 0.6075515564009808, + "learning_rate": 4.686788459296753e-06, + "loss": 0.5943, + "step": 3062 + }, + { + "epoch": 0.49691758598312785, + "grad_norm": 0.5656076181687131, + "learning_rate": 4.6865814716594985e-06, + "loss": 0.5979, + "step": 3063 + }, + { + "epoch": 0.4970798182998053, + "grad_norm": 0.5950377150705342, + "learning_rate": 4.6863744202241805e-06, + "loss": 0.5427, + "step": 3064 + }, + { + "epoch": 0.4972420506164828, + "grad_norm": 0.5615569451197322, + "learning_rate": 4.686167304996842e-06, + "loss": 0.5694, + "step": 3065 + }, + { + "epoch": 0.4974042829331603, + "grad_norm": 0.6225353441449623, + "learning_rate": 4.685960125983524e-06, + "loss": 0.5704, + "step": 3066 + }, + { + "epoch": 0.49756651524983775, + "grad_norm": 0.5890385481847616, + "learning_rate": 4.685752883190272e-06, + "loss": 0.57, + "step": 3067 + }, + { + "epoch": 0.49772874756651525, + "grad_norm": 0.6173043357228699, + "learning_rate": 4.685545576623134e-06, + "loss": 0.568, + "step": 3068 + }, + { + "epoch": 0.49789097988319275, + "grad_norm": 0.588759973731356, + "learning_rate": 4.685338206288157e-06, + "loss": 0.5914, + "step": 3069 + }, + { + "epoch": 0.4980532121998702, + "grad_norm": 0.5847415428061854, + "learning_rate": 4.685130772191392e-06, + "loss": 0.5571, + "step": 3070 + }, + { + "epoch": 0.4982154445165477, + "grad_norm": 0.6065077062692628, + "learning_rate": 4.684923274338891e-06, + "loss": 0.5697, + "step": 3071 + }, + { + "epoch": 0.4983776768332252, + "grad_norm": 0.6377474622785221, + "learning_rate": 4.684715712736709e-06, + "loss": 0.5811, + "step": 3072 + }, + { + "epoch": 0.49853990914990265, + "grad_norm": 0.5489410436695459, + "learning_rate": 4.684508087390902e-06, + "loss": 0.5426, + "step": 3073 + }, + { + "epoch": 0.49870214146658015, + "grad_norm": 0.5621428734874402, + "learning_rate": 4.684300398307527e-06, + "loss": 0.562, + "step": 3074 + }, + { + "epoch": 0.4988643737832576, + "grad_norm": 0.5687901661030501, + "learning_rate": 4.684092645492645e-06, + "loss": 0.5579, + "step": 3075 + }, + { + "epoch": 0.4990266060999351, + "grad_norm": 0.5455215920776565, + "learning_rate": 4.683884828952316e-06, + "loss": 0.5345, + "step": 3076 + }, + { + "epoch": 0.4991888384166126, + "grad_norm": 0.6074341086187802, + "learning_rate": 4.683676948692606e-06, + "loss": 0.5653, + "step": 3077 + }, + { + "epoch": 0.49935107073329005, + "grad_norm": 0.6098225715397326, + "learning_rate": 4.683469004719577e-06, + "loss": 0.5572, + "step": 3078 + }, + { + "epoch": 0.49951330304996755, + "grad_norm": 0.5735010368941411, + "learning_rate": 4.6832609970392985e-06, + "loss": 0.5697, + "step": 3079 + }, + { + "epoch": 0.49967553536664505, + "grad_norm": 0.5776958657564294, + "learning_rate": 4.683052925657839e-06, + "loss": 0.556, + "step": 3080 + }, + { + "epoch": 0.4998377676833225, + "grad_norm": 0.6481571164902609, + "learning_rate": 4.68284479058127e-06, + "loss": 0.586, + "step": 3081 + }, + { + "epoch": 0.5, + "grad_norm": 0.5640002283336603, + "learning_rate": 4.682636591815663e-06, + "loss": 0.5357, + "step": 3082 + }, + { + "epoch": 0.5001622323166774, + "grad_norm": 0.573333709991381, + "learning_rate": 4.682428329367094e-06, + "loss": 0.5934, + "step": 3083 + }, + { + "epoch": 0.500324464633355, + "grad_norm": 0.5646339672685371, + "learning_rate": 4.682220003241638e-06, + "loss": 0.5334, + "step": 3084 + }, + { + "epoch": 0.5004866969500325, + "grad_norm": 0.6116643612062166, + "learning_rate": 4.682011613445374e-06, + "loss": 0.5898, + "step": 3085 + }, + { + "epoch": 0.5006489292667099, + "grad_norm": 0.6124751995558879, + "learning_rate": 4.681803159984383e-06, + "loss": 0.5943, + "step": 3086 + }, + { + "epoch": 0.5008111615833875, + "grad_norm": 0.5797383508493393, + "learning_rate": 4.681594642864746e-06, + "loss": 0.5793, + "step": 3087 + }, + { + "epoch": 0.5009733939000649, + "grad_norm": 0.5900266945507584, + "learning_rate": 4.681386062092548e-06, + "loss": 0.5676, + "step": 3088 + }, + { + "epoch": 0.5011356262167423, + "grad_norm": 0.5847822271391004, + "learning_rate": 4.681177417673873e-06, + "loss": 0.5514, + "step": 3089 + }, + { + "epoch": 0.5012978585334199, + "grad_norm": 0.5866738227484052, + "learning_rate": 4.68096870961481e-06, + "loss": 0.5634, + "step": 3090 + }, + { + "epoch": 0.5014600908500974, + "grad_norm": 0.5936464245608601, + "learning_rate": 4.680759937921449e-06, + "loss": 0.5965, + "step": 3091 + }, + { + "epoch": 0.5016223231667748, + "grad_norm": 0.6168671385081664, + "learning_rate": 4.680551102599881e-06, + "loss": 0.5441, + "step": 3092 + }, + { + "epoch": 0.5017845554834524, + "grad_norm": 0.5872902385558721, + "learning_rate": 4.680342203656197e-06, + "loss": 0.5765, + "step": 3093 + }, + { + "epoch": 0.5019467878001298, + "grad_norm": 0.6085802610014861, + "learning_rate": 4.680133241096495e-06, + "loss": 0.5731, + "step": 3094 + }, + { + "epoch": 0.5021090201168072, + "grad_norm": 0.5968974902681469, + "learning_rate": 4.679924214926871e-06, + "loss": 0.5373, + "step": 3095 + }, + { + "epoch": 0.5022712524334848, + "grad_norm": 0.6050826409128445, + "learning_rate": 4.679715125153423e-06, + "loss": 0.5713, + "step": 3096 + }, + { + "epoch": 0.5024334847501623, + "grad_norm": 0.5904000843175923, + "learning_rate": 4.679505971782252e-06, + "loss": 0.584, + "step": 3097 + }, + { + "epoch": 0.5025957170668397, + "grad_norm": 0.5822059111891702, + "learning_rate": 4.679296754819461e-06, + "loss": 0.5918, + "step": 3098 + }, + { + "epoch": 0.5027579493835171, + "grad_norm": 0.6280806103440904, + "learning_rate": 4.679087474271155e-06, + "loss": 0.5678, + "step": 3099 + }, + { + "epoch": 0.5029201817001947, + "grad_norm": 0.5730329890785419, + "learning_rate": 4.678878130143437e-06, + "loss": 0.5665, + "step": 3100 + }, + { + "epoch": 0.5030824140168721, + "grad_norm": 0.5788452315036924, + "learning_rate": 4.6786687224424185e-06, + "loss": 0.5396, + "step": 3101 + }, + { + "epoch": 0.5032446463335496, + "grad_norm": 0.6003088110891359, + "learning_rate": 4.678459251174209e-06, + "loss": 0.56, + "step": 3102 + }, + { + "epoch": 0.5034068786502272, + "grad_norm": 0.5836803410764825, + "learning_rate": 4.6782497163449185e-06, + "loss": 0.5627, + "step": 3103 + }, + { + "epoch": 0.5035691109669046, + "grad_norm": 0.5880659838894269, + "learning_rate": 4.678040117960661e-06, + "loss": 0.5384, + "step": 3104 + }, + { + "epoch": 0.503731343283582, + "grad_norm": 0.6039991518865446, + "learning_rate": 4.677830456027553e-06, + "loss": 0.578, + "step": 3105 + }, + { + "epoch": 0.5038935756002596, + "grad_norm": 0.6373995772560228, + "learning_rate": 4.677620730551712e-06, + "loss": 0.5866, + "step": 3106 + }, + { + "epoch": 0.504055807916937, + "grad_norm": 0.5994003943166231, + "learning_rate": 4.677410941539255e-06, + "loss": 0.5711, + "step": 3107 + }, + { + "epoch": 0.5042180402336145, + "grad_norm": 0.5950955054467982, + "learning_rate": 4.677201088996306e-06, + "loss": 0.5598, + "step": 3108 + }, + { + "epoch": 0.504380272550292, + "grad_norm": 0.5605540351536087, + "learning_rate": 4.676991172928985e-06, + "loss": 0.5803, + "step": 3109 + }, + { + "epoch": 0.5045425048669695, + "grad_norm": 0.5821741031288277, + "learning_rate": 4.676781193343419e-06, + "loss": 0.5503, + "step": 3110 + }, + { + "epoch": 0.504704737183647, + "grad_norm": 0.5930225010202409, + "learning_rate": 4.676571150245734e-06, + "loss": 0.5575, + "step": 3111 + }, + { + "epoch": 0.5048669695003245, + "grad_norm": 0.5798973894725135, + "learning_rate": 4.676361043642058e-06, + "loss": 0.5855, + "step": 3112 + }, + { + "epoch": 0.505029201817002, + "grad_norm": 0.5862545156581345, + "learning_rate": 4.676150873538522e-06, + "loss": 0.5507, + "step": 3113 + }, + { + "epoch": 0.5051914341336794, + "grad_norm": 0.590235053173034, + "learning_rate": 4.675940639941256e-06, + "loss": 0.5264, + "step": 3114 + }, + { + "epoch": 0.505353666450357, + "grad_norm": 0.5949695730858209, + "learning_rate": 4.675730342856397e-06, + "loss": 0.5708, + "step": 3115 + }, + { + "epoch": 0.5055158987670344, + "grad_norm": 0.5590042971398442, + "learning_rate": 4.6755199822900785e-06, + "loss": 0.5661, + "step": 3116 + }, + { + "epoch": 0.5056781310837118, + "grad_norm": 0.6034877980818438, + "learning_rate": 4.67530955824844e-06, + "loss": 0.5693, + "step": 3117 + }, + { + "epoch": 0.5058403634003894, + "grad_norm": 0.5789670880051849, + "learning_rate": 4.675099070737621e-06, + "loss": 0.6062, + "step": 3118 + }, + { + "epoch": 0.5060025957170668, + "grad_norm": 0.6238661194055346, + "learning_rate": 4.674888519763761e-06, + "loss": 0.591, + "step": 3119 + }, + { + "epoch": 0.5061648280337443, + "grad_norm": 0.5659195833985091, + "learning_rate": 4.674677905333004e-06, + "loss": 0.5495, + "step": 3120 + }, + { + "epoch": 0.5063270603504219, + "grad_norm": 0.5609278064311987, + "learning_rate": 4.674467227451496e-06, + "loss": 0.5905, + "step": 3121 + }, + { + "epoch": 0.5064892926670993, + "grad_norm": 0.5940891309854873, + "learning_rate": 4.674256486125384e-06, + "loss": 0.5932, + "step": 3122 + }, + { + "epoch": 0.5066515249837767, + "grad_norm": 0.5600425979158039, + "learning_rate": 4.674045681360816e-06, + "loss": 0.5568, + "step": 3123 + }, + { + "epoch": 0.5068137573004543, + "grad_norm": 0.6169610529435906, + "learning_rate": 4.673834813163943e-06, + "loss": 0.5582, + "step": 3124 + }, + { + "epoch": 0.5069759896171318, + "grad_norm": 0.6348546885219459, + "learning_rate": 4.673623881540917e-06, + "loss": 0.4521, + "step": 3125 + }, + { + "epoch": 0.5071382219338092, + "grad_norm": 0.602191747314424, + "learning_rate": 4.673412886497894e-06, + "loss": 0.5891, + "step": 3126 + }, + { + "epoch": 0.5073004542504866, + "grad_norm": 0.5703312183983111, + "learning_rate": 4.6732018280410284e-06, + "loss": 0.5472, + "step": 3127 + }, + { + "epoch": 0.5074626865671642, + "grad_norm": 0.6195034239868843, + "learning_rate": 4.672990706176479e-06, + "loss": 0.5962, + "step": 3128 + }, + { + "epoch": 0.5076249188838416, + "grad_norm": 0.6109719188998961, + "learning_rate": 4.672779520910406e-06, + "loss": 0.5765, + "step": 3129 + }, + { + "epoch": 0.5077871512005191, + "grad_norm": 0.6029995629234529, + "learning_rate": 4.672568272248971e-06, + "loss": 0.5571, + "step": 3130 + }, + { + "epoch": 0.5079493835171967, + "grad_norm": 0.5756022687870869, + "learning_rate": 4.672356960198336e-06, + "loss": 0.5442, + "step": 3131 + }, + { + "epoch": 0.5081116158338741, + "grad_norm": 0.5970326178131251, + "learning_rate": 4.67214558476467e-06, + "loss": 0.5567, + "step": 3132 + }, + { + "epoch": 0.5082738481505515, + "grad_norm": 0.600324038509054, + "learning_rate": 4.671934145954137e-06, + "loss": 0.5951, + "step": 3133 + }, + { + "epoch": 0.5084360804672291, + "grad_norm": 0.5910739561814162, + "learning_rate": 4.671722643772907e-06, + "loss": 0.6024, + "step": 3134 + }, + { + "epoch": 0.5085983127839065, + "grad_norm": 0.5795834679957317, + "learning_rate": 4.671511078227153e-06, + "loss": 0.5624, + "step": 3135 + }, + { + "epoch": 0.508760545100584, + "grad_norm": 0.5726121279376956, + "learning_rate": 4.671299449323045e-06, + "loss": 0.5674, + "step": 3136 + }, + { + "epoch": 0.5089227774172616, + "grad_norm": 0.5400452091182958, + "learning_rate": 4.671087757066759e-06, + "loss": 0.5523, + "step": 3137 + }, + { + "epoch": 0.509085009733939, + "grad_norm": 0.5927258020311363, + "learning_rate": 4.670876001464472e-06, + "loss": 0.5843, + "step": 3138 + }, + { + "epoch": 0.5092472420506164, + "grad_norm": 0.5886809256739741, + "learning_rate": 4.670664182522363e-06, + "loss": 0.5697, + "step": 3139 + }, + { + "epoch": 0.509409474367294, + "grad_norm": 0.5684717790347322, + "learning_rate": 4.67045230024661e-06, + "loss": 0.5621, + "step": 3140 + }, + { + "epoch": 0.5095717066839714, + "grad_norm": 0.5955236414364129, + "learning_rate": 4.670240354643396e-06, + "loss": 0.5643, + "step": 3141 + }, + { + "epoch": 0.5097339390006489, + "grad_norm": 0.5859235125343994, + "learning_rate": 4.6700283457189065e-06, + "loss": 0.5695, + "step": 3142 + }, + { + "epoch": 0.5098961713173265, + "grad_norm": 0.608714309509769, + "learning_rate": 4.6698162734793264e-06, + "loss": 0.5692, + "step": 3143 + }, + { + "epoch": 0.5100584036340039, + "grad_norm": 0.5670459846695682, + "learning_rate": 4.669604137930842e-06, + "loss": 0.5502, + "step": 3144 + }, + { + "epoch": 0.5102206359506813, + "grad_norm": 0.6103766240559085, + "learning_rate": 4.669391939079644e-06, + "loss": 0.5957, + "step": 3145 + }, + { + "epoch": 0.5103828682673589, + "grad_norm": 0.5802845452811055, + "learning_rate": 4.669179676931924e-06, + "loss": 0.5885, + "step": 3146 + }, + { + "epoch": 0.5105451005840363, + "grad_norm": 0.6039668401854372, + "learning_rate": 4.668967351493874e-06, + "loss": 0.5521, + "step": 3147 + }, + { + "epoch": 0.5107073329007138, + "grad_norm": 0.5952769348900537, + "learning_rate": 4.66875496277169e-06, + "loss": 0.5676, + "step": 3148 + }, + { + "epoch": 0.5108695652173914, + "grad_norm": 0.5828634698566567, + "learning_rate": 4.66854251077157e-06, + "loss": 0.5758, + "step": 3149 + }, + { + "epoch": 0.5110317975340688, + "grad_norm": 0.56414197848268, + "learning_rate": 4.66832999549971e-06, + "loss": 0.5432, + "step": 3150 + }, + { + "epoch": 0.5111940298507462, + "grad_norm": 0.5908232155997057, + "learning_rate": 4.668117416962312e-06, + "loss": 0.5483, + "step": 3151 + }, + { + "epoch": 0.5113562621674238, + "grad_norm": 0.572753099573934, + "learning_rate": 4.6679047751655796e-06, + "loss": 0.5696, + "step": 3152 + }, + { + "epoch": 0.5115184944841012, + "grad_norm": 0.6071979884206398, + "learning_rate": 4.667692070115715e-06, + "loss": 0.61, + "step": 3153 + }, + { + "epoch": 0.5116807268007787, + "grad_norm": 0.5756885143786142, + "learning_rate": 4.667479301818926e-06, + "loss": 0.578, + "step": 3154 + }, + { + "epoch": 0.5118429591174561, + "grad_norm": 0.5995699796186426, + "learning_rate": 4.667266470281419e-06, + "loss": 0.5615, + "step": 3155 + }, + { + "epoch": 0.5120051914341337, + "grad_norm": 0.5876539128986688, + "learning_rate": 4.667053575509405e-06, + "loss": 0.5477, + "step": 3156 + }, + { + "epoch": 0.5121674237508111, + "grad_norm": 0.5427664111472679, + "learning_rate": 4.666840617509095e-06, + "loss": 0.5384, + "step": 3157 + }, + { + "epoch": 0.5123296560674886, + "grad_norm": 0.5832248822385135, + "learning_rate": 4.666627596286702e-06, + "loss": 0.5474, + "step": 3158 + }, + { + "epoch": 0.5124918883841661, + "grad_norm": 0.5906316479325465, + "learning_rate": 4.666414511848443e-06, + "loss": 0.5694, + "step": 3159 + }, + { + "epoch": 0.5126541207008436, + "grad_norm": 0.6114181082431805, + "learning_rate": 4.666201364200534e-06, + "loss": 0.5771, + "step": 3160 + }, + { + "epoch": 0.512816353017521, + "grad_norm": 0.6136194741935518, + "learning_rate": 4.665988153349195e-06, + "loss": 0.5974, + "step": 3161 + }, + { + "epoch": 0.5129785853341986, + "grad_norm": 0.5798612219126277, + "learning_rate": 4.665774879300645e-06, + "loss": 0.5847, + "step": 3162 + }, + { + "epoch": 0.513140817650876, + "grad_norm": 0.5792128180447729, + "learning_rate": 4.665561542061109e-06, + "loss": 0.5594, + "step": 3163 + }, + { + "epoch": 0.5133030499675535, + "grad_norm": 0.6216398812326169, + "learning_rate": 4.66534814163681e-06, + "loss": 0.5752, + "step": 3164 + }, + { + "epoch": 0.513465282284231, + "grad_norm": 0.5594236211750564, + "learning_rate": 4.665134678033973e-06, + "loss": 0.5578, + "step": 3165 + }, + { + "epoch": 0.5136275146009085, + "grad_norm": 0.5570463454438936, + "learning_rate": 4.664921151258831e-06, + "loss": 0.5784, + "step": 3166 + }, + { + "epoch": 0.5137897469175859, + "grad_norm": 0.6097344097935329, + "learning_rate": 4.664707561317609e-06, + "loss": 0.5726, + "step": 3167 + }, + { + "epoch": 0.5139519792342635, + "grad_norm": 0.6020044579936772, + "learning_rate": 4.664493908216542e-06, + "loss": 0.5446, + "step": 3168 + }, + { + "epoch": 0.514114211550941, + "grad_norm": 0.5449960713564291, + "learning_rate": 4.664280191961862e-06, + "loss": 0.5737, + "step": 3169 + }, + { + "epoch": 0.5142764438676184, + "grad_norm": 0.5630397819330999, + "learning_rate": 4.664066412559807e-06, + "loss": 0.5722, + "step": 3170 + }, + { + "epoch": 0.514438676184296, + "grad_norm": 0.5895734555558366, + "learning_rate": 4.663852570016611e-06, + "loss": 0.6283, + "step": 3171 + }, + { + "epoch": 0.5146009085009734, + "grad_norm": 0.6191155697845935, + "learning_rate": 4.663638664338517e-06, + "loss": 0.5932, + "step": 3172 + }, + { + "epoch": 0.5147631408176508, + "grad_norm": 0.5640077205032173, + "learning_rate": 4.663424695531763e-06, + "loss": 0.5807, + "step": 3173 + }, + { + "epoch": 0.5149253731343284, + "grad_norm": 0.5847116798629203, + "learning_rate": 4.663210663602594e-06, + "loss": 0.5691, + "step": 3174 + }, + { + "epoch": 0.5150876054510058, + "grad_norm": 0.6354747044572385, + "learning_rate": 4.662996568557254e-06, + "loss": 0.5287, + "step": 3175 + }, + { + "epoch": 0.5152498377676833, + "grad_norm": 0.6294718455392261, + "learning_rate": 4.66278241040199e-06, + "loss": 0.5733, + "step": 3176 + }, + { + "epoch": 0.5154120700843609, + "grad_norm": 0.5827914501801416, + "learning_rate": 4.662568189143051e-06, + "loss": 0.5953, + "step": 3177 + }, + { + "epoch": 0.5155743024010383, + "grad_norm": 0.5891533943867682, + "learning_rate": 4.662353904786685e-06, + "loss": 0.5651, + "step": 3178 + }, + { + "epoch": 0.5157365347177157, + "grad_norm": 0.5971959147487204, + "learning_rate": 4.662139557339147e-06, + "loss": 0.6167, + "step": 3179 + }, + { + "epoch": 0.5158987670343933, + "grad_norm": 0.5898227068959043, + "learning_rate": 4.66192514680669e-06, + "loss": 0.6081, + "step": 3180 + }, + { + "epoch": 0.5160609993510707, + "grad_norm": 0.5969259553979965, + "learning_rate": 4.66171067319557e-06, + "loss": 0.5213, + "step": 3181 + }, + { + "epoch": 0.5162232316677482, + "grad_norm": 0.6051905632005564, + "learning_rate": 4.661496136512044e-06, + "loss": 0.5949, + "step": 3182 + }, + { + "epoch": 0.5163854639844258, + "grad_norm": 0.611223120222808, + "learning_rate": 4.661281536762372e-06, + "loss": 0.5732, + "step": 3183 + }, + { + "epoch": 0.5165476963011032, + "grad_norm": 0.6115095153600051, + "learning_rate": 4.661066873952815e-06, + "loss": 0.5759, + "step": 3184 + }, + { + "epoch": 0.5167099286177806, + "grad_norm": 0.593192184628107, + "learning_rate": 4.6608521480896375e-06, + "loss": 0.5925, + "step": 3185 + }, + { + "epoch": 0.5168721609344581, + "grad_norm": 0.6018369437898577, + "learning_rate": 4.660637359179104e-06, + "loss": 0.5563, + "step": 3186 + }, + { + "epoch": 0.5170343932511356, + "grad_norm": 0.6045046720473252, + "learning_rate": 4.660422507227481e-06, + "loss": 0.5735, + "step": 3187 + }, + { + "epoch": 0.5171966255678131, + "grad_norm": 0.6015717752333976, + "learning_rate": 4.660207592241037e-06, + "loss": 0.5862, + "step": 3188 + }, + { + "epoch": 0.5173588578844905, + "grad_norm": 0.6117414411855874, + "learning_rate": 4.659992614226043e-06, + "loss": 0.5724, + "step": 3189 + }, + { + "epoch": 0.5175210902011681, + "grad_norm": 0.6065074634257784, + "learning_rate": 4.659777573188772e-06, + "loss": 0.5363, + "step": 3190 + }, + { + "epoch": 0.5176833225178455, + "grad_norm": 0.5898810139887594, + "learning_rate": 4.659562469135498e-06, + "loss": 0.5739, + "step": 3191 + }, + { + "epoch": 0.517845554834523, + "grad_norm": 0.5937727931244673, + "learning_rate": 4.659347302072495e-06, + "loss": 0.604, + "step": 3192 + }, + { + "epoch": 0.5180077871512005, + "grad_norm": 0.5653092232966918, + "learning_rate": 4.659132072006045e-06, + "loss": 0.5891, + "step": 3193 + }, + { + "epoch": 0.518170019467878, + "grad_norm": 0.5904953074532479, + "learning_rate": 4.658916778942424e-06, + "loss": 0.5363, + "step": 3194 + }, + { + "epoch": 0.5183322517845554, + "grad_norm": 0.5868393188147932, + "learning_rate": 4.658701422887916e-06, + "loss": 0.5527, + "step": 3195 + }, + { + "epoch": 0.518494484101233, + "grad_norm": 0.5827675974873103, + "learning_rate": 4.658486003848803e-06, + "loss": 0.5666, + "step": 3196 + }, + { + "epoch": 0.5186567164179104, + "grad_norm": 0.6026094585070543, + "learning_rate": 4.658270521831371e-06, + "loss": 0.6048, + "step": 3197 + }, + { + "epoch": 0.5188189487345879, + "grad_norm": 0.5646059189876328, + "learning_rate": 4.658054976841908e-06, + "loss": 0.5496, + "step": 3198 + }, + { + "epoch": 0.5189811810512654, + "grad_norm": 0.5967464524934084, + "learning_rate": 4.657839368886702e-06, + "loss": 0.5892, + "step": 3199 + }, + { + "epoch": 0.5191434133679429, + "grad_norm": 0.5806719417814465, + "learning_rate": 4.657623697972043e-06, + "loss": 0.5784, + "step": 3200 + }, + { + "epoch": 0.5193056456846203, + "grad_norm": 0.5798551335785851, + "learning_rate": 4.657407964104225e-06, + "loss": 0.5472, + "step": 3201 + }, + { + "epoch": 0.5194678780012979, + "grad_norm": 0.5525209924344484, + "learning_rate": 4.657192167289542e-06, + "loss": 0.6008, + "step": 3202 + }, + { + "epoch": 0.5196301103179753, + "grad_norm": 0.5998657416668861, + "learning_rate": 4.65697630753429e-06, + "loss": 0.5704, + "step": 3203 + }, + { + "epoch": 0.5197923426346528, + "grad_norm": 0.6024229641434509, + "learning_rate": 4.656760384844768e-06, + "loss": 0.5428, + "step": 3204 + }, + { + "epoch": 0.5199545749513304, + "grad_norm": 0.5458900103013548, + "learning_rate": 4.656544399227275e-06, + "loss": 0.5624, + "step": 3205 + }, + { + "epoch": 0.5201168072680078, + "grad_norm": 0.6151741896213466, + "learning_rate": 4.656328350688114e-06, + "loss": 0.5969, + "step": 3206 + }, + { + "epoch": 0.5202790395846852, + "grad_norm": 0.5809626243589932, + "learning_rate": 4.656112239233587e-06, + "loss": 0.559, + "step": 3207 + }, + { + "epoch": 0.5204412719013628, + "grad_norm": 0.5559040445824268, + "learning_rate": 4.655896064870001e-06, + "loss": 0.5469, + "step": 3208 + }, + { + "epoch": 0.5206035042180402, + "grad_norm": 0.6664785282599375, + "learning_rate": 4.655679827603663e-06, + "loss": 0.5509, + "step": 3209 + }, + { + "epoch": 0.5207657365347177, + "grad_norm": 0.5744319701893185, + "learning_rate": 4.655463527440882e-06, + "loss": 0.5967, + "step": 3210 + }, + { + "epoch": 0.5209279688513953, + "grad_norm": 0.6107792263312622, + "learning_rate": 4.6552471643879685e-06, + "loss": 0.598, + "step": 3211 + }, + { + "epoch": 0.5210902011680727, + "grad_norm": 0.5736097865420442, + "learning_rate": 4.655030738451236e-06, + "loss": 0.5565, + "step": 3212 + }, + { + "epoch": 0.5212524334847501, + "grad_norm": 0.5795356975397457, + "learning_rate": 4.654814249636999e-06, + "loss": 0.5933, + "step": 3213 + }, + { + "epoch": 0.5214146658014276, + "grad_norm": 0.5901869283752482, + "learning_rate": 4.654597697951574e-06, + "loss": 0.5685, + "step": 3214 + }, + { + "epoch": 0.5215768981181051, + "grad_norm": 0.5696067766389329, + "learning_rate": 4.654381083401279e-06, + "loss": 0.6182, + "step": 3215 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 0.6382892720317856, + "learning_rate": 4.654164405992435e-06, + "loss": 0.5842, + "step": 3216 + }, + { + "epoch": 0.52190136275146, + "grad_norm": 0.62324443685293, + "learning_rate": 4.653947665731364e-06, + "loss": 0.565, + "step": 3217 + }, + { + "epoch": 0.5220635950681376, + "grad_norm": 0.6190213148550153, + "learning_rate": 4.653730862624388e-06, + "loss": 0.5443, + "step": 3218 + }, + { + "epoch": 0.522225827384815, + "grad_norm": 0.6016306492957186, + "learning_rate": 4.6535139966778355e-06, + "loss": 0.5589, + "step": 3219 + }, + { + "epoch": 0.5223880597014925, + "grad_norm": 0.5553671990068113, + "learning_rate": 4.653297067898032e-06, + "loss": 0.557, + "step": 3220 + }, + { + "epoch": 0.52255029201817, + "grad_norm": 0.6085955016951854, + "learning_rate": 4.653080076291308e-06, + "loss": 0.5777, + "step": 3221 + }, + { + "epoch": 0.5227125243348475, + "grad_norm": 0.5696395774217725, + "learning_rate": 4.6528630218639935e-06, + "loss": 0.5735, + "step": 3222 + }, + { + "epoch": 0.5228747566515249, + "grad_norm": 0.6006928880355792, + "learning_rate": 4.6526459046224225e-06, + "loss": 0.577, + "step": 3223 + }, + { + "epoch": 0.5230369889682025, + "grad_norm": 0.5936349276906735, + "learning_rate": 4.652428724572929e-06, + "loss": 0.5967, + "step": 3224 + }, + { + "epoch": 0.52319922128488, + "grad_norm": 0.5836433840972046, + "learning_rate": 4.6522114817218505e-06, + "loss": 0.5497, + "step": 3225 + }, + { + "epoch": 0.5233614536015574, + "grad_norm": 0.6069969323828294, + "learning_rate": 4.6519941760755246e-06, + "loss": 0.5895, + "step": 3226 + }, + { + "epoch": 0.523523685918235, + "grad_norm": 0.6049056734814018, + "learning_rate": 4.651776807640294e-06, + "loss": 0.5517, + "step": 3227 + }, + { + "epoch": 0.5236859182349124, + "grad_norm": 0.581252284304049, + "learning_rate": 4.651559376422497e-06, + "loss": 0.5457, + "step": 3228 + }, + { + "epoch": 0.5238481505515898, + "grad_norm": 0.5489062075776477, + "learning_rate": 4.651341882428481e-06, + "loss": 0.5784, + "step": 3229 + }, + { + "epoch": 0.5240103828682674, + "grad_norm": 0.5469453443798072, + "learning_rate": 4.65112432566459e-06, + "loss": 0.5495, + "step": 3230 + }, + { + "epoch": 0.5241726151849448, + "grad_norm": 0.5761343865481545, + "learning_rate": 4.650906706137173e-06, + "loss": 0.5547, + "step": 3231 + }, + { + "epoch": 0.5243348475016223, + "grad_norm": 0.5939269951748002, + "learning_rate": 4.6506890238525775e-06, + "loss": 0.5572, + "step": 3232 + }, + { + "epoch": 0.5244970798182998, + "grad_norm": 0.5693271568075019, + "learning_rate": 4.650471278817157e-06, + "loss": 0.524, + "step": 3233 + }, + { + "epoch": 0.5246593121349773, + "grad_norm": 0.6127624040739817, + "learning_rate": 4.650253471037263e-06, + "loss": 0.5635, + "step": 3234 + }, + { + "epoch": 0.5248215444516547, + "grad_norm": 0.5542825314802606, + "learning_rate": 4.6500356005192514e-06, + "loss": 0.5713, + "step": 3235 + }, + { + "epoch": 0.5249837767683323, + "grad_norm": 0.5862505023103403, + "learning_rate": 4.64981766726948e-06, + "loss": 0.5827, + "step": 3236 + }, + { + "epoch": 0.5251460090850097, + "grad_norm": 0.6062967117846672, + "learning_rate": 4.649599671294305e-06, + "loss": 0.551, + "step": 3237 + }, + { + "epoch": 0.5253082414016872, + "grad_norm": 0.5920676579087133, + "learning_rate": 4.649381612600088e-06, + "loss": 0.5967, + "step": 3238 + }, + { + "epoch": 0.5254704737183648, + "grad_norm": 0.6087620490636475, + "learning_rate": 4.649163491193192e-06, + "loss": 0.5394, + "step": 3239 + }, + { + "epoch": 0.5256327060350422, + "grad_norm": 0.6076828709208048, + "learning_rate": 4.648945307079981e-06, + "loss": 0.553, + "step": 3240 + }, + { + "epoch": 0.5257949383517196, + "grad_norm": 0.5891407131114726, + "learning_rate": 4.64872706026682e-06, + "loss": 0.5684, + "step": 3241 + }, + { + "epoch": 0.5259571706683971, + "grad_norm": 0.5527505011670463, + "learning_rate": 4.648508750760078e-06, + "loss": 0.5234, + "step": 3242 + }, + { + "epoch": 0.5261194029850746, + "grad_norm": 0.626395275741191, + "learning_rate": 4.6482903785661234e-06, + "loss": 0.5708, + "step": 3243 + }, + { + "epoch": 0.5262816353017521, + "grad_norm": 0.5756230944636185, + "learning_rate": 4.648071943691329e-06, + "loss": 0.5644, + "step": 3244 + }, + { + "epoch": 0.5264438676184295, + "grad_norm": 0.5885471302993125, + "learning_rate": 4.647853446142068e-06, + "loss": 0.5466, + "step": 3245 + }, + { + "epoch": 0.5266060999351071, + "grad_norm": 0.6014693185683403, + "learning_rate": 4.647634885924713e-06, + "loss": 0.5582, + "step": 3246 + }, + { + "epoch": 0.5267683322517845, + "grad_norm": 0.5936090618217894, + "learning_rate": 4.647416263045644e-06, + "loss": 0.5759, + "step": 3247 + }, + { + "epoch": 0.526930564568462, + "grad_norm": 0.6096222949660495, + "learning_rate": 4.647197577511239e-06, + "loss": 0.6063, + "step": 3248 + }, + { + "epoch": 0.5270927968851395, + "grad_norm": 0.589655261848266, + "learning_rate": 4.646978829327878e-06, + "loss": 0.5501, + "step": 3249 + }, + { + "epoch": 0.527255029201817, + "grad_norm": 0.5604913542934952, + "learning_rate": 4.646760018501944e-06, + "loss": 0.5852, + "step": 3250 + }, + { + "epoch": 0.5274172615184944, + "grad_norm": 0.602383349858829, + "learning_rate": 4.64654114503982e-06, + "loss": 0.5993, + "step": 3251 + }, + { + "epoch": 0.527579493835172, + "grad_norm": 0.6024566105636301, + "learning_rate": 4.646322208947893e-06, + "loss": 0.6009, + "step": 3252 + }, + { + "epoch": 0.5277417261518494, + "grad_norm": 0.5705526596454511, + "learning_rate": 4.646103210232552e-06, + "loss": 0.5228, + "step": 3253 + }, + { + "epoch": 0.5279039584685269, + "grad_norm": 0.5756968327130912, + "learning_rate": 4.6458841489001855e-06, + "loss": 0.5451, + "step": 3254 + }, + { + "epoch": 0.5280661907852044, + "grad_norm": 0.5809889381215397, + "learning_rate": 4.645665024957185e-06, + "loss": 0.5616, + "step": 3255 + }, + { + "epoch": 0.5282284231018819, + "grad_norm": 0.5803748592255485, + "learning_rate": 4.6454458384099445e-06, + "loss": 0.5654, + "step": 3256 + }, + { + "epoch": 0.5283906554185593, + "grad_norm": 0.5814404163290071, + "learning_rate": 4.6452265892648585e-06, + "loss": 0.5946, + "step": 3257 + }, + { + "epoch": 0.5285528877352369, + "grad_norm": 0.638808489922318, + "learning_rate": 4.6450072775283245e-06, + "loss": 0.6165, + "step": 3258 + }, + { + "epoch": 0.5287151200519143, + "grad_norm": 0.5696621368803289, + "learning_rate": 4.644787903206741e-06, + "loss": 0.5563, + "step": 3259 + }, + { + "epoch": 0.5288773523685918, + "grad_norm": 0.5694526589134903, + "learning_rate": 4.64456846630651e-06, + "loss": 0.5637, + "step": 3260 + }, + { + "epoch": 0.5290395846852693, + "grad_norm": 0.63058604053028, + "learning_rate": 4.644348966834032e-06, + "loss": 0.5917, + "step": 3261 + }, + { + "epoch": 0.5292018170019468, + "grad_norm": 0.5932673654320806, + "learning_rate": 4.644129404795713e-06, + "loss": 0.6186, + "step": 3262 + }, + { + "epoch": 0.5293640493186242, + "grad_norm": 0.6168958328518646, + "learning_rate": 4.643909780197958e-06, + "loss": 0.5782, + "step": 3263 + }, + { + "epoch": 0.5295262816353018, + "grad_norm": 0.5643717286264265, + "learning_rate": 4.643690093047177e-06, + "loss": 0.5618, + "step": 3264 + }, + { + "epoch": 0.5296885139519792, + "grad_norm": 0.6024808952619587, + "learning_rate": 4.643470343349777e-06, + "loss": 0.6007, + "step": 3265 + }, + { + "epoch": 0.5298507462686567, + "grad_norm": 0.6207626433754516, + "learning_rate": 4.643250531112172e-06, + "loss": 0.5764, + "step": 3266 + }, + { + "epoch": 0.5300129785853342, + "grad_norm": 0.5989497775920326, + "learning_rate": 4.6430306563407746e-06, + "loss": 0.5843, + "step": 3267 + }, + { + "epoch": 0.5301752109020117, + "grad_norm": 0.5973062681694901, + "learning_rate": 4.642810719041999e-06, + "loss": 0.5276, + "step": 3268 + }, + { + "epoch": 0.5303374432186891, + "grad_norm": 0.6042923716017121, + "learning_rate": 4.642590719222264e-06, + "loss": 0.5559, + "step": 3269 + }, + { + "epoch": 0.5304996755353667, + "grad_norm": 0.5826245091141893, + "learning_rate": 4.642370656887988e-06, + "loss": 0.5944, + "step": 3270 + }, + { + "epoch": 0.5306619078520441, + "grad_norm": 0.587839163709597, + "learning_rate": 4.6421505320455915e-06, + "loss": 0.5751, + "step": 3271 + }, + { + "epoch": 0.5308241401687216, + "grad_norm": 0.6073562355744859, + "learning_rate": 4.641930344701498e-06, + "loss": 0.571, + "step": 3272 + }, + { + "epoch": 0.530986372485399, + "grad_norm": 0.5801595434988438, + "learning_rate": 4.641710094862131e-06, + "loss": 0.5556, + "step": 3273 + }, + { + "epoch": 0.5311486048020766, + "grad_norm": 0.5413591508738309, + "learning_rate": 4.641489782533916e-06, + "loss": 0.5534, + "step": 3274 + }, + { + "epoch": 0.531310837118754, + "grad_norm": 0.5835904874722824, + "learning_rate": 4.641269407723283e-06, + "loss": 0.5416, + "step": 3275 + }, + { + "epoch": 0.5314730694354315, + "grad_norm": 0.5878564594469422, + "learning_rate": 4.64104897043666e-06, + "loss": 0.5619, + "step": 3276 + }, + { + "epoch": 0.531635301752109, + "grad_norm": 0.5927125262701407, + "learning_rate": 4.640828470680481e-06, + "loss": 0.5812, + "step": 3277 + }, + { + "epoch": 0.5317975340687865, + "grad_norm": 0.6278734731942959, + "learning_rate": 4.640607908461178e-06, + "loss": 0.5689, + "step": 3278 + }, + { + "epoch": 0.5319597663854639, + "grad_norm": 0.5712527375872138, + "learning_rate": 4.640387283785186e-06, + "loss": 0.5567, + "step": 3279 + }, + { + "epoch": 0.5321219987021415, + "grad_norm": 0.5866500454458281, + "learning_rate": 4.640166596658943e-06, + "loss": 0.5568, + "step": 3280 + }, + { + "epoch": 0.5322842310188189, + "grad_norm": 0.5749377821288866, + "learning_rate": 4.639945847088888e-06, + "loss": 0.5308, + "step": 3281 + }, + { + "epoch": 0.5324464633354964, + "grad_norm": 0.6130215951299555, + "learning_rate": 4.63972503508146e-06, + "loss": 0.6102, + "step": 3282 + }, + { + "epoch": 0.532608695652174, + "grad_norm": 0.5731851825434086, + "learning_rate": 4.639504160643106e-06, + "loss": 0.5277, + "step": 3283 + }, + { + "epoch": 0.5327709279688514, + "grad_norm": 0.6147771909073083, + "learning_rate": 4.639283223780265e-06, + "loss": 0.5779, + "step": 3284 + }, + { + "epoch": 0.5329331602855288, + "grad_norm": 0.5778559686954868, + "learning_rate": 4.639062224499387e-06, + "loss": 0.5876, + "step": 3285 + }, + { + "epoch": 0.5330953926022064, + "grad_norm": 0.554464743292619, + "learning_rate": 4.6388411628069196e-06, + "loss": 0.5796, + "step": 3286 + }, + { + "epoch": 0.5332576249188838, + "grad_norm": 0.5777362541433996, + "learning_rate": 4.638620038709313e-06, + "loss": 0.5749, + "step": 3287 + }, + { + "epoch": 0.5334198572355613, + "grad_norm": 0.6155770318399084, + "learning_rate": 4.638398852213016e-06, + "loss": 0.5525, + "step": 3288 + }, + { + "epoch": 0.5335820895522388, + "grad_norm": 0.6097294890279923, + "learning_rate": 4.6381776033244865e-06, + "loss": 0.5847, + "step": 3289 + }, + { + "epoch": 0.5337443218689163, + "grad_norm": 0.607436339212107, + "learning_rate": 4.637956292050176e-06, + "loss": 0.5827, + "step": 3290 + }, + { + "epoch": 0.5339065541855937, + "grad_norm": 0.60299905483409, + "learning_rate": 4.637734918396545e-06, + "loss": 0.5791, + "step": 3291 + }, + { + "epoch": 0.5340687865022713, + "grad_norm": 0.5669504220213535, + "learning_rate": 4.6375134823700505e-06, + "loss": 0.5594, + "step": 3292 + }, + { + "epoch": 0.5342310188189487, + "grad_norm": 0.5844658713815282, + "learning_rate": 4.637291983977153e-06, + "loss": 0.6062, + "step": 3293 + }, + { + "epoch": 0.5343932511356262, + "grad_norm": 0.6267088672414737, + "learning_rate": 4.637070423224317e-06, + "loss": 0.5731, + "step": 3294 + }, + { + "epoch": 0.5345554834523037, + "grad_norm": 0.6101039012114878, + "learning_rate": 4.636848800118007e-06, + "loss": 0.5881, + "step": 3295 + }, + { + "epoch": 0.5347177157689812, + "grad_norm": 0.5823884712879076, + "learning_rate": 4.636627114664687e-06, + "loss": 0.5856, + "step": 3296 + }, + { + "epoch": 0.5348799480856586, + "grad_norm": 0.6093039761154071, + "learning_rate": 4.6364053668708265e-06, + "loss": 0.5806, + "step": 3297 + }, + { + "epoch": 0.5350421804023362, + "grad_norm": 0.5749390159836717, + "learning_rate": 4.636183556742896e-06, + "loss": 0.5725, + "step": 3298 + }, + { + "epoch": 0.5352044127190136, + "grad_norm": 0.6102531627681171, + "learning_rate": 4.635961684287366e-06, + "loss": 0.5869, + "step": 3299 + }, + { + "epoch": 0.5353666450356911, + "grad_norm": 0.6071570880316554, + "learning_rate": 4.635739749510711e-06, + "loss": 0.5838, + "step": 3300 + }, + { + "epoch": 0.5355288773523685, + "grad_norm": 0.6112503938991805, + "learning_rate": 4.6355177524194075e-06, + "loss": 0.5764, + "step": 3301 + }, + { + "epoch": 0.5356911096690461, + "grad_norm": 0.6235287733027562, + "learning_rate": 4.63529569301993e-06, + "loss": 0.5423, + "step": 3302 + }, + { + "epoch": 0.5358533419857235, + "grad_norm": 0.5618690167745484, + "learning_rate": 4.6350735713187595e-06, + "loss": 0.5808, + "step": 3303 + }, + { + "epoch": 0.536015574302401, + "grad_norm": 0.6015871036567797, + "learning_rate": 4.634851387322377e-06, + "loss": 0.5808, + "step": 3304 + }, + { + "epoch": 0.5361778066190785, + "grad_norm": 0.6412152610663698, + "learning_rate": 4.634629141037264e-06, + "loss": 0.5529, + "step": 3305 + }, + { + "epoch": 0.536340038935756, + "grad_norm": 0.6181270847668935, + "learning_rate": 4.6344068324699045e-06, + "loss": 0.6012, + "step": 3306 + }, + { + "epoch": 0.5365022712524334, + "grad_norm": 0.6437004543290253, + "learning_rate": 4.634184461626787e-06, + "loss": 0.5618, + "step": 3307 + }, + { + "epoch": 0.536664503569111, + "grad_norm": 0.5894719358220125, + "learning_rate": 4.633962028514398e-06, + "loss": 0.5663, + "step": 3308 + }, + { + "epoch": 0.5368267358857884, + "grad_norm": 0.585368698095879, + "learning_rate": 4.633739533139229e-06, + "loss": 0.5717, + "step": 3309 + }, + { + "epoch": 0.5369889682024659, + "grad_norm": 0.6043909962876572, + "learning_rate": 4.63351697550777e-06, + "loss": 0.5632, + "step": 3310 + }, + { + "epoch": 0.5371512005191434, + "grad_norm": 0.6157097009141331, + "learning_rate": 4.633294355626515e-06, + "loss": 0.5777, + "step": 3311 + }, + { + "epoch": 0.5373134328358209, + "grad_norm": 0.6183499082912408, + "learning_rate": 4.63307167350196e-06, + "loss": 0.5182, + "step": 3312 + }, + { + "epoch": 0.5374756651524983, + "grad_norm": 0.5808467895138741, + "learning_rate": 4.632848929140601e-06, + "loss": 0.5788, + "step": 3313 + }, + { + "epoch": 0.5376378974691759, + "grad_norm": 0.5818930879574227, + "learning_rate": 4.632626122548939e-06, + "loss": 0.5564, + "step": 3314 + }, + { + "epoch": 0.5378001297858533, + "grad_norm": 0.6224739128605867, + "learning_rate": 4.632403253733474e-06, + "loss": 0.5391, + "step": 3315 + }, + { + "epoch": 0.5379623621025308, + "grad_norm": 0.6422233666653232, + "learning_rate": 4.632180322700708e-06, + "loss": 0.561, + "step": 3316 + }, + { + "epoch": 0.5381245944192083, + "grad_norm": 0.6034981143767578, + "learning_rate": 4.631957329457147e-06, + "loss": 0.5902, + "step": 3317 + }, + { + "epoch": 0.5382868267358858, + "grad_norm": 0.5730396598547186, + "learning_rate": 4.631734274009294e-06, + "loss": 0.5758, + "step": 3318 + }, + { + "epoch": 0.5384490590525632, + "grad_norm": 0.5755916919170235, + "learning_rate": 4.631511156363661e-06, + "loss": 0.5793, + "step": 3319 + }, + { + "epoch": 0.5386112913692408, + "grad_norm": 0.6014396075924275, + "learning_rate": 4.631287976526755e-06, + "loss": 0.5821, + "step": 3320 + }, + { + "epoch": 0.5387735236859182, + "grad_norm": 0.6324878927970645, + "learning_rate": 4.63106473450509e-06, + "loss": 0.5516, + "step": 3321 + }, + { + "epoch": 0.5389357560025957, + "grad_norm": 0.6128896012607334, + "learning_rate": 4.630841430305177e-06, + "loss": 0.5369, + "step": 3322 + }, + { + "epoch": 0.5390979883192732, + "grad_norm": 0.5961446944832377, + "learning_rate": 4.630618063933535e-06, + "loss": 0.5636, + "step": 3323 + }, + { + "epoch": 0.5392602206359507, + "grad_norm": 0.5701736066993799, + "learning_rate": 4.630394635396678e-06, + "loss": 0.5836, + "step": 3324 + }, + { + "epoch": 0.5394224529526281, + "grad_norm": 0.571791459040998, + "learning_rate": 4.630171144701126e-06, + "loss": 0.5797, + "step": 3325 + }, + { + "epoch": 0.5395846852693057, + "grad_norm": 0.6027406562852743, + "learning_rate": 4.6299475918534e-06, + "loss": 0.5631, + "step": 3326 + }, + { + "epoch": 0.5397469175859831, + "grad_norm": 0.5886650437031864, + "learning_rate": 4.629723976860023e-06, + "loss": 0.5782, + "step": 3327 + }, + { + "epoch": 0.5399091499026606, + "grad_norm": 0.580308573609172, + "learning_rate": 4.629500299727518e-06, + "loss": 0.5693, + "step": 3328 + }, + { + "epoch": 0.5400713822193381, + "grad_norm": 0.574122498484463, + "learning_rate": 4.629276560462413e-06, + "loss": 0.548, + "step": 3329 + }, + { + "epoch": 0.5402336145360156, + "grad_norm": 0.5802627619335031, + "learning_rate": 4.629052759071234e-06, + "loss": 0.5698, + "step": 3330 + }, + { + "epoch": 0.540395846852693, + "grad_norm": 0.6177317104194492, + "learning_rate": 4.628828895560513e-06, + "loss": 0.5845, + "step": 3331 + }, + { + "epoch": 0.5405580791693705, + "grad_norm": 0.6071982974112874, + "learning_rate": 4.628604969936781e-06, + "loss": 0.5605, + "step": 3332 + }, + { + "epoch": 0.540720311486048, + "grad_norm": 0.6025699107525511, + "learning_rate": 4.6283809822065704e-06, + "loss": 0.6114, + "step": 3333 + }, + { + "epoch": 0.5408825438027255, + "grad_norm": 0.5701723011864805, + "learning_rate": 4.628156932376419e-06, + "loss": 0.5728, + "step": 3334 + }, + { + "epoch": 0.5410447761194029, + "grad_norm": 0.6460008879125567, + "learning_rate": 4.6279328204528605e-06, + "loss": 0.5392, + "step": 3335 + }, + { + "epoch": 0.5412070084360805, + "grad_norm": 0.5655629537244791, + "learning_rate": 4.627708646442437e-06, + "loss": 0.5494, + "step": 3336 + }, + { + "epoch": 0.5413692407527579, + "grad_norm": 0.6114205629506305, + "learning_rate": 4.6274844103516865e-06, + "loss": 0.5732, + "step": 3337 + }, + { + "epoch": 0.5415314730694354, + "grad_norm": 0.5624314047618397, + "learning_rate": 4.627260112187154e-06, + "loss": 0.5552, + "step": 3338 + }, + { + "epoch": 0.541693705386113, + "grad_norm": 0.5665050350569958, + "learning_rate": 4.6270357519553825e-06, + "loss": 0.559, + "step": 3339 + }, + { + "epoch": 0.5418559377027904, + "grad_norm": 0.6015535592641544, + "learning_rate": 4.626811329662918e-06, + "loss": 0.5815, + "step": 3340 + }, + { + "epoch": 0.5420181700194678, + "grad_norm": 0.6036729983701081, + "learning_rate": 4.6265868453163095e-06, + "loss": 0.5505, + "step": 3341 + }, + { + "epoch": 0.5421804023361454, + "grad_norm": 0.5729271563886841, + "learning_rate": 4.626362298922106e-06, + "loss": 0.5366, + "step": 3342 + }, + { + "epoch": 0.5423426346528228, + "grad_norm": 0.5854108591940047, + "learning_rate": 4.626137690486859e-06, + "loss": 0.5972, + "step": 3343 + }, + { + "epoch": 0.5425048669695003, + "grad_norm": 0.5820905975833013, + "learning_rate": 4.625913020017123e-06, + "loss": 0.5604, + "step": 3344 + }, + { + "epoch": 0.5426670992861778, + "grad_norm": 0.6001251973243297, + "learning_rate": 4.625688287519452e-06, + "loss": 0.5715, + "step": 3345 + }, + { + "epoch": 0.5428293316028553, + "grad_norm": 0.5926309537724574, + "learning_rate": 4.625463493000404e-06, + "loss": 0.5824, + "step": 3346 + }, + { + "epoch": 0.5429915639195327, + "grad_norm": 0.5539747427167593, + "learning_rate": 4.625238636466537e-06, + "loss": 0.5496, + "step": 3347 + }, + { + "epoch": 0.5431537962362103, + "grad_norm": 0.6263711720607865, + "learning_rate": 4.625013717924412e-06, + "loss": 0.6151, + "step": 3348 + }, + { + "epoch": 0.5433160285528877, + "grad_norm": 0.5906907994712522, + "learning_rate": 4.6247887373805925e-06, + "loss": 0.585, + "step": 3349 + }, + { + "epoch": 0.5434782608695652, + "grad_norm": 0.6045303653726675, + "learning_rate": 4.62456369484164e-06, + "loss": 0.5632, + "step": 3350 + }, + { + "epoch": 0.5436404931862427, + "grad_norm": 0.5610575935178092, + "learning_rate": 4.624338590314124e-06, + "loss": 0.5553, + "step": 3351 + }, + { + "epoch": 0.5438027255029202, + "grad_norm": 0.5815839778731118, + "learning_rate": 4.6241134238046106e-06, + "loss": 0.5869, + "step": 3352 + }, + { + "epoch": 0.5439649578195976, + "grad_norm": 0.5731014543818114, + "learning_rate": 4.623888195319669e-06, + "loss": 0.5227, + "step": 3353 + }, + { + "epoch": 0.5441271901362752, + "grad_norm": 0.5701946705568965, + "learning_rate": 4.623662904865872e-06, + "loss": 0.5947, + "step": 3354 + }, + { + "epoch": 0.5442894224529526, + "grad_norm": 0.6032645907511804, + "learning_rate": 4.623437552449792e-06, + "loss": 0.5906, + "step": 3355 + }, + { + "epoch": 0.5444516547696301, + "grad_norm": 0.5787984892792469, + "learning_rate": 4.623212138078004e-06, + "loss": 0.5469, + "step": 3356 + }, + { + "epoch": 0.5446138870863076, + "grad_norm": 0.598013113771502, + "learning_rate": 4.622986661757086e-06, + "loss": 0.5539, + "step": 3357 + }, + { + "epoch": 0.5447761194029851, + "grad_norm": 0.5973402298884831, + "learning_rate": 4.622761123493616e-06, + "loss": 0.5618, + "step": 3358 + }, + { + "epoch": 0.5449383517196625, + "grad_norm": 0.5596259473717059, + "learning_rate": 4.622535523294174e-06, + "loss": 0.5658, + "step": 3359 + }, + { + "epoch": 0.54510058403634, + "grad_norm": 0.5615506357330131, + "learning_rate": 4.622309861165343e-06, + "loss": 0.5524, + "step": 3360 + }, + { + "epoch": 0.5452628163530175, + "grad_norm": 0.5755201252955386, + "learning_rate": 4.622084137113708e-06, + "loss": 0.5639, + "step": 3361 + }, + { + "epoch": 0.545425048669695, + "grad_norm": 0.5762527970338218, + "learning_rate": 4.621858351145854e-06, + "loss": 0.5418, + "step": 3362 + }, + { + "epoch": 0.5455872809863724, + "grad_norm": 0.5589540804729426, + "learning_rate": 4.621632503268369e-06, + "loss": 0.5302, + "step": 3363 + }, + { + "epoch": 0.54574951330305, + "grad_norm": 0.5762712598334466, + "learning_rate": 4.621406593487842e-06, + "loss": 0.5734, + "step": 3364 + }, + { + "epoch": 0.5459117456197274, + "grad_norm": 0.5816744329259294, + "learning_rate": 4.6211806218108644e-06, + "loss": 0.5468, + "step": 3365 + }, + { + "epoch": 0.5460739779364049, + "grad_norm": 0.5706913864360722, + "learning_rate": 4.6209545882440305e-06, + "loss": 0.5725, + "step": 3366 + }, + { + "epoch": 0.5462362102530824, + "grad_norm": 0.5738111768090484, + "learning_rate": 4.620728492793934e-06, + "loss": 0.5892, + "step": 3367 + }, + { + "epoch": 0.5463984425697599, + "grad_norm": 0.6549103872037585, + "learning_rate": 4.620502335467174e-06, + "loss": 0.5572, + "step": 3368 + }, + { + "epoch": 0.5465606748864373, + "grad_norm": 0.5352376707888802, + "learning_rate": 4.620276116270346e-06, + "loss": 0.5339, + "step": 3369 + }, + { + "epoch": 0.5467229072031149, + "grad_norm": 0.6060635840754208, + "learning_rate": 4.620049835210053e-06, + "loss": 0.5259, + "step": 3370 + }, + { + "epoch": 0.5468851395197923, + "grad_norm": 0.5909235742938126, + "learning_rate": 4.619823492292895e-06, + "loss": 0.5708, + "step": 3371 + }, + { + "epoch": 0.5470473718364698, + "grad_norm": 0.6309110665953718, + "learning_rate": 4.619597087525478e-06, + "loss": 0.5632, + "step": 3372 + }, + { + "epoch": 0.5472096041531473, + "grad_norm": 0.5727564248823847, + "learning_rate": 4.619370620914406e-06, + "loss": 0.5648, + "step": 3373 + }, + { + "epoch": 0.5473718364698248, + "grad_norm": 0.566412656158293, + "learning_rate": 4.619144092466289e-06, + "loss": 0.5885, + "step": 3374 + }, + { + "epoch": 0.5475340687865022, + "grad_norm": 0.5725352478712398, + "learning_rate": 4.618917502187734e-06, + "loss": 0.5827, + "step": 3375 + }, + { + "epoch": 0.5476963011031798, + "grad_norm": 0.5717114080983559, + "learning_rate": 4.618690850085353e-06, + "loss": 0.5865, + "step": 3376 + }, + { + "epoch": 0.5478585334198572, + "grad_norm": 0.5504630191652619, + "learning_rate": 4.61846413616576e-06, + "loss": 0.5568, + "step": 3377 + }, + { + "epoch": 0.5480207657365347, + "grad_norm": 0.6260352435917242, + "learning_rate": 4.61823736043557e-06, + "loss": 0.521, + "step": 3378 + }, + { + "epoch": 0.5481829980532122, + "grad_norm": 0.5888997928038443, + "learning_rate": 4.6180105229013976e-06, + "loss": 0.5485, + "step": 3379 + }, + { + "epoch": 0.5483452303698897, + "grad_norm": 0.5800354203786664, + "learning_rate": 4.617783623569863e-06, + "loss": 0.5611, + "step": 3380 + }, + { + "epoch": 0.5485074626865671, + "grad_norm": 0.5676243202815306, + "learning_rate": 4.617556662447586e-06, + "loss": 0.5379, + "step": 3381 + }, + { + "epoch": 0.5486696950032447, + "grad_norm": 0.5794703971852491, + "learning_rate": 4.617329639541188e-06, + "loss": 0.5794, + "step": 3382 + }, + { + "epoch": 0.5488319273199221, + "grad_norm": 0.5682574793619196, + "learning_rate": 4.617102554857295e-06, + "loss": 0.5844, + "step": 3383 + }, + { + "epoch": 0.5489941596365996, + "grad_norm": 0.606800381902768, + "learning_rate": 4.616875408402529e-06, + "loss": 0.5777, + "step": 3384 + }, + { + "epoch": 0.5491563919532771, + "grad_norm": 0.6246190671797189, + "learning_rate": 4.616648200183521e-06, + "loss": 0.5819, + "step": 3385 + }, + { + "epoch": 0.5493186242699546, + "grad_norm": 0.5968276812951216, + "learning_rate": 4.616420930206899e-06, + "loss": 0.5236, + "step": 3386 + }, + { + "epoch": 0.549480856586632, + "grad_norm": 0.5942407570679462, + "learning_rate": 4.616193598479293e-06, + "loss": 0.5558, + "step": 3387 + }, + { + "epoch": 0.5496430889033095, + "grad_norm": 0.5757122021412944, + "learning_rate": 4.6159662050073375e-06, + "loss": 0.5615, + "step": 3388 + }, + { + "epoch": 0.549805321219987, + "grad_norm": 0.5764792251421049, + "learning_rate": 4.6157387497976666e-06, + "loss": 0.5764, + "step": 3389 + }, + { + "epoch": 0.5499675535366645, + "grad_norm": 0.6376785675448483, + "learning_rate": 4.615511232856916e-06, + "loss": 0.6012, + "step": 3390 + }, + { + "epoch": 0.5501297858533419, + "grad_norm": 0.5561827084273128, + "learning_rate": 4.615283654191726e-06, + "loss": 0.5307, + "step": 3391 + }, + { + "epoch": 0.5502920181700195, + "grad_norm": 0.5809523936916101, + "learning_rate": 4.615056013808734e-06, + "loss": 0.5728, + "step": 3392 + }, + { + "epoch": 0.5504542504866969, + "grad_norm": 0.6117346697537628, + "learning_rate": 4.614828311714584e-06, + "loss": 0.5766, + "step": 3393 + }, + { + "epoch": 0.5506164828033744, + "grad_norm": 0.5982604463154504, + "learning_rate": 4.614600547915919e-06, + "loss": 0.5963, + "step": 3394 + }, + { + "epoch": 0.5507787151200519, + "grad_norm": 0.5747832967309002, + "learning_rate": 4.614372722419385e-06, + "loss": 0.565, + "step": 3395 + }, + { + "epoch": 0.5509409474367294, + "grad_norm": 0.5698342147435853, + "learning_rate": 4.614144835231627e-06, + "loss": 0.5792, + "step": 3396 + }, + { + "epoch": 0.5511031797534068, + "grad_norm": 0.587852825198704, + "learning_rate": 4.613916886359297e-06, + "loss": 0.5851, + "step": 3397 + }, + { + "epoch": 0.5512654120700844, + "grad_norm": 0.5712988974805963, + "learning_rate": 4.613688875809044e-06, + "loss": 0.5553, + "step": 3398 + }, + { + "epoch": 0.5514276443867618, + "grad_norm": 0.5933331480713667, + "learning_rate": 4.613460803587522e-06, + "loss": 0.5845, + "step": 3399 + }, + { + "epoch": 0.5515898767034393, + "grad_norm": 0.5860548384500489, + "learning_rate": 4.613232669701384e-06, + "loss": 0.5527, + "step": 3400 + }, + { + "epoch": 0.5517521090201168, + "grad_norm": 0.580183996475999, + "learning_rate": 4.613004474157288e-06, + "loss": 0.5363, + "step": 3401 + }, + { + "epoch": 0.5519143413367943, + "grad_norm": 0.6115728227309919, + "learning_rate": 4.612776216961891e-06, + "loss": 0.5862, + "step": 3402 + }, + { + "epoch": 0.5520765736534717, + "grad_norm": 0.6058721702579617, + "learning_rate": 4.612547898121853e-06, + "loss": 0.5769, + "step": 3403 + }, + { + "epoch": 0.5522388059701493, + "grad_norm": 0.606291012501118, + "learning_rate": 4.612319517643835e-06, + "loss": 0.5607, + "step": 3404 + }, + { + "epoch": 0.5524010382868267, + "grad_norm": 0.5958505398078534, + "learning_rate": 4.612091075534502e-06, + "loss": 0.5696, + "step": 3405 + }, + { + "epoch": 0.5525632706035042, + "grad_norm": 0.6056107027738579, + "learning_rate": 4.611862571800519e-06, + "loss": 0.5877, + "step": 3406 + }, + { + "epoch": 0.5527255029201817, + "grad_norm": 0.541696795333791, + "learning_rate": 4.611634006448551e-06, + "loss": 0.5813, + "step": 3407 + }, + { + "epoch": 0.5528877352368592, + "grad_norm": 0.575697821369238, + "learning_rate": 4.61140537948527e-06, + "loss": 0.5583, + "step": 3408 + }, + { + "epoch": 0.5530499675535366, + "grad_norm": 0.5713715434216871, + "learning_rate": 4.611176690917344e-06, + "loss": 0.555, + "step": 3409 + }, + { + "epoch": 0.5532121998702142, + "grad_norm": 0.587152132923863, + "learning_rate": 4.610947940751447e-06, + "loss": 0.5586, + "step": 3410 + }, + { + "epoch": 0.5533744321868916, + "grad_norm": 0.5939532177792536, + "learning_rate": 4.610719128994252e-06, + "loss": 0.5486, + "step": 3411 + }, + { + "epoch": 0.5535366645035691, + "grad_norm": 0.5505743830013561, + "learning_rate": 4.6104902556524365e-06, + "loss": 0.57, + "step": 3412 + }, + { + "epoch": 0.5536988968202466, + "grad_norm": 0.5895841899296974, + "learning_rate": 4.6102613207326784e-06, + "loss": 0.5725, + "step": 3413 + }, + { + "epoch": 0.5538611291369241, + "grad_norm": 0.5801079815301161, + "learning_rate": 4.610032324241657e-06, + "loss": 0.55, + "step": 3414 + }, + { + "epoch": 0.5540233614536015, + "grad_norm": 0.5797579318606412, + "learning_rate": 4.609803266186052e-06, + "loss": 0.5491, + "step": 3415 + }, + { + "epoch": 0.5541855937702791, + "grad_norm": 0.6072511192359675, + "learning_rate": 4.6095741465725484e-06, + "loss": 0.5437, + "step": 3416 + }, + { + "epoch": 0.5543478260869565, + "grad_norm": 0.5768611306379312, + "learning_rate": 4.6093449654078316e-06, + "loss": 0.5705, + "step": 3417 + }, + { + "epoch": 0.554510058403634, + "grad_norm": 0.5877661809568308, + "learning_rate": 4.609115722698588e-06, + "loss": 0.5555, + "step": 3418 + }, + { + "epoch": 0.5546722907203114, + "grad_norm": 0.6129105322544747, + "learning_rate": 4.608886418451505e-06, + "loss": 0.5658, + "step": 3419 + }, + { + "epoch": 0.554834523036989, + "grad_norm": 0.6091535644952389, + "learning_rate": 4.608657052673274e-06, + "loss": 0.5827, + "step": 3420 + }, + { + "epoch": 0.5549967553536664, + "grad_norm": 0.6465840339228145, + "learning_rate": 4.6084276253705874e-06, + "loss": 0.5772, + "step": 3421 + }, + { + "epoch": 0.5551589876703439, + "grad_norm": 0.563800002453669, + "learning_rate": 4.60819813655014e-06, + "loss": 0.53, + "step": 3422 + }, + { + "epoch": 0.5553212199870214, + "grad_norm": 0.545948760575199, + "learning_rate": 4.607968586218626e-06, + "loss": 0.5403, + "step": 3423 + }, + { + "epoch": 0.5554834523036989, + "grad_norm": 0.5841016819636614, + "learning_rate": 4.607738974382744e-06, + "loss": 0.519, + "step": 3424 + }, + { + "epoch": 0.5556456846203763, + "grad_norm": 0.5712380020412248, + "learning_rate": 4.607509301049192e-06, + "loss": 0.5582, + "step": 3425 + }, + { + "epoch": 0.5558079169370539, + "grad_norm": 0.5902647906685313, + "learning_rate": 4.6072795662246735e-06, + "loss": 0.5626, + "step": 3426 + }, + { + "epoch": 0.5559701492537313, + "grad_norm": 0.6098204349933296, + "learning_rate": 4.60704976991589e-06, + "loss": 0.5548, + "step": 3427 + }, + { + "epoch": 0.5561323815704088, + "grad_norm": 0.560306636656885, + "learning_rate": 4.606819912129547e-06, + "loss": 0.5652, + "step": 3428 + }, + { + "epoch": 0.5562946138870863, + "grad_norm": 0.5960687531974586, + "learning_rate": 4.606589992872349e-06, + "loss": 0.5704, + "step": 3429 + }, + { + "epoch": 0.5564568462037638, + "grad_norm": 0.5708031638003629, + "learning_rate": 4.606360012151007e-06, + "loss": 0.5608, + "step": 3430 + }, + { + "epoch": 0.5566190785204412, + "grad_norm": 0.5610164341424063, + "learning_rate": 4.606129969972231e-06, + "loss": 0.5791, + "step": 3431 + }, + { + "epoch": 0.5567813108371188, + "grad_norm": 0.6005120594408079, + "learning_rate": 4.605899866342731e-06, + "loss": 0.5946, + "step": 3432 + }, + { + "epoch": 0.5569435431537962, + "grad_norm": 0.5925917159298859, + "learning_rate": 4.6056697012692225e-06, + "loss": 0.5963, + "step": 3433 + }, + { + "epoch": 0.5571057754704737, + "grad_norm": 0.6291787614039749, + "learning_rate": 4.6054394747584196e-06, + "loss": 0.5869, + "step": 3434 + }, + { + "epoch": 0.5572680077871512, + "grad_norm": 0.6116527217090734, + "learning_rate": 4.605209186817042e-06, + "loss": 0.5965, + "step": 3435 + }, + { + "epoch": 0.5574302401038287, + "grad_norm": 0.5983138110759227, + "learning_rate": 4.604978837451806e-06, + "loss": 0.5921, + "step": 3436 + }, + { + "epoch": 0.5575924724205061, + "grad_norm": 0.6224663214579571, + "learning_rate": 4.604748426669434e-06, + "loss": 0.5919, + "step": 3437 + }, + { + "epoch": 0.5577547047371837, + "grad_norm": 0.5712508686938733, + "learning_rate": 4.604517954476649e-06, + "loss": 0.566, + "step": 3438 + }, + { + "epoch": 0.5579169370538611, + "grad_norm": 0.5609091603462959, + "learning_rate": 4.6042874208801754e-06, + "loss": 0.5481, + "step": 3439 + }, + { + "epoch": 0.5580791693705386, + "grad_norm": 0.5958843777204105, + "learning_rate": 4.604056825886738e-06, + "loss": 0.5601, + "step": 3440 + }, + { + "epoch": 0.5582414016872161, + "grad_norm": 0.6250931437601608, + "learning_rate": 4.6038261695030675e-06, + "loss": 0.5659, + "step": 3441 + }, + { + "epoch": 0.5584036340038936, + "grad_norm": 0.5684130510983466, + "learning_rate": 4.603595451735891e-06, + "loss": 0.5626, + "step": 3442 + }, + { + "epoch": 0.558565866320571, + "grad_norm": 0.5869562664695174, + "learning_rate": 4.603364672591942e-06, + "loss": 0.5603, + "step": 3443 + }, + { + "epoch": 0.5587280986372486, + "grad_norm": 0.5655531319648297, + "learning_rate": 4.603133832077953e-06, + "loss": 0.5904, + "step": 3444 + }, + { + "epoch": 0.558890330953926, + "grad_norm": 0.5523552635899406, + "learning_rate": 4.60290293020066e-06, + "loss": 0.545, + "step": 3445 + }, + { + "epoch": 0.5590525632706035, + "grad_norm": 0.5876050007129057, + "learning_rate": 4.602671966966801e-06, + "loss": 0.5869, + "step": 3446 + }, + { + "epoch": 0.5592147955872809, + "grad_norm": 0.5770199024210089, + "learning_rate": 4.602440942383112e-06, + "loss": 0.6124, + "step": 3447 + }, + { + "epoch": 0.5593770279039585, + "grad_norm": 0.5776660653072955, + "learning_rate": 4.6022098564563355e-06, + "loss": 0.5515, + "step": 3448 + }, + { + "epoch": 0.5595392602206359, + "grad_norm": 0.5862024028269601, + "learning_rate": 4.601978709193213e-06, + "loss": 0.5534, + "step": 3449 + }, + { + "epoch": 0.5597014925373134, + "grad_norm": 0.6035874198976269, + "learning_rate": 4.601747500600491e-06, + "loss": 0.557, + "step": 3450 + }, + { + "epoch": 0.5598637248539909, + "grad_norm": 0.5908969472959811, + "learning_rate": 4.601516230684912e-06, + "loss": 0.5755, + "step": 3451 + }, + { + "epoch": 0.5600259571706684, + "grad_norm": 0.5794595566503873, + "learning_rate": 4.601284899453227e-06, + "loss": 0.5628, + "step": 3452 + }, + { + "epoch": 0.5601881894873458, + "grad_norm": 0.584306086511997, + "learning_rate": 4.601053506912183e-06, + "loss": 0.5279, + "step": 3453 + }, + { + "epoch": 0.5603504218040234, + "grad_norm": 0.5910604626175225, + "learning_rate": 4.600822053068533e-06, + "loss": 0.5633, + "step": 3454 + }, + { + "epoch": 0.5605126541207008, + "grad_norm": 0.6197847660400212, + "learning_rate": 4.60059053792903e-06, + "loss": 0.6102, + "step": 3455 + }, + { + "epoch": 0.5606748864373783, + "grad_norm": 0.5795052227878397, + "learning_rate": 4.6003589615004285e-06, + "loss": 0.5535, + "step": 3456 + }, + { + "epoch": 0.5608371187540558, + "grad_norm": 0.596629662533553, + "learning_rate": 4.600127323789485e-06, + "loss": 0.5764, + "step": 3457 + }, + { + "epoch": 0.5609993510707333, + "grad_norm": 0.6223571921677883, + "learning_rate": 4.5998956248029584e-06, + "loss": 0.5692, + "step": 3458 + }, + { + "epoch": 0.5611615833874107, + "grad_norm": 0.5606612862481195, + "learning_rate": 4.59966386454761e-06, + "loss": 0.5489, + "step": 3459 + }, + { + "epoch": 0.5613238157040883, + "grad_norm": 0.5737856787380742, + "learning_rate": 4.599432043030199e-06, + "loss": 0.5552, + "step": 3460 + }, + { + "epoch": 0.5614860480207657, + "grad_norm": 0.5790308556522883, + "learning_rate": 4.599200160257492e-06, + "loss": 0.5773, + "step": 3461 + }, + { + "epoch": 0.5616482803374432, + "grad_norm": 0.5790947959075746, + "learning_rate": 4.598968216236254e-06, + "loss": 0.5537, + "step": 3462 + }, + { + "epoch": 0.5618105126541207, + "grad_norm": 0.6008326206625983, + "learning_rate": 4.598736210973251e-06, + "loss": 0.5384, + "step": 3463 + }, + { + "epoch": 0.5619727449707982, + "grad_norm": 0.5961217506926564, + "learning_rate": 4.598504144475256e-06, + "loss": 0.5535, + "step": 3464 + }, + { + "epoch": 0.5621349772874756, + "grad_norm": 0.6057605833585593, + "learning_rate": 4.598272016749034e-06, + "loss": 0.5849, + "step": 3465 + }, + { + "epoch": 0.5622972096041532, + "grad_norm": 0.5705818763349955, + "learning_rate": 4.598039827801364e-06, + "loss": 0.5883, + "step": 3466 + }, + { + "epoch": 0.5624594419208306, + "grad_norm": 0.6920986350953279, + "learning_rate": 4.597807577639017e-06, + "loss": 0.5648, + "step": 3467 + }, + { + "epoch": 0.5626216742375081, + "grad_norm": 0.5874569528059865, + "learning_rate": 4.59757526626877e-06, + "loss": 0.5935, + "step": 3468 + }, + { + "epoch": 0.5627839065541856, + "grad_norm": 0.6157418514003645, + "learning_rate": 4.597342893697402e-06, + "loss": 0.5422, + "step": 3469 + }, + { + "epoch": 0.5629461388708631, + "grad_norm": 0.6575082004243539, + "learning_rate": 4.597110459931692e-06, + "loss": 0.566, + "step": 3470 + }, + { + "epoch": 0.5631083711875405, + "grad_norm": 0.6033244164648564, + "learning_rate": 4.596877964978421e-06, + "loss": 0.5485, + "step": 3471 + }, + { + "epoch": 0.5632706035042181, + "grad_norm": 0.6108315346255332, + "learning_rate": 4.5966454088443755e-06, + "loss": 0.6011, + "step": 3472 + }, + { + "epoch": 0.5634328358208955, + "grad_norm": 0.580489365638392, + "learning_rate": 4.596412791536338e-06, + "loss": 0.5625, + "step": 3473 + }, + { + "epoch": 0.563595068137573, + "grad_norm": 0.6432591902692338, + "learning_rate": 4.596180113061098e-06, + "loss": 0.5279, + "step": 3474 + }, + { + "epoch": 0.5637573004542504, + "grad_norm": 0.6228065541001132, + "learning_rate": 4.59594737342544e-06, + "loss": 0.564, + "step": 3475 + }, + { + "epoch": 0.563919532770928, + "grad_norm": 0.5828346484888652, + "learning_rate": 4.59571457263616e-06, + "loss": 0.547, + "step": 3476 + }, + { + "epoch": 0.5640817650876054, + "grad_norm": 0.5978550399834407, + "learning_rate": 4.595481710700047e-06, + "loss": 0.5717, + "step": 3477 + }, + { + "epoch": 0.5642439974042829, + "grad_norm": 0.6411544401104701, + "learning_rate": 4.595248787623896e-06, + "loss": 0.5588, + "step": 3478 + }, + { + "epoch": 0.5644062297209604, + "grad_norm": 0.584510918152808, + "learning_rate": 4.595015803414504e-06, + "loss": 0.5609, + "step": 3479 + }, + { + "epoch": 0.5645684620376379, + "grad_norm": 0.6113241187712997, + "learning_rate": 4.594782758078668e-06, + "loss": 0.5666, + "step": 3480 + }, + { + "epoch": 0.5647306943543153, + "grad_norm": 0.572401801860372, + "learning_rate": 4.594549651623188e-06, + "loss": 0.5751, + "step": 3481 + }, + { + "epoch": 0.5648929266709929, + "grad_norm": 0.5985548372342452, + "learning_rate": 4.594316484054864e-06, + "loss": 0.5546, + "step": 3482 + }, + { + "epoch": 0.5650551589876703, + "grad_norm": 0.6049386558639569, + "learning_rate": 4.594083255380501e-06, + "loss": 0.5509, + "step": 3483 + }, + { + "epoch": 0.5652173913043478, + "grad_norm": 0.6227307283564903, + "learning_rate": 4.593849965606902e-06, + "loss": 0.5759, + "step": 3484 + }, + { + "epoch": 0.5653796236210253, + "grad_norm": 0.5786261332353422, + "learning_rate": 4.593616614740876e-06, + "loss": 0.5489, + "step": 3485 + }, + { + "epoch": 0.5655418559377028, + "grad_norm": 0.5948000075198029, + "learning_rate": 4.59338320278923e-06, + "loss": 0.5798, + "step": 3486 + }, + { + "epoch": 0.5657040882543802, + "grad_norm": 0.5977872047837889, + "learning_rate": 4.593149729758774e-06, + "loss": 0.5588, + "step": 3487 + }, + { + "epoch": 0.5658663205710578, + "grad_norm": 0.6298671628063639, + "learning_rate": 4.592916195656322e-06, + "loss": 0.5853, + "step": 3488 + }, + { + "epoch": 0.5660285528877352, + "grad_norm": 0.5861283910496485, + "learning_rate": 4.592682600488685e-06, + "loss": 0.5658, + "step": 3489 + }, + { + "epoch": 0.5661907852044127, + "grad_norm": 0.5908571439072468, + "learning_rate": 4.592448944262681e-06, + "loss": 0.5388, + "step": 3490 + }, + { + "epoch": 0.5663530175210902, + "grad_norm": 0.5985507833250712, + "learning_rate": 4.592215226985127e-06, + "loss": 0.5761, + "step": 3491 + }, + { + "epoch": 0.5665152498377677, + "grad_norm": 0.5977934991228618, + "learning_rate": 4.59198144866284e-06, + "loss": 0.5846, + "step": 3492 + }, + { + "epoch": 0.5666774821544451, + "grad_norm": 0.6007776557355982, + "learning_rate": 4.591747609302644e-06, + "loss": 0.5539, + "step": 3493 + }, + { + "epoch": 0.5668397144711227, + "grad_norm": 0.5814011043455327, + "learning_rate": 4.59151370891136e-06, + "loss": 0.5416, + "step": 3494 + }, + { + "epoch": 0.5670019467878001, + "grad_norm": 0.6114060941141918, + "learning_rate": 4.591279747495814e-06, + "loss": 0.576, + "step": 3495 + }, + { + "epoch": 0.5671641791044776, + "grad_norm": 0.5918046994627894, + "learning_rate": 4.59104572506283e-06, + "loss": 0.5167, + "step": 3496 + }, + { + "epoch": 0.5673264114211551, + "grad_norm": 0.5922509524057581, + "learning_rate": 4.590811641619237e-06, + "loss": 0.5572, + "step": 3497 + }, + { + "epoch": 0.5674886437378326, + "grad_norm": 0.5733841120482274, + "learning_rate": 4.590577497171866e-06, + "loss": 0.5773, + "step": 3498 + }, + { + "epoch": 0.56765087605451, + "grad_norm": 0.6407914779127294, + "learning_rate": 4.590343291727548e-06, + "loss": 0.5807, + "step": 3499 + }, + { + "epoch": 0.5678131083711876, + "grad_norm": 0.5725015078839928, + "learning_rate": 4.590109025293117e-06, + "loss": 0.5958, + "step": 3500 + }, + { + "epoch": 0.567975340687865, + "grad_norm": 0.5879741980221017, + "learning_rate": 4.589874697875406e-06, + "loss": 0.5746, + "step": 3501 + }, + { + "epoch": 0.5681375730045425, + "grad_norm": 0.6191891688160922, + "learning_rate": 4.589640309481255e-06, + "loss": 0.5508, + "step": 3502 + }, + { + "epoch": 0.56829980532122, + "grad_norm": 0.5900153583977624, + "learning_rate": 4.5894058601175e-06, + "loss": 0.558, + "step": 3503 + }, + { + "epoch": 0.5684620376378975, + "grad_norm": 0.5843768037663125, + "learning_rate": 4.589171349790983e-06, + "loss": 0.584, + "step": 3504 + }, + { + "epoch": 0.5686242699545749, + "grad_norm": 0.6352603276388253, + "learning_rate": 4.588936778508547e-06, + "loss": 0.5645, + "step": 3505 + }, + { + "epoch": 0.5687865022712524, + "grad_norm": 0.606762473116442, + "learning_rate": 4.5887021462770346e-06, + "loss": 0.5689, + "step": 3506 + }, + { + "epoch": 0.5689487345879299, + "grad_norm": 0.5911924726166962, + "learning_rate": 4.588467453103292e-06, + "loss": 0.5965, + "step": 3507 + }, + { + "epoch": 0.5691109669046074, + "grad_norm": 0.5734921749055456, + "learning_rate": 4.588232698994169e-06, + "loss": 0.5572, + "step": 3508 + }, + { + "epoch": 0.5692731992212848, + "grad_norm": 0.5799326959463613, + "learning_rate": 4.587997883956511e-06, + "loss": 0.5758, + "step": 3509 + }, + { + "epoch": 0.5694354315379624, + "grad_norm": 0.588689546617233, + "learning_rate": 4.587763007997173e-06, + "loss": 0.6009, + "step": 3510 + }, + { + "epoch": 0.5695976638546398, + "grad_norm": 0.597704035444241, + "learning_rate": 4.587528071123006e-06, + "loss": 0.5737, + "step": 3511 + }, + { + "epoch": 0.5697598961713173, + "grad_norm": 0.5991804058366775, + "learning_rate": 4.5872930733408646e-06, + "loss": 0.5701, + "step": 3512 + }, + { + "epoch": 0.5699221284879948, + "grad_norm": 0.633105880215588, + "learning_rate": 4.587058014657607e-06, + "loss": 0.5507, + "step": 3513 + }, + { + "epoch": 0.5700843608046723, + "grad_norm": 0.5928063155593015, + "learning_rate": 4.58682289508009e-06, + "loss": 0.6046, + "step": 3514 + }, + { + "epoch": 0.5702465931213497, + "grad_norm": 0.6080565082426151, + "learning_rate": 4.586587714615174e-06, + "loss": 0.5945, + "step": 3515 + }, + { + "epoch": 0.5704088254380273, + "grad_norm": 0.5807615999408571, + "learning_rate": 4.586352473269722e-06, + "loss": 0.5523, + "step": 3516 + }, + { + "epoch": 0.5705710577547047, + "grad_norm": 0.6178489881883309, + "learning_rate": 4.5861171710505956e-06, + "loss": 0.5494, + "step": 3517 + }, + { + "epoch": 0.5707332900713822, + "grad_norm": 0.575874129429192, + "learning_rate": 4.5858818079646614e-06, + "loss": 0.5887, + "step": 3518 + }, + { + "epoch": 0.5708955223880597, + "grad_norm": 0.5985415619900628, + "learning_rate": 4.5856463840187875e-06, + "loss": 0.5563, + "step": 3519 + }, + { + "epoch": 0.5710577547047372, + "grad_norm": 0.6300811597142741, + "learning_rate": 4.585410899219842e-06, + "loss": 0.617, + "step": 3520 + }, + { + "epoch": 0.5712199870214146, + "grad_norm": 0.6263607660032835, + "learning_rate": 4.5851753535746945e-06, + "loss": 0.5394, + "step": 3521 + }, + { + "epoch": 0.5713822193380922, + "grad_norm": 0.5955485556832532, + "learning_rate": 4.584939747090219e-06, + "loss": 0.5503, + "step": 3522 + }, + { + "epoch": 0.5715444516547696, + "grad_norm": 0.5643017772353475, + "learning_rate": 4.584704079773291e-06, + "loss": 0.5856, + "step": 3523 + }, + { + "epoch": 0.5717066839714471, + "grad_norm": 0.5696104263363895, + "learning_rate": 4.584468351630783e-06, + "loss": 0.544, + "step": 3524 + }, + { + "epoch": 0.5718689162881246, + "grad_norm": 0.5832401805022018, + "learning_rate": 4.584232562669576e-06, + "loss": 0.5513, + "step": 3525 + }, + { + "epoch": 0.5720311486048021, + "grad_norm": 0.6172296204900724, + "learning_rate": 4.583996712896548e-06, + "loss": 0.5734, + "step": 3526 + }, + { + "epoch": 0.5721933809214795, + "grad_norm": 0.5552928661130699, + "learning_rate": 4.583760802318582e-06, + "loss": 0.5529, + "step": 3527 + }, + { + "epoch": 0.5723556132381571, + "grad_norm": 0.5769406192061155, + "learning_rate": 4.5835248309425595e-06, + "loss": 0.5866, + "step": 3528 + }, + { + "epoch": 0.5725178455548345, + "grad_norm": 0.5708142133692576, + "learning_rate": 4.583288798775366e-06, + "loss": 0.5557, + "step": 3529 + }, + { + "epoch": 0.572680077871512, + "grad_norm": 0.5822772055719688, + "learning_rate": 4.583052705823889e-06, + "loss": 0.5274, + "step": 3530 + }, + { + "epoch": 0.5728423101881895, + "grad_norm": 0.5802335015032342, + "learning_rate": 4.582816552095015e-06, + "loss": 0.5461, + "step": 3531 + }, + { + "epoch": 0.573004542504867, + "grad_norm": 0.5669249197830493, + "learning_rate": 4.582580337595636e-06, + "loss": 0.5662, + "step": 3532 + }, + { + "epoch": 0.5731667748215444, + "grad_norm": 0.5910481582978774, + "learning_rate": 4.582344062332644e-06, + "loss": 0.5533, + "step": 3533 + }, + { + "epoch": 0.5733290071382219, + "grad_norm": 0.5884998805017715, + "learning_rate": 4.582107726312933e-06, + "loss": 0.5648, + "step": 3534 + }, + { + "epoch": 0.5734912394548994, + "grad_norm": 0.5763659750425261, + "learning_rate": 4.581871329543397e-06, + "loss": 0.5792, + "step": 3535 + }, + { + "epoch": 0.5736534717715769, + "grad_norm": 0.6151765802565269, + "learning_rate": 4.581634872030935e-06, + "loss": 0.5532, + "step": 3536 + }, + { + "epoch": 0.5738157040882543, + "grad_norm": 0.5964861093698505, + "learning_rate": 4.581398353782446e-06, + "loss": 0.6101, + "step": 3537 + }, + { + "epoch": 0.5739779364049319, + "grad_norm": 0.5828117023289333, + "learning_rate": 4.5811617748048294e-06, + "loss": 0.571, + "step": 3538 + }, + { + "epoch": 0.5741401687216093, + "grad_norm": 0.5837727869178971, + "learning_rate": 4.58092513510499e-06, + "loss": 0.57, + "step": 3539 + }, + { + "epoch": 0.5743024010382868, + "grad_norm": 0.5735325788248904, + "learning_rate": 4.580688434689831e-06, + "loss": 0.5591, + "step": 3540 + }, + { + "epoch": 0.5744646333549643, + "grad_norm": 0.6075737436159584, + "learning_rate": 4.580451673566258e-06, + "loss": 0.5554, + "step": 3541 + }, + { + "epoch": 0.5746268656716418, + "grad_norm": 0.6066570796263056, + "learning_rate": 4.580214851741181e-06, + "loss": 0.5582, + "step": 3542 + }, + { + "epoch": 0.5747890979883192, + "grad_norm": 0.6113757763914537, + "learning_rate": 4.579977969221508e-06, + "loss": 0.5663, + "step": 3543 + }, + { + "epoch": 0.5749513303049968, + "grad_norm": 0.5645378542960796, + "learning_rate": 4.579741026014151e-06, + "loss": 0.5296, + "step": 3544 + }, + { + "epoch": 0.5751135626216742, + "grad_norm": 0.573738744602486, + "learning_rate": 4.5795040221260245e-06, + "loss": 0.582, + "step": 3545 + }, + { + "epoch": 0.5752757949383517, + "grad_norm": 0.5779758016278155, + "learning_rate": 4.579266957564042e-06, + "loss": 0.606, + "step": 3546 + }, + { + "epoch": 0.5754380272550292, + "grad_norm": 0.5731123256427891, + "learning_rate": 4.5790298323351205e-06, + "loss": 0.56, + "step": 3547 + }, + { + "epoch": 0.5756002595717067, + "grad_norm": 0.6002200372337906, + "learning_rate": 4.578792646446179e-06, + "loss": 0.5838, + "step": 3548 + }, + { + "epoch": 0.5757624918883841, + "grad_norm": 0.5882399534051945, + "learning_rate": 4.578555399904138e-06, + "loss": 0.5795, + "step": 3549 + }, + { + "epoch": 0.5759247242050617, + "grad_norm": 0.5672673493739194, + "learning_rate": 4.57831809271592e-06, + "loss": 0.5528, + "step": 3550 + }, + { + "epoch": 0.5760869565217391, + "grad_norm": 0.5904429958390381, + "learning_rate": 4.578080724888449e-06, + "loss": 0.548, + "step": 3551 + }, + { + "epoch": 0.5762491888384166, + "grad_norm": 0.614426248867219, + "learning_rate": 4.577843296428649e-06, + "loss": 0.546, + "step": 3552 + }, + { + "epoch": 0.5764114211550941, + "grad_norm": 0.5906192525163038, + "learning_rate": 4.577605807343449e-06, + "loss": 0.5545, + "step": 3553 + }, + { + "epoch": 0.5765736534717716, + "grad_norm": 0.5880024080907575, + "learning_rate": 4.577368257639778e-06, + "loss": 0.5703, + "step": 3554 + }, + { + "epoch": 0.576735885788449, + "grad_norm": 0.6136635344387802, + "learning_rate": 4.577130647324567e-06, + "loss": 0.5181, + "step": 3555 + }, + { + "epoch": 0.5768981181051266, + "grad_norm": 0.590499004880639, + "learning_rate": 4.576892976404749e-06, + "loss": 0.5697, + "step": 3556 + }, + { + "epoch": 0.577060350421804, + "grad_norm": 0.5583161547279986, + "learning_rate": 4.576655244887258e-06, + "loss": 0.5809, + "step": 3557 + }, + { + "epoch": 0.5772225827384815, + "grad_norm": 0.5835241806577606, + "learning_rate": 4.576417452779031e-06, + "loss": 0.5681, + "step": 3558 + }, + { + "epoch": 0.577384815055159, + "grad_norm": 0.5744081965743323, + "learning_rate": 4.576179600087005e-06, + "loss": 0.5887, + "step": 3559 + }, + { + "epoch": 0.5775470473718365, + "grad_norm": 0.5843861752047494, + "learning_rate": 4.575941686818121e-06, + "loss": 0.5715, + "step": 3560 + }, + { + "epoch": 0.5777092796885139, + "grad_norm": 0.605482135959413, + "learning_rate": 4.575703712979319e-06, + "loss": 0.555, + "step": 3561 + }, + { + "epoch": 0.5778715120051914, + "grad_norm": 0.5844007776064003, + "learning_rate": 4.575465678577544e-06, + "loss": 0.5446, + "step": 3562 + }, + { + "epoch": 0.5780337443218689, + "grad_norm": 0.6212830588919156, + "learning_rate": 4.5752275836197414e-06, + "loss": 0.5667, + "step": 3563 + }, + { + "epoch": 0.5781959766385464, + "grad_norm": 0.569841182883579, + "learning_rate": 4.574989428112857e-06, + "loss": 0.5618, + "step": 3564 + }, + { + "epoch": 0.5783582089552238, + "grad_norm": 0.5776330396658464, + "learning_rate": 4.5747512120638395e-06, + "loss": 0.57, + "step": 3565 + }, + { + "epoch": 0.5785204412719014, + "grad_norm": 0.5987193377448328, + "learning_rate": 4.57451293547964e-06, + "loss": 0.5571, + "step": 3566 + }, + { + "epoch": 0.5786826735885788, + "grad_norm": 0.5891438998424757, + "learning_rate": 4.57427459836721e-06, + "loss": 0.6005, + "step": 3567 + }, + { + "epoch": 0.5788449059052563, + "grad_norm": 0.5753378079633882, + "learning_rate": 4.574036200733504e-06, + "loss": 0.5615, + "step": 3568 + }, + { + "epoch": 0.5790071382219338, + "grad_norm": 0.6056401463699019, + "learning_rate": 4.573797742585478e-06, + "loss": 0.577, + "step": 3569 + }, + { + "epoch": 0.5791693705386113, + "grad_norm": 0.5868702783032349, + "learning_rate": 4.5735592239300885e-06, + "loss": 0.5993, + "step": 3570 + }, + { + "epoch": 0.5793316028552887, + "grad_norm": 0.5586851165681249, + "learning_rate": 4.573320644774296e-06, + "loss": 0.5597, + "step": 3571 + }, + { + "epoch": 0.5794938351719663, + "grad_norm": 0.5753258545596656, + "learning_rate": 4.57308200512506e-06, + "loss": 0.5432, + "step": 3572 + }, + { + "epoch": 0.5796560674886437, + "grad_norm": 0.6066104559763088, + "learning_rate": 4.572843304989346e-06, + "loss": 0.5798, + "step": 3573 + }, + { + "epoch": 0.5798182998053212, + "grad_norm": 0.6015257804446422, + "learning_rate": 4.572604544374115e-06, + "loss": 0.5889, + "step": 3574 + }, + { + "epoch": 0.5799805321219987, + "grad_norm": 0.5687803810913685, + "learning_rate": 4.572365723286337e-06, + "loss": 0.5461, + "step": 3575 + }, + { + "epoch": 0.5801427644386762, + "grad_norm": 0.5853427917665743, + "learning_rate": 4.572126841732977e-06, + "loss": 0.5431, + "step": 3576 + }, + { + "epoch": 0.5803049967553536, + "grad_norm": 0.5709990959909238, + "learning_rate": 4.571887899721006e-06, + "loss": 0.5942, + "step": 3577 + }, + { + "epoch": 0.5804672290720312, + "grad_norm": 0.6012793835384791, + "learning_rate": 4.571648897257397e-06, + "loss": 0.5526, + "step": 3578 + }, + { + "epoch": 0.5806294613887086, + "grad_norm": 0.5514606731369887, + "learning_rate": 4.571409834349121e-06, + "loss": 0.5523, + "step": 3579 + }, + { + "epoch": 0.5807916937053861, + "grad_norm": 0.5966789640611424, + "learning_rate": 4.571170711003154e-06, + "loss": 0.5919, + "step": 3580 + }, + { + "epoch": 0.5809539260220636, + "grad_norm": 0.579565637646998, + "learning_rate": 4.570931527226474e-06, + "loss": 0.5755, + "step": 3581 + }, + { + "epoch": 0.5811161583387411, + "grad_norm": 0.5950262584622035, + "learning_rate": 4.570692283026059e-06, + "loss": 0.5512, + "step": 3582 + }, + { + "epoch": 0.5812783906554185, + "grad_norm": 0.5812301492127439, + "learning_rate": 4.570452978408889e-06, + "loss": 0.543, + "step": 3583 + }, + { + "epoch": 0.5814406229720961, + "grad_norm": 0.60152493035143, + "learning_rate": 4.5702136133819475e-06, + "loss": 0.5554, + "step": 3584 + }, + { + "epoch": 0.5816028552887735, + "grad_norm": 0.5854019460430373, + "learning_rate": 4.569974187952217e-06, + "loss": 0.5693, + "step": 3585 + }, + { + "epoch": 0.581765087605451, + "grad_norm": 0.5609114786473127, + "learning_rate": 4.569734702126683e-06, + "loss": 0.55, + "step": 3586 + }, + { + "epoch": 0.5819273199221285, + "grad_norm": 0.5929933725181863, + "learning_rate": 4.5694951559123345e-06, + "loss": 0.5834, + "step": 3587 + }, + { + "epoch": 0.582089552238806, + "grad_norm": 0.5857001751413783, + "learning_rate": 4.56925554931616e-06, + "loss": 0.5339, + "step": 3588 + }, + { + "epoch": 0.5822517845554834, + "grad_norm": 0.5888797827134689, + "learning_rate": 4.569015882345151e-06, + "loss": 0.5612, + "step": 3589 + }, + { + "epoch": 0.582414016872161, + "grad_norm": 0.5543025043327294, + "learning_rate": 4.5687761550063e-06, + "loss": 0.5241, + "step": 3590 + }, + { + "epoch": 0.5825762491888384, + "grad_norm": 0.5700615250897342, + "learning_rate": 4.5685363673065995e-06, + "loss": 0.572, + "step": 3591 + }, + { + "epoch": 0.5827384815055159, + "grad_norm": 0.6010085618079019, + "learning_rate": 4.568296519253049e-06, + "loss": 0.5954, + "step": 3592 + }, + { + "epoch": 0.5829007138221933, + "grad_norm": 0.5410121234543409, + "learning_rate": 4.568056610852646e-06, + "loss": 0.5884, + "step": 3593 + }, + { + "epoch": 0.5830629461388709, + "grad_norm": 0.5593951658557373, + "learning_rate": 4.567816642112388e-06, + "loss": 0.5441, + "step": 3594 + }, + { + "epoch": 0.5832251784555483, + "grad_norm": 0.5671360317312684, + "learning_rate": 4.567576613039279e-06, + "loss": 0.5827, + "step": 3595 + }, + { + "epoch": 0.5833874107722258, + "grad_norm": 0.5574193825174573, + "learning_rate": 4.567336523640322e-06, + "loss": 0.5807, + "step": 3596 + }, + { + "epoch": 0.5835496430889033, + "grad_norm": 0.577512876015635, + "learning_rate": 4.5670963739225215e-06, + "loss": 0.561, + "step": 3597 + }, + { + "epoch": 0.5837118754055808, + "grad_norm": 0.6015112077882884, + "learning_rate": 4.566856163892884e-06, + "loss": 0.5386, + "step": 3598 + }, + { + "epoch": 0.5838741077222582, + "grad_norm": 0.6265554251215021, + "learning_rate": 4.56661589355842e-06, + "loss": 0.5508, + "step": 3599 + }, + { + "epoch": 0.5840363400389358, + "grad_norm": 0.5867640427471723, + "learning_rate": 4.566375562926137e-06, + "loss": 0.5558, + "step": 3600 + }, + { + "epoch": 0.5841985723556132, + "grad_norm": 0.606190297706498, + "learning_rate": 4.566135172003048e-06, + "loss": 0.5635, + "step": 3601 + }, + { + "epoch": 0.5843608046722907, + "grad_norm": 0.5955753875564498, + "learning_rate": 4.565894720796169e-06, + "loss": 0.5847, + "step": 3602 + }, + { + "epoch": 0.5845230369889682, + "grad_norm": 0.5882483023716906, + "learning_rate": 4.565654209312513e-06, + "loss": 0.5446, + "step": 3603 + }, + { + "epoch": 0.5846852693056457, + "grad_norm": 0.5620030228139692, + "learning_rate": 4.5654136375591e-06, + "loss": 0.5496, + "step": 3604 + }, + { + "epoch": 0.5848475016223231, + "grad_norm": 0.5630694267271573, + "learning_rate": 4.5651730055429474e-06, + "loss": 0.5489, + "step": 3605 + }, + { + "epoch": 0.5850097339390007, + "grad_norm": 0.5838731535014609, + "learning_rate": 4.564932313271077e-06, + "loss": 0.5464, + "step": 3606 + }, + { + "epoch": 0.5851719662556781, + "grad_norm": 0.5753437424575688, + "learning_rate": 4.56469156075051e-06, + "loss": 0.5737, + "step": 3607 + }, + { + "epoch": 0.5853341985723556, + "grad_norm": 0.5807370865776127, + "learning_rate": 4.5644507479882726e-06, + "loss": 0.5604, + "step": 3608 + }, + { + "epoch": 0.5854964308890331, + "grad_norm": 0.570019595632526, + "learning_rate": 4.56420987499139e-06, + "loss": 0.5824, + "step": 3609 + }, + { + "epoch": 0.5856586632057106, + "grad_norm": 0.581463442913514, + "learning_rate": 4.563968941766891e-06, + "loss": 0.5757, + "step": 3610 + }, + { + "epoch": 0.585820895522388, + "grad_norm": 0.6108367551550862, + "learning_rate": 4.563727948321804e-06, + "loss": 0.5662, + "step": 3611 + }, + { + "epoch": 0.5859831278390656, + "grad_norm": 0.5687651375178321, + "learning_rate": 4.563486894663162e-06, + "loss": 0.5823, + "step": 3612 + }, + { + "epoch": 0.586145360155743, + "grad_norm": 0.597664673598419, + "learning_rate": 4.563245780797998e-06, + "loss": 0.587, + "step": 3613 + }, + { + "epoch": 0.5863075924724205, + "grad_norm": 0.6029871744952721, + "learning_rate": 4.563004606733345e-06, + "loss": 0.5906, + "step": 3614 + }, + { + "epoch": 0.586469824789098, + "grad_norm": 0.5948950844299464, + "learning_rate": 4.562763372476243e-06, + "loss": 0.5748, + "step": 3615 + }, + { + "epoch": 0.5866320571057755, + "grad_norm": 0.5800932682305342, + "learning_rate": 4.562522078033728e-06, + "loss": 0.5678, + "step": 3616 + }, + { + "epoch": 0.5867942894224529, + "grad_norm": 0.5752835618788968, + "learning_rate": 4.562280723412842e-06, + "loss": 0.5497, + "step": 3617 + }, + { + "epoch": 0.5869565217391305, + "grad_norm": 0.5679735734497358, + "learning_rate": 4.562039308620625e-06, + "loss": 0.5479, + "step": 3618 + }, + { + "epoch": 0.5871187540558079, + "grad_norm": 0.5485904669635132, + "learning_rate": 4.561797833664122e-06, + "loss": 0.586, + "step": 3619 + }, + { + "epoch": 0.5872809863724854, + "grad_norm": 0.5816354244396618, + "learning_rate": 4.561556298550379e-06, + "loss": 0.5874, + "step": 3620 + }, + { + "epoch": 0.5874432186891628, + "grad_norm": 0.5589140304646601, + "learning_rate": 4.561314703286442e-06, + "loss": 0.5609, + "step": 3621 + }, + { + "epoch": 0.5876054510058404, + "grad_norm": 0.5717146040716711, + "learning_rate": 4.561073047879362e-06, + "loss": 0.5248, + "step": 3622 + }, + { + "epoch": 0.5877676833225178, + "grad_norm": 0.5720646856156711, + "learning_rate": 4.5608313323361876e-06, + "loss": 0.5649, + "step": 3623 + }, + { + "epoch": 0.5879299156391953, + "grad_norm": 0.5884019087062397, + "learning_rate": 4.560589556663972e-06, + "loss": 0.5951, + "step": 3624 + }, + { + "epoch": 0.5880921479558728, + "grad_norm": 0.56660928651163, + "learning_rate": 4.560347720869771e-06, + "loss": 0.5345, + "step": 3625 + }, + { + "epoch": 0.5882543802725503, + "grad_norm": 0.5663705055105897, + "learning_rate": 4.560105824960639e-06, + "loss": 0.5671, + "step": 3626 + }, + { + "epoch": 0.5884166125892277, + "grad_norm": 0.5653296338901896, + "learning_rate": 4.559863868943634e-06, + "loss": 0.5596, + "step": 3627 + }, + { + "epoch": 0.5885788449059053, + "grad_norm": 0.5792101535101396, + "learning_rate": 4.559621852825816e-06, + "loss": 0.5625, + "step": 3628 + }, + { + "epoch": 0.5887410772225827, + "grad_norm": 0.5718632515204485, + "learning_rate": 4.5593797766142465e-06, + "loss": 0.5597, + "step": 3629 + }, + { + "epoch": 0.5889033095392602, + "grad_norm": 0.5923445687719068, + "learning_rate": 4.559137640315988e-06, + "loss": 0.5702, + "step": 3630 + }, + { + "epoch": 0.5890655418559377, + "grad_norm": 0.599257233384417, + "learning_rate": 4.558895443938105e-06, + "loss": 0.5464, + "step": 3631 + }, + { + "epoch": 0.5892277741726152, + "grad_norm": 0.6167800848738102, + "learning_rate": 4.5586531874876654e-06, + "loss": 0.5745, + "step": 3632 + }, + { + "epoch": 0.5893900064892926, + "grad_norm": 0.5959867304347733, + "learning_rate": 4.558410870971737e-06, + "loss": 0.565, + "step": 3633 + }, + { + "epoch": 0.5895522388059702, + "grad_norm": 0.608514211544976, + "learning_rate": 4.558168494397389e-06, + "loss": 0.5792, + "step": 3634 + }, + { + "epoch": 0.5897144711226476, + "grad_norm": 0.5792748261492694, + "learning_rate": 4.557926057771695e-06, + "loss": 0.5668, + "step": 3635 + }, + { + "epoch": 0.5898767034393251, + "grad_norm": 0.6305789518475089, + "learning_rate": 4.5576835611017265e-06, + "loss": 0.5678, + "step": 3636 + }, + { + "epoch": 0.5900389357560026, + "grad_norm": 0.5690424137260944, + "learning_rate": 4.5574410043945605e-06, + "loss": 0.5641, + "step": 3637 + }, + { + "epoch": 0.5902011680726801, + "grad_norm": 0.6178215881893474, + "learning_rate": 4.557198387657273e-06, + "loss": 0.5454, + "step": 3638 + }, + { + "epoch": 0.5903634003893575, + "grad_norm": 0.6028743948871434, + "learning_rate": 4.556955710896945e-06, + "loss": 0.6052, + "step": 3639 + }, + { + "epoch": 0.5905256327060351, + "grad_norm": 0.591865541939852, + "learning_rate": 4.556712974120654e-06, + "loss": 0.5409, + "step": 3640 + }, + { + "epoch": 0.5906878650227125, + "grad_norm": 0.5894159659821561, + "learning_rate": 4.5564701773354835e-06, + "loss": 0.5603, + "step": 3641 + }, + { + "epoch": 0.59085009733939, + "grad_norm": 0.6190973900443139, + "learning_rate": 4.556227320548519e-06, + "loss": 0.5816, + "step": 3642 + }, + { + "epoch": 0.5910123296560675, + "grad_norm": 0.5991307287978875, + "learning_rate": 4.555984403766844e-06, + "loss": 0.5676, + "step": 3643 + }, + { + "epoch": 0.591174561972745, + "grad_norm": 0.595041602126515, + "learning_rate": 4.555741426997548e-06, + "loss": 0.548, + "step": 3644 + }, + { + "epoch": 0.5913367942894224, + "grad_norm": 0.6024710073238306, + "learning_rate": 4.555498390247719e-06, + "loss": 0.5737, + "step": 3645 + }, + { + "epoch": 0.5914990266061, + "grad_norm": 0.5687595995115649, + "learning_rate": 4.5552552935244495e-06, + "loss": 0.5577, + "step": 3646 + }, + { + "epoch": 0.5916612589227774, + "grad_norm": 0.5513738325028983, + "learning_rate": 4.555012136834832e-06, + "loss": 0.5415, + "step": 3647 + }, + { + "epoch": 0.5918234912394549, + "grad_norm": 0.6236444287227938, + "learning_rate": 4.55476892018596e-06, + "loss": 0.5393, + "step": 3648 + }, + { + "epoch": 0.5919857235561323, + "grad_norm": 0.6216813908972775, + "learning_rate": 4.5545256435849314e-06, + "loss": 0.5812, + "step": 3649 + }, + { + "epoch": 0.5921479558728099, + "grad_norm": 0.5596665406204845, + "learning_rate": 4.554282307038843e-06, + "loss": 0.5285, + "step": 3650 + }, + { + "epoch": 0.5923101881894873, + "grad_norm": 0.5876132669153744, + "learning_rate": 4.5540389105547946e-06, + "loss": 0.5816, + "step": 3651 + }, + { + "epoch": 0.5924724205061648, + "grad_norm": 0.5773085249645601, + "learning_rate": 4.553795454139889e-06, + "loss": 0.5909, + "step": 3652 + }, + { + "epoch": 0.5926346528228423, + "grad_norm": 0.6066512591186924, + "learning_rate": 4.553551937801229e-06, + "loss": 0.5701, + "step": 3653 + }, + { + "epoch": 0.5927968851395198, + "grad_norm": 0.5950635703300539, + "learning_rate": 4.55330836154592e-06, + "loss": 0.5443, + "step": 3654 + }, + { + "epoch": 0.5929591174561972, + "grad_norm": 0.5709972960158871, + "learning_rate": 4.553064725381068e-06, + "loss": 0.5541, + "step": 3655 + }, + { + "epoch": 0.5931213497728748, + "grad_norm": 0.5881795255248359, + "learning_rate": 4.552821029313782e-06, + "loss": 0.5325, + "step": 3656 + }, + { + "epoch": 0.5932835820895522, + "grad_norm": 0.6091258561079683, + "learning_rate": 4.552577273351172e-06, + "loss": 0.5564, + "step": 3657 + }, + { + "epoch": 0.5934458144062297, + "grad_norm": 0.552267957279247, + "learning_rate": 4.552333457500351e-06, + "loss": 0.5534, + "step": 3658 + }, + { + "epoch": 0.5936080467229072, + "grad_norm": 0.5878194212142777, + "learning_rate": 4.552089581768432e-06, + "loss": 0.5484, + "step": 3659 + }, + { + "epoch": 0.5937702790395847, + "grad_norm": 0.6014904487107059, + "learning_rate": 4.55184564616253e-06, + "loss": 0.5771, + "step": 3660 + }, + { + "epoch": 0.5939325113562621, + "grad_norm": 0.5733941929985301, + "learning_rate": 4.551601650689766e-06, + "loss": 0.6017, + "step": 3661 + }, + { + "epoch": 0.5940947436729397, + "grad_norm": 0.6261039824356108, + "learning_rate": 4.551357595357253e-06, + "loss": 0.5437, + "step": 3662 + }, + { + "epoch": 0.5942569759896171, + "grad_norm": 0.5982071438875203, + "learning_rate": 4.551113480172117e-06, + "loss": 0.6003, + "step": 3663 + }, + { + "epoch": 0.5944192083062946, + "grad_norm": 0.6305348403353385, + "learning_rate": 4.550869305141478e-06, + "loss": 0.5576, + "step": 3664 + }, + { + "epoch": 0.5945814406229721, + "grad_norm": 0.5738710187141051, + "learning_rate": 4.5506250702724615e-06, + "loss": 0.5662, + "step": 3665 + }, + { + "epoch": 0.5947436729396496, + "grad_norm": 0.5629740412019492, + "learning_rate": 4.550380775572193e-06, + "loss": 0.5632, + "step": 3666 + }, + { + "epoch": 0.594905905256327, + "grad_norm": 0.5811417576935622, + "learning_rate": 4.5501364210478e-06, + "loss": 0.5689, + "step": 3667 + }, + { + "epoch": 0.5950681375730046, + "grad_norm": 0.6218156330174958, + "learning_rate": 4.549892006706412e-06, + "loss": 0.5517, + "step": 3668 + }, + { + "epoch": 0.595230369889682, + "grad_norm": 0.573420604588384, + "learning_rate": 4.549647532555161e-06, + "loss": 0.559, + "step": 3669 + }, + { + "epoch": 0.5953926022063595, + "grad_norm": 0.6120804540302445, + "learning_rate": 4.549402998601181e-06, + "loss": 0.5274, + "step": 3670 + }, + { + "epoch": 0.595554834523037, + "grad_norm": 0.627958064662776, + "learning_rate": 4.549158404851604e-06, + "loss": 0.5467, + "step": 3671 + }, + { + "epoch": 0.5957170668397145, + "grad_norm": 0.5900008750631794, + "learning_rate": 4.548913751313568e-06, + "loss": 0.5575, + "step": 3672 + }, + { + "epoch": 0.5958792991563919, + "grad_norm": 0.5534514626078217, + "learning_rate": 4.548669037994212e-06, + "loss": 0.5447, + "step": 3673 + }, + { + "epoch": 0.5960415314730695, + "grad_norm": 0.5730105262501372, + "learning_rate": 4.548424264900676e-06, + "loss": 0.5412, + "step": 3674 + }, + { + "epoch": 0.5962037637897469, + "grad_norm": 0.5594329655210089, + "learning_rate": 4.548179432040101e-06, + "loss": 0.5697, + "step": 3675 + }, + { + "epoch": 0.5963659961064244, + "grad_norm": 0.6057338112120317, + "learning_rate": 4.54793453941963e-06, + "loss": 0.5637, + "step": 3676 + }, + { + "epoch": 0.5965282284231019, + "grad_norm": 0.5703186708772767, + "learning_rate": 4.547689587046408e-06, + "loss": 0.5699, + "step": 3677 + }, + { + "epoch": 0.5966904607397794, + "grad_norm": 0.5629405122236428, + "learning_rate": 4.547444574927584e-06, + "loss": 0.5525, + "step": 3678 + }, + { + "epoch": 0.5968526930564568, + "grad_norm": 0.581490742126108, + "learning_rate": 4.547199503070305e-06, + "loss": 0.5604, + "step": 3679 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 0.5859968421303736, + "learning_rate": 4.546954371481723e-06, + "loss": 0.559, + "step": 3680 + }, + { + "epoch": 0.5971771576898118, + "grad_norm": 0.5881218950541244, + "learning_rate": 4.546709180168989e-06, + "loss": 0.5822, + "step": 3681 + }, + { + "epoch": 0.5973393900064893, + "grad_norm": 0.5518923974891633, + "learning_rate": 4.546463929139257e-06, + "loss": 0.5549, + "step": 3682 + }, + { + "epoch": 0.5975016223231667, + "grad_norm": 0.5680366525520615, + "learning_rate": 4.546218618399683e-06, + "loss": 0.5382, + "step": 3683 + }, + { + "epoch": 0.5976638546398443, + "grad_norm": 0.566384630327208, + "learning_rate": 4.5459732479574244e-06, + "loss": 0.55, + "step": 3684 + }, + { + "epoch": 0.5978260869565217, + "grad_norm": 0.6705417134258538, + "learning_rate": 4.545727817819641e-06, + "loss": 0.5804, + "step": 3685 + }, + { + "epoch": 0.5979883192731992, + "grad_norm": 0.5919738623307713, + "learning_rate": 4.5454823279934924e-06, + "loss": 0.5372, + "step": 3686 + }, + { + "epoch": 0.5981505515898767, + "grad_norm": 0.566903545383741, + "learning_rate": 4.545236778486143e-06, + "loss": 0.5347, + "step": 3687 + }, + { + "epoch": 0.5983127839065542, + "grad_norm": 0.5688692105007518, + "learning_rate": 4.544991169304756e-06, + "loss": 0.6094, + "step": 3688 + }, + { + "epoch": 0.5984750162232316, + "grad_norm": 0.5762162162141623, + "learning_rate": 4.544745500456497e-06, + "loss": 0.6037, + "step": 3689 + }, + { + "epoch": 0.5986372485399092, + "grad_norm": 0.5718470417790793, + "learning_rate": 4.544499771948535e-06, + "loss": 0.5595, + "step": 3690 + }, + { + "epoch": 0.5987994808565866, + "grad_norm": 0.5733221704088198, + "learning_rate": 4.544253983788039e-06, + "loss": 0.5638, + "step": 3691 + }, + { + "epoch": 0.5989617131732641, + "grad_norm": 0.6135672770530947, + "learning_rate": 4.544008135982182e-06, + "loss": 0.5962, + "step": 3692 + }, + { + "epoch": 0.5991239454899416, + "grad_norm": 0.6184338777877052, + "learning_rate": 4.5437622285381355e-06, + "loss": 0.6003, + "step": 3693 + }, + { + "epoch": 0.5992861778066191, + "grad_norm": 0.632791130341155, + "learning_rate": 4.543516261463075e-06, + "loss": 0.5452, + "step": 3694 + }, + { + "epoch": 0.5994484101232965, + "grad_norm": 0.6236427664762998, + "learning_rate": 4.543270234764176e-06, + "loss": 0.566, + "step": 3695 + }, + { + "epoch": 0.5996106424399741, + "grad_norm": 0.5734780326640643, + "learning_rate": 4.543024148448618e-06, + "loss": 0.5821, + "step": 3696 + }, + { + "epoch": 0.5997728747566515, + "grad_norm": 0.566245681570067, + "learning_rate": 4.54277800252358e-06, + "loss": 0.584, + "step": 3697 + }, + { + "epoch": 0.599935107073329, + "grad_norm": 0.5780557452507968, + "learning_rate": 4.542531796996245e-06, + "loss": 0.5791, + "step": 3698 + }, + { + "epoch": 0.6000973393900065, + "grad_norm": 0.5918067011806585, + "learning_rate": 4.5422855318737965e-06, + "loss": 0.5551, + "step": 3699 + }, + { + "epoch": 0.600259571706684, + "grad_norm": 0.5906638736648878, + "learning_rate": 4.54203920716342e-06, + "loss": 0.5242, + "step": 3700 + }, + { + "epoch": 0.6004218040233614, + "grad_norm": 0.5908845299967636, + "learning_rate": 4.541792822872301e-06, + "loss": 0.5296, + "step": 3701 + }, + { + "epoch": 0.600584036340039, + "grad_norm": 0.5613155773772116, + "learning_rate": 4.5415463790076295e-06, + "loss": 0.5302, + "step": 3702 + }, + { + "epoch": 0.6007462686567164, + "grad_norm": 0.5770973495009487, + "learning_rate": 4.541299875576596e-06, + "loss": 0.5301, + "step": 3703 + }, + { + "epoch": 0.6009085009733939, + "grad_norm": 0.5943481328284752, + "learning_rate": 4.541053312586392e-06, + "loss": 0.5589, + "step": 3704 + }, + { + "epoch": 0.6010707332900714, + "grad_norm": 0.6068813273773649, + "learning_rate": 4.540806690044213e-06, + "loss": 0.5778, + "step": 3705 + }, + { + "epoch": 0.6012329656067489, + "grad_norm": 0.5999228074871797, + "learning_rate": 4.540560007957253e-06, + "loss": 0.5776, + "step": 3706 + }, + { + "epoch": 0.6013951979234263, + "grad_norm": 0.6301807465904776, + "learning_rate": 4.5403132663327095e-06, + "loss": 0.5651, + "step": 3707 + }, + { + "epoch": 0.6015574302401038, + "grad_norm": 0.5998355440574337, + "learning_rate": 4.5400664651777835e-06, + "loss": 0.5763, + "step": 3708 + }, + { + "epoch": 0.6017196625567813, + "grad_norm": 0.5623103928961748, + "learning_rate": 4.539819604499675e-06, + "loss": 0.5935, + "step": 3709 + }, + { + "epoch": 0.6018818948734588, + "grad_norm": 0.5870774524965962, + "learning_rate": 4.539572684305585e-06, + "loss": 0.5608, + "step": 3710 + }, + { + "epoch": 0.6020441271901362, + "grad_norm": 0.5771182852430111, + "learning_rate": 4.539325704602721e-06, + "loss": 0.555, + "step": 3711 + }, + { + "epoch": 0.6022063595068138, + "grad_norm": 0.5895607216494928, + "learning_rate": 4.539078665398286e-06, + "loss": 0.5584, + "step": 3712 + }, + { + "epoch": 0.6023685918234912, + "grad_norm": 0.603481951211896, + "learning_rate": 4.538831566699491e-06, + "loss": 0.5581, + "step": 3713 + }, + { + "epoch": 0.6025308241401687, + "grad_norm": 0.5869782952233494, + "learning_rate": 4.538584408513544e-06, + "loss": 0.5813, + "step": 3714 + }, + { + "epoch": 0.6026930564568462, + "grad_norm": 0.5870242025309882, + "learning_rate": 4.5383371908476545e-06, + "loss": 0.5428, + "step": 3715 + }, + { + "epoch": 0.6028552887735237, + "grad_norm": 0.5901627503477177, + "learning_rate": 4.5380899137090385e-06, + "loss": 0.5607, + "step": 3716 + }, + { + "epoch": 0.6030175210902011, + "grad_norm": 0.6030787875997834, + "learning_rate": 4.537842577104911e-06, + "loss": 0.5698, + "step": 3717 + }, + { + "epoch": 0.6031797534068787, + "grad_norm": 0.5864750680049988, + "learning_rate": 4.537595181042485e-06, + "loss": 0.5696, + "step": 3718 + }, + { + "epoch": 0.6033419857235561, + "grad_norm": 0.571678926756924, + "learning_rate": 4.537347725528983e-06, + "loss": 0.5796, + "step": 3719 + }, + { + "epoch": 0.6035042180402336, + "grad_norm": 0.6355587638542651, + "learning_rate": 4.5371002105716234e-06, + "loss": 0.5729, + "step": 3720 + }, + { + "epoch": 0.6036664503569111, + "grad_norm": 0.5565604912785865, + "learning_rate": 4.5368526361776265e-06, + "loss": 0.5333, + "step": 3721 + }, + { + "epoch": 0.6038286826735886, + "grad_norm": 0.6236942011875752, + "learning_rate": 4.536605002354218e-06, + "loss": 0.5926, + "step": 3722 + }, + { + "epoch": 0.603990914990266, + "grad_norm": 0.5761986283708511, + "learning_rate": 4.536357309108622e-06, + "loss": 0.5637, + "step": 3723 + }, + { + "epoch": 0.6041531473069436, + "grad_norm": 0.5865691404020724, + "learning_rate": 4.536109556448065e-06, + "loss": 0.5706, + "step": 3724 + }, + { + "epoch": 0.604315379623621, + "grad_norm": 0.5814573135075498, + "learning_rate": 4.535861744379777e-06, + "loss": 0.5563, + "step": 3725 + }, + { + "epoch": 0.6044776119402985, + "grad_norm": 0.5847614148099379, + "learning_rate": 4.535613872910988e-06, + "loss": 0.5791, + "step": 3726 + }, + { + "epoch": 0.604639844256976, + "grad_norm": 0.6162898156658063, + "learning_rate": 4.535365942048929e-06, + "loss": 0.5555, + "step": 3727 + }, + { + "epoch": 0.6048020765736535, + "grad_norm": 0.5742195099647744, + "learning_rate": 4.535117951800836e-06, + "loss": 0.5556, + "step": 3728 + }, + { + "epoch": 0.6049643088903309, + "grad_norm": 0.5940768690763835, + "learning_rate": 4.534869902173943e-06, + "loss": 0.5676, + "step": 3729 + }, + { + "epoch": 0.6051265412070085, + "grad_norm": 0.5977169767040748, + "learning_rate": 4.534621793175488e-06, + "loss": 0.5492, + "step": 3730 + }, + { + "epoch": 0.6052887735236859, + "grad_norm": 0.5921040734219309, + "learning_rate": 4.53437362481271e-06, + "loss": 0.5545, + "step": 3731 + }, + { + "epoch": 0.6054510058403634, + "grad_norm": 0.551142782677142, + "learning_rate": 4.534125397092849e-06, + "loss": 0.5748, + "step": 3732 + }, + { + "epoch": 0.6056132381570409, + "grad_norm": 0.5935560398436104, + "learning_rate": 4.533877110023149e-06, + "loss": 0.5635, + "step": 3733 + }, + { + "epoch": 0.6057754704737184, + "grad_norm": 0.6073749789904405, + "learning_rate": 4.533628763610853e-06, + "loss": 0.5898, + "step": 3734 + }, + { + "epoch": 0.6059377027903958, + "grad_norm": 0.5766204435500262, + "learning_rate": 4.533380357863208e-06, + "loss": 0.5457, + "step": 3735 + }, + { + "epoch": 0.6060999351070734, + "grad_norm": 0.5984460942194271, + "learning_rate": 4.533131892787463e-06, + "loss": 0.5871, + "step": 3736 + }, + { + "epoch": 0.6062621674237508, + "grad_norm": 0.5736434205343801, + "learning_rate": 4.532883368390863e-06, + "loss": 0.5787, + "step": 3737 + }, + { + "epoch": 0.6064243997404283, + "grad_norm": 0.5945943072315863, + "learning_rate": 4.532634784680664e-06, + "loss": 0.5884, + "step": 3738 + }, + { + "epoch": 0.6065866320571057, + "grad_norm": 0.6200813234151388, + "learning_rate": 4.532386141664117e-06, + "loss": 0.5725, + "step": 3739 + }, + { + "epoch": 0.6067488643737833, + "grad_norm": 0.5895990063228731, + "learning_rate": 4.532137439348476e-06, + "loss": 0.5629, + "step": 3740 + }, + { + "epoch": 0.6069110966904607, + "grad_norm": 0.5759276836601375, + "learning_rate": 4.531888677740999e-06, + "loss": 0.5823, + "step": 3741 + }, + { + "epoch": 0.6070733290071382, + "grad_norm": 0.5928711443377066, + "learning_rate": 4.531639856848943e-06, + "loss": 0.5758, + "step": 3742 + }, + { + "epoch": 0.6072355613238157, + "grad_norm": 0.5774212660608816, + "learning_rate": 4.5313909766795676e-06, + "loss": 0.5224, + "step": 3743 + }, + { + "epoch": 0.6073977936404932, + "grad_norm": 0.5634096172489227, + "learning_rate": 4.531142037240135e-06, + "loss": 0.553, + "step": 3744 + }, + { + "epoch": 0.6075600259571706, + "grad_norm": 0.5863110803913377, + "learning_rate": 4.530893038537909e-06, + "loss": 0.5932, + "step": 3745 + }, + { + "epoch": 0.6077222582738482, + "grad_norm": 0.5585000552236123, + "learning_rate": 4.530643980580154e-06, + "loss": 0.5702, + "step": 3746 + }, + { + "epoch": 0.6078844905905256, + "grad_norm": 0.6022140806410555, + "learning_rate": 4.5303948633741365e-06, + "loss": 0.5566, + "step": 3747 + }, + { + "epoch": 0.6080467229072031, + "grad_norm": 0.5643043589575893, + "learning_rate": 4.530145686927126e-06, + "loss": 0.5777, + "step": 3748 + }, + { + "epoch": 0.6082089552238806, + "grad_norm": 0.569313016272824, + "learning_rate": 4.529896451246391e-06, + "loss": 0.5429, + "step": 3749 + }, + { + "epoch": 0.6083711875405581, + "grad_norm": 0.586247834083435, + "learning_rate": 4.5296471563392055e-06, + "loss": 0.5201, + "step": 3750 + }, + { + "epoch": 0.6085334198572355, + "grad_norm": 0.7221842851304894, + "learning_rate": 4.529397802212843e-06, + "loss": 0.5847, + "step": 3751 + }, + { + "epoch": 0.6086956521739131, + "grad_norm": 0.5764638477249663, + "learning_rate": 4.529148388874577e-06, + "loss": 0.5577, + "step": 3752 + }, + { + "epoch": 0.6088578844905905, + "grad_norm": 0.5668882499709922, + "learning_rate": 4.528898916331687e-06, + "loss": 0.5548, + "step": 3753 + }, + { + "epoch": 0.609020116807268, + "grad_norm": 0.6205170990327358, + "learning_rate": 4.528649384591449e-06, + "loss": 0.5775, + "step": 3754 + }, + { + "epoch": 0.6091823491239455, + "grad_norm": 0.6102994569899332, + "learning_rate": 4.528399793661147e-06, + "loss": 0.5611, + "step": 3755 + }, + { + "epoch": 0.609344581440623, + "grad_norm": 0.5757936334788989, + "learning_rate": 4.528150143548061e-06, + "loss": 0.5765, + "step": 3756 + }, + { + "epoch": 0.6095068137573004, + "grad_norm": 0.5803468447548372, + "learning_rate": 4.527900434259476e-06, + "loss": 0.5611, + "step": 3757 + }, + { + "epoch": 0.609669046073978, + "grad_norm": 0.5974774420962773, + "learning_rate": 4.527650665802678e-06, + "loss": 0.5364, + "step": 3758 + }, + { + "epoch": 0.6098312783906554, + "grad_norm": 0.5779910076065201, + "learning_rate": 4.527400838184953e-06, + "loss": 0.5765, + "step": 3759 + }, + { + "epoch": 0.6099935107073329, + "grad_norm": 0.554777058757224, + "learning_rate": 4.527150951413593e-06, + "loss": 0.6152, + "step": 3760 + }, + { + "epoch": 0.6101557430240104, + "grad_norm": 0.598481140327435, + "learning_rate": 4.526901005495886e-06, + "loss": 0.5556, + "step": 3761 + }, + { + "epoch": 0.6103179753406879, + "grad_norm": 0.5967596356235008, + "learning_rate": 4.526651000439126e-06, + "loss": 0.5628, + "step": 3762 + }, + { + "epoch": 0.6104802076573653, + "grad_norm": 0.5900013180456928, + "learning_rate": 4.526400936250608e-06, + "loss": 0.5992, + "step": 3763 + }, + { + "epoch": 0.6106424399740429, + "grad_norm": 0.6016486620143782, + "learning_rate": 4.526150812937627e-06, + "loss": 0.5349, + "step": 3764 + }, + { + "epoch": 0.6108046722907203, + "grad_norm": 0.5777382804454378, + "learning_rate": 4.525900630507483e-06, + "loss": 0.5405, + "step": 3765 + }, + { + "epoch": 0.6109669046073978, + "grad_norm": 0.6244929728632853, + "learning_rate": 4.525650388967472e-06, + "loss": 0.5801, + "step": 3766 + }, + { + "epoch": 0.6111291369240752, + "grad_norm": 0.5675902538886437, + "learning_rate": 4.525400088324898e-06, + "loss": 0.5895, + "step": 3767 + }, + { + "epoch": 0.6112913692407528, + "grad_norm": 0.5943538176039297, + "learning_rate": 4.525149728587064e-06, + "loss": 0.5713, + "step": 3768 + }, + { + "epoch": 0.6114536015574302, + "grad_norm": 0.5697977014481316, + "learning_rate": 4.524899309761272e-06, + "loss": 0.5615, + "step": 3769 + }, + { + "epoch": 0.6116158338741077, + "grad_norm": 0.61898219483377, + "learning_rate": 4.524648831854833e-06, + "loss": 0.5925, + "step": 3770 + }, + { + "epoch": 0.6117780661907852, + "grad_norm": 0.5945150903680767, + "learning_rate": 4.524398294875052e-06, + "loss": 0.5545, + "step": 3771 + }, + { + "epoch": 0.6119402985074627, + "grad_norm": 0.5780124499089726, + "learning_rate": 4.52414769882924e-06, + "loss": 0.5714, + "step": 3772 + }, + { + "epoch": 0.6121025308241401, + "grad_norm": 0.5984547732912895, + "learning_rate": 4.523897043724708e-06, + "loss": 0.5545, + "step": 3773 + }, + { + "epoch": 0.6122647631408177, + "grad_norm": 0.6067186584882108, + "learning_rate": 4.523646329568771e-06, + "loss": 0.6062, + "step": 3774 + }, + { + "epoch": 0.6124269954574951, + "grad_norm": 0.579585523962557, + "learning_rate": 4.5233955563687415e-06, + "loss": 0.5632, + "step": 3775 + }, + { + "epoch": 0.6125892277741726, + "grad_norm": 0.5798414225005487, + "learning_rate": 4.523144724131938e-06, + "loss": 0.5414, + "step": 3776 + }, + { + "epoch": 0.6127514600908501, + "grad_norm": 0.6171220346632341, + "learning_rate": 4.522893832865679e-06, + "loss": 0.5554, + "step": 3777 + }, + { + "epoch": 0.6129136924075276, + "grad_norm": 0.5947703464347038, + "learning_rate": 4.522642882577284e-06, + "loss": 0.5535, + "step": 3778 + }, + { + "epoch": 0.613075924724205, + "grad_norm": 0.5823017459765177, + "learning_rate": 4.522391873274077e-06, + "loss": 0.5608, + "step": 3779 + }, + { + "epoch": 0.6132381570408826, + "grad_norm": 0.5840215550243306, + "learning_rate": 4.522140804963379e-06, + "loss": 0.5459, + "step": 3780 + }, + { + "epoch": 0.61340038935756, + "grad_norm": 0.6068073107932908, + "learning_rate": 4.521889677652518e-06, + "loss": 0.5584, + "step": 3781 + }, + { + "epoch": 0.6135626216742375, + "grad_norm": 0.5912454507152111, + "learning_rate": 4.521638491348818e-06, + "loss": 0.6052, + "step": 3782 + }, + { + "epoch": 0.613724853990915, + "grad_norm": 0.5842626667145217, + "learning_rate": 4.521387246059612e-06, + "loss": 0.5891, + "step": 3783 + }, + { + "epoch": 0.6138870863075925, + "grad_norm": 0.5657765953474002, + "learning_rate": 4.521135941792227e-06, + "loss": 0.5964, + "step": 3784 + }, + { + "epoch": 0.6140493186242699, + "grad_norm": 0.587838843758767, + "learning_rate": 4.520884578553997e-06, + "loss": 0.5522, + "step": 3785 + }, + { + "epoch": 0.6142115509409475, + "grad_norm": 0.5971513317911865, + "learning_rate": 4.5206331563522556e-06, + "loss": 0.5677, + "step": 3786 + }, + { + "epoch": 0.6143737832576249, + "grad_norm": 0.6385516696885263, + "learning_rate": 4.520381675194339e-06, + "loss": 0.5632, + "step": 3787 + }, + { + "epoch": 0.6145360155743024, + "grad_norm": 0.5944244709106439, + "learning_rate": 4.520130135087584e-06, + "loss": 0.5343, + "step": 3788 + }, + { + "epoch": 0.6146982478909799, + "grad_norm": 0.6495456292833226, + "learning_rate": 4.51987853603933e-06, + "loss": 0.5642, + "step": 3789 + }, + { + "epoch": 0.6148604802076574, + "grad_norm": 0.5753178243005087, + "learning_rate": 4.51962687805692e-06, + "loss": 0.5519, + "step": 3790 + }, + { + "epoch": 0.6150227125243348, + "grad_norm": 0.5562542416227727, + "learning_rate": 4.519375161147693e-06, + "loss": 0.5426, + "step": 3791 + }, + { + "epoch": 0.6151849448410124, + "grad_norm": 0.5651521389177345, + "learning_rate": 4.519123385318995e-06, + "loss": 0.5525, + "step": 3792 + }, + { + "epoch": 0.6153471771576898, + "grad_norm": 0.6273562020457695, + "learning_rate": 4.518871550578173e-06, + "loss": 0.5624, + "step": 3793 + }, + { + "epoch": 0.6155094094743673, + "grad_norm": 0.8135220429394854, + "learning_rate": 4.518619656932574e-06, + "loss": 0.5548, + "step": 3794 + }, + { + "epoch": 0.6156716417910447, + "grad_norm": 0.588826175574517, + "learning_rate": 4.518367704389548e-06, + "loss": 0.5563, + "step": 3795 + }, + { + "epoch": 0.6158338741077223, + "grad_norm": 0.5758890257637558, + "learning_rate": 4.518115692956445e-06, + "loss": 0.5646, + "step": 3796 + }, + { + "epoch": 0.6159961064243997, + "grad_norm": 0.6194824818260248, + "learning_rate": 4.5178636226406194e-06, + "loss": 0.5911, + "step": 3797 + }, + { + "epoch": 0.6161583387410772, + "grad_norm": 0.5993027410861484, + "learning_rate": 4.5176114934494245e-06, + "loss": 0.555, + "step": 3798 + }, + { + "epoch": 0.6163205710577547, + "grad_norm": 0.5901729913474163, + "learning_rate": 4.517359305390217e-06, + "loss": 0.5392, + "step": 3799 + }, + { + "epoch": 0.6164828033744322, + "grad_norm": 0.5730644925808979, + "learning_rate": 4.517107058470357e-06, + "loss": 0.5701, + "step": 3800 + }, + { + "epoch": 0.6166450356911096, + "grad_norm": 0.5896077743719592, + "learning_rate": 4.516854752697201e-06, + "loss": 0.5794, + "step": 3801 + }, + { + "epoch": 0.6168072680077872, + "grad_norm": 0.598592749805591, + "learning_rate": 4.516602388078114e-06, + "loss": 0.5714, + "step": 3802 + }, + { + "epoch": 0.6169695003244646, + "grad_norm": 0.5833638451060604, + "learning_rate": 4.5163499646204566e-06, + "loss": 0.567, + "step": 3803 + }, + { + "epoch": 0.6171317326411421, + "grad_norm": 0.5638023444183023, + "learning_rate": 4.5160974823315954e-06, + "loss": 0.5502, + "step": 3804 + }, + { + "epoch": 0.6172939649578196, + "grad_norm": 0.6147841237131323, + "learning_rate": 4.515844941218896e-06, + "loss": 0.5842, + "step": 3805 + }, + { + "epoch": 0.6174561972744971, + "grad_norm": 0.601980313006092, + "learning_rate": 4.515592341289727e-06, + "loss": 0.5319, + "step": 3806 + }, + { + "epoch": 0.6176184295911745, + "grad_norm": 0.6051247978121932, + "learning_rate": 4.515339682551459e-06, + "loss": 0.5711, + "step": 3807 + }, + { + "epoch": 0.6177806619078521, + "grad_norm": 0.5768352220021216, + "learning_rate": 4.515086965011464e-06, + "loss": 0.5852, + "step": 3808 + }, + { + "epoch": 0.6179428942245295, + "grad_norm": 0.6300980567234487, + "learning_rate": 4.514834188677115e-06, + "loss": 0.5875, + "step": 3809 + }, + { + "epoch": 0.618105126541207, + "grad_norm": 0.5793847409335994, + "learning_rate": 4.514581353555787e-06, + "loss": 0.5376, + "step": 3810 + }, + { + "epoch": 0.6182673588578845, + "grad_norm": 0.5686926470790324, + "learning_rate": 4.514328459654858e-06, + "loss": 0.5589, + "step": 3811 + }, + { + "epoch": 0.618429591174562, + "grad_norm": 0.5618461104651046, + "learning_rate": 4.514075506981706e-06, + "loss": 0.5798, + "step": 3812 + }, + { + "epoch": 0.6185918234912394, + "grad_norm": 0.5571161522188557, + "learning_rate": 4.513822495543711e-06, + "loss": 0.5986, + "step": 3813 + }, + { + "epoch": 0.618754055807917, + "grad_norm": 0.5685243577784977, + "learning_rate": 4.5135694253482566e-06, + "loss": 0.5814, + "step": 3814 + }, + { + "epoch": 0.6189162881245944, + "grad_norm": 0.6223062411387364, + "learning_rate": 4.5133162964027255e-06, + "loss": 0.5712, + "step": 3815 + }, + { + "epoch": 0.6190785204412719, + "grad_norm": 0.5508173204017552, + "learning_rate": 4.513063108714504e-06, + "loss": 0.5336, + "step": 3816 + }, + { + "epoch": 0.6192407527579494, + "grad_norm": 0.6029929031335947, + "learning_rate": 4.512809862290978e-06, + "loss": 0.556, + "step": 3817 + }, + { + "epoch": 0.6194029850746269, + "grad_norm": 0.5493747973947737, + "learning_rate": 4.512556557139538e-06, + "loss": 0.5435, + "step": 3818 + }, + { + "epoch": 0.6195652173913043, + "grad_norm": 0.5573707672633318, + "learning_rate": 4.512303193267574e-06, + "loss": 0.5895, + "step": 3819 + }, + { + "epoch": 0.6197274497079819, + "grad_norm": 0.5583117861591199, + "learning_rate": 4.512049770682478e-06, + "loss": 0.5291, + "step": 3820 + }, + { + "epoch": 0.6198896820246593, + "grad_norm": 0.5990810163969801, + "learning_rate": 4.511796289391645e-06, + "loss": 0.542, + "step": 3821 + }, + { + "epoch": 0.6200519143413368, + "grad_norm": 0.5910883943335786, + "learning_rate": 4.511542749402471e-06, + "loss": 0.5543, + "step": 3822 + }, + { + "epoch": 0.6202141466580143, + "grad_norm": 0.6043555554377625, + "learning_rate": 4.511289150722352e-06, + "loss": 0.5752, + "step": 3823 + }, + { + "epoch": 0.6203763789746918, + "grad_norm": 0.6025455152169406, + "learning_rate": 4.51103549335869e-06, + "loss": 0.5629, + "step": 3824 + }, + { + "epoch": 0.6205386112913692, + "grad_norm": 0.6062468195575306, + "learning_rate": 4.510781777318883e-06, + "loss": 0.5969, + "step": 3825 + }, + { + "epoch": 0.6207008436080467, + "grad_norm": 0.5916903601443654, + "learning_rate": 4.510528002610335e-06, + "loss": 0.573, + "step": 3826 + }, + { + "epoch": 0.6208630759247242, + "grad_norm": 0.5852673348533071, + "learning_rate": 4.51027416924045e-06, + "loss": 0.5576, + "step": 3827 + }, + { + "epoch": 0.6210253082414017, + "grad_norm": 0.5659521249521717, + "learning_rate": 4.510020277216636e-06, + "loss": 0.5527, + "step": 3828 + }, + { + "epoch": 0.6211875405580791, + "grad_norm": 0.5607444218890483, + "learning_rate": 4.509766326546299e-06, + "loss": 0.5588, + "step": 3829 + }, + { + "epoch": 0.6213497728747567, + "grad_norm": 0.5554861981751524, + "learning_rate": 4.5095123172368485e-06, + "loss": 0.5502, + "step": 3830 + }, + { + "epoch": 0.6215120051914341, + "grad_norm": 0.5765695674285695, + "learning_rate": 4.509258249295696e-06, + "loss": 0.5827, + "step": 3831 + }, + { + "epoch": 0.6216742375081116, + "grad_norm": 0.627875682884427, + "learning_rate": 4.509004122730254e-06, + "loss": 0.5701, + "step": 3832 + }, + { + "epoch": 0.6218364698247891, + "grad_norm": 0.5946134440474994, + "learning_rate": 4.508749937547938e-06, + "loss": 0.5364, + "step": 3833 + }, + { + "epoch": 0.6219987021414666, + "grad_norm": 0.5895573233547989, + "learning_rate": 4.5084956937561654e-06, + "loss": 0.5115, + "step": 3834 + }, + { + "epoch": 0.622160934458144, + "grad_norm": 0.5616825953009418, + "learning_rate": 4.508241391362352e-06, + "loss": 0.5786, + "step": 3835 + }, + { + "epoch": 0.6223231667748216, + "grad_norm": 0.6170595078096228, + "learning_rate": 4.507987030373919e-06, + "loss": 0.5519, + "step": 3836 + }, + { + "epoch": 0.622485399091499, + "grad_norm": 0.5888364526140639, + "learning_rate": 4.5077326107982874e-06, + "loss": 0.5521, + "step": 3837 + }, + { + "epoch": 0.6226476314081765, + "grad_norm": 0.6421310933053721, + "learning_rate": 4.5074781326428806e-06, + "loss": 0.578, + "step": 3838 + }, + { + "epoch": 0.622809863724854, + "grad_norm": 0.5558084006114492, + "learning_rate": 4.507223595915123e-06, + "loss": 0.5604, + "step": 3839 + }, + { + "epoch": 0.6229720960415315, + "grad_norm": 0.6197089089408702, + "learning_rate": 4.506969000622443e-06, + "loss": 0.5854, + "step": 3840 + }, + { + "epoch": 0.6231343283582089, + "grad_norm": 0.5754790208111407, + "learning_rate": 4.506714346772267e-06, + "loss": 0.5734, + "step": 3841 + }, + { + "epoch": 0.6232965606748865, + "grad_norm": 0.5777756611737793, + "learning_rate": 4.506459634372025e-06, + "loss": 0.554, + "step": 3842 + }, + { + "epoch": 0.6234587929915639, + "grad_norm": 0.5571783343756842, + "learning_rate": 4.506204863429151e-06, + "loss": 0.5702, + "step": 3843 + }, + { + "epoch": 0.6236210253082414, + "grad_norm": 0.5755062465685489, + "learning_rate": 4.505950033951076e-06, + "loss": 0.5831, + "step": 3844 + }, + { + "epoch": 0.6237832576249189, + "grad_norm": 0.6035142277847867, + "learning_rate": 4.505695145945236e-06, + "loss": 0.5442, + "step": 3845 + }, + { + "epoch": 0.6239454899415964, + "grad_norm": 0.5927644371651685, + "learning_rate": 4.505440199419069e-06, + "loss": 0.5642, + "step": 3846 + }, + { + "epoch": 0.6241077222582738, + "grad_norm": 0.5771854842480705, + "learning_rate": 4.505185194380012e-06, + "loss": 0.567, + "step": 3847 + }, + { + "epoch": 0.6242699545749514, + "grad_norm": 0.5848856877783377, + "learning_rate": 4.504930130835505e-06, + "loss": 0.5687, + "step": 3848 + }, + { + "epoch": 0.6244321868916288, + "grad_norm": 0.6155072503070934, + "learning_rate": 4.5046750087929925e-06, + "loss": 0.5909, + "step": 3849 + }, + { + "epoch": 0.6245944192083063, + "grad_norm": 0.556725948558279, + "learning_rate": 4.504419828259916e-06, + "loss": 0.5487, + "step": 3850 + }, + { + "epoch": 0.6247566515249838, + "grad_norm": 0.6164271721676329, + "learning_rate": 4.504164589243721e-06, + "loss": 0.5724, + "step": 3851 + }, + { + "epoch": 0.6249188838416613, + "grad_norm": 0.5932013715411457, + "learning_rate": 4.503909291751857e-06, + "loss": 0.5512, + "step": 3852 + }, + { + "epoch": 0.6250811161583387, + "grad_norm": 0.5826234188174819, + "learning_rate": 4.50365393579177e-06, + "loss": 0.5467, + "step": 3853 + }, + { + "epoch": 0.6252433484750162, + "grad_norm": 0.5808470713613852, + "learning_rate": 4.503398521370911e-06, + "loss": 0.565, + "step": 3854 + }, + { + "epoch": 0.6254055807916937, + "grad_norm": 0.5741224832354447, + "learning_rate": 4.503143048496734e-06, + "loss": 0.5401, + "step": 3855 + }, + { + "epoch": 0.6255678131083712, + "grad_norm": 0.5668816419622646, + "learning_rate": 4.50288751717669e-06, + "loss": 0.558, + "step": 3856 + }, + { + "epoch": 0.6257300454250486, + "grad_norm": 0.6663017823741519, + "learning_rate": 4.502631927418239e-06, + "loss": 0.5544, + "step": 3857 + }, + { + "epoch": 0.6258922777417262, + "grad_norm": 0.6017403050046929, + "learning_rate": 4.502376279228834e-06, + "loss": 0.5504, + "step": 3858 + }, + { + "epoch": 0.6260545100584036, + "grad_norm": 0.6162629587110205, + "learning_rate": 4.502120572615937e-06, + "loss": 0.5934, + "step": 3859 + }, + { + "epoch": 0.6262167423750811, + "grad_norm": 0.5859842402225415, + "learning_rate": 4.5018648075870065e-06, + "loss": 0.576, + "step": 3860 + }, + { + "epoch": 0.6263789746917586, + "grad_norm": 0.6309236242316093, + "learning_rate": 4.501608984149507e-06, + "loss": 0.5361, + "step": 3861 + }, + { + "epoch": 0.6265412070084361, + "grad_norm": 0.5969834092219317, + "learning_rate": 4.501353102310901e-06, + "loss": 0.5847, + "step": 3862 + }, + { + "epoch": 0.6267034393251135, + "grad_norm": 0.5679778431924113, + "learning_rate": 4.501097162078656e-06, + "loss": 0.5502, + "step": 3863 + }, + { + "epoch": 0.6268656716417911, + "grad_norm": 0.5706577532370386, + "learning_rate": 4.500841163460239e-06, + "loss": 0.5314, + "step": 3864 + }, + { + "epoch": 0.6270279039584685, + "grad_norm": 0.602915722022351, + "learning_rate": 4.500585106463118e-06, + "loss": 0.5466, + "step": 3865 + }, + { + "epoch": 0.627190136275146, + "grad_norm": 0.5764282595837634, + "learning_rate": 4.500328991094766e-06, + "loss": 0.5582, + "step": 3866 + }, + { + "epoch": 0.6273523685918235, + "grad_norm": 0.5848993596044034, + "learning_rate": 4.500072817362655e-06, + "loss": 0.577, + "step": 3867 + }, + { + "epoch": 0.627514600908501, + "grad_norm": 0.5647840423129384, + "learning_rate": 4.499816585274258e-06, + "loss": 0.55, + "step": 3868 + }, + { + "epoch": 0.6276768332251784, + "grad_norm": 0.6398034765531153, + "learning_rate": 4.4995602948370535e-06, + "loss": 0.523, + "step": 3869 + }, + { + "epoch": 0.627839065541856, + "grad_norm": 0.5711551410276189, + "learning_rate": 4.499303946058517e-06, + "loss": 0.5658, + "step": 3870 + }, + { + "epoch": 0.6280012978585334, + "grad_norm": 0.5581383362565772, + "learning_rate": 4.49904753894613e-06, + "loss": 0.579, + "step": 3871 + }, + { + "epoch": 0.6281635301752109, + "grad_norm": 0.5958923577797676, + "learning_rate": 4.498791073507373e-06, + "loss": 0.6081, + "step": 3872 + }, + { + "epoch": 0.6283257624918884, + "grad_norm": 0.5839886990610155, + "learning_rate": 4.4985345497497276e-06, + "loss": 0.5781, + "step": 3873 + }, + { + "epoch": 0.6284879948085659, + "grad_norm": 0.5704264032192945, + "learning_rate": 4.49827796768068e-06, + "loss": 0.5634, + "step": 3874 + }, + { + "epoch": 0.6286502271252433, + "grad_norm": 0.5668777788878779, + "learning_rate": 4.498021327307716e-06, + "loss": 0.5594, + "step": 3875 + }, + { + "epoch": 0.6288124594419209, + "grad_norm": 0.6024651127388515, + "learning_rate": 4.4977646286383236e-06, + "loss": 0.5783, + "step": 3876 + }, + { + "epoch": 0.6289746917585983, + "grad_norm": 0.5634459010169622, + "learning_rate": 4.497507871679994e-06, + "loss": 0.5477, + "step": 3877 + }, + { + "epoch": 0.6291369240752758, + "grad_norm": 0.58942056647558, + "learning_rate": 4.497251056440215e-06, + "loss": 0.5685, + "step": 3878 + }, + { + "epoch": 0.6292991563919533, + "grad_norm": 0.6053838357273265, + "learning_rate": 4.496994182926483e-06, + "loss": 0.5877, + "step": 3879 + }, + { + "epoch": 0.6294613887086308, + "grad_norm": 0.5887838968776185, + "learning_rate": 4.496737251146292e-06, + "loss": 0.573, + "step": 3880 + }, + { + "epoch": 0.6296236210253082, + "grad_norm": 0.556678293357553, + "learning_rate": 4.496480261107138e-06, + "loss": 0.57, + "step": 3881 + }, + { + "epoch": 0.6297858533419857, + "grad_norm": 0.5806129529368453, + "learning_rate": 4.49622321281652e-06, + "loss": 0.5195, + "step": 3882 + }, + { + "epoch": 0.6299480856586632, + "grad_norm": 0.5465326422285639, + "learning_rate": 4.495966106281936e-06, + "loss": 0.5751, + "step": 3883 + }, + { + "epoch": 0.6301103179753407, + "grad_norm": 0.5700132255826603, + "learning_rate": 4.49570894151089e-06, + "loss": 0.5376, + "step": 3884 + }, + { + "epoch": 0.6302725502920181, + "grad_norm": 0.5983682940167766, + "learning_rate": 4.4954517185108845e-06, + "loss": 0.5956, + "step": 3885 + }, + { + "epoch": 0.6304347826086957, + "grad_norm": 0.5908927010770928, + "learning_rate": 4.495194437289424e-06, + "loss": 0.5852, + "step": 3886 + }, + { + "epoch": 0.6305970149253731, + "grad_norm": 0.6046439981269989, + "learning_rate": 4.494937097854016e-06, + "loss": 0.5461, + "step": 3887 + }, + { + "epoch": 0.6307592472420506, + "grad_norm": 0.5905764380587116, + "learning_rate": 4.494679700212168e-06, + "loss": 0.554, + "step": 3888 + }, + { + "epoch": 0.6309214795587281, + "grad_norm": 0.612425873392299, + "learning_rate": 4.49442224437139e-06, + "loss": 0.5837, + "step": 3889 + }, + { + "epoch": 0.6310837118754056, + "grad_norm": 0.5809576173642093, + "learning_rate": 4.494164730339196e-06, + "loss": 0.5863, + "step": 3890 + }, + { + "epoch": 0.631245944192083, + "grad_norm": 0.5897449005186486, + "learning_rate": 4.493907158123096e-06, + "loss": 0.5648, + "step": 3891 + }, + { + "epoch": 0.6314081765087606, + "grad_norm": 0.5404797493010302, + "learning_rate": 4.4936495277306085e-06, + "loss": 0.5328, + "step": 3892 + }, + { + "epoch": 0.631570408825438, + "grad_norm": 0.5863824163719745, + "learning_rate": 4.493391839169249e-06, + "loss": 0.5662, + "step": 3893 + }, + { + "epoch": 0.6317326411421155, + "grad_norm": 0.592969404014914, + "learning_rate": 4.493134092446536e-06, + "loss": 0.5856, + "step": 3894 + }, + { + "epoch": 0.631894873458793, + "grad_norm": 0.6112172850950797, + "learning_rate": 4.4928762875699895e-06, + "loss": 0.535, + "step": 3895 + }, + { + "epoch": 0.6320571057754705, + "grad_norm": 0.6081244376366742, + "learning_rate": 4.4926184245471315e-06, + "loss": 0.5626, + "step": 3896 + }, + { + "epoch": 0.6322193380921479, + "grad_norm": 0.5838498309457824, + "learning_rate": 4.492360503385486e-06, + "loss": 0.5547, + "step": 3897 + }, + { + "epoch": 0.6323815704088255, + "grad_norm": 0.5986889230124234, + "learning_rate": 4.492102524092581e-06, + "loss": 0.5452, + "step": 3898 + }, + { + "epoch": 0.6325438027255029, + "grad_norm": 0.6092130328725177, + "learning_rate": 4.491844486675938e-06, + "loss": 0.5699, + "step": 3899 + }, + { + "epoch": 0.6327060350421804, + "grad_norm": 0.5688997992050765, + "learning_rate": 4.49158639114309e-06, + "loss": 0.5774, + "step": 3900 + }, + { + "epoch": 0.6328682673588579, + "grad_norm": 0.6024818111342244, + "learning_rate": 4.4913282375015675e-06, + "loss": 0.5505, + "step": 3901 + }, + { + "epoch": 0.6330304996755354, + "grad_norm": 0.5695322194770709, + "learning_rate": 4.4910700257589e-06, + "loss": 0.541, + "step": 3902 + }, + { + "epoch": 0.6331927319922128, + "grad_norm": 0.5585871125498649, + "learning_rate": 4.490811755922624e-06, + "loss": 0.5766, + "step": 3903 + }, + { + "epoch": 0.6333549643088904, + "grad_norm": 0.585599712342205, + "learning_rate": 4.490553428000273e-06, + "loss": 0.5753, + "step": 3904 + }, + { + "epoch": 0.6335171966255678, + "grad_norm": 0.5970285233509779, + "learning_rate": 4.490295041999385e-06, + "loss": 0.5375, + "step": 3905 + }, + { + "epoch": 0.6336794289422453, + "grad_norm": 0.575494394095075, + "learning_rate": 4.490036597927499e-06, + "loss": 0.5773, + "step": 3906 + }, + { + "epoch": 0.6338416612589228, + "grad_norm": 0.5670717009672633, + "learning_rate": 4.489778095792157e-06, + "loss": 0.5389, + "step": 3907 + }, + { + "epoch": 0.6340038935756003, + "grad_norm": 0.5924568718953598, + "learning_rate": 4.489519535600899e-06, + "loss": 0.5456, + "step": 3908 + }, + { + "epoch": 0.6341661258922777, + "grad_norm": 0.5939599505995332, + "learning_rate": 4.48926091736127e-06, + "loss": 0.5532, + "step": 3909 + }, + { + "epoch": 0.6343283582089553, + "grad_norm": 0.6091341695623561, + "learning_rate": 4.489002241080817e-06, + "loss": 0.5646, + "step": 3910 + }, + { + "epoch": 0.6344905905256327, + "grad_norm": 0.5620453631156401, + "learning_rate": 4.488743506767085e-06, + "loss": 0.5681, + "step": 3911 + }, + { + "epoch": 0.6346528228423102, + "grad_norm": 0.6040199807119242, + "learning_rate": 4.488484714427625e-06, + "loss": 0.5531, + "step": 3912 + }, + { + "epoch": 0.6348150551589876, + "grad_norm": 0.5551568628733574, + "learning_rate": 4.488225864069987e-06, + "loss": 0.5798, + "step": 3913 + }, + { + "epoch": 0.6349772874756652, + "grad_norm": 0.5840653254598726, + "learning_rate": 4.487966955701723e-06, + "loss": 0.5637, + "step": 3914 + }, + { + "epoch": 0.6351395197923426, + "grad_norm": 0.5292104028364717, + "learning_rate": 4.487707989330389e-06, + "loss": 0.5655, + "step": 3915 + }, + { + "epoch": 0.6353017521090201, + "grad_norm": 0.5793028648392715, + "learning_rate": 4.48744896496354e-06, + "loss": 0.5332, + "step": 3916 + }, + { + "epoch": 0.6354639844256976, + "grad_norm": 0.5866618202089323, + "learning_rate": 4.4871898826087324e-06, + "loss": 0.5454, + "step": 3917 + }, + { + "epoch": 0.6356262167423751, + "grad_norm": 0.5684305456374359, + "learning_rate": 4.486930742273527e-06, + "loss": 0.5658, + "step": 3918 + }, + { + "epoch": 0.6357884490590525, + "grad_norm": 0.5709681564249935, + "learning_rate": 4.486671543965484e-06, + "loss": 0.5608, + "step": 3919 + }, + { + "epoch": 0.6359506813757301, + "grad_norm": 0.6207551625396543, + "learning_rate": 4.486412287692166e-06, + "loss": 0.5936, + "step": 3920 + }, + { + "epoch": 0.6361129136924075, + "grad_norm": 0.5778594581005329, + "learning_rate": 4.486152973461138e-06, + "loss": 0.5694, + "step": 3921 + }, + { + "epoch": 0.636275146009085, + "grad_norm": 0.5719874311612593, + "learning_rate": 4.485893601279964e-06, + "loss": 0.5716, + "step": 3922 + }, + { + "epoch": 0.6364373783257625, + "grad_norm": 0.5388769321698026, + "learning_rate": 4.485634171156216e-06, + "loss": 0.5605, + "step": 3923 + }, + { + "epoch": 0.63659961064244, + "grad_norm": 0.5600706204408352, + "learning_rate": 4.48537468309746e-06, + "loss": 0.561, + "step": 3924 + }, + { + "epoch": 0.6367618429591174, + "grad_norm": 0.5735400546192616, + "learning_rate": 4.485115137111266e-06, + "loss": 0.5548, + "step": 3925 + }, + { + "epoch": 0.636924075275795, + "grad_norm": 0.5576315368859436, + "learning_rate": 4.48485553320521e-06, + "loss": 0.5428, + "step": 3926 + }, + { + "epoch": 0.6370863075924724, + "grad_norm": 0.5703752720599998, + "learning_rate": 4.484595871386865e-06, + "loss": 0.5715, + "step": 3927 + }, + { + "epoch": 0.6372485399091499, + "grad_norm": 0.6044075145127233, + "learning_rate": 4.484336151663807e-06, + "loss": 0.5667, + "step": 3928 + }, + { + "epoch": 0.6374107722258274, + "grad_norm": 0.5864053919267685, + "learning_rate": 4.484076374043615e-06, + "loss": 0.5465, + "step": 3929 + }, + { + "epoch": 0.6375730045425049, + "grad_norm": 0.5752421416334357, + "learning_rate": 4.483816538533866e-06, + "loss": 0.5622, + "step": 3930 + }, + { + "epoch": 0.6377352368591823, + "grad_norm": 0.5673044636957414, + "learning_rate": 4.483556645142144e-06, + "loss": 0.5668, + "step": 3931 + }, + { + "epoch": 0.6378974691758599, + "grad_norm": 0.5917391788871733, + "learning_rate": 4.483296693876031e-06, + "loss": 0.5483, + "step": 3932 + }, + { + "epoch": 0.6380597014925373, + "grad_norm": 0.5589934890264929, + "learning_rate": 4.483036684743111e-06, + "loss": 0.5448, + "step": 3933 + }, + { + "epoch": 0.6382219338092148, + "grad_norm": 0.6092309222410087, + "learning_rate": 4.482776617750972e-06, + "loss": 0.5386, + "step": 3934 + }, + { + "epoch": 0.6383841661258923, + "grad_norm": 0.5749363730898503, + "learning_rate": 4.4825164929071985e-06, + "loss": 0.5648, + "step": 3935 + }, + { + "epoch": 0.6385463984425698, + "grad_norm": 0.5828814003430219, + "learning_rate": 4.482256310219384e-06, + "loss": 0.5937, + "step": 3936 + }, + { + "epoch": 0.6387086307592472, + "grad_norm": 0.5951013833663735, + "learning_rate": 4.481996069695118e-06, + "loss": 0.5687, + "step": 3937 + }, + { + "epoch": 0.6388708630759248, + "grad_norm": 0.5810557241573583, + "learning_rate": 4.481735771341994e-06, + "loss": 0.5639, + "step": 3938 + }, + { + "epoch": 0.6390330953926022, + "grad_norm": 0.5721736969334078, + "learning_rate": 4.481475415167606e-06, + "loss": 0.5573, + "step": 3939 + }, + { + "epoch": 0.6391953277092797, + "grad_norm": 0.5900781558239244, + "learning_rate": 4.481215001179553e-06, + "loss": 0.5814, + "step": 3940 + }, + { + "epoch": 0.6393575600259571, + "grad_norm": 0.6201909664240322, + "learning_rate": 4.480954529385429e-06, + "loss": 0.5735, + "step": 3941 + }, + { + "epoch": 0.6395197923426347, + "grad_norm": 0.5530629988311806, + "learning_rate": 4.480693999792838e-06, + "loss": 0.565, + "step": 3942 + }, + { + "epoch": 0.6396820246593121, + "grad_norm": 0.584370587303702, + "learning_rate": 4.480433412409377e-06, + "loss": 0.589, + "step": 3943 + }, + { + "epoch": 0.6398442569759896, + "grad_norm": 0.5980670297194468, + "learning_rate": 4.4801727672426545e-06, + "loss": 0.578, + "step": 3944 + }, + { + "epoch": 0.6400064892926671, + "grad_norm": 0.5490401652437291, + "learning_rate": 4.4799120643002704e-06, + "loss": 0.5762, + "step": 3945 + }, + { + "epoch": 0.6401687216093446, + "grad_norm": 0.5798193713791163, + "learning_rate": 4.4796513035898345e-06, + "loss": 0.5658, + "step": 3946 + }, + { + "epoch": 0.640330953926022, + "grad_norm": 0.5726812289349132, + "learning_rate": 4.479390485118953e-06, + "loss": 0.5743, + "step": 3947 + }, + { + "epoch": 0.6404931862426996, + "grad_norm": 0.5838668356871884, + "learning_rate": 4.4791296088952374e-06, + "loss": 0.5656, + "step": 3948 + }, + { + "epoch": 0.640655418559377, + "grad_norm": 0.5380042417167512, + "learning_rate": 4.478868674926299e-06, + "loss": 0.5429, + "step": 3949 + }, + { + "epoch": 0.6408176508760545, + "grad_norm": 0.5698241243267436, + "learning_rate": 4.47860768321975e-06, + "loss": 0.5586, + "step": 3950 + }, + { + "epoch": 0.640979883192732, + "grad_norm": 0.5906251645609066, + "learning_rate": 4.478346633783206e-06, + "loss": 0.55, + "step": 3951 + }, + { + "epoch": 0.6411421155094095, + "grad_norm": 0.5580029602952289, + "learning_rate": 4.478085526624284e-06, + "loss": 0.5154, + "step": 3952 + }, + { + "epoch": 0.6413043478260869, + "grad_norm": 0.5926386288165586, + "learning_rate": 4.477824361750602e-06, + "loss": 0.5664, + "step": 3953 + }, + { + "epoch": 0.6414665801427645, + "grad_norm": 0.6155268723381258, + "learning_rate": 4.477563139169781e-06, + "loss": 0.558, + "step": 3954 + }, + { + "epoch": 0.6416288124594419, + "grad_norm": 0.637481886255728, + "learning_rate": 4.477301858889441e-06, + "loss": 0.5783, + "step": 3955 + }, + { + "epoch": 0.6417910447761194, + "grad_norm": 0.5940312864972518, + "learning_rate": 4.477040520917207e-06, + "loss": 0.5698, + "step": 3956 + }, + { + "epoch": 0.6419532770927969, + "grad_norm": 0.566937697714781, + "learning_rate": 4.476779125260703e-06, + "loss": 0.5581, + "step": 3957 + }, + { + "epoch": 0.6421155094094744, + "grad_norm": 0.606264809020774, + "learning_rate": 4.476517671927556e-06, + "loss": 0.5346, + "step": 3958 + }, + { + "epoch": 0.6422777417261518, + "grad_norm": 0.5979149758929344, + "learning_rate": 4.4762561609253945e-06, + "loss": 0.5454, + "step": 3959 + }, + { + "epoch": 0.6424399740428294, + "grad_norm": 0.5770496757658686, + "learning_rate": 4.475994592261848e-06, + "loss": 0.5429, + "step": 3960 + }, + { + "epoch": 0.6426022063595068, + "grad_norm": 0.5676170558630049, + "learning_rate": 4.47573296594455e-06, + "loss": 0.5753, + "step": 3961 + }, + { + "epoch": 0.6427644386761843, + "grad_norm": 0.5741293959590067, + "learning_rate": 4.475471281981133e-06, + "loss": 0.567, + "step": 3962 + }, + { + "epoch": 0.6429266709928618, + "grad_norm": 0.5738551692170937, + "learning_rate": 4.475209540379232e-06, + "loss": 0.539, + "step": 3963 + }, + { + "epoch": 0.6430889033095393, + "grad_norm": 0.5697238248495918, + "learning_rate": 4.474947741146484e-06, + "loss": 0.5658, + "step": 3964 + }, + { + "epoch": 0.6432511356262167, + "grad_norm": 0.5730823443187626, + "learning_rate": 4.474685884290527e-06, + "loss": 0.5482, + "step": 3965 + }, + { + "epoch": 0.6434133679428943, + "grad_norm": 0.5773297771210387, + "learning_rate": 4.474423969819003e-06, + "loss": 0.5432, + "step": 3966 + }, + { + "epoch": 0.6435756002595717, + "grad_norm": 0.5590306680465768, + "learning_rate": 4.474161997739551e-06, + "loss": 0.5809, + "step": 3967 + }, + { + "epoch": 0.6437378325762492, + "grad_norm": 0.5699092074498396, + "learning_rate": 4.473899968059817e-06, + "loss": 0.5251, + "step": 3968 + }, + { + "epoch": 0.6439000648929266, + "grad_norm": 0.5684729968164144, + "learning_rate": 4.473637880787446e-06, + "loss": 0.576, + "step": 3969 + }, + { + "epoch": 0.6440622972096042, + "grad_norm": 0.6187695105163438, + "learning_rate": 4.473375735930084e-06, + "loss": 0.5955, + "step": 3970 + }, + { + "epoch": 0.6442245295262816, + "grad_norm": 0.6079553932386204, + "learning_rate": 4.47311353349538e-06, + "loss": 0.5895, + "step": 3971 + }, + { + "epoch": 0.6443867618429591, + "grad_norm": 0.6063929200984215, + "learning_rate": 4.472851273490985e-06, + "loss": 0.5556, + "step": 3972 + }, + { + "epoch": 0.6445489941596366, + "grad_norm": 0.6033882793382436, + "learning_rate": 4.472588955924549e-06, + "loss": 0.5529, + "step": 3973 + }, + { + "epoch": 0.6447112264763141, + "grad_norm": 0.613452219995702, + "learning_rate": 4.472326580803728e-06, + "loss": 0.5253, + "step": 3974 + }, + { + "epoch": 0.6448734587929915, + "grad_norm": 0.61992767258225, + "learning_rate": 4.472064148136176e-06, + "loss": 0.5851, + "step": 3975 + }, + { + "epoch": 0.6450356911096691, + "grad_norm": 0.5837517518851293, + "learning_rate": 4.4718016579295506e-06, + "loss": 0.573, + "step": 3976 + }, + { + "epoch": 0.6451979234263465, + "grad_norm": 0.6076192788391649, + "learning_rate": 4.47153911019151e-06, + "loss": 0.5634, + "step": 3977 + }, + { + "epoch": 0.645360155743024, + "grad_norm": 0.5994224867742942, + "learning_rate": 4.471276504929715e-06, + "loss": 0.6039, + "step": 3978 + }, + { + "epoch": 0.6455223880597015, + "grad_norm": 0.5706488652582625, + "learning_rate": 4.471013842151828e-06, + "loss": 0.5908, + "step": 3979 + }, + { + "epoch": 0.645684620376379, + "grad_norm": 0.5637060795068811, + "learning_rate": 4.4707511218655115e-06, + "loss": 0.5622, + "step": 3980 + }, + { + "epoch": 0.6458468526930564, + "grad_norm": 0.600905747567664, + "learning_rate": 4.470488344078432e-06, + "loss": 0.5956, + "step": 3981 + }, + { + "epoch": 0.646009085009734, + "grad_norm": 0.5730413675058308, + "learning_rate": 4.470225508798256e-06, + "loss": 0.549, + "step": 3982 + }, + { + "epoch": 0.6461713173264114, + "grad_norm": 0.5844997567973371, + "learning_rate": 4.469962616032653e-06, + "loss": 0.5381, + "step": 3983 + }, + { + "epoch": 0.6463335496430889, + "grad_norm": 0.6194716316884957, + "learning_rate": 4.469699665789292e-06, + "loss": 0.5815, + "step": 3984 + }, + { + "epoch": 0.6464957819597664, + "grad_norm": 0.6514860352630123, + "learning_rate": 4.4694366580758464e-06, + "loss": 0.5553, + "step": 3985 + }, + { + "epoch": 0.6466580142764439, + "grad_norm": 0.6343878429417661, + "learning_rate": 4.4691735928999906e-06, + "loss": 0.5813, + "step": 3986 + }, + { + "epoch": 0.6468202465931213, + "grad_norm": 0.5864896884456676, + "learning_rate": 4.468910470269398e-06, + "loss": 0.5676, + "step": 3987 + }, + { + "epoch": 0.6469824789097989, + "grad_norm": 0.5983475496486174, + "learning_rate": 4.468647290191747e-06, + "loss": 0.5493, + "step": 3988 + }, + { + "epoch": 0.6471447112264763, + "grad_norm": 0.6051005150540889, + "learning_rate": 4.468384052674717e-06, + "loss": 0.5385, + "step": 3989 + }, + { + "epoch": 0.6473069435431538, + "grad_norm": 0.5903234405544039, + "learning_rate": 4.468120757725987e-06, + "loss": 0.5383, + "step": 3990 + }, + { + "epoch": 0.6474691758598313, + "grad_norm": 0.5918761836610207, + "learning_rate": 4.46785740535324e-06, + "loss": 0.5654, + "step": 3991 + }, + { + "epoch": 0.6476314081765088, + "grad_norm": 0.6337750812932152, + "learning_rate": 4.467593995564161e-06, + "loss": 0.5485, + "step": 3992 + }, + { + "epoch": 0.6477936404931862, + "grad_norm": 0.5821069682604961, + "learning_rate": 4.467330528366433e-06, + "loss": 0.5468, + "step": 3993 + }, + { + "epoch": 0.6479558728098638, + "grad_norm": 0.5966715532827132, + "learning_rate": 4.467067003767745e-06, + "loss": 0.5913, + "step": 3994 + }, + { + "epoch": 0.6481181051265412, + "grad_norm": 0.570277224852801, + "learning_rate": 4.466803421775786e-06, + "loss": 0.5813, + "step": 3995 + }, + { + "epoch": 0.6482803374432187, + "grad_norm": 0.6230494307296233, + "learning_rate": 4.466539782398246e-06, + "loss": 0.5557, + "step": 3996 + }, + { + "epoch": 0.6484425697598962, + "grad_norm": 0.5876333492479732, + "learning_rate": 4.466276085642817e-06, + "loss": 0.5838, + "step": 3997 + }, + { + "epoch": 0.6486048020765737, + "grad_norm": 0.6197235046612865, + "learning_rate": 4.4660123315171924e-06, + "loss": 0.5796, + "step": 3998 + }, + { + "epoch": 0.6487670343932511, + "grad_norm": 0.5979714298385005, + "learning_rate": 4.465748520029069e-06, + "loss": 0.5692, + "step": 3999 + }, + { + "epoch": 0.6489292667099286, + "grad_norm": 0.5980364268149152, + "learning_rate": 4.465484651186144e-06, + "loss": 0.5062, + "step": 4000 + }, + { + "epoch": 0.6490914990266061, + "grad_norm": 0.5903296683504144, + "learning_rate": 4.465220724996115e-06, + "loss": 0.562, + "step": 4001 + }, + { + "epoch": 0.6492537313432836, + "grad_norm": 0.5734743444851506, + "learning_rate": 4.464956741466684e-06, + "loss": 0.5573, + "step": 4002 + }, + { + "epoch": 0.649415963659961, + "grad_norm": 0.6087086600664873, + "learning_rate": 4.464692700605553e-06, + "loss": 0.5575, + "step": 4003 + }, + { + "epoch": 0.6495781959766386, + "grad_norm": 0.5682125308187953, + "learning_rate": 4.464428602420425e-06, + "loss": 0.5739, + "step": 4004 + }, + { + "epoch": 0.649740428293316, + "grad_norm": 0.5978769479205198, + "learning_rate": 4.464164446919007e-06, + "loss": 0.5757, + "step": 4005 + }, + { + "epoch": 0.6499026606099935, + "grad_norm": 0.6596256936230762, + "learning_rate": 4.463900234109005e-06, + "loss": 0.5898, + "step": 4006 + }, + { + "epoch": 0.650064892926671, + "grad_norm": 0.6096357450890507, + "learning_rate": 4.463635963998128e-06, + "loss": 0.5327, + "step": 4007 + }, + { + "epoch": 0.6502271252433485, + "grad_norm": 0.6163135614380844, + "learning_rate": 4.463371636594089e-06, + "loss": 0.5538, + "step": 4008 + }, + { + "epoch": 0.6503893575600259, + "grad_norm": 0.6005479217487023, + "learning_rate": 4.463107251904597e-06, + "loss": 0.5749, + "step": 4009 + }, + { + "epoch": 0.6505515898767035, + "grad_norm": 0.6038243604388431, + "learning_rate": 4.462842809937368e-06, + "loss": 0.524, + "step": 4010 + }, + { + "epoch": 0.6507138221933809, + "grad_norm": 0.5726278920739377, + "learning_rate": 4.462578310700117e-06, + "loss": 0.56, + "step": 4011 + }, + { + "epoch": 0.6508760545100584, + "grad_norm": 0.592279154725207, + "learning_rate": 4.4623137542005615e-06, + "loss": 0.5689, + "step": 4012 + }, + { + "epoch": 0.6510382868267359, + "grad_norm": 0.6049374919137783, + "learning_rate": 4.462049140446421e-06, + "loss": 0.5654, + "step": 4013 + }, + { + "epoch": 0.6512005191434134, + "grad_norm": 0.5854092585635061, + "learning_rate": 4.461784469445414e-06, + "loss": 0.6162, + "step": 4014 + }, + { + "epoch": 0.6513627514600908, + "grad_norm": 0.5669401090203143, + "learning_rate": 4.461519741205265e-06, + "loss": 0.5551, + "step": 4015 + }, + { + "epoch": 0.6515249837767684, + "grad_norm": 0.6116460548513868, + "learning_rate": 4.4612549557336975e-06, + "loss": 0.5732, + "step": 4016 + }, + { + "epoch": 0.6516872160934458, + "grad_norm": 0.6191491217712182, + "learning_rate": 4.460990113038437e-06, + "loss": 0.5557, + "step": 4017 + }, + { + "epoch": 0.6518494484101233, + "grad_norm": 0.6342056516338237, + "learning_rate": 4.460725213127212e-06, + "loss": 0.5872, + "step": 4018 + }, + { + "epoch": 0.6520116807268008, + "grad_norm": 0.6077407041420267, + "learning_rate": 4.460460256007748e-06, + "loss": 0.5951, + "step": 4019 + }, + { + "epoch": 0.6521739130434783, + "grad_norm": 0.579923295252956, + "learning_rate": 4.460195241687781e-06, + "loss": 0.5522, + "step": 4020 + }, + { + "epoch": 0.6523361453601557, + "grad_norm": 0.6128134554852166, + "learning_rate": 4.459930170175038e-06, + "loss": 0.5695, + "step": 4021 + }, + { + "epoch": 0.6524983776768333, + "grad_norm": 0.6141846025543237, + "learning_rate": 4.459665041477256e-06, + "loss": 0.5786, + "step": 4022 + }, + { + "epoch": 0.6526606099935107, + "grad_norm": 0.6229757505767738, + "learning_rate": 4.459399855602171e-06, + "loss": 0.6011, + "step": 4023 + }, + { + "epoch": 0.6528228423101882, + "grad_norm": 0.5892434490760825, + "learning_rate": 4.459134612557519e-06, + "loss": 0.5878, + "step": 4024 + }, + { + "epoch": 0.6529850746268657, + "grad_norm": 0.6009703199856641, + "learning_rate": 4.458869312351039e-06, + "loss": 0.6063, + "step": 4025 + }, + { + "epoch": 0.6531473069435432, + "grad_norm": 0.5992997358377434, + "learning_rate": 4.458603954990472e-06, + "loss": 0.584, + "step": 4026 + }, + { + "epoch": 0.6533095392602206, + "grad_norm": 0.5902820674619714, + "learning_rate": 4.458338540483561e-06, + "loss": 0.5774, + "step": 4027 + }, + { + "epoch": 0.6534717715768981, + "grad_norm": 0.5886637330307539, + "learning_rate": 4.458073068838049e-06, + "loss": 0.5647, + "step": 4028 + }, + { + "epoch": 0.6536340038935756, + "grad_norm": 0.5918322657246954, + "learning_rate": 4.457807540061682e-06, + "loss": 0.5361, + "step": 4029 + }, + { + "epoch": 0.6537962362102531, + "grad_norm": 0.6313149478402867, + "learning_rate": 4.457541954162209e-06, + "loss": 0.5446, + "step": 4030 + }, + { + "epoch": 0.6539584685269305, + "grad_norm": 0.5879057107897464, + "learning_rate": 4.457276311147377e-06, + "loss": 0.5493, + "step": 4031 + }, + { + "epoch": 0.6541207008436081, + "grad_norm": 0.5714506533306678, + "learning_rate": 4.457010611024937e-06, + "loss": 0.5719, + "step": 4032 + }, + { + "epoch": 0.6542829331602855, + "grad_norm": 0.5844824019853508, + "learning_rate": 4.456744853802642e-06, + "loss": 0.5616, + "step": 4033 + }, + { + "epoch": 0.654445165476963, + "grad_norm": 0.5963311406541559, + "learning_rate": 4.456479039488245e-06, + "loss": 0.5471, + "step": 4034 + }, + { + "epoch": 0.6546073977936405, + "grad_norm": 0.5916608289729901, + "learning_rate": 4.456213168089503e-06, + "loss": 0.5393, + "step": 4035 + }, + { + "epoch": 0.654769630110318, + "grad_norm": 0.5683142083241286, + "learning_rate": 4.4559472396141725e-06, + "loss": 0.5709, + "step": 4036 + }, + { + "epoch": 0.6549318624269954, + "grad_norm": 0.6126008597320421, + "learning_rate": 4.455681254070012e-06, + "loss": 0.5666, + "step": 4037 + }, + { + "epoch": 0.655094094743673, + "grad_norm": 0.5662359828540181, + "learning_rate": 4.455415211464783e-06, + "loss": 0.5726, + "step": 4038 + }, + { + "epoch": 0.6552563270603504, + "grad_norm": 0.589408596143994, + "learning_rate": 4.455149111806248e-06, + "loss": 0.5545, + "step": 4039 + }, + { + "epoch": 0.6554185593770279, + "grad_norm": 0.5540971429730508, + "learning_rate": 4.454882955102171e-06, + "loss": 0.5385, + "step": 4040 + }, + { + "epoch": 0.6555807916937054, + "grad_norm": 0.5646921848191486, + "learning_rate": 4.454616741360318e-06, + "loss": 0.5595, + "step": 4041 + }, + { + "epoch": 0.6557430240103829, + "grad_norm": 0.6919206458629183, + "learning_rate": 4.454350470588454e-06, + "loss": 0.5437, + "step": 4042 + }, + { + "epoch": 0.6559052563270603, + "grad_norm": 0.5747214956719368, + "learning_rate": 4.4540841427943514e-06, + "loss": 0.5773, + "step": 4043 + }, + { + "epoch": 0.6560674886437379, + "grad_norm": 0.6024545117253116, + "learning_rate": 4.453817757985779e-06, + "loss": 0.5511, + "step": 4044 + }, + { + "epoch": 0.6562297209604153, + "grad_norm": 0.5642605171133716, + "learning_rate": 4.453551316170509e-06, + "loss": 0.5226, + "step": 4045 + }, + { + "epoch": 0.6563919532770928, + "grad_norm": 0.5981616976330757, + "learning_rate": 4.453284817356315e-06, + "loss": 0.5616, + "step": 4046 + }, + { + "epoch": 0.6565541855937703, + "grad_norm": 0.6117877149134916, + "learning_rate": 4.453018261550974e-06, + "loss": 0.5452, + "step": 4047 + }, + { + "epoch": 0.6567164179104478, + "grad_norm": 0.6104549735464923, + "learning_rate": 4.4527516487622635e-06, + "loss": 0.5765, + "step": 4048 + }, + { + "epoch": 0.6568786502271252, + "grad_norm": 0.5958883672980854, + "learning_rate": 4.45248497899796e-06, + "loss": 0.5769, + "step": 4049 + }, + { + "epoch": 0.6570408825438028, + "grad_norm": 0.6503120635486171, + "learning_rate": 4.452218252265847e-06, + "loss": 0.5565, + "step": 4050 + }, + { + "epoch": 0.6572031148604802, + "grad_norm": 0.6483056921670238, + "learning_rate": 4.451951468573706e-06, + "loss": 0.5851, + "step": 4051 + }, + { + "epoch": 0.6573653471771577, + "grad_norm": 0.595121431982105, + "learning_rate": 4.45168462792932e-06, + "loss": 0.5485, + "step": 4052 + }, + { + "epoch": 0.6575275794938352, + "grad_norm": 0.5881159354185992, + "learning_rate": 4.451417730340476e-06, + "loss": 0.5685, + "step": 4053 + }, + { + "epoch": 0.6576898118105127, + "grad_norm": 0.5528466235457566, + "learning_rate": 4.45115077581496e-06, + "loss": 0.5526, + "step": 4054 + }, + { + "epoch": 0.6578520441271901, + "grad_norm": 0.632436566667076, + "learning_rate": 4.450883764360562e-06, + "loss": 0.561, + "step": 4055 + }, + { + "epoch": 0.6580142764438677, + "grad_norm": 0.584506060012231, + "learning_rate": 4.450616695985071e-06, + "loss": 0.5635, + "step": 4056 + }, + { + "epoch": 0.6581765087605451, + "grad_norm": 0.6480182069076065, + "learning_rate": 4.450349570696281e-06, + "loss": 0.5565, + "step": 4057 + }, + { + "epoch": 0.6583387410772226, + "grad_norm": 0.5477935629618759, + "learning_rate": 4.450082388501986e-06, + "loss": 0.5831, + "step": 4058 + }, + { + "epoch": 0.6585009733939, + "grad_norm": 0.5729868339581765, + "learning_rate": 4.44981514940998e-06, + "loss": 0.5396, + "step": 4059 + }, + { + "epoch": 0.6586632057105776, + "grad_norm": 0.5801555761176675, + "learning_rate": 4.449547853428061e-06, + "loss": 0.5857, + "step": 4060 + }, + { + "epoch": 0.658825438027255, + "grad_norm": 0.5769717876739519, + "learning_rate": 4.449280500564029e-06, + "loss": 0.5752, + "step": 4061 + }, + { + "epoch": 0.6589876703439325, + "grad_norm": 0.5785872363382044, + "learning_rate": 4.449013090825683e-06, + "loss": 0.5588, + "step": 4062 + }, + { + "epoch": 0.65914990266061, + "grad_norm": 0.5857023054115947, + "learning_rate": 4.448745624220826e-06, + "loss": 0.5411, + "step": 4063 + }, + { + "epoch": 0.6593121349772875, + "grad_norm": 0.5968391527400715, + "learning_rate": 4.448478100757262e-06, + "loss": 0.5763, + "step": 4064 + }, + { + "epoch": 0.6594743672939649, + "grad_norm": 0.5857307718466283, + "learning_rate": 4.448210520442797e-06, + "loss": 0.5425, + "step": 4065 + }, + { + "epoch": 0.6596365996106425, + "grad_norm": 0.5946453710408495, + "learning_rate": 4.447942883285237e-06, + "loss": 0.5602, + "step": 4066 + }, + { + "epoch": 0.6597988319273199, + "grad_norm": 0.6013344133957906, + "learning_rate": 4.447675189292391e-06, + "loss": 0.5988, + "step": 4067 + }, + { + "epoch": 0.6599610642439974, + "grad_norm": 0.5778618923535318, + "learning_rate": 4.44740743847207e-06, + "loss": 0.5392, + "step": 4068 + }, + { + "epoch": 0.6601232965606749, + "grad_norm": 0.5718382470094155, + "learning_rate": 4.447139630832087e-06, + "loss": 0.5492, + "step": 4069 + }, + { + "epoch": 0.6602855288773524, + "grad_norm": 0.5730605039739034, + "learning_rate": 4.446871766380254e-06, + "loss": 0.5282, + "step": 4070 + }, + { + "epoch": 0.6604477611940298, + "grad_norm": 0.5772683360534282, + "learning_rate": 4.446603845124388e-06, + "loss": 0.5508, + "step": 4071 + }, + { + "epoch": 0.6606099935107074, + "grad_norm": 0.6333503074716669, + "learning_rate": 4.446335867072306e-06, + "loss": 0.5952, + "step": 4072 + }, + { + "epoch": 0.6607722258273848, + "grad_norm": 0.5992557060321838, + "learning_rate": 4.446067832231826e-06, + "loss": 0.5715, + "step": 4073 + }, + { + "epoch": 0.6609344581440623, + "grad_norm": 0.5694915768776007, + "learning_rate": 4.4457997406107685e-06, + "loss": 0.5212, + "step": 4074 + }, + { + "epoch": 0.6610966904607398, + "grad_norm": 0.5627838469113949, + "learning_rate": 4.445531592216958e-06, + "loss": 0.5269, + "step": 4075 + }, + { + "epoch": 0.6612589227774173, + "grad_norm": 0.5600064801835792, + "learning_rate": 4.445263387058215e-06, + "loss": 0.5753, + "step": 4076 + }, + { + "epoch": 0.6614211550940947, + "grad_norm": 0.6125703962238159, + "learning_rate": 4.4449951251423665e-06, + "loss": 0.6134, + "step": 4077 + }, + { + "epoch": 0.6615833874107723, + "grad_norm": 0.6246344537792694, + "learning_rate": 4.444726806477239e-06, + "loss": 0.5662, + "step": 4078 + }, + { + "epoch": 0.6617456197274497, + "grad_norm": 0.5864855804045398, + "learning_rate": 4.444458431070662e-06, + "loss": 0.5363, + "step": 4079 + }, + { + "epoch": 0.6619078520441272, + "grad_norm": 0.5889202692867613, + "learning_rate": 4.444189998930466e-06, + "loss": 0.5714, + "step": 4080 + }, + { + "epoch": 0.6620700843608047, + "grad_norm": 0.5671410370511737, + "learning_rate": 4.4439215100644815e-06, + "loss": 0.5846, + "step": 4081 + }, + { + "epoch": 0.6622323166774822, + "grad_norm": 0.5959406150330612, + "learning_rate": 4.443652964480544e-06, + "loss": 0.5689, + "step": 4082 + }, + { + "epoch": 0.6623945489941596, + "grad_norm": 0.5803019299384988, + "learning_rate": 4.443384362186488e-06, + "loss": 0.5698, + "step": 4083 + }, + { + "epoch": 0.6625567813108372, + "grad_norm": 0.5899551168338967, + "learning_rate": 4.443115703190152e-06, + "loss": 0.5454, + "step": 4084 + }, + { + "epoch": 0.6627190136275146, + "grad_norm": 0.5656250477476502, + "learning_rate": 4.442846987499372e-06, + "loss": 0.5438, + "step": 4085 + }, + { + "epoch": 0.6628812459441921, + "grad_norm": 0.6282459825555264, + "learning_rate": 4.442578215121991e-06, + "loss": 0.5957, + "step": 4086 + }, + { + "epoch": 0.6630434782608695, + "grad_norm": 0.5994873071232584, + "learning_rate": 4.44230938606585e-06, + "loss": 0.5627, + "step": 4087 + }, + { + "epoch": 0.6632057105775471, + "grad_norm": 0.6118364784192912, + "learning_rate": 4.44204050033879e-06, + "loss": 0.5615, + "step": 4088 + }, + { + "epoch": 0.6633679428942245, + "grad_norm": 0.5823122873344412, + "learning_rate": 4.441771557948661e-06, + "loss": 0.5895, + "step": 4089 + }, + { + "epoch": 0.663530175210902, + "grad_norm": 0.5671719402585672, + "learning_rate": 4.441502558903306e-06, + "loss": 0.5588, + "step": 4090 + }, + { + "epoch": 0.6636924075275795, + "grad_norm": 0.5889729731388615, + "learning_rate": 4.4412335032105775e-06, + "loss": 0.5355, + "step": 4091 + }, + { + "epoch": 0.663854639844257, + "grad_norm": 0.5857042352292406, + "learning_rate": 4.4409643908783225e-06, + "loss": 0.5457, + "step": 4092 + }, + { + "epoch": 0.6640168721609344, + "grad_norm": 0.6175443293916745, + "learning_rate": 4.440695221914394e-06, + "loss": 0.5437, + "step": 4093 + }, + { + "epoch": 0.664179104477612, + "grad_norm": 0.6099108327415569, + "learning_rate": 4.440425996326645e-06, + "loss": 0.5972, + "step": 4094 + }, + { + "epoch": 0.6643413367942894, + "grad_norm": 0.5790068474934279, + "learning_rate": 4.440156714122932e-06, + "loss": 0.5569, + "step": 4095 + }, + { + "epoch": 0.6645035691109669, + "grad_norm": 0.5901170869247644, + "learning_rate": 4.439887375311111e-06, + "loss": 0.54, + "step": 4096 + }, + { + "epoch": 0.6646658014276444, + "grad_norm": 0.6040158838308639, + "learning_rate": 4.43961797989904e-06, + "loss": 0.5465, + "step": 4097 + }, + { + "epoch": 0.6648280337443219, + "grad_norm": 0.6359681810797428, + "learning_rate": 4.439348527894581e-06, + "loss": 0.556, + "step": 4098 + }, + { + "epoch": 0.6649902660609993, + "grad_norm": 0.6001626495109187, + "learning_rate": 4.439079019305594e-06, + "loss": 0.5571, + "step": 4099 + }, + { + "epoch": 0.6651524983776769, + "grad_norm": 0.5758245153050808, + "learning_rate": 4.438809454139943e-06, + "loss": 0.5773, + "step": 4100 + }, + { + "epoch": 0.6653147306943543, + "grad_norm": 0.5980215386815843, + "learning_rate": 4.438539832405493e-06, + "loss": 0.5646, + "step": 4101 + }, + { + "epoch": 0.6654769630110318, + "grad_norm": 0.6179849720559127, + "learning_rate": 4.438270154110111e-06, + "loss": 0.5558, + "step": 4102 + }, + { + "epoch": 0.6656391953277093, + "grad_norm": 0.577120185921494, + "learning_rate": 4.438000419261667e-06, + "loss": 0.5718, + "step": 4103 + }, + { + "epoch": 0.6658014276443868, + "grad_norm": 0.5628813920966041, + "learning_rate": 4.437730627868028e-06, + "loss": 0.5597, + "step": 4104 + }, + { + "epoch": 0.6659636599610642, + "grad_norm": 0.6036315449582623, + "learning_rate": 4.437460779937067e-06, + "loss": 0.565, + "step": 4105 + }, + { + "epoch": 0.6661258922777418, + "grad_norm": 0.5838244389606696, + "learning_rate": 4.437190875476658e-06, + "loss": 0.6079, + "step": 4106 + }, + { + "epoch": 0.6662881245944192, + "grad_norm": 1.2544813024381896, + "learning_rate": 4.436920914494676e-06, + "loss": 0.5212, + "step": 4107 + }, + { + "epoch": 0.6664503569110967, + "grad_norm": 0.5812922283910306, + "learning_rate": 4.4366508969989966e-06, + "loss": 0.5866, + "step": 4108 + }, + { + "epoch": 0.6666125892277742, + "grad_norm": 0.5756564869445463, + "learning_rate": 4.4363808229974996e-06, + "loss": 0.5424, + "step": 4109 + }, + { + "epoch": 0.6667748215444517, + "grad_norm": 0.6531426465758808, + "learning_rate": 4.436110692498064e-06, + "loss": 0.5788, + "step": 4110 + }, + { + "epoch": 0.6669370538611291, + "grad_norm": 0.5881231902150217, + "learning_rate": 4.435840505508572e-06, + "loss": 0.5126, + "step": 4111 + }, + { + "epoch": 0.6670992861778067, + "grad_norm": 0.602575361803439, + "learning_rate": 4.4355702620369055e-06, + "loss": 0.5457, + "step": 4112 + }, + { + "epoch": 0.6672615184944841, + "grad_norm": 0.5780187510049262, + "learning_rate": 4.435299962090951e-06, + "loss": 0.5263, + "step": 4113 + }, + { + "epoch": 0.6674237508111616, + "grad_norm": 0.598599240468464, + "learning_rate": 4.435029605678595e-06, + "loss": 0.5481, + "step": 4114 + }, + { + "epoch": 0.667585983127839, + "grad_norm": 0.5841252713041035, + "learning_rate": 4.4347591928077235e-06, + "loss": 0.581, + "step": 4115 + }, + { + "epoch": 0.6677482154445166, + "grad_norm": 0.5634532639542523, + "learning_rate": 4.43448872348623e-06, + "loss": 0.5461, + "step": 4116 + }, + { + "epoch": 0.667910447761194, + "grad_norm": 0.6022356948801457, + "learning_rate": 4.434218197722003e-06, + "loss": 0.5476, + "step": 4117 + }, + { + "epoch": 0.6680726800778715, + "grad_norm": 0.5614194369054668, + "learning_rate": 4.433947615522937e-06, + "loss": 0.5388, + "step": 4118 + }, + { + "epoch": 0.668234912394549, + "grad_norm": 0.6004046201661066, + "learning_rate": 4.433676976896926e-06, + "loss": 0.5897, + "step": 4119 + }, + { + "epoch": 0.6683971447112265, + "grad_norm": 0.6002052595509201, + "learning_rate": 4.433406281851868e-06, + "loss": 0.5362, + "step": 4120 + }, + { + "epoch": 0.6685593770279039, + "grad_norm": 0.5835760546385056, + "learning_rate": 4.433135530395658e-06, + "loss": 0.4961, + "step": 4121 + }, + { + "epoch": 0.6687216093445815, + "grad_norm": 0.6084717324300561, + "learning_rate": 4.4328647225362e-06, + "loss": 0.5485, + "step": 4122 + }, + { + "epoch": 0.6688838416612589, + "grad_norm": 0.5813252705732912, + "learning_rate": 4.432593858281392e-06, + "loss": 0.5454, + "step": 4123 + }, + { + "epoch": 0.6690460739779364, + "grad_norm": 0.6000423310469781, + "learning_rate": 4.4323229376391384e-06, + "loss": 0.5701, + "step": 4124 + }, + { + "epoch": 0.6692083062946139, + "grad_norm": 0.6072088259557408, + "learning_rate": 4.4320519606173436e-06, + "loss": 0.5407, + "step": 4125 + }, + { + "epoch": 0.6693705386112914, + "grad_norm": 0.5625827390149928, + "learning_rate": 4.4317809272239145e-06, + "loss": 0.5689, + "step": 4126 + }, + { + "epoch": 0.6695327709279688, + "grad_norm": 0.6064794230558532, + "learning_rate": 4.431509837466757e-06, + "loss": 0.5445, + "step": 4127 + }, + { + "epoch": 0.6696950032446464, + "grad_norm": 0.6481029964693066, + "learning_rate": 4.431238691353784e-06, + "loss": 0.5684, + "step": 4128 + }, + { + "epoch": 0.6698572355613238, + "grad_norm": 0.538484621879351, + "learning_rate": 4.430967488892904e-06, + "loss": 0.5416, + "step": 4129 + }, + { + "epoch": 0.6700194678780013, + "grad_norm": 0.563455406485709, + "learning_rate": 4.430696230092031e-06, + "loss": 0.5797, + "step": 4130 + }, + { + "epoch": 0.6701817001946788, + "grad_norm": 0.5878772893461728, + "learning_rate": 4.43042491495908e-06, + "loss": 0.5422, + "step": 4131 + }, + { + "epoch": 0.6703439325113563, + "grad_norm": 0.5619219728277566, + "learning_rate": 4.430153543501966e-06, + "loss": 0.5341, + "step": 4132 + }, + { + "epoch": 0.6705061648280337, + "grad_norm": 0.5416975104276928, + "learning_rate": 4.429882115728608e-06, + "loss": 0.5374, + "step": 4133 + }, + { + "epoch": 0.6706683971447113, + "grad_norm": 0.5706872161578231, + "learning_rate": 4.429610631646925e-06, + "loss": 0.5805, + "step": 4134 + }, + { + "epoch": 0.6708306294613887, + "grad_norm": 0.5850925423036275, + "learning_rate": 4.429339091264837e-06, + "loss": 0.5701, + "step": 4135 + }, + { + "epoch": 0.6709928617780662, + "grad_norm": 0.6164560916234346, + "learning_rate": 4.429067494590268e-06, + "loss": 0.5638, + "step": 4136 + }, + { + "epoch": 0.6711550940947437, + "grad_norm": 0.555201683133707, + "learning_rate": 4.428795841631142e-06, + "loss": 0.5588, + "step": 4137 + }, + { + "epoch": 0.6713173264114212, + "grad_norm": 0.5909298240031688, + "learning_rate": 4.428524132395386e-06, + "loss": 0.5703, + "step": 4138 + }, + { + "epoch": 0.6714795587280986, + "grad_norm": 0.5575454029395585, + "learning_rate": 4.428252366890927e-06, + "loss": 0.5607, + "step": 4139 + }, + { + "epoch": 0.6716417910447762, + "grad_norm": 0.6131224874066712, + "learning_rate": 4.4279805451256944e-06, + "loss": 0.5716, + "step": 4140 + }, + { + "epoch": 0.6718040233614536, + "grad_norm": 0.5872655549274518, + "learning_rate": 4.427708667107617e-06, + "loss": 0.538, + "step": 4141 + }, + { + "epoch": 0.6719662556781311, + "grad_norm": 0.5988175116256141, + "learning_rate": 4.427436732844632e-06, + "loss": 0.5362, + "step": 4142 + }, + { + "epoch": 0.6721284879948086, + "grad_norm": 0.5612910565548525, + "learning_rate": 4.4271647423446695e-06, + "loss": 0.5783, + "step": 4143 + }, + { + "epoch": 0.6722907203114861, + "grad_norm": 0.5841456360632774, + "learning_rate": 4.426892695615668e-06, + "loss": 0.5738, + "step": 4144 + }, + { + "epoch": 0.6724529526281635, + "grad_norm": 0.5599361237398691, + "learning_rate": 4.426620592665562e-06, + "loss": 0.5454, + "step": 4145 + }, + { + "epoch": 0.672615184944841, + "grad_norm": 0.5589002650702346, + "learning_rate": 4.426348433502294e-06, + "loss": 0.5368, + "step": 4146 + }, + { + "epoch": 0.6727774172615185, + "grad_norm": 0.5765845053847966, + "learning_rate": 4.426076218133804e-06, + "loss": 0.5486, + "step": 4147 + }, + { + "epoch": 0.672939649578196, + "grad_norm": 0.5735928724989197, + "learning_rate": 4.425803946568033e-06, + "loss": 0.5631, + "step": 4148 + }, + { + "epoch": 0.6731018818948734, + "grad_norm": 0.583488883473101, + "learning_rate": 4.425531618812926e-06, + "loss": 0.56, + "step": 4149 + }, + { + "epoch": 0.673264114211551, + "grad_norm": 0.5730533867754445, + "learning_rate": 4.425259234876429e-06, + "loss": 0.5903, + "step": 4150 + }, + { + "epoch": 0.6734263465282284, + "grad_norm": 0.5735177759314899, + "learning_rate": 4.424986794766489e-06, + "loss": 0.5361, + "step": 4151 + }, + { + "epoch": 0.6735885788449059, + "grad_norm": 0.6017359381810288, + "learning_rate": 4.424714298491055e-06, + "loss": 0.5622, + "step": 4152 + }, + { + "epoch": 0.6737508111615834, + "grad_norm": 0.5866965077977516, + "learning_rate": 4.424441746058078e-06, + "loss": 0.5486, + "step": 4153 + }, + { + "epoch": 0.6739130434782609, + "grad_norm": 0.615693981259116, + "learning_rate": 4.424169137475509e-06, + "loss": 0.5884, + "step": 4154 + }, + { + "epoch": 0.6740752757949383, + "grad_norm": 0.5887644540282881, + "learning_rate": 4.423896472751303e-06, + "loss": 0.5298, + "step": 4155 + }, + { + "epoch": 0.6742375081116159, + "grad_norm": 0.5904055224287589, + "learning_rate": 4.423623751893417e-06, + "loss": 0.5975, + "step": 4156 + }, + { + "epoch": 0.6743997404282933, + "grad_norm": 0.6148894331600848, + "learning_rate": 4.423350974909806e-06, + "loss": 0.5951, + "step": 4157 + }, + { + "epoch": 0.6745619727449708, + "grad_norm": 0.5733473290348068, + "learning_rate": 4.423078141808429e-06, + "loss": 0.564, + "step": 4158 + }, + { + "epoch": 0.6747242050616483, + "grad_norm": 0.6119773578553355, + "learning_rate": 4.422805252597247e-06, + "loss": 0.524, + "step": 4159 + }, + { + "epoch": 0.6748864373783258, + "grad_norm": 0.559250941180738, + "learning_rate": 4.422532307284223e-06, + "loss": 0.5642, + "step": 4160 + }, + { + "epoch": 0.6750486696950032, + "grad_norm": 0.5764392101714075, + "learning_rate": 4.4222593058773195e-06, + "loss": 0.5625, + "step": 4161 + }, + { + "epoch": 0.6752109020116808, + "grad_norm": 0.5854832148608288, + "learning_rate": 4.421986248384502e-06, + "loss": 0.5429, + "step": 4162 + }, + { + "epoch": 0.6753731343283582, + "grad_norm": 0.5469539835937823, + "learning_rate": 4.421713134813738e-06, + "loss": 0.5882, + "step": 4163 + }, + { + "epoch": 0.6755353666450357, + "grad_norm": 0.6380390902588832, + "learning_rate": 4.421439965172995e-06, + "loss": 0.5446, + "step": 4164 + }, + { + "epoch": 0.6756975989617132, + "grad_norm": 0.5601221566914051, + "learning_rate": 4.4211667394702454e-06, + "loss": 0.5618, + "step": 4165 + }, + { + "epoch": 0.6758598312783907, + "grad_norm": 0.6381394881906596, + "learning_rate": 4.420893457713459e-06, + "loss": 0.5939, + "step": 4166 + }, + { + "epoch": 0.6760220635950681, + "grad_norm": 0.5891349807367816, + "learning_rate": 4.4206201199106116e-06, + "loss": 0.5314, + "step": 4167 + }, + { + "epoch": 0.6761842959117457, + "grad_norm": 0.5720415853670233, + "learning_rate": 4.420346726069676e-06, + "loss": 0.5474, + "step": 4168 + }, + { + "epoch": 0.6763465282284231, + "grad_norm": 0.5632759631971104, + "learning_rate": 4.420073276198631e-06, + "loss": 0.5243, + "step": 4169 + }, + { + "epoch": 0.6765087605451006, + "grad_norm": 0.56761335711302, + "learning_rate": 4.419799770305453e-06, + "loss": 0.5332, + "step": 4170 + }, + { + "epoch": 0.6766709928617781, + "grad_norm": 0.5725846744596695, + "learning_rate": 4.419526208398124e-06, + "loss": 0.5934, + "step": 4171 + }, + { + "epoch": 0.6768332251784556, + "grad_norm": 0.5864265234797601, + "learning_rate": 4.419252590484625e-06, + "loss": 0.5541, + "step": 4172 + }, + { + "epoch": 0.676995457495133, + "grad_norm": 0.5787732136300118, + "learning_rate": 4.418978916572939e-06, + "loss": 0.541, + "step": 4173 + }, + { + "epoch": 0.6771576898118105, + "grad_norm": 0.5949926954375334, + "learning_rate": 4.418705186671052e-06, + "loss": 0.5068, + "step": 4174 + }, + { + "epoch": 0.677319922128488, + "grad_norm": 0.5893438889964232, + "learning_rate": 4.41843140078695e-06, + "loss": 0.5149, + "step": 4175 + }, + { + "epoch": 0.6774821544451655, + "grad_norm": 0.6295992246875678, + "learning_rate": 4.418157558928622e-06, + "loss": 0.578, + "step": 4176 + }, + { + "epoch": 0.6776443867618429, + "grad_norm": 0.5812948815232947, + "learning_rate": 4.417883661104055e-06, + "loss": 0.5771, + "step": 4177 + }, + { + "epoch": 0.6778066190785205, + "grad_norm": 0.5842285976387831, + "learning_rate": 4.417609707321245e-06, + "loss": 0.531, + "step": 4178 + }, + { + "epoch": 0.6779688513951979, + "grad_norm": 0.5688291405938809, + "learning_rate": 4.417335697588182e-06, + "loss": 0.5557, + "step": 4179 + }, + { + "epoch": 0.6781310837118754, + "grad_norm": 0.5912546738970351, + "learning_rate": 4.4170616319128625e-06, + "loss": 0.5794, + "step": 4180 + }, + { + "epoch": 0.6782933160285529, + "grad_norm": 0.6028153867241722, + "learning_rate": 4.416787510303281e-06, + "loss": 0.5472, + "step": 4181 + }, + { + "epoch": 0.6784555483452304, + "grad_norm": 0.5540280007762974, + "learning_rate": 4.416513332767438e-06, + "loss": 0.5519, + "step": 4182 + }, + { + "epoch": 0.6786177806619078, + "grad_norm": 0.6208228511658922, + "learning_rate": 4.416239099313331e-06, + "loss": 0.5791, + "step": 4183 + }, + { + "epoch": 0.6787800129785854, + "grad_norm": 0.6071303466761021, + "learning_rate": 4.415964809948963e-06, + "loss": 0.5752, + "step": 4184 + }, + { + "epoch": 0.6789422452952628, + "grad_norm": 0.5837905509509936, + "learning_rate": 4.415690464682335e-06, + "loss": 0.541, + "step": 4185 + }, + { + "epoch": 0.6791044776119403, + "grad_norm": 0.5898898680523826, + "learning_rate": 4.4154160635214534e-06, + "loss": 0.5602, + "step": 4186 + }, + { + "epoch": 0.6792667099286178, + "grad_norm": 0.5842380607978558, + "learning_rate": 4.415141606474325e-06, + "loss": 0.5821, + "step": 4187 + }, + { + "epoch": 0.6794289422452953, + "grad_norm": 0.589424915315123, + "learning_rate": 4.414867093548956e-06, + "loss": 0.5963, + "step": 4188 + }, + { + "epoch": 0.6795911745619727, + "grad_norm": 0.5674388932830516, + "learning_rate": 4.414592524753356e-06, + "loss": 0.5523, + "step": 4189 + }, + { + "epoch": 0.6797534068786503, + "grad_norm": 0.5921449589168544, + "learning_rate": 4.414317900095536e-06, + "loss": 0.5621, + "step": 4190 + }, + { + "epoch": 0.6799156391953277, + "grad_norm": 0.5959915766484256, + "learning_rate": 4.414043219583511e-06, + "loss": 0.5518, + "step": 4191 + }, + { + "epoch": 0.6800778715120052, + "grad_norm": 0.6148936143271517, + "learning_rate": 4.413768483225292e-06, + "loss": 0.4944, + "step": 4192 + }, + { + "epoch": 0.6802401038286827, + "grad_norm": 0.5992693587634483, + "learning_rate": 4.413493691028898e-06, + "loss": 0.5619, + "step": 4193 + }, + { + "epoch": 0.6804023361453602, + "grad_norm": 0.5852369484718499, + "learning_rate": 4.413218843002344e-06, + "loss": 0.5743, + "step": 4194 + }, + { + "epoch": 0.6805645684620376, + "grad_norm": 0.5931556793759093, + "learning_rate": 4.412943939153651e-06, + "loss": 0.6058, + "step": 4195 + }, + { + "epoch": 0.6807268007787152, + "grad_norm": 0.574936409992935, + "learning_rate": 4.41266897949084e-06, + "loss": 0.5117, + "step": 4196 + }, + { + "epoch": 0.6808890330953926, + "grad_norm": 0.5883823257762147, + "learning_rate": 4.412393964021931e-06, + "loss": 0.5443, + "step": 4197 + }, + { + "epoch": 0.6810512654120701, + "grad_norm": 0.5854452278107777, + "learning_rate": 4.412118892754952e-06, + "loss": 0.5127, + "step": 4198 + }, + { + "epoch": 0.6812134977287476, + "grad_norm": 0.579776626976124, + "learning_rate": 4.4118437656979264e-06, + "loss": 0.5317, + "step": 4199 + }, + { + "epoch": 0.6813757300454251, + "grad_norm": 0.5774525253386532, + "learning_rate": 4.411568582858882e-06, + "loss": 0.5336, + "step": 4200 + }, + { + "epoch": 0.6815379623621025, + "grad_norm": 0.5920571580223881, + "learning_rate": 4.411293344245848e-06, + "loss": 0.5676, + "step": 4201 + }, + { + "epoch": 0.68170019467878, + "grad_norm": 0.5962066365503192, + "learning_rate": 4.4110180498668556e-06, + "loss": 0.5185, + "step": 4202 + }, + { + "epoch": 0.6818624269954575, + "grad_norm": 0.5904492719643113, + "learning_rate": 4.410742699729936e-06, + "loss": 0.5967, + "step": 4203 + }, + { + "epoch": 0.682024659312135, + "grad_norm": 0.5567931189272874, + "learning_rate": 4.410467293843123e-06, + "loss": 0.5312, + "step": 4204 + }, + { + "epoch": 0.6821868916288124, + "grad_norm": 0.5927746744976038, + "learning_rate": 4.410191832214453e-06, + "loss": 0.5449, + "step": 4205 + }, + { + "epoch": 0.68234912394549, + "grad_norm": 0.5922836715899307, + "learning_rate": 4.409916314851964e-06, + "loss": 0.547, + "step": 4206 + }, + { + "epoch": 0.6825113562621674, + "grad_norm": 0.5613030756781984, + "learning_rate": 4.409640741763692e-06, + "loss": 0.578, + "step": 4207 + }, + { + "epoch": 0.6826735885788449, + "grad_norm": 0.5983533784223326, + "learning_rate": 4.40936511295768e-06, + "loss": 0.5715, + "step": 4208 + }, + { + "epoch": 0.6828358208955224, + "grad_norm": 0.6082093944334617, + "learning_rate": 4.409089428441969e-06, + "loss": 0.5506, + "step": 4209 + }, + { + "epoch": 0.6829980532121999, + "grad_norm": 0.5496529365627891, + "learning_rate": 4.408813688224603e-06, + "loss": 0.5506, + "step": 4210 + }, + { + "epoch": 0.6831602855288773, + "grad_norm": 0.5888449954330333, + "learning_rate": 4.408537892313627e-06, + "loss": 0.5665, + "step": 4211 + }, + { + "epoch": 0.6833225178455549, + "grad_norm": 0.5732345468094318, + "learning_rate": 4.408262040717088e-06, + "loss": 0.5372, + "step": 4212 + }, + { + "epoch": 0.6834847501622323, + "grad_norm": 0.5590833231216126, + "learning_rate": 4.407986133443034e-06, + "loss": 0.5722, + "step": 4213 + }, + { + "epoch": 0.6836469824789098, + "grad_norm": 0.579991508884245, + "learning_rate": 4.407710170499517e-06, + "loss": 0.5307, + "step": 4214 + }, + { + "epoch": 0.6838092147955873, + "grad_norm": 0.5906936721995465, + "learning_rate": 4.4074341518945865e-06, + "loss": 0.5917, + "step": 4215 + }, + { + "epoch": 0.6839714471122648, + "grad_norm": 0.5857391909012069, + "learning_rate": 4.407158077636297e-06, + "loss": 0.5624, + "step": 4216 + }, + { + "epoch": 0.6841336794289422, + "grad_norm": 0.6181755463586409, + "learning_rate": 4.406881947732704e-06, + "loss": 0.5737, + "step": 4217 + }, + { + "epoch": 0.6842959117456198, + "grad_norm": 0.5762707766348202, + "learning_rate": 4.406605762191864e-06, + "loss": 0.5364, + "step": 4218 + }, + { + "epoch": 0.6844581440622972, + "grad_norm": 0.5881451439277868, + "learning_rate": 4.4063295210218334e-06, + "loss": 0.5756, + "step": 4219 + }, + { + "epoch": 0.6846203763789747, + "grad_norm": 0.5759044189711043, + "learning_rate": 4.406053224230675e-06, + "loss": 0.5383, + "step": 4220 + }, + { + "epoch": 0.6847826086956522, + "grad_norm": 0.5789519432414206, + "learning_rate": 4.405776871826448e-06, + "loss": 0.5729, + "step": 4221 + }, + { + "epoch": 0.6849448410123297, + "grad_norm": 0.5593880555289035, + "learning_rate": 4.4055004638172175e-06, + "loss": 0.5683, + "step": 4222 + }, + { + "epoch": 0.6851070733290071, + "grad_norm": 0.5988563984610978, + "learning_rate": 4.405224000211047e-06, + "loss": 0.5885, + "step": 4223 + }, + { + "epoch": 0.6852693056456847, + "grad_norm": 0.5777348657775638, + "learning_rate": 4.404947481016003e-06, + "loss": 0.5579, + "step": 4224 + }, + { + "epoch": 0.6854315379623621, + "grad_norm": 0.6004376293462498, + "learning_rate": 4.404670906240154e-06, + "loss": 0.5862, + "step": 4225 + }, + { + "epoch": 0.6855937702790396, + "grad_norm": 0.5490266022301655, + "learning_rate": 4.4043942758915685e-06, + "loss": 0.5451, + "step": 4226 + }, + { + "epoch": 0.6857560025957171, + "grad_norm": 0.5775474534212058, + "learning_rate": 4.40411758997832e-06, + "loss": 0.5438, + "step": 4227 + }, + { + "epoch": 0.6859182349123946, + "grad_norm": 0.5988819585165883, + "learning_rate": 4.403840848508479e-06, + "loss": 0.5855, + "step": 4228 + }, + { + "epoch": 0.686080467229072, + "grad_norm": 0.6080124429573162, + "learning_rate": 4.403564051490121e-06, + "loss": 0.593, + "step": 4229 + }, + { + "epoch": 0.6862426995457496, + "grad_norm": 0.5922723321159706, + "learning_rate": 4.403287198931323e-06, + "loss": 0.5463, + "step": 4230 + }, + { + "epoch": 0.686404931862427, + "grad_norm": 0.5604355564591751, + "learning_rate": 4.403010290840162e-06, + "loss": 0.5432, + "step": 4231 + }, + { + "epoch": 0.6865671641791045, + "grad_norm": 0.5682058351773287, + "learning_rate": 4.402733327224717e-06, + "loss": 0.5294, + "step": 4232 + }, + { + "epoch": 0.6867293964957819, + "grad_norm": 0.6004652098051813, + "learning_rate": 4.402456308093069e-06, + "loss": 0.529, + "step": 4233 + }, + { + "epoch": 0.6868916288124595, + "grad_norm": 0.5616970261251808, + "learning_rate": 4.402179233453301e-06, + "loss": 0.557, + "step": 4234 + }, + { + "epoch": 0.6870538611291369, + "grad_norm": 0.5930650546762607, + "learning_rate": 4.401902103313497e-06, + "loss": 0.5329, + "step": 4235 + }, + { + "epoch": 0.6872160934458144, + "grad_norm": 0.5816457301233473, + "learning_rate": 4.401624917681743e-06, + "loss": 0.5675, + "step": 4236 + }, + { + "epoch": 0.6873783257624919, + "grad_norm": 0.5674244199938292, + "learning_rate": 4.401347676566127e-06, + "loss": 0.5312, + "step": 4237 + }, + { + "epoch": 0.6875405580791694, + "grad_norm": 0.606808311673381, + "learning_rate": 4.401070379974737e-06, + "loss": 0.5622, + "step": 4238 + }, + { + "epoch": 0.6877027903958468, + "grad_norm": 0.6109611584062973, + "learning_rate": 4.400793027915664e-06, + "loss": 0.5579, + "step": 4239 + }, + { + "epoch": 0.6878650227125244, + "grad_norm": 0.6130676751086148, + "learning_rate": 4.400515620397001e-06, + "loss": 0.5642, + "step": 4240 + }, + { + "epoch": 0.6880272550292018, + "grad_norm": 0.5880062455204784, + "learning_rate": 4.400238157426841e-06, + "loss": 0.5446, + "step": 4241 + }, + { + "epoch": 0.6881894873458793, + "grad_norm": 0.5979534238572917, + "learning_rate": 4.39996063901328e-06, + "loss": 0.5663, + "step": 4242 + }, + { + "epoch": 0.6883517196625568, + "grad_norm": 0.603402406272536, + "learning_rate": 4.399683065164416e-06, + "loss": 0.6087, + "step": 4243 + }, + { + "epoch": 0.6885139519792343, + "grad_norm": 0.5580724897659216, + "learning_rate": 4.399405435888346e-06, + "loss": 0.547, + "step": 4244 + }, + { + "epoch": 0.6886761842959117, + "grad_norm": 0.5936969091129174, + "learning_rate": 4.399127751193173e-06, + "loss": 0.5491, + "step": 4245 + }, + { + "epoch": 0.6888384166125893, + "grad_norm": 0.5697002353300203, + "learning_rate": 4.398850011086997e-06, + "loss": 0.5399, + "step": 4246 + }, + { + "epoch": 0.6890006489292667, + "grad_norm": 0.591790610230512, + "learning_rate": 4.398572215577921e-06, + "loss": 0.5554, + "step": 4247 + }, + { + "epoch": 0.6891628812459442, + "grad_norm": 0.5708532593123891, + "learning_rate": 4.398294364674052e-06, + "loss": 0.5658, + "step": 4248 + }, + { + "epoch": 0.6893251135626217, + "grad_norm": 0.56150379223069, + "learning_rate": 4.398016458383496e-06, + "loss": 0.5903, + "step": 4249 + }, + { + "epoch": 0.6894873458792992, + "grad_norm": 0.5679996140604165, + "learning_rate": 4.397738496714362e-06, + "loss": 0.5678, + "step": 4250 + }, + { + "epoch": 0.6896495781959766, + "grad_norm": 0.6543507784018541, + "learning_rate": 4.39746047967476e-06, + "loss": 0.5487, + "step": 4251 + }, + { + "epoch": 0.6898118105126542, + "grad_norm": 0.5625671816566624, + "learning_rate": 4.397182407272802e-06, + "loss": 0.5587, + "step": 4252 + }, + { + "epoch": 0.6899740428293316, + "grad_norm": 0.5872110492311138, + "learning_rate": 4.3969042795166e-06, + "loss": 0.5543, + "step": 4253 + }, + { + "epoch": 0.6901362751460091, + "grad_norm": 0.5884334919353604, + "learning_rate": 4.3966260964142704e-06, + "loss": 0.5789, + "step": 4254 + }, + { + "epoch": 0.6902985074626866, + "grad_norm": 0.599964380573669, + "learning_rate": 4.3963478579739295e-06, + "loss": 0.5421, + "step": 4255 + }, + { + "epoch": 0.6904607397793641, + "grad_norm": 0.5857913644824545, + "learning_rate": 4.396069564203695e-06, + "loss": 0.514, + "step": 4256 + }, + { + "epoch": 0.6906229720960415, + "grad_norm": 0.5719569829979274, + "learning_rate": 4.395791215111686e-06, + "loss": 0.5657, + "step": 4257 + }, + { + "epoch": 0.6907852044127191, + "grad_norm": 0.5932856543133063, + "learning_rate": 4.395512810706026e-06, + "loss": 0.5486, + "step": 4258 + }, + { + "epoch": 0.6909474367293965, + "grad_norm": 0.580568927549959, + "learning_rate": 4.395234350994836e-06, + "loss": 0.5712, + "step": 4259 + }, + { + "epoch": 0.691109669046074, + "grad_norm": 0.5875946545527031, + "learning_rate": 4.394955835986242e-06, + "loss": 0.5851, + "step": 4260 + }, + { + "epoch": 0.6912719013627514, + "grad_norm": 0.667660973964701, + "learning_rate": 4.39467726568837e-06, + "loss": 0.5545, + "step": 4261 + }, + { + "epoch": 0.691434133679429, + "grad_norm": 0.6002333040856431, + "learning_rate": 4.394398640109348e-06, + "loss": 0.5324, + "step": 4262 + }, + { + "epoch": 0.6915963659961064, + "grad_norm": 0.6192362021707313, + "learning_rate": 4.394119959257303e-06, + "loss": 0.5411, + "step": 4263 + }, + { + "epoch": 0.6917585983127839, + "grad_norm": 0.6015076810221996, + "learning_rate": 4.393841223140371e-06, + "loss": 0.5559, + "step": 4264 + }, + { + "epoch": 0.6919208306294614, + "grad_norm": 0.6019532334008498, + "learning_rate": 4.39356243176668e-06, + "loss": 0.5477, + "step": 4265 + }, + { + "epoch": 0.6920830629461389, + "grad_norm": 0.6050476936329673, + "learning_rate": 4.393283585144367e-06, + "loss": 0.5396, + "step": 4266 + }, + { + "epoch": 0.6922452952628163, + "grad_norm": 0.5603769724243514, + "learning_rate": 4.393004683281567e-06, + "loss": 0.5588, + "step": 4267 + }, + { + "epoch": 0.6924075275794939, + "grad_norm": 0.6206878002217485, + "learning_rate": 4.392725726186417e-06, + "loss": 0.5679, + "step": 4268 + }, + { + "epoch": 0.6925697598961713, + "grad_norm": 0.5789300708900765, + "learning_rate": 4.3924467138670585e-06, + "loss": 0.5675, + "step": 4269 + }, + { + "epoch": 0.6927319922128488, + "grad_norm": 0.6047939850674834, + "learning_rate": 4.39216764633163e-06, + "loss": 0.5746, + "step": 4270 + }, + { + "epoch": 0.6928942245295263, + "grad_norm": 0.5724651180300034, + "learning_rate": 4.391888523588274e-06, + "loss": 0.5337, + "step": 4271 + }, + { + "epoch": 0.6930564568462038, + "grad_norm": 0.5936168396912497, + "learning_rate": 4.391609345645135e-06, + "loss": 0.5734, + "step": 4272 + }, + { + "epoch": 0.6932186891628812, + "grad_norm": 0.6262600223537826, + "learning_rate": 4.391330112510359e-06, + "loss": 0.514, + "step": 4273 + }, + { + "epoch": 0.6933809214795588, + "grad_norm": 0.5550917990265427, + "learning_rate": 4.391050824192092e-06, + "loss": 0.5773, + "step": 4274 + }, + { + "epoch": 0.6935431537962362, + "grad_norm": 0.6107637919202795, + "learning_rate": 4.390771480698485e-06, + "loss": 0.5874, + "step": 4275 + }, + { + "epoch": 0.6937053861129137, + "grad_norm": 0.6111429503192419, + "learning_rate": 4.390492082037686e-06, + "loss": 0.5552, + "step": 4276 + }, + { + "epoch": 0.6938676184295912, + "grad_norm": 0.6104593116716281, + "learning_rate": 4.390212628217848e-06, + "loss": 0.5419, + "step": 4277 + }, + { + "epoch": 0.6940298507462687, + "grad_norm": 0.5699126319906245, + "learning_rate": 4.389933119247125e-06, + "loss": 0.5709, + "step": 4278 + }, + { + "epoch": 0.6941920830629461, + "grad_norm": 0.5956481349501638, + "learning_rate": 4.389653555133672e-06, + "loss": 0.5565, + "step": 4279 + }, + { + "epoch": 0.6943543153796237, + "grad_norm": 0.5875921231946729, + "learning_rate": 4.3893739358856465e-06, + "loss": 0.5366, + "step": 4280 + }, + { + "epoch": 0.6945165476963011, + "grad_norm": 0.5740503906530426, + "learning_rate": 4.389094261511205e-06, + "loss": 0.5308, + "step": 4281 + }, + { + "epoch": 0.6946787800129786, + "grad_norm": 0.5941665141757921, + "learning_rate": 4.388814532018509e-06, + "loss": 0.571, + "step": 4282 + }, + { + "epoch": 0.6948410123296561, + "grad_norm": 0.6170753497642412, + "learning_rate": 4.388534747415722e-06, + "loss": 0.5263, + "step": 4283 + }, + { + "epoch": 0.6950032446463336, + "grad_norm": 0.5999601007119998, + "learning_rate": 4.388254907711004e-06, + "loss": 0.5381, + "step": 4284 + }, + { + "epoch": 0.695165476963011, + "grad_norm": 0.6265480699153437, + "learning_rate": 4.387975012912521e-06, + "loss": 0.571, + "step": 4285 + }, + { + "epoch": 0.6953277092796886, + "grad_norm": 0.573747005905019, + "learning_rate": 4.38769506302844e-06, + "loss": 0.5771, + "step": 4286 + }, + { + "epoch": 0.695489941596366, + "grad_norm": 0.6142246533204678, + "learning_rate": 4.387415058066929e-06, + "loss": 0.5834, + "step": 4287 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 0.5972380601714105, + "learning_rate": 4.387134998036158e-06, + "loss": 0.5443, + "step": 4288 + }, + { + "epoch": 0.6958144062297209, + "grad_norm": 0.5790948885079978, + "learning_rate": 4.386854882944298e-06, + "loss": 0.5457, + "step": 4289 + }, + { + "epoch": 0.6959766385463985, + "grad_norm": 0.5783668246969499, + "learning_rate": 4.3865747127995206e-06, + "loss": 0.5701, + "step": 4290 + }, + { + "epoch": 0.6961388708630759, + "grad_norm": 0.5693918658655602, + "learning_rate": 4.386294487610003e-06, + "loss": 0.5689, + "step": 4291 + }, + { + "epoch": 0.6963011031797534, + "grad_norm": 0.5788527217906461, + "learning_rate": 4.38601420738392e-06, + "loss": 0.5595, + "step": 4292 + }, + { + "epoch": 0.6964633354964309, + "grad_norm": 0.6175647184699798, + "learning_rate": 4.385733872129448e-06, + "loss": 0.5326, + "step": 4293 + }, + { + "epoch": 0.6966255678131084, + "grad_norm": 0.5736339280133306, + "learning_rate": 4.38545348185477e-06, + "loss": 0.5753, + "step": 4294 + }, + { + "epoch": 0.6967878001297858, + "grad_norm": 0.6025738226945367, + "learning_rate": 4.3851730365680635e-06, + "loss": 0.5624, + "step": 4295 + }, + { + "epoch": 0.6969500324464634, + "grad_norm": 0.6028985857366956, + "learning_rate": 4.384892536277512e-06, + "loss": 0.5532, + "step": 4296 + }, + { + "epoch": 0.6971122647631408, + "grad_norm": 0.5575464041960545, + "learning_rate": 4.3846119809913e-06, + "loss": 0.5609, + "step": 4297 + }, + { + "epoch": 0.6972744970798183, + "grad_norm": 0.5852420595351892, + "learning_rate": 4.384331370717614e-06, + "loss": 0.5472, + "step": 4298 + }, + { + "epoch": 0.6974367293964958, + "grad_norm": 0.597758773718419, + "learning_rate": 4.384050705464639e-06, + "loss": 0.5369, + "step": 4299 + }, + { + "epoch": 0.6975989617131733, + "grad_norm": 0.6407742715686237, + "learning_rate": 4.383769985240568e-06, + "loss": 0.5276, + "step": 4300 + }, + { + "epoch": 0.6977611940298507, + "grad_norm": 0.5805235639011256, + "learning_rate": 4.383489210053588e-06, + "loss": 0.5747, + "step": 4301 + }, + { + "epoch": 0.6979234263465283, + "grad_norm": 0.6078529451027043, + "learning_rate": 4.383208379911893e-06, + "loss": 0.5727, + "step": 4302 + }, + { + "epoch": 0.6980856586632057, + "grad_norm": 0.609048904775105, + "learning_rate": 4.382927494823675e-06, + "loss": 0.5833, + "step": 4303 + }, + { + "epoch": 0.6982478909798832, + "grad_norm": 0.5954871379926221, + "learning_rate": 4.382646554797132e-06, + "loss": 0.5474, + "step": 4304 + }, + { + "epoch": 0.6984101232965607, + "grad_norm": 0.569336205391938, + "learning_rate": 4.382365559840458e-06, + "loss": 0.5572, + "step": 4305 + }, + { + "epoch": 0.6985723556132382, + "grad_norm": 0.6227738037387488, + "learning_rate": 4.382084509961855e-06, + "loss": 0.5741, + "step": 4306 + }, + { + "epoch": 0.6987345879299156, + "grad_norm": 0.631787095956244, + "learning_rate": 4.381803405169521e-06, + "loss": 0.5596, + "step": 4307 + }, + { + "epoch": 0.6988968202465932, + "grad_norm": 0.6042453750572652, + "learning_rate": 4.381522245471658e-06, + "loss": 0.5531, + "step": 4308 + }, + { + "epoch": 0.6990590525632706, + "grad_norm": 0.6012228206857162, + "learning_rate": 4.3812410308764695e-06, + "loss": 0.5363, + "step": 4309 + }, + { + "epoch": 0.6992212848799481, + "grad_norm": 0.5811450980434292, + "learning_rate": 4.38095976139216e-06, + "loss": 0.5682, + "step": 4310 + }, + { + "epoch": 0.6993835171966256, + "grad_norm": 0.6068826638685236, + "learning_rate": 4.380678437026938e-06, + "loss": 0.5148, + "step": 4311 + }, + { + "epoch": 0.6995457495133031, + "grad_norm": 0.5957621874559704, + "learning_rate": 4.380397057789011e-06, + "loss": 0.5032, + "step": 4312 + }, + { + "epoch": 0.6997079818299805, + "grad_norm": 0.5654735724833406, + "learning_rate": 4.380115623686588e-06, + "loss": 0.5752, + "step": 4313 + }, + { + "epoch": 0.6998702141466581, + "grad_norm": 0.5938662322798136, + "learning_rate": 4.37983413472788e-06, + "loss": 0.5642, + "step": 4314 + }, + { + "epoch": 0.7000324464633355, + "grad_norm": 0.6040229104104526, + "learning_rate": 4.379552590921102e-06, + "loss": 0.5735, + "step": 4315 + }, + { + "epoch": 0.700194678780013, + "grad_norm": 0.6052599808399488, + "learning_rate": 4.379270992274467e-06, + "loss": 0.5472, + "step": 4316 + }, + { + "epoch": 0.7003569110966905, + "grad_norm": 0.582475117790467, + "learning_rate": 4.378989338796192e-06, + "loss": 0.5515, + "step": 4317 + }, + { + "epoch": 0.700519143413368, + "grad_norm": 0.5826664784806789, + "learning_rate": 4.3787076304944945e-06, + "loss": 0.5678, + "step": 4318 + }, + { + "epoch": 0.7006813757300454, + "grad_norm": 0.5572705637685872, + "learning_rate": 4.378425867377594e-06, + "loss": 0.533, + "step": 4319 + }, + { + "epoch": 0.7008436080467229, + "grad_norm": 0.5920036314287314, + "learning_rate": 4.378144049453711e-06, + "loss": 0.5547, + "step": 4320 + }, + { + "epoch": 0.7010058403634004, + "grad_norm": 0.5855328104615675, + "learning_rate": 4.377862176731068e-06, + "loss": 0.5669, + "step": 4321 + }, + { + "epoch": 0.7011680726800779, + "grad_norm": 0.5771493014154353, + "learning_rate": 4.377580249217891e-06, + "loss": 0.5669, + "step": 4322 + }, + { + "epoch": 0.7013303049967553, + "grad_norm": 0.6298605124661448, + "learning_rate": 4.377298266922404e-06, + "loss": 0.5451, + "step": 4323 + }, + { + "epoch": 0.7014925373134329, + "grad_norm": 0.572471516635981, + "learning_rate": 4.377016229852836e-06, + "loss": 0.5692, + "step": 4324 + }, + { + "epoch": 0.7016547696301103, + "grad_norm": 0.5834588985770892, + "learning_rate": 4.376734138017414e-06, + "loss": 0.5556, + "step": 4325 + }, + { + "epoch": 0.7018170019467878, + "grad_norm": 0.6112325561678056, + "learning_rate": 4.376451991424371e-06, + "loss": 0.5429, + "step": 4326 + }, + { + "epoch": 0.7019792342634653, + "grad_norm": 0.5521654865137132, + "learning_rate": 4.3761697900819365e-06, + "loss": 0.5523, + "step": 4327 + }, + { + "epoch": 0.7021414665801428, + "grad_norm": 0.5960065619614485, + "learning_rate": 4.375887533998346e-06, + "loss": 0.5725, + "step": 4328 + }, + { + "epoch": 0.7023036988968202, + "grad_norm": 0.5688929874269585, + "learning_rate": 4.375605223181836e-06, + "loss": 0.5755, + "step": 4329 + }, + { + "epoch": 0.7024659312134978, + "grad_norm": 0.5837581025199245, + "learning_rate": 4.37532285764064e-06, + "loss": 0.5155, + "step": 4330 + }, + { + "epoch": 0.7026281635301752, + "grad_norm": 0.5868389958313083, + "learning_rate": 4.375040437383e-06, + "loss": 0.5495, + "step": 4331 + }, + { + "epoch": 0.7027903958468527, + "grad_norm": 0.6244312844171084, + "learning_rate": 4.374757962417155e-06, + "loss": 0.5691, + "step": 4332 + }, + { + "epoch": 0.7029526281635302, + "grad_norm": 0.605871983996882, + "learning_rate": 4.374475432751347e-06, + "loss": 0.5533, + "step": 4333 + }, + { + "epoch": 0.7031148604802077, + "grad_norm": 0.5608472472816014, + "learning_rate": 4.374192848393819e-06, + "loss": 0.5623, + "step": 4334 + }, + { + "epoch": 0.7032770927968851, + "grad_norm": 0.6025526597742671, + "learning_rate": 4.373910209352816e-06, + "loss": 0.5298, + "step": 4335 + }, + { + "epoch": 0.7034393251135627, + "grad_norm": 0.564460624449056, + "learning_rate": 4.373627515636584e-06, + "loss": 0.5637, + "step": 4336 + }, + { + "epoch": 0.7036015574302401, + "grad_norm": 0.5888218720574351, + "learning_rate": 4.3733447672533725e-06, + "loss": 0.5691, + "step": 4337 + }, + { + "epoch": 0.7037637897469176, + "grad_norm": 0.5849879588347234, + "learning_rate": 4.373061964211431e-06, + "loss": 0.5761, + "step": 4338 + }, + { + "epoch": 0.7039260220635951, + "grad_norm": 0.5870453321739296, + "learning_rate": 4.372779106519009e-06, + "loss": 0.5891, + "step": 4339 + }, + { + "epoch": 0.7040882543802726, + "grad_norm": 0.5682152456641729, + "learning_rate": 4.372496194184362e-06, + "loss": 0.6001, + "step": 4340 + }, + { + "epoch": 0.70425048669695, + "grad_norm": 0.6117450970791544, + "learning_rate": 4.3722132272157444e-06, + "loss": 0.5673, + "step": 4341 + }, + { + "epoch": 0.7044127190136276, + "grad_norm": 0.556843941352099, + "learning_rate": 4.371930205621411e-06, + "loss": 0.5188, + "step": 4342 + }, + { + "epoch": 0.704574951330305, + "grad_norm": 0.5874601949491948, + "learning_rate": 4.37164712940962e-06, + "loss": 0.548, + "step": 4343 + }, + { + "epoch": 0.7047371836469825, + "grad_norm": 0.5897700879944743, + "learning_rate": 4.3713639985886306e-06, + "loss": 0.5775, + "step": 4344 + }, + { + "epoch": 0.70489941596366, + "grad_norm": 0.577018492649131, + "learning_rate": 4.371080813166703e-06, + "loss": 0.5429, + "step": 4345 + }, + { + "epoch": 0.7050616482803375, + "grad_norm": 0.5905715303305774, + "learning_rate": 4.370797573152101e-06, + "loss": 0.5973, + "step": 4346 + }, + { + "epoch": 0.7052238805970149, + "grad_norm": 0.5617970787625416, + "learning_rate": 4.370514278553089e-06, + "loss": 0.5371, + "step": 4347 + }, + { + "epoch": 0.7053861129136924, + "grad_norm": 0.646764515994108, + "learning_rate": 4.3702309293779325e-06, + "loss": 0.534, + "step": 4348 + }, + { + "epoch": 0.7055483452303699, + "grad_norm": 0.5821280349355387, + "learning_rate": 4.369947525634897e-06, + "loss": 0.5591, + "step": 4349 + }, + { + "epoch": 0.7057105775470474, + "grad_norm": 0.6010877177321103, + "learning_rate": 4.369664067332253e-06, + "loss": 0.5052, + "step": 4350 + }, + { + "epoch": 0.7058728098637248, + "grad_norm": 0.6377781263506982, + "learning_rate": 4.369380554478272e-06, + "loss": 0.5565, + "step": 4351 + }, + { + "epoch": 0.7060350421804024, + "grad_norm": 0.5997151666915777, + "learning_rate": 4.369096987081223e-06, + "loss": 0.522, + "step": 4352 + }, + { + "epoch": 0.7061972744970798, + "grad_norm": 0.5801919158689367, + "learning_rate": 4.368813365149382e-06, + "loss": 0.555, + "step": 4353 + }, + { + "epoch": 0.7063595068137573, + "grad_norm": 0.602278325805147, + "learning_rate": 4.368529688691025e-06, + "loss": 0.5379, + "step": 4354 + }, + { + "epoch": 0.7065217391304348, + "grad_norm": 0.608708112671496, + "learning_rate": 4.368245957714426e-06, + "loss": 0.5403, + "step": 4355 + }, + { + "epoch": 0.7066839714471123, + "grad_norm": 0.6192732992486787, + "learning_rate": 4.367962172227866e-06, + "loss": 0.5567, + "step": 4356 + }, + { + "epoch": 0.7068462037637897, + "grad_norm": 0.5853888195651715, + "learning_rate": 4.367678332239624e-06, + "loss": 0.5905, + "step": 4357 + }, + { + "epoch": 0.7070084360804673, + "grad_norm": 0.5808265632044521, + "learning_rate": 4.367394437757981e-06, + "loss": 0.5783, + "step": 4358 + }, + { + "epoch": 0.7071706683971447, + "grad_norm": 0.5774991462478084, + "learning_rate": 4.367110488791222e-06, + "loss": 0.5548, + "step": 4359 + }, + { + "epoch": 0.7073329007138222, + "grad_norm": 0.6112325301984146, + "learning_rate": 4.36682648534763e-06, + "loss": 0.535, + "step": 4360 + }, + { + "epoch": 0.7074951330304997, + "grad_norm": 0.5623916325255053, + "learning_rate": 4.366542427435492e-06, + "loss": 0.5586, + "step": 4361 + }, + { + "epoch": 0.7076573653471772, + "grad_norm": 0.5780705064841025, + "learning_rate": 4.366258315063097e-06, + "loss": 0.5978, + "step": 4362 + }, + { + "epoch": 0.7078195976638546, + "grad_norm": 0.5662570428127655, + "learning_rate": 4.365974148238732e-06, + "loss": 0.5661, + "step": 4363 + }, + { + "epoch": 0.7079818299805322, + "grad_norm": 0.6041593722647687, + "learning_rate": 4.365689926970691e-06, + "loss": 0.5448, + "step": 4364 + }, + { + "epoch": 0.7081440622972096, + "grad_norm": 0.5340564735247245, + "learning_rate": 4.365405651267265e-06, + "loss": 0.5454, + "step": 4365 + }, + { + "epoch": 0.708306294613887, + "grad_norm": 0.5770982115343561, + "learning_rate": 4.36512132113675e-06, + "loss": 0.5396, + "step": 4366 + }, + { + "epoch": 0.7084685269305646, + "grad_norm": 0.5774368856795739, + "learning_rate": 4.364836936587439e-06, + "loss": 0.5838, + "step": 4367 + }, + { + "epoch": 0.7086307592472421, + "grad_norm": 0.5649035225559189, + "learning_rate": 4.364552497627632e-06, + "loss": 0.5292, + "step": 4368 + }, + { + "epoch": 0.7087929915639195, + "grad_norm": 0.6259528644735225, + "learning_rate": 4.364268004265628e-06, + "loss": 0.5951, + "step": 4369 + }, + { + "epoch": 0.7089552238805971, + "grad_norm": 0.5825599215573792, + "learning_rate": 4.363983456509726e-06, + "loss": 0.5701, + "step": 4370 + }, + { + "epoch": 0.7091174561972745, + "grad_norm": 0.5698235169841752, + "learning_rate": 4.36369885436823e-06, + "loss": 0.5645, + "step": 4371 + }, + { + "epoch": 0.709279688513952, + "grad_norm": 0.6302994854278128, + "learning_rate": 4.363414197849444e-06, + "loss": 0.5487, + "step": 4372 + }, + { + "epoch": 0.7094419208306295, + "grad_norm": 0.5552365704034761, + "learning_rate": 4.3631294869616715e-06, + "loss": 0.5469, + "step": 4373 + }, + { + "epoch": 0.709604153147307, + "grad_norm": 0.6025222614796174, + "learning_rate": 4.362844721713221e-06, + "loss": 0.5354, + "step": 4374 + }, + { + "epoch": 0.7097663854639844, + "grad_norm": 0.5703513127446335, + "learning_rate": 4.362559902112401e-06, + "loss": 0.5701, + "step": 4375 + }, + { + "epoch": 0.7099286177806619, + "grad_norm": 0.6079403372523585, + "learning_rate": 4.362275028167521e-06, + "loss": 0.5437, + "step": 4376 + }, + { + "epoch": 0.7100908500973394, + "grad_norm": 0.5954080857472579, + "learning_rate": 4.361990099886895e-06, + "loss": 0.5593, + "step": 4377 + }, + { + "epoch": 0.7102530824140169, + "grad_norm": 0.6664937691565314, + "learning_rate": 4.361705117278833e-06, + "loss": 0.5656, + "step": 4378 + }, + { + "epoch": 0.7104153147306943, + "grad_norm": 0.5829952190283142, + "learning_rate": 4.361420080351652e-06, + "loss": 0.5685, + "step": 4379 + }, + { + "epoch": 0.7105775470473719, + "grad_norm": 0.6014005770289631, + "learning_rate": 4.361134989113668e-06, + "loss": 0.5609, + "step": 4380 + }, + { + "epoch": 0.7107397793640493, + "grad_norm": 0.7026539434714469, + "learning_rate": 4.3608498435732e-06, + "loss": 0.5634, + "step": 4381 + }, + { + "epoch": 0.7109020116807268, + "grad_norm": 0.5629885960515889, + "learning_rate": 4.360564643738566e-06, + "loss": 0.5292, + "step": 4382 + }, + { + "epoch": 0.7110642439974043, + "grad_norm": 0.6160031169491359, + "learning_rate": 4.360279389618089e-06, + "loss": 0.5941, + "step": 4383 + }, + { + "epoch": 0.7112264763140818, + "grad_norm": 0.6291679273617474, + "learning_rate": 4.359994081220091e-06, + "loss": 0.5764, + "step": 4384 + }, + { + "epoch": 0.7113887086307592, + "grad_norm": 0.6232521197362308, + "learning_rate": 4.359708718552898e-06, + "loss": 0.5672, + "step": 4385 + }, + { + "epoch": 0.7115509409474368, + "grad_norm": 0.5595097856883505, + "learning_rate": 4.359423301624833e-06, + "loss": 0.5888, + "step": 4386 + }, + { + "epoch": 0.7117131732641142, + "grad_norm": 0.5872524575578099, + "learning_rate": 4.359137830444227e-06, + "loss": 0.5501, + "step": 4387 + }, + { + "epoch": 0.7118754055807917, + "grad_norm": 0.592954271308388, + "learning_rate": 4.3588523050194055e-06, + "loss": 0.5641, + "step": 4388 + }, + { + "epoch": 0.7120376378974692, + "grad_norm": 0.6083375563708918, + "learning_rate": 4.358566725358703e-06, + "loss": 0.5533, + "step": 4389 + }, + { + "epoch": 0.7121998702141467, + "grad_norm": 0.5747014664334015, + "learning_rate": 4.35828109147045e-06, + "loss": 0.5507, + "step": 4390 + }, + { + "epoch": 0.7123621025308241, + "grad_norm": 0.5987900921442174, + "learning_rate": 4.35799540336298e-06, + "loss": 0.5727, + "step": 4391 + }, + { + "epoch": 0.7125243348475017, + "grad_norm": 0.5898491240949701, + "learning_rate": 4.35770966104463e-06, + "loss": 0.5913, + "step": 4392 + }, + { + "epoch": 0.7126865671641791, + "grad_norm": 0.5631564350586394, + "learning_rate": 4.357423864523737e-06, + "loss": 0.5595, + "step": 4393 + }, + { + "epoch": 0.7128487994808566, + "grad_norm": 0.5716957495105632, + "learning_rate": 4.357138013808637e-06, + "loss": 0.5514, + "step": 4394 + }, + { + "epoch": 0.7130110317975341, + "grad_norm": 0.5930934595653357, + "learning_rate": 4.356852108907675e-06, + "loss": 0.5085, + "step": 4395 + }, + { + "epoch": 0.7131732641142116, + "grad_norm": 0.6081382560169311, + "learning_rate": 4.356566149829188e-06, + "loss": 0.5871, + "step": 4396 + }, + { + "epoch": 0.713335496430889, + "grad_norm": 0.6054371718083057, + "learning_rate": 4.356280136581523e-06, + "loss": 0.5297, + "step": 4397 + }, + { + "epoch": 0.7134977287475666, + "grad_norm": 0.5966651354271277, + "learning_rate": 4.355994069173023e-06, + "loss": 0.5504, + "step": 4398 + }, + { + "epoch": 0.713659961064244, + "grad_norm": 0.5914882703198426, + "learning_rate": 4.355707947612036e-06, + "loss": 0.558, + "step": 4399 + }, + { + "epoch": 0.7138221933809215, + "grad_norm": 0.5734300387876483, + "learning_rate": 4.355421771906909e-06, + "loss": 0.5575, + "step": 4400 + }, + { + "epoch": 0.713984425697599, + "grad_norm": 0.5983070338572583, + "learning_rate": 4.355135542065993e-06, + "loss": 0.5462, + "step": 4401 + }, + { + "epoch": 0.7141466580142765, + "grad_norm": 0.6134007034317771, + "learning_rate": 4.354849258097638e-06, + "loss": 0.5683, + "step": 4402 + }, + { + "epoch": 0.7143088903309539, + "grad_norm": 0.5673327271512552, + "learning_rate": 4.354562920010198e-06, + "loss": 0.5153, + "step": 4403 + }, + { + "epoch": 0.7144711226476315, + "grad_norm": 0.5971560338413452, + "learning_rate": 4.354276527812027e-06, + "loss": 0.5734, + "step": 4404 + }, + { + "epoch": 0.7146333549643089, + "grad_norm": 0.5992403448956491, + "learning_rate": 4.353990081511482e-06, + "loss": 0.5901, + "step": 4405 + }, + { + "epoch": 0.7147955872809864, + "grad_norm": 0.5474965917410138, + "learning_rate": 4.353703581116918e-06, + "loss": 0.5573, + "step": 4406 + }, + { + "epoch": 0.7149578195976638, + "grad_norm": 0.5904090029193214, + "learning_rate": 4.353417026636698e-06, + "loss": 0.5884, + "step": 4407 + }, + { + "epoch": 0.7151200519143414, + "grad_norm": 0.5504511998904899, + "learning_rate": 4.35313041807918e-06, + "loss": 0.6033, + "step": 4408 + }, + { + "epoch": 0.7152822842310188, + "grad_norm": 0.5677291905959384, + "learning_rate": 4.352843755452727e-06, + "loss": 0.5945, + "step": 4409 + }, + { + "epoch": 0.7154445165476963, + "grad_norm": 0.6031519140062368, + "learning_rate": 4.352557038765704e-06, + "loss": 0.5534, + "step": 4410 + }, + { + "epoch": 0.7156067488643738, + "grad_norm": 0.6175729967705923, + "learning_rate": 4.352270268026476e-06, + "loss": 0.5092, + "step": 4411 + }, + { + "epoch": 0.7157689811810513, + "grad_norm": 0.604275707138056, + "learning_rate": 4.3519834432434095e-06, + "loss": 0.5727, + "step": 4412 + }, + { + "epoch": 0.7159312134977287, + "grad_norm": 0.5588324917892928, + "learning_rate": 4.3516965644248734e-06, + "loss": 0.6018, + "step": 4413 + }, + { + "epoch": 0.7160934458144063, + "grad_norm": 0.5564109716863813, + "learning_rate": 4.3514096315792395e-06, + "loss": 0.5717, + "step": 4414 + }, + { + "epoch": 0.7162556781310837, + "grad_norm": 0.585424791023328, + "learning_rate": 4.351122644714877e-06, + "loss": 0.5506, + "step": 4415 + }, + { + "epoch": 0.7164179104477612, + "grad_norm": 0.5775558745079629, + "learning_rate": 4.350835603840162e-06, + "loss": 0.5479, + "step": 4416 + }, + { + "epoch": 0.7165801427644387, + "grad_norm": 0.5645281536238762, + "learning_rate": 4.350548508963468e-06, + "loss": 0.5412, + "step": 4417 + }, + { + "epoch": 0.7167423750811162, + "grad_norm": 0.6068298185300206, + "learning_rate": 4.350261360093172e-06, + "loss": 0.5584, + "step": 4418 + }, + { + "epoch": 0.7169046073977936, + "grad_norm": 0.5740749680771433, + "learning_rate": 4.349974157237651e-06, + "loss": 0.5531, + "step": 4419 + }, + { + "epoch": 0.7170668397144712, + "grad_norm": 0.589348622062804, + "learning_rate": 4.349686900405287e-06, + "loss": 0.5864, + "step": 4420 + }, + { + "epoch": 0.7172290720311486, + "grad_norm": 0.5828822815465374, + "learning_rate": 4.34939958960446e-06, + "loss": 0.534, + "step": 4421 + }, + { + "epoch": 0.717391304347826, + "grad_norm": 0.5663195006973929, + "learning_rate": 4.349112224843552e-06, + "loss": 0.5601, + "step": 4422 + }, + { + "epoch": 0.7175535366645036, + "grad_norm": 0.5453057428302627, + "learning_rate": 4.34882480613095e-06, + "loss": 0.5372, + "step": 4423 + }, + { + "epoch": 0.7177157689811811, + "grad_norm": 0.5870185119056329, + "learning_rate": 4.348537333475038e-06, + "loss": 0.5619, + "step": 4424 + }, + { + "epoch": 0.7178780012978585, + "grad_norm": 0.5652428136964698, + "learning_rate": 4.348249806884204e-06, + "loss": 0.5545, + "step": 4425 + }, + { + "epoch": 0.7180402336145361, + "grad_norm": 0.5919987821763756, + "learning_rate": 4.347962226366837e-06, + "loss": 0.5573, + "step": 4426 + }, + { + "epoch": 0.7182024659312135, + "grad_norm": 0.5759306045178687, + "learning_rate": 4.3476745919313286e-06, + "loss": 0.5192, + "step": 4427 + }, + { + "epoch": 0.718364698247891, + "grad_norm": 0.5667686971666609, + "learning_rate": 4.34738690358607e-06, + "loss": 0.5644, + "step": 4428 + }, + { + "epoch": 0.7185269305645685, + "grad_norm": 0.5568687302073526, + "learning_rate": 4.347099161339456e-06, + "loss": 0.55, + "step": 4429 + }, + { + "epoch": 0.718689162881246, + "grad_norm": 0.5486154791025744, + "learning_rate": 4.346811365199882e-06, + "loss": 0.5517, + "step": 4430 + }, + { + "epoch": 0.7188513951979234, + "grad_norm": 0.5798242500548814, + "learning_rate": 4.346523515175745e-06, + "loss": 0.5659, + "step": 4431 + }, + { + "epoch": 0.719013627514601, + "grad_norm": 0.5778508728534952, + "learning_rate": 4.346235611275443e-06, + "loss": 0.5617, + "step": 4432 + }, + { + "epoch": 0.7191758598312784, + "grad_norm": 0.5965425082148129, + "learning_rate": 4.345947653507377e-06, + "loss": 0.572, + "step": 4433 + }, + { + "epoch": 0.7193380921479559, + "grad_norm": 0.5752877406906245, + "learning_rate": 4.345659641879948e-06, + "loss": 0.5582, + "step": 4434 + }, + { + "epoch": 0.7195003244646333, + "grad_norm": 0.5739961439242293, + "learning_rate": 4.34537157640156e-06, + "loss": 0.5854, + "step": 4435 + }, + { + "epoch": 0.7196625567813109, + "grad_norm": 0.5887977256006667, + "learning_rate": 4.345083457080618e-06, + "loss": 0.5604, + "step": 4436 + }, + { + "epoch": 0.7198247890979883, + "grad_norm": 0.5559246411948484, + "learning_rate": 4.344795283925528e-06, + "loss": 0.5532, + "step": 4437 + }, + { + "epoch": 0.7199870214146658, + "grad_norm": 0.5554232584374833, + "learning_rate": 4.344507056944698e-06, + "loss": 0.5867, + "step": 4438 + }, + { + "epoch": 0.7201492537313433, + "grad_norm": 0.5974744073967745, + "learning_rate": 4.344218776146539e-06, + "loss": 0.5758, + "step": 4439 + }, + { + "epoch": 0.7203114860480208, + "grad_norm": 0.5766374100974264, + "learning_rate": 4.343930441539459e-06, + "loss": 0.5514, + "step": 4440 + }, + { + "epoch": 0.7204737183646982, + "grad_norm": 0.5869894819024533, + "learning_rate": 4.343642053131873e-06, + "loss": 0.5323, + "step": 4441 + }, + { + "epoch": 0.7206359506813758, + "grad_norm": 0.5949402366754531, + "learning_rate": 4.343353610932197e-06, + "loss": 0.5535, + "step": 4442 + }, + { + "epoch": 0.7207981829980532, + "grad_norm": 0.6198480157873948, + "learning_rate": 4.343065114948843e-06, + "loss": 0.592, + "step": 4443 + }, + { + "epoch": 0.7209604153147307, + "grad_norm": 0.5687872407346164, + "learning_rate": 4.342776565190232e-06, + "loss": 0.5568, + "step": 4444 + }, + { + "epoch": 0.7211226476314082, + "grad_norm": 0.5963596781115263, + "learning_rate": 4.3424879616647805e-06, + "loss": 0.5618, + "step": 4445 + }, + { + "epoch": 0.7212848799480857, + "grad_norm": 0.5980914038250872, + "learning_rate": 4.3421993043809104e-06, + "loss": 0.5528, + "step": 4446 + }, + { + "epoch": 0.7214471122647631, + "grad_norm": 0.6003473344305037, + "learning_rate": 4.341910593347044e-06, + "loss": 0.5644, + "step": 4447 + }, + { + "epoch": 0.7216093445814407, + "grad_norm": 0.6197939508484234, + "learning_rate": 4.341621828571605e-06, + "loss": 0.5549, + "step": 4448 + }, + { + "epoch": 0.7217715768981181, + "grad_norm": 0.5938544059794834, + "learning_rate": 4.341333010063018e-06, + "loss": 0.5276, + "step": 4449 + }, + { + "epoch": 0.7219338092147956, + "grad_norm": 0.5687955670659426, + "learning_rate": 4.341044137829709e-06, + "loss": 0.5539, + "step": 4450 + }, + { + "epoch": 0.7220960415314731, + "grad_norm": 0.612781085987672, + "learning_rate": 4.34075521188011e-06, + "loss": 0.5547, + "step": 4451 + }, + { + "epoch": 0.7222582738481506, + "grad_norm": 0.5897792779802882, + "learning_rate": 4.340466232222647e-06, + "loss": 0.5427, + "step": 4452 + }, + { + "epoch": 0.722420506164828, + "grad_norm": 0.5992497825933988, + "learning_rate": 4.340177198865754e-06, + "loss": 0.593, + "step": 4453 + }, + { + "epoch": 0.7225827384815056, + "grad_norm": 0.5728076938413068, + "learning_rate": 4.339888111817863e-06, + "loss": 0.5327, + "step": 4454 + }, + { + "epoch": 0.722744970798183, + "grad_norm": 0.5803784694922165, + "learning_rate": 4.33959897108741e-06, + "loss": 0.5733, + "step": 4455 + }, + { + "epoch": 0.7229072031148605, + "grad_norm": 0.5922402239920576, + "learning_rate": 4.33930977668283e-06, + "loss": 0.5525, + "step": 4456 + }, + { + "epoch": 0.723069435431538, + "grad_norm": 0.6054231841536354, + "learning_rate": 4.339020528612561e-06, + "loss": 0.5443, + "step": 4457 + }, + { + "epoch": 0.7232316677482155, + "grad_norm": 0.5994249324408113, + "learning_rate": 4.338731226885043e-06, + "loss": 0.5844, + "step": 4458 + }, + { + "epoch": 0.7233939000648929, + "grad_norm": 0.6234703674643305, + "learning_rate": 4.338441871508716e-06, + "loss": 0.5735, + "step": 4459 + }, + { + "epoch": 0.7235561323815705, + "grad_norm": 0.58307262295413, + "learning_rate": 4.3381524624920245e-06, + "loss": 0.5359, + "step": 4460 + }, + { + "epoch": 0.7237183646982479, + "grad_norm": 0.5916416830663288, + "learning_rate": 4.33786299984341e-06, + "loss": 0.5643, + "step": 4461 + }, + { + "epoch": 0.7238805970149254, + "grad_norm": 0.5897809032764257, + "learning_rate": 4.33757348357132e-06, + "loss": 0.5238, + "step": 4462 + }, + { + "epoch": 0.7240428293316029, + "grad_norm": 0.5554479686074735, + "learning_rate": 4.337283913684201e-06, + "loss": 0.5134, + "step": 4463 + }, + { + "epoch": 0.7242050616482804, + "grad_norm": 0.5735842387116911, + "learning_rate": 4.336994290190504e-06, + "loss": 0.5175, + "step": 4464 + }, + { + "epoch": 0.7243672939649578, + "grad_norm": 0.5956713657333059, + "learning_rate": 4.336704613098676e-06, + "loss": 0.558, + "step": 4465 + }, + { + "epoch": 0.7245295262816352, + "grad_norm": 0.6569398472106646, + "learning_rate": 4.336414882417169e-06, + "loss": 0.5412, + "step": 4466 + }, + { + "epoch": 0.7246917585983128, + "grad_norm": 0.6200229537050219, + "learning_rate": 4.3361250981544404e-06, + "loss": 0.5569, + "step": 4467 + }, + { + "epoch": 0.7248539909149903, + "grad_norm": 0.5699136705185477, + "learning_rate": 4.335835260318941e-06, + "loss": 0.5368, + "step": 4468 + }, + { + "epoch": 0.7250162232316677, + "grad_norm": 0.5677744294840862, + "learning_rate": 4.3355453689191295e-06, + "loss": 0.563, + "step": 4469 + }, + { + "epoch": 0.7251784555483453, + "grad_norm": 0.5979716074040791, + "learning_rate": 4.335255423963464e-06, + "loss": 0.5592, + "step": 4470 + }, + { + "epoch": 0.7253406878650227, + "grad_norm": 0.6159792097117188, + "learning_rate": 4.334965425460403e-06, + "loss": 0.5435, + "step": 4471 + }, + { + "epoch": 0.7255029201817002, + "grad_norm": 0.5981870201668853, + "learning_rate": 4.33467537341841e-06, + "loss": 0.5558, + "step": 4472 + }, + { + "epoch": 0.7256651524983777, + "grad_norm": 0.5512953871944806, + "learning_rate": 4.334385267845947e-06, + "loss": 0.5756, + "step": 4473 + }, + { + "epoch": 0.7258273848150552, + "grad_norm": 0.581890948887106, + "learning_rate": 4.334095108751477e-06, + "loss": 0.5734, + "step": 4474 + }, + { + "epoch": 0.7259896171317326, + "grad_norm": 0.5942952404026807, + "learning_rate": 4.333804896143468e-06, + "loss": 0.5455, + "step": 4475 + }, + { + "epoch": 0.7261518494484102, + "grad_norm": 0.5826947917544913, + "learning_rate": 4.333514630030386e-06, + "loss": 0.5569, + "step": 4476 + }, + { + "epoch": 0.7263140817650876, + "grad_norm": 0.5764410750566932, + "learning_rate": 4.333224310420701e-06, + "loss": 0.5237, + "step": 4477 + }, + { + "epoch": 0.726476314081765, + "grad_norm": 0.593458640853347, + "learning_rate": 4.332933937322883e-06, + "loss": 0.5997, + "step": 4478 + }, + { + "epoch": 0.7266385463984426, + "grad_norm": 0.5556045211049437, + "learning_rate": 4.3326435107454046e-06, + "loss": 0.5239, + "step": 4479 + }, + { + "epoch": 0.72680077871512, + "grad_norm": 0.5610556561293599, + "learning_rate": 4.33235303069674e-06, + "loss": 0.5526, + "step": 4480 + }, + { + "epoch": 0.7269630110317975, + "grad_norm": 0.5662896605603825, + "learning_rate": 4.332062497185364e-06, + "loss": 0.5368, + "step": 4481 + }, + { + "epoch": 0.7271252433484751, + "grad_norm": 0.5631215110352598, + "learning_rate": 4.331771910219754e-06, + "loss": 0.5653, + "step": 4482 + }, + { + "epoch": 0.7272874756651525, + "grad_norm": 0.6135137159077372, + "learning_rate": 4.331481269808388e-06, + "loss": 0.5734, + "step": 4483 + }, + { + "epoch": 0.72744970798183, + "grad_norm": 0.6113262469649449, + "learning_rate": 4.331190575959746e-06, + "loss": 0.6, + "step": 4484 + }, + { + "epoch": 0.7276119402985075, + "grad_norm": 0.5855790133251463, + "learning_rate": 4.3308998286823114e-06, + "loss": 0.543, + "step": 4485 + }, + { + "epoch": 0.727774172615185, + "grad_norm": 0.5940177208842723, + "learning_rate": 4.330609027984564e-06, + "loss": 0.569, + "step": 4486 + }, + { + "epoch": 0.7279364049318624, + "grad_norm": 0.5734206807338166, + "learning_rate": 4.330318173874992e-06, + "loss": 0.5472, + "step": 4487 + }, + { + "epoch": 0.72809863724854, + "grad_norm": 0.5831123374289033, + "learning_rate": 4.330027266362079e-06, + "loss": 0.5502, + "step": 4488 + }, + { + "epoch": 0.7282608695652174, + "grad_norm": 0.5892921720775783, + "learning_rate": 4.329736305454314e-06, + "loss": 0.5533, + "step": 4489 + }, + { + "epoch": 0.7284231018818949, + "grad_norm": 0.547709401483974, + "learning_rate": 4.3294452911601854e-06, + "loss": 0.5367, + "step": 4490 + }, + { + "epoch": 0.7285853341985724, + "grad_norm": 0.5660050037640894, + "learning_rate": 4.329154223488187e-06, + "loss": 0.5787, + "step": 4491 + }, + { + "epoch": 0.7287475665152499, + "grad_norm": 0.5706203842766132, + "learning_rate": 4.328863102446808e-06, + "loss": 0.5915, + "step": 4492 + }, + { + "epoch": 0.7289097988319273, + "grad_norm": 0.5882138032840077, + "learning_rate": 4.328571928044544e-06, + "loss": 0.5388, + "step": 4493 + }, + { + "epoch": 0.7290720311486047, + "grad_norm": 0.5980518143341367, + "learning_rate": 4.32828070028989e-06, + "loss": 0.5583, + "step": 4494 + }, + { + "epoch": 0.7292342634652823, + "grad_norm": 0.6364377465833279, + "learning_rate": 4.327989419191344e-06, + "loss": 0.5866, + "step": 4495 + }, + { + "epoch": 0.7293964957819598, + "grad_norm": 0.5958337482622531, + "learning_rate": 4.327698084757404e-06, + "loss": 0.5711, + "step": 4496 + }, + { + "epoch": 0.7295587280986372, + "grad_norm": 0.5630187372054346, + "learning_rate": 4.32740669699657e-06, + "loss": 0.5428, + "step": 4497 + }, + { + "epoch": 0.7297209604153148, + "grad_norm": 0.5663266902584544, + "learning_rate": 4.327115255917346e-06, + "loss": 0.578, + "step": 4498 + }, + { + "epoch": 0.7298831927319922, + "grad_norm": 0.5844594939395891, + "learning_rate": 4.3268237615282325e-06, + "loss": 0.5646, + "step": 4499 + }, + { + "epoch": 0.7300454250486696, + "grad_norm": 0.5916310088751311, + "learning_rate": 4.326532213837735e-06, + "loss": 0.5432, + "step": 4500 + }, + { + "epoch": 0.7302076573653472, + "grad_norm": 0.5550961596299122, + "learning_rate": 4.326240612854362e-06, + "loss": 0.549, + "step": 4501 + }, + { + "epoch": 0.7303698896820247, + "grad_norm": 0.5456513471656955, + "learning_rate": 4.325948958586621e-06, + "loss": 0.5195, + "step": 4502 + }, + { + "epoch": 0.7305321219987021, + "grad_norm": 0.5796912855222103, + "learning_rate": 4.325657251043019e-06, + "loss": 0.5789, + "step": 4503 + }, + { + "epoch": 0.7306943543153797, + "grad_norm": 0.5739649605448488, + "learning_rate": 4.325365490232071e-06, + "loss": 0.5362, + "step": 4504 + }, + { + "epoch": 0.7308565866320571, + "grad_norm": 0.5573380529456818, + "learning_rate": 4.3250736761622865e-06, + "loss": 0.5602, + "step": 4505 + }, + { + "epoch": 0.7310188189487346, + "grad_norm": 0.5714506133641402, + "learning_rate": 4.3247818088421826e-06, + "loss": 0.5385, + "step": 4506 + }, + { + "epoch": 0.7311810512654121, + "grad_norm": 0.5598834970499721, + "learning_rate": 4.324489888280273e-06, + "loss": 0.5401, + "step": 4507 + }, + { + "epoch": 0.7313432835820896, + "grad_norm": 0.5655021214164205, + "learning_rate": 4.324197914485075e-06, + "loss": 0.5463, + "step": 4508 + }, + { + "epoch": 0.731505515898767, + "grad_norm": 0.5633187727745506, + "learning_rate": 4.323905887465109e-06, + "loss": 0.5346, + "step": 4509 + }, + { + "epoch": 0.7316677482154446, + "grad_norm": 0.5824612100275799, + "learning_rate": 4.323613807228895e-06, + "loss": 0.5896, + "step": 4510 + }, + { + "epoch": 0.731829980532122, + "grad_norm": 0.6223545678936295, + "learning_rate": 4.323321673784955e-06, + "loss": 0.5791, + "step": 4511 + }, + { + "epoch": 0.7319922128487995, + "grad_norm": 0.5683206547881207, + "learning_rate": 4.323029487141812e-06, + "loss": 0.5687, + "step": 4512 + }, + { + "epoch": 0.732154445165477, + "grad_norm": 0.5832864178047689, + "learning_rate": 4.322737247307991e-06, + "loss": 0.5553, + "step": 4513 + }, + { + "epoch": 0.7323166774821545, + "grad_norm": 0.5726009176589091, + "learning_rate": 4.32244495429202e-06, + "loss": 0.5529, + "step": 4514 + }, + { + "epoch": 0.7324789097988319, + "grad_norm": 0.587666951047904, + "learning_rate": 4.3221526081024265e-06, + "loss": 0.5632, + "step": 4515 + }, + { + "epoch": 0.7326411421155095, + "grad_norm": 0.5709209355761983, + "learning_rate": 4.321860208747741e-06, + "loss": 0.5862, + "step": 4516 + }, + { + "epoch": 0.7328033744321869, + "grad_norm": 0.6069914514414303, + "learning_rate": 4.321567756236493e-06, + "loss": 0.5521, + "step": 4517 + }, + { + "epoch": 0.7329656067488644, + "grad_norm": 0.6046388763123205, + "learning_rate": 4.3212752505772185e-06, + "loss": 0.556, + "step": 4518 + }, + { + "epoch": 0.7331278390655419, + "grad_norm": 0.59852262438596, + "learning_rate": 4.3209826917784485e-06, + "loss": 0.5912, + "step": 4519 + }, + { + "epoch": 0.7332900713822194, + "grad_norm": 0.5675179312997686, + "learning_rate": 4.32069007984872e-06, + "loss": 0.5654, + "step": 4520 + }, + { + "epoch": 0.7334523036988968, + "grad_norm": 0.5813641784424616, + "learning_rate": 4.320397414796573e-06, + "loss": 0.5611, + "step": 4521 + }, + { + "epoch": 0.7336145360155742, + "grad_norm": 0.5693913149829158, + "learning_rate": 4.320104696630544e-06, + "loss": 0.5587, + "step": 4522 + }, + { + "epoch": 0.7337767683322518, + "grad_norm": 0.5580454238571766, + "learning_rate": 4.319811925359175e-06, + "loss": 0.5524, + "step": 4523 + }, + { + "epoch": 0.7339390006489293, + "grad_norm": 0.5450198971259165, + "learning_rate": 4.319519100991007e-06, + "loss": 0.5648, + "step": 4524 + }, + { + "epoch": 0.7341012329656067, + "grad_norm": 0.5985799660809559, + "learning_rate": 4.319226223534585e-06, + "loss": 0.5542, + "step": 4525 + }, + { + "epoch": 0.7342634652822843, + "grad_norm": 0.6338717956746532, + "learning_rate": 4.318933292998453e-06, + "loss": 0.5658, + "step": 4526 + }, + { + "epoch": 0.7344256975989617, + "grad_norm": 0.5912906466845792, + "learning_rate": 4.318640309391159e-06, + "loss": 0.5276, + "step": 4527 + }, + { + "epoch": 0.7345879299156391, + "grad_norm": 0.5426076685973321, + "learning_rate": 4.3183472727212504e-06, + "loss": 0.5922, + "step": 4528 + }, + { + "epoch": 0.7347501622323167, + "grad_norm": 0.5598138950853785, + "learning_rate": 4.318054182997279e-06, + "loss": 0.526, + "step": 4529 + }, + { + "epoch": 0.7349123945489942, + "grad_norm": 0.5814151739898414, + "learning_rate": 4.317761040227794e-06, + "loss": 0.539, + "step": 4530 + }, + { + "epoch": 0.7350746268656716, + "grad_norm": 0.5985103978577387, + "learning_rate": 4.317467844421349e-06, + "loss": 0.5473, + "step": 4531 + }, + { + "epoch": 0.7352368591823492, + "grad_norm": 0.558317239513067, + "learning_rate": 4.317174595586501e-06, + "loss": 0.5575, + "step": 4532 + }, + { + "epoch": 0.7353990914990266, + "grad_norm": 0.5691460853750883, + "learning_rate": 4.3168812937318025e-06, + "loss": 0.5444, + "step": 4533 + }, + { + "epoch": 0.735561323815704, + "grad_norm": 0.60481448369792, + "learning_rate": 4.316587938865814e-06, + "loss": 0.553, + "step": 4534 + }, + { + "epoch": 0.7357235561323816, + "grad_norm": 0.6046921851201331, + "learning_rate": 4.316294530997093e-06, + "loss": 0.5672, + "step": 4535 + }, + { + "epoch": 0.735885788449059, + "grad_norm": 0.5676287316319519, + "learning_rate": 4.316001070134201e-06, + "loss": 0.5481, + "step": 4536 + }, + { + "epoch": 0.7360480207657365, + "grad_norm": 0.5710314312869152, + "learning_rate": 4.315707556285702e-06, + "loss": 0.5777, + "step": 4537 + }, + { + "epoch": 0.7362102530824141, + "grad_norm": 0.5694517985448265, + "learning_rate": 4.315413989460156e-06, + "loss": 0.5637, + "step": 4538 + }, + { + "epoch": 0.7363724853990915, + "grad_norm": 0.5814805581355098, + "learning_rate": 4.315120369666131e-06, + "loss": 0.532, + "step": 4539 + }, + { + "epoch": 0.736534717715769, + "grad_norm": 0.5680682850282345, + "learning_rate": 4.314826696912194e-06, + "loss": 0.5584, + "step": 4540 + }, + { + "epoch": 0.7366969500324465, + "grad_norm": 0.5699130545398072, + "learning_rate": 4.3145329712069134e-06, + "loss": 0.5626, + "step": 4541 + }, + { + "epoch": 0.736859182349124, + "grad_norm": 0.5713654828689009, + "learning_rate": 4.314239192558859e-06, + "loss": 0.5441, + "step": 4542 + }, + { + "epoch": 0.7370214146658014, + "grad_norm": 0.5567350713983548, + "learning_rate": 4.3139453609766016e-06, + "loss": 0.5503, + "step": 4543 + }, + { + "epoch": 0.737183646982479, + "grad_norm": 0.5574279635531564, + "learning_rate": 4.3136514764687155e-06, + "loss": 0.5413, + "step": 4544 + }, + { + "epoch": 0.7373458792991564, + "grad_norm": 0.5948043789594939, + "learning_rate": 4.313357539043774e-06, + "loss": 0.5541, + "step": 4545 + }, + { + "epoch": 0.7375081116158339, + "grad_norm": 0.5914142305862907, + "learning_rate": 4.3130635487103555e-06, + "loss": 0.5689, + "step": 4546 + }, + { + "epoch": 0.7376703439325114, + "grad_norm": 0.5720114474742947, + "learning_rate": 4.3127695054770365e-06, + "loss": 0.567, + "step": 4547 + }, + { + "epoch": 0.7378325762491889, + "grad_norm": 0.5891295069089791, + "learning_rate": 4.312475409352396e-06, + "loss": 0.5779, + "step": 4548 + }, + { + "epoch": 0.7379948085658663, + "grad_norm": 0.5561378042703293, + "learning_rate": 4.312181260345015e-06, + "loss": 0.5522, + "step": 4549 + }, + { + "epoch": 0.7381570408825439, + "grad_norm": 0.6312963226773645, + "learning_rate": 4.311887058463477e-06, + "loss": 0.5599, + "step": 4550 + }, + { + "epoch": 0.7383192731992213, + "grad_norm": 0.5502318571353514, + "learning_rate": 4.311592803716364e-06, + "loss": 0.568, + "step": 4551 + }, + { + "epoch": 0.7384815055158988, + "grad_norm": 0.6168477822526359, + "learning_rate": 4.311298496112264e-06, + "loss": 0.5883, + "step": 4552 + }, + { + "epoch": 0.7386437378325762, + "grad_norm": 0.5748335124111508, + "learning_rate": 4.311004135659762e-06, + "loss": 0.5517, + "step": 4553 + }, + { + "epoch": 0.7388059701492538, + "grad_norm": 0.6162560708255587, + "learning_rate": 4.3107097223674475e-06, + "loss": 0.5174, + "step": 4554 + }, + { + "epoch": 0.7389682024659312, + "grad_norm": 0.62133859906206, + "learning_rate": 4.3104152562439105e-06, + "loss": 0.516, + "step": 4555 + }, + { + "epoch": 0.7391304347826086, + "grad_norm": 0.5924144753375734, + "learning_rate": 4.310120737297742e-06, + "loss": 0.5394, + "step": 4556 + }, + { + "epoch": 0.7392926670992862, + "grad_norm": 0.6159032411082378, + "learning_rate": 4.309826165537536e-06, + "loss": 0.5593, + "step": 4557 + }, + { + "epoch": 0.7394548994159637, + "grad_norm": 0.5948835775634868, + "learning_rate": 4.309531540971888e-06, + "loss": 0.5375, + "step": 4558 + }, + { + "epoch": 0.7396171317326411, + "grad_norm": 0.5913312017967691, + "learning_rate": 4.309236863609392e-06, + "loss": 0.5632, + "step": 4559 + }, + { + "epoch": 0.7397793640493187, + "grad_norm": 0.5931725229013756, + "learning_rate": 4.308942133458647e-06, + "loss": 0.573, + "step": 4560 + }, + { + "epoch": 0.7399415963659961, + "grad_norm": 0.6133562756819684, + "learning_rate": 4.308647350528253e-06, + "loss": 0.5529, + "step": 4561 + }, + { + "epoch": 0.7401038286826735, + "grad_norm": 0.5773203913605681, + "learning_rate": 4.3083525148268105e-06, + "loss": 0.618, + "step": 4562 + }, + { + "epoch": 0.7402660609993511, + "grad_norm": 0.5918739647221527, + "learning_rate": 4.308057626362921e-06, + "loss": 0.5608, + "step": 4563 + }, + { + "epoch": 0.7404282933160286, + "grad_norm": 0.560632448622155, + "learning_rate": 4.30776268514519e-06, + "loss": 0.5929, + "step": 4564 + }, + { + "epoch": 0.740590525632706, + "grad_norm": 0.5762168335004144, + "learning_rate": 4.307467691182222e-06, + "loss": 0.5663, + "step": 4565 + }, + { + "epoch": 0.7407527579493836, + "grad_norm": 0.5772840545425729, + "learning_rate": 4.3071726444826244e-06, + "loss": 0.5182, + "step": 4566 + }, + { + "epoch": 0.740914990266061, + "grad_norm": 0.568058516619948, + "learning_rate": 4.306877545055005e-06, + "loss": 0.5696, + "step": 4567 + }, + { + "epoch": 0.7410772225827384, + "grad_norm": 0.5753095963815016, + "learning_rate": 4.306582392907976e-06, + "loss": 0.5645, + "step": 4568 + }, + { + "epoch": 0.741239454899416, + "grad_norm": 0.5408710026037661, + "learning_rate": 4.306287188050148e-06, + "loss": 0.5498, + "step": 4569 + }, + { + "epoch": 0.7414016872160935, + "grad_norm": 0.5795885898617835, + "learning_rate": 4.305991930490133e-06, + "loss": 0.5426, + "step": 4570 + }, + { + "epoch": 0.7415639195327709, + "grad_norm": 0.5692752045334573, + "learning_rate": 4.305696620236547e-06, + "loss": 0.565, + "step": 4571 + }, + { + "epoch": 0.7417261518494485, + "grad_norm": 0.5474662145570517, + "learning_rate": 4.305401257298007e-06, + "loss": 0.5733, + "step": 4572 + }, + { + "epoch": 0.7418883841661259, + "grad_norm": 0.5928823915283344, + "learning_rate": 4.305105841683128e-06, + "loss": 0.5558, + "step": 4573 + }, + { + "epoch": 0.7420506164828033, + "grad_norm": 0.5997385738007665, + "learning_rate": 4.304810373400533e-06, + "loss": 0.5685, + "step": 4574 + }, + { + "epoch": 0.7422128487994809, + "grad_norm": 0.5922453394484292, + "learning_rate": 4.3045148524588396e-06, + "loss": 0.5585, + "step": 4575 + }, + { + "epoch": 0.7423750811161584, + "grad_norm": 0.5975002899257156, + "learning_rate": 4.304219278866673e-06, + "loss": 0.5671, + "step": 4576 + }, + { + "epoch": 0.7425373134328358, + "grad_norm": 0.5943886570223508, + "learning_rate": 4.303923652632656e-06, + "loss": 0.5875, + "step": 4577 + }, + { + "epoch": 0.7426995457495134, + "grad_norm": 0.5997759741023743, + "learning_rate": 4.303627973765413e-06, + "loss": 0.5643, + "step": 4578 + }, + { + "epoch": 0.7428617780661908, + "grad_norm": 0.5755117007463948, + "learning_rate": 4.303332242273574e-06, + "loss": 0.581, + "step": 4579 + }, + { + "epoch": 0.7430240103828682, + "grad_norm": 0.5880719476232413, + "learning_rate": 4.303036458165764e-06, + "loss": 0.56, + "step": 4580 + }, + { + "epoch": 0.7431862426995457, + "grad_norm": 0.5745283846115702, + "learning_rate": 4.302740621450615e-06, + "loss": 0.5941, + "step": 4581 + }, + { + "epoch": 0.7433484750162233, + "grad_norm": 0.5980759544381306, + "learning_rate": 4.302444732136759e-06, + "loss": 0.5348, + "step": 4582 + }, + { + "epoch": 0.7435107073329007, + "grad_norm": 0.5976750881417853, + "learning_rate": 4.302148790232828e-06, + "loss": 0.5383, + "step": 4583 + }, + { + "epoch": 0.7436729396495781, + "grad_norm": 0.5895045573187855, + "learning_rate": 4.3018527957474585e-06, + "loss": 0.575, + "step": 4584 + }, + { + "epoch": 0.7438351719662557, + "grad_norm": 0.5775543571787819, + "learning_rate": 4.301556748689285e-06, + "loss": 0.5639, + "step": 4585 + }, + { + "epoch": 0.7439974042829332, + "grad_norm": 0.5959065050651745, + "learning_rate": 4.301260649066946e-06, + "loss": 0.5608, + "step": 4586 + }, + { + "epoch": 0.7441596365996106, + "grad_norm": 0.5750897265000017, + "learning_rate": 4.3009644968890805e-06, + "loss": 0.5853, + "step": 4587 + }, + { + "epoch": 0.7443218689162882, + "grad_norm": 0.6194767877168403, + "learning_rate": 4.300668292164329e-06, + "loss": 0.528, + "step": 4588 + }, + { + "epoch": 0.7444841012329656, + "grad_norm": 0.5637568838437583, + "learning_rate": 4.300372034901336e-06, + "loss": 0.5926, + "step": 4589 + }, + { + "epoch": 0.744646333549643, + "grad_norm": 0.5623206699654688, + "learning_rate": 4.300075725108743e-06, + "loss": 0.5686, + "step": 4590 + }, + { + "epoch": 0.7448085658663206, + "grad_norm": 0.6209457175539985, + "learning_rate": 4.2997793627951964e-06, + "loss": 0.5764, + "step": 4591 + }, + { + "epoch": 0.744970798182998, + "grad_norm": 0.5408165226106584, + "learning_rate": 4.299482947969344e-06, + "loss": 0.5805, + "step": 4592 + }, + { + "epoch": 0.7451330304996755, + "grad_norm": 0.5801575385340253, + "learning_rate": 4.299186480639832e-06, + "loss": 0.5719, + "step": 4593 + }, + { + "epoch": 0.745295262816353, + "grad_norm": 0.5727979464978119, + "learning_rate": 4.298889960815312e-06, + "loss": 0.5586, + "step": 4594 + }, + { + "epoch": 0.7454574951330305, + "grad_norm": 0.5720780001178826, + "learning_rate": 4.298593388504437e-06, + "loss": 0.5423, + "step": 4595 + }, + { + "epoch": 0.745619727449708, + "grad_norm": 0.588207993618563, + "learning_rate": 4.298296763715858e-06, + "loss": 0.5559, + "step": 4596 + }, + { + "epoch": 0.7457819597663855, + "grad_norm": 0.5741358477863177, + "learning_rate": 4.2980000864582294e-06, + "loss": 0.5741, + "step": 4597 + }, + { + "epoch": 0.745944192083063, + "grad_norm": 0.571318453486377, + "learning_rate": 4.29770335674021e-06, + "loss": 0.5487, + "step": 4598 + }, + { + "epoch": 0.7461064243997404, + "grad_norm": 0.6099524636353911, + "learning_rate": 4.297406574570454e-06, + "loss": 0.5514, + "step": 4599 + }, + { + "epoch": 0.746268656716418, + "grad_norm": 0.557598918223511, + "learning_rate": 4.297109739957622e-06, + "loss": 0.5687, + "step": 4600 + }, + { + "epoch": 0.7464308890330954, + "grad_norm": 0.6092632185042226, + "learning_rate": 4.296812852910377e-06, + "loss": 0.5525, + "step": 4601 + }, + { + "epoch": 0.7465931213497728, + "grad_norm": 0.6085061919688352, + "learning_rate": 4.296515913437378e-06, + "loss": 0.5346, + "step": 4602 + }, + { + "epoch": 0.7467553536664504, + "grad_norm": 0.5535721230673459, + "learning_rate": 4.296218921547292e-06, + "loss": 0.5458, + "step": 4603 + }, + { + "epoch": 0.7469175859831279, + "grad_norm": 0.5745827618984388, + "learning_rate": 4.295921877248781e-06, + "loss": 0.5718, + "step": 4604 + }, + { + "epoch": 0.7470798182998053, + "grad_norm": 0.5952837126763255, + "learning_rate": 4.2956247805505135e-06, + "loss": 0.5411, + "step": 4605 + }, + { + "epoch": 0.7472420506164829, + "grad_norm": 0.6075234154440373, + "learning_rate": 4.295327631461159e-06, + "loss": 0.563, + "step": 4606 + }, + { + "epoch": 0.7474042829331603, + "grad_norm": 0.5834519580062579, + "learning_rate": 4.295030429989385e-06, + "loss": 0.5579, + "step": 4607 + }, + { + "epoch": 0.7475665152498377, + "grad_norm": 0.589480196956285, + "learning_rate": 4.294733176143866e-06, + "loss": 0.5414, + "step": 4608 + }, + { + "epoch": 0.7477287475665152, + "grad_norm": 0.5951185927220594, + "learning_rate": 4.294435869933272e-06, + "loss": 0.559, + "step": 4609 + }, + { + "epoch": 0.7478909798831928, + "grad_norm": 0.6233812290746213, + "learning_rate": 4.29413851136628e-06, + "loss": 0.5476, + "step": 4610 + }, + { + "epoch": 0.7480532121998702, + "grad_norm": 0.5875990591546125, + "learning_rate": 4.293841100451563e-06, + "loss": 0.5337, + "step": 4611 + }, + { + "epoch": 0.7482154445165476, + "grad_norm": 0.5385867782119862, + "learning_rate": 4.293543637197802e-06, + "loss": 0.5461, + "step": 4612 + }, + { + "epoch": 0.7483776768332252, + "grad_norm": 0.571896986282939, + "learning_rate": 4.293246121613674e-06, + "loss": 0.5894, + "step": 4613 + }, + { + "epoch": 0.7485399091499026, + "grad_norm": 0.6038809647379488, + "learning_rate": 4.292948553707862e-06, + "loss": 0.5997, + "step": 4614 + }, + { + "epoch": 0.7487021414665801, + "grad_norm": 0.6008560915346383, + "learning_rate": 4.292650933489045e-06, + "loss": 0.5551, + "step": 4615 + }, + { + "epoch": 0.7488643737832577, + "grad_norm": 0.5760272953469221, + "learning_rate": 4.292353260965908e-06, + "loss": 0.5396, + "step": 4616 + }, + { + "epoch": 0.7490266060999351, + "grad_norm": 0.5808022064119519, + "learning_rate": 4.292055536147137e-06, + "loss": 0.5385, + "step": 4617 + }, + { + "epoch": 0.7491888384166125, + "grad_norm": 0.6159802985536699, + "learning_rate": 4.291757759041417e-06, + "loss": 0.564, + "step": 4618 + }, + { + "epoch": 0.7493510707332901, + "grad_norm": 0.5774017994235912, + "learning_rate": 4.2914599296574375e-06, + "loss": 0.551, + "step": 4619 + }, + { + "epoch": 0.7495133030499675, + "grad_norm": 0.5745753497098041, + "learning_rate": 4.291162048003889e-06, + "loss": 0.5608, + "step": 4620 + }, + { + "epoch": 0.749675535366645, + "grad_norm": 0.5732536876450822, + "learning_rate": 4.290864114089461e-06, + "loss": 0.574, + "step": 4621 + }, + { + "epoch": 0.7498377676833226, + "grad_norm": 0.6502316979867219, + "learning_rate": 4.290566127922848e-06, + "loss": 0.5509, + "step": 4622 + }, + { + "epoch": 0.75, + "grad_norm": 0.5878803693571061, + "learning_rate": 4.290268089512744e-06, + "loss": 0.528, + "step": 4623 + }, + { + "epoch": 0.7501622323166774, + "grad_norm": 0.603454732258105, + "learning_rate": 4.289969998867843e-06, + "loss": 0.5132, + "step": 4624 + }, + { + "epoch": 0.750324464633355, + "grad_norm": 0.6188198583308108, + "learning_rate": 4.289671855996845e-06, + "loss": 0.5627, + "step": 4625 + }, + { + "epoch": 0.7504866969500325, + "grad_norm": 0.6120485762147484, + "learning_rate": 4.289373660908448e-06, + "loss": 0.5717, + "step": 4626 + }, + { + "epoch": 0.7506489292667099, + "grad_norm": 0.5845236093473386, + "learning_rate": 4.289075413611352e-06, + "loss": 0.5019, + "step": 4627 + }, + { + "epoch": 0.7508111615833875, + "grad_norm": 0.5940163653328138, + "learning_rate": 4.28877711411426e-06, + "loss": 0.5105, + "step": 4628 + }, + { + "epoch": 0.7509733939000649, + "grad_norm": 0.5673720421708163, + "learning_rate": 4.288478762425874e-06, + "loss": 0.5577, + "step": 4629 + }, + { + "epoch": 0.7511356262167423, + "grad_norm": 0.5879761938271557, + "learning_rate": 4.2881803585549e-06, + "loss": 0.5498, + "step": 4630 + }, + { + "epoch": 0.7512978585334199, + "grad_norm": 0.6120113958056391, + "learning_rate": 4.287881902510044e-06, + "loss": 0.5322, + "step": 4631 + }, + { + "epoch": 0.7514600908500974, + "grad_norm": 0.5921155708216465, + "learning_rate": 4.287583394300016e-06, + "loss": 0.6225, + "step": 4632 + }, + { + "epoch": 0.7516223231667748, + "grad_norm": 0.594208007000593, + "learning_rate": 4.287284833933523e-06, + "loss": 0.5546, + "step": 4633 + }, + { + "epoch": 0.7517845554834524, + "grad_norm": 0.6017804309265475, + "learning_rate": 4.286986221419277e-06, + "loss": 0.5655, + "step": 4634 + }, + { + "epoch": 0.7519467878001298, + "grad_norm": 0.5619455851946366, + "learning_rate": 4.2866875567659915e-06, + "loss": 0.5243, + "step": 4635 + }, + { + "epoch": 0.7521090201168072, + "grad_norm": 0.5502257031512532, + "learning_rate": 4.2863888399823795e-06, + "loss": 0.5589, + "step": 4636 + }, + { + "epoch": 0.7522712524334848, + "grad_norm": 0.5984621989712989, + "learning_rate": 4.2860900710771575e-06, + "loss": 0.5784, + "step": 4637 + }, + { + "epoch": 0.7524334847501623, + "grad_norm": 0.562195976795697, + "learning_rate": 4.285791250059042e-06, + "loss": 0.5586, + "step": 4638 + }, + { + "epoch": 0.7525957170668397, + "grad_norm": 0.5884490037522974, + "learning_rate": 4.2854923769367525e-06, + "loss": 0.5315, + "step": 4639 + }, + { + "epoch": 0.7527579493835171, + "grad_norm": 0.5970624176578152, + "learning_rate": 4.28519345171901e-06, + "loss": 0.5205, + "step": 4640 + }, + { + "epoch": 0.7529201817001947, + "grad_norm": 0.5673317190284112, + "learning_rate": 4.284894474414533e-06, + "loss": 0.5796, + "step": 4641 + }, + { + "epoch": 0.7530824140168721, + "grad_norm": 0.5973864014698258, + "learning_rate": 4.284595445032048e-06, + "loss": 0.5765, + "step": 4642 + }, + { + "epoch": 0.7532446463335496, + "grad_norm": 0.5707584483024967, + "learning_rate": 4.284296363580279e-06, + "loss": 0.5752, + "step": 4643 + }, + { + "epoch": 0.7534068786502272, + "grad_norm": 0.6099627861267453, + "learning_rate": 4.283997230067952e-06, + "loss": 0.5614, + "step": 4644 + }, + { + "epoch": 0.7535691109669046, + "grad_norm": 0.5987450130557574, + "learning_rate": 4.283698044503794e-06, + "loss": 0.5589, + "step": 4645 + }, + { + "epoch": 0.753731343283582, + "grad_norm": 0.5849041915226485, + "learning_rate": 4.2833988068965366e-06, + "loss": 0.5719, + "step": 4646 + }, + { + "epoch": 0.7538935756002596, + "grad_norm": 0.6378678485548589, + "learning_rate": 4.283099517254908e-06, + "loss": 0.5372, + "step": 4647 + }, + { + "epoch": 0.754055807916937, + "grad_norm": 0.5822088616956437, + "learning_rate": 4.282800175587643e-06, + "loss": 0.5826, + "step": 4648 + }, + { + "epoch": 0.7542180402336145, + "grad_norm": 0.5446959216010777, + "learning_rate": 4.282500781903474e-06, + "loss": 0.5542, + "step": 4649 + }, + { + "epoch": 0.754380272550292, + "grad_norm": 0.5952042500203495, + "learning_rate": 4.282201336211137e-06, + "loss": 0.5447, + "step": 4650 + }, + { + "epoch": 0.7545425048669695, + "grad_norm": 0.5418392638089268, + "learning_rate": 4.281901838519369e-06, + "loss": 0.5222, + "step": 4651 + }, + { + "epoch": 0.754704737183647, + "grad_norm": 0.5663138057046372, + "learning_rate": 4.281602288836908e-06, + "loss": 0.5418, + "step": 4652 + }, + { + "epoch": 0.7548669695003245, + "grad_norm": 0.5894355216826754, + "learning_rate": 4.281302687172495e-06, + "loss": 0.5548, + "step": 4653 + }, + { + "epoch": 0.755029201817002, + "grad_norm": 0.5941365422965429, + "learning_rate": 4.28100303353487e-06, + "loss": 0.5706, + "step": 4654 + }, + { + "epoch": 0.7551914341336794, + "grad_norm": 0.5661141751953317, + "learning_rate": 4.280703327932777e-06, + "loss": 0.5635, + "step": 4655 + }, + { + "epoch": 0.755353666450357, + "grad_norm": 0.5713239405495484, + "learning_rate": 4.28040357037496e-06, + "loss": 0.5371, + "step": 4656 + }, + { + "epoch": 0.7555158987670344, + "grad_norm": 0.574751540373081, + "learning_rate": 4.280103760870165e-06, + "loss": 0.6011, + "step": 4657 + }, + { + "epoch": 0.7556781310837118, + "grad_norm": 0.6956211639269169, + "learning_rate": 4.279803899427142e-06, + "loss": 0.5504, + "step": 4658 + }, + { + "epoch": 0.7558403634003894, + "grad_norm": 0.5757083416124176, + "learning_rate": 4.279503986054636e-06, + "loss": 0.5517, + "step": 4659 + }, + { + "epoch": 0.7560025957170668, + "grad_norm": 0.5945195853934105, + "learning_rate": 4.279204020761401e-06, + "loss": 0.5446, + "step": 4660 + }, + { + "epoch": 0.7561648280337443, + "grad_norm": 0.5756755411572656, + "learning_rate": 4.2789040035561864e-06, + "loss": 0.5543, + "step": 4661 + }, + { + "epoch": 0.7563270603504219, + "grad_norm": 0.5896578541314148, + "learning_rate": 4.2786039344477485e-06, + "loss": 0.5396, + "step": 4662 + }, + { + "epoch": 0.7564892926670993, + "grad_norm": 0.5618597312811616, + "learning_rate": 4.278303813444841e-06, + "loss": 0.5381, + "step": 4663 + }, + { + "epoch": 0.7566515249837767, + "grad_norm": 0.5973632578283427, + "learning_rate": 4.2780036405562195e-06, + "loss": 0.5471, + "step": 4664 + }, + { + "epoch": 0.7568137573004543, + "grad_norm": 0.5827821579076321, + "learning_rate": 4.277703415790645e-06, + "loss": 0.5603, + "step": 4665 + }, + { + "epoch": 0.7569759896171318, + "grad_norm": 0.587486265946789, + "learning_rate": 4.2774031391568734e-06, + "loss": 0.5579, + "step": 4666 + }, + { + "epoch": 0.7571382219338092, + "grad_norm": 0.6480046788518767, + "learning_rate": 4.2771028106636706e-06, + "loss": 0.5463, + "step": 4667 + }, + { + "epoch": 0.7573004542504866, + "grad_norm": 0.6009388612168848, + "learning_rate": 4.276802430319796e-06, + "loss": 0.5474, + "step": 4668 + }, + { + "epoch": 0.7574626865671642, + "grad_norm": 0.5974525237949876, + "learning_rate": 4.276501998134013e-06, + "loss": 0.5569, + "step": 4669 + }, + { + "epoch": 0.7576249188838416, + "grad_norm": 0.5842502240929323, + "learning_rate": 4.276201514115089e-06, + "loss": 0.5391, + "step": 4670 + }, + { + "epoch": 0.7577871512005191, + "grad_norm": 0.6002703194376028, + "learning_rate": 4.2759009782717925e-06, + "loss": 0.5934, + "step": 4671 + }, + { + "epoch": 0.7579493835171967, + "grad_norm": 0.5595617327113162, + "learning_rate": 4.27560039061289e-06, + "loss": 0.5747, + "step": 4672 + }, + { + "epoch": 0.7581116158338741, + "grad_norm": 0.558400212366824, + "learning_rate": 4.275299751147153e-06, + "loss": 0.5613, + "step": 4673 + }, + { + "epoch": 0.7582738481505515, + "grad_norm": 0.6161775424536404, + "learning_rate": 4.2749990598833535e-06, + "loss": 0.5356, + "step": 4674 + }, + { + "epoch": 0.7584360804672291, + "grad_norm": 0.5497810505179147, + "learning_rate": 4.2746983168302635e-06, + "loss": 0.5386, + "step": 4675 + }, + { + "epoch": 0.7585983127839065, + "grad_norm": 0.5815566090457635, + "learning_rate": 4.274397521996658e-06, + "loss": 0.5443, + "step": 4676 + }, + { + "epoch": 0.758760545100584, + "grad_norm": 0.572277937898079, + "learning_rate": 4.274096675391315e-06, + "loss": 0.5483, + "step": 4677 + }, + { + "epoch": 0.7589227774172616, + "grad_norm": 0.6132892384724391, + "learning_rate": 4.273795777023011e-06, + "loss": 0.5338, + "step": 4678 + }, + { + "epoch": 0.759085009733939, + "grad_norm": 0.5753156516098051, + "learning_rate": 4.273494826900525e-06, + "loss": 0.5292, + "step": 4679 + }, + { + "epoch": 0.7592472420506164, + "grad_norm": 0.6227054260282292, + "learning_rate": 4.273193825032639e-06, + "loss": 0.524, + "step": 4680 + }, + { + "epoch": 0.759409474367294, + "grad_norm": 0.5515089921779913, + "learning_rate": 4.272892771428134e-06, + "loss": 0.5457, + "step": 4681 + }, + { + "epoch": 0.7595717066839714, + "grad_norm": 0.5668436180709618, + "learning_rate": 4.272591666095795e-06, + "loss": 0.5264, + "step": 4682 + }, + { + "epoch": 0.7597339390006489, + "grad_norm": 0.5712346437868178, + "learning_rate": 4.2722905090444065e-06, + "loss": 0.5736, + "step": 4683 + }, + { + "epoch": 0.7598961713173265, + "grad_norm": 0.6371440447409443, + "learning_rate": 4.271989300282756e-06, + "loss": 0.5446, + "step": 4684 + }, + { + "epoch": 0.7600584036340039, + "grad_norm": 0.5974133456073659, + "learning_rate": 4.271688039819633e-06, + "loss": 0.585, + "step": 4685 + }, + { + "epoch": 0.7602206359506813, + "grad_norm": 0.568382055751949, + "learning_rate": 4.271386727663824e-06, + "loss": 0.5697, + "step": 4686 + }, + { + "epoch": 0.7603828682673589, + "grad_norm": 0.5614978192818649, + "learning_rate": 4.271085363824124e-06, + "loss": 0.5595, + "step": 4687 + }, + { + "epoch": 0.7605451005840363, + "grad_norm": 0.5660320865054179, + "learning_rate": 4.270783948309324e-06, + "loss": 0.5752, + "step": 4688 + }, + { + "epoch": 0.7607073329007138, + "grad_norm": 0.5945653682889754, + "learning_rate": 4.270482481128218e-06, + "loss": 0.558, + "step": 4689 + }, + { + "epoch": 0.7608695652173914, + "grad_norm": 0.5560404096812472, + "learning_rate": 4.270180962289605e-06, + "loss": 0.5526, + "step": 4690 + }, + { + "epoch": 0.7610317975340688, + "grad_norm": 0.5760561043306688, + "learning_rate": 4.269879391802278e-06, + "loss": 0.5531, + "step": 4691 + }, + { + "epoch": 0.7611940298507462, + "grad_norm": 0.593926448173001, + "learning_rate": 4.26957776967504e-06, + "loss": 0.5796, + "step": 4692 + }, + { + "epoch": 0.7613562621674238, + "grad_norm": 0.5605891798023295, + "learning_rate": 4.269276095916689e-06, + "loss": 0.5595, + "step": 4693 + }, + { + "epoch": 0.7615184944841012, + "grad_norm": 0.6094488196657196, + "learning_rate": 4.268974370536028e-06, + "loss": 0.5742, + "step": 4694 + }, + { + "epoch": 0.7616807268007787, + "grad_norm": 0.5647856761908013, + "learning_rate": 4.268672593541859e-06, + "loss": 0.5348, + "step": 4695 + }, + { + "epoch": 0.7618429591174561, + "grad_norm": 0.5824153986607783, + "learning_rate": 4.268370764942988e-06, + "loss": 0.5406, + "step": 4696 + }, + { + "epoch": 0.7620051914341337, + "grad_norm": 0.6363745170231748, + "learning_rate": 4.268068884748222e-06, + "loss": 0.5867, + "step": 4697 + }, + { + "epoch": 0.7621674237508111, + "grad_norm": 0.5840087701694677, + "learning_rate": 4.267766952966369e-06, + "loss": 0.5546, + "step": 4698 + }, + { + "epoch": 0.7623296560674886, + "grad_norm": 0.5663467244828343, + "learning_rate": 4.267464969606237e-06, + "loss": 0.5618, + "step": 4699 + }, + { + "epoch": 0.7624918883841661, + "grad_norm": 0.5777143645527308, + "learning_rate": 4.2671629346766396e-06, + "loss": 0.5647, + "step": 4700 + }, + { + "epoch": 0.7626541207008436, + "grad_norm": 0.6062147714254391, + "learning_rate": 4.266860848186386e-06, + "loss": 0.5434, + "step": 4701 + }, + { + "epoch": 0.762816353017521, + "grad_norm": 0.5720831288546023, + "learning_rate": 4.266558710144293e-06, + "loss": 0.538, + "step": 4702 + }, + { + "epoch": 0.7629785853341986, + "grad_norm": 0.5262148359662695, + "learning_rate": 4.266256520559175e-06, + "loss": 0.5414, + "step": 4703 + }, + { + "epoch": 0.763140817650876, + "grad_norm": 0.617778974529051, + "learning_rate": 4.2659542794398474e-06, + "loss": 0.5929, + "step": 4704 + }, + { + "epoch": 0.7633030499675535, + "grad_norm": 0.612484312835524, + "learning_rate": 4.265651986795133e-06, + "loss": 0.5853, + "step": 4705 + }, + { + "epoch": 0.763465282284231, + "grad_norm": 0.5961365116298394, + "learning_rate": 4.265349642633847e-06, + "loss": 0.5501, + "step": 4706 + }, + { + "epoch": 0.7636275146009085, + "grad_norm": 0.5674609879854311, + "learning_rate": 4.265047246964814e-06, + "loss": 0.5571, + "step": 4707 + }, + { + "epoch": 0.7637897469175859, + "grad_norm": 0.5944509397807495, + "learning_rate": 4.264744799796855e-06, + "loss": 0.5445, + "step": 4708 + }, + { + "epoch": 0.7639519792342635, + "grad_norm": 0.5860695301165939, + "learning_rate": 4.264442301138797e-06, + "loss": 0.5709, + "step": 4709 + }, + { + "epoch": 0.764114211550941, + "grad_norm": 0.6005256622274894, + "learning_rate": 4.264139750999464e-06, + "loss": 0.573, + "step": 4710 + }, + { + "epoch": 0.7642764438676184, + "grad_norm": 0.5685545424311892, + "learning_rate": 4.263837149387684e-06, + "loss": 0.5368, + "step": 4711 + }, + { + "epoch": 0.764438676184296, + "grad_norm": 0.6194546955077586, + "learning_rate": 4.263534496312286e-06, + "loss": 0.5508, + "step": 4712 + }, + { + "epoch": 0.7646009085009734, + "grad_norm": 0.5673123926363812, + "learning_rate": 4.263231791782102e-06, + "loss": 0.5753, + "step": 4713 + }, + { + "epoch": 0.7647631408176508, + "grad_norm": 0.5921993238850367, + "learning_rate": 4.262929035805962e-06, + "loss": 0.5959, + "step": 4714 + }, + { + "epoch": 0.7649253731343284, + "grad_norm": 0.5959277393401438, + "learning_rate": 4.2626262283927e-06, + "loss": 0.5031, + "step": 4715 + }, + { + "epoch": 0.7650876054510058, + "grad_norm": 0.5795444765742909, + "learning_rate": 4.262323369551151e-06, + "loss": 0.5444, + "step": 4716 + }, + { + "epoch": 0.7652498377676833, + "grad_norm": 0.5372313122948376, + "learning_rate": 4.262020459290152e-06, + "loss": 0.5704, + "step": 4717 + }, + { + "epoch": 0.7654120700843609, + "grad_norm": 0.5916294614032167, + "learning_rate": 4.261717497618541e-06, + "loss": 0.5774, + "step": 4718 + }, + { + "epoch": 0.7655743024010383, + "grad_norm": 0.5825415500232998, + "learning_rate": 4.261414484545158e-06, + "loss": 0.5694, + "step": 4719 + }, + { + "epoch": 0.7657365347177157, + "grad_norm": 0.5731343583113242, + "learning_rate": 4.261111420078844e-06, + "loss": 0.5698, + "step": 4720 + }, + { + "epoch": 0.7658987670343933, + "grad_norm": 0.6145255294948995, + "learning_rate": 4.26080830422844e-06, + "loss": 0.5389, + "step": 4721 + }, + { + "epoch": 0.7660609993510707, + "grad_norm": 0.56976046337148, + "learning_rate": 4.2605051370027906e-06, + "loss": 0.5457, + "step": 4722 + }, + { + "epoch": 0.7662232316677482, + "grad_norm": 0.6236320906545878, + "learning_rate": 4.260201918410742e-06, + "loss": 0.5553, + "step": 4723 + }, + { + "epoch": 0.7663854639844258, + "grad_norm": 0.5961586651745918, + "learning_rate": 4.2598986484611415e-06, + "loss": 0.5666, + "step": 4724 + }, + { + "epoch": 0.7665476963011032, + "grad_norm": 0.5946204593658444, + "learning_rate": 4.259595327162837e-06, + "loss": 0.546, + "step": 4725 + }, + { + "epoch": 0.7667099286177806, + "grad_norm": 0.5626711275385428, + "learning_rate": 4.259291954524679e-06, + "loss": 0.5416, + "step": 4726 + }, + { + "epoch": 0.7668721609344581, + "grad_norm": 0.6200730922590623, + "learning_rate": 4.258988530555519e-06, + "loss": 0.5609, + "step": 4727 + }, + { + "epoch": 0.7670343932511356, + "grad_norm": 0.5776622433562943, + "learning_rate": 4.258685055264209e-06, + "loss": 0.5692, + "step": 4728 + }, + { + "epoch": 0.7671966255678131, + "grad_norm": 0.5916319066161401, + "learning_rate": 4.2583815286596045e-06, + "loss": 0.56, + "step": 4729 + }, + { + "epoch": 0.7673588578844905, + "grad_norm": 0.5738504232592262, + "learning_rate": 4.258077950750561e-06, + "loss": 0.5255, + "step": 4730 + }, + { + "epoch": 0.7675210902011681, + "grad_norm": 0.593996495795399, + "learning_rate": 4.257774321545937e-06, + "loss": 0.5478, + "step": 4731 + }, + { + "epoch": 0.7676833225178455, + "grad_norm": 0.5569798565776938, + "learning_rate": 4.25747064105459e-06, + "loss": 0.5529, + "step": 4732 + }, + { + "epoch": 0.767845554834523, + "grad_norm": 0.5367203582407813, + "learning_rate": 4.257166909285382e-06, + "loss": 0.5241, + "step": 4733 + }, + { + "epoch": 0.7680077871512005, + "grad_norm": 0.6022652825820156, + "learning_rate": 4.256863126247173e-06, + "loss": 0.5641, + "step": 4734 + }, + { + "epoch": 0.768170019467878, + "grad_norm": 0.5521876160789593, + "learning_rate": 4.256559291948828e-06, + "loss": 0.5294, + "step": 4735 + }, + { + "epoch": 0.7683322517845554, + "grad_norm": 0.5938900507425383, + "learning_rate": 4.256255406399213e-06, + "loss": 0.5829, + "step": 4736 + }, + { + "epoch": 0.768494484101233, + "grad_norm": 0.5720659158977481, + "learning_rate": 4.2559514696071935e-06, + "loss": 0.5776, + "step": 4737 + }, + { + "epoch": 0.7686567164179104, + "grad_norm": 0.5725488677885773, + "learning_rate": 4.2556474815816365e-06, + "loss": 0.5764, + "step": 4738 + }, + { + "epoch": 0.7688189487345879, + "grad_norm": 0.5806370970895253, + "learning_rate": 4.255343442331413e-06, + "loss": 0.5387, + "step": 4739 + }, + { + "epoch": 0.7689811810512654, + "grad_norm": 0.5978309147968415, + "learning_rate": 4.2550393518653924e-06, + "loss": 0.564, + "step": 4740 + }, + { + "epoch": 0.7691434133679429, + "grad_norm": 0.5608326148671162, + "learning_rate": 4.254735210192449e-06, + "loss": 0.5547, + "step": 4741 + }, + { + "epoch": 0.7693056456846203, + "grad_norm": 0.5394640362994165, + "learning_rate": 4.2544310173214546e-06, + "loss": 0.5424, + "step": 4742 + }, + { + "epoch": 0.7694678780012979, + "grad_norm": 0.5770307952286524, + "learning_rate": 4.254126773261287e-06, + "loss": 0.5314, + "step": 4743 + }, + { + "epoch": 0.7696301103179753, + "grad_norm": 0.5624902505388915, + "learning_rate": 4.253822478020821e-06, + "loss": 0.5438, + "step": 4744 + }, + { + "epoch": 0.7697923426346528, + "grad_norm": 0.6334584699483832, + "learning_rate": 4.253518131608936e-06, + "loss": 0.5569, + "step": 4745 + }, + { + "epoch": 0.7699545749513304, + "grad_norm": 0.5801219234643701, + "learning_rate": 4.2532137340345135e-06, + "loss": 0.5661, + "step": 4746 + }, + { + "epoch": 0.7701168072680078, + "grad_norm": 0.5831285866680019, + "learning_rate": 4.252909285306432e-06, + "loss": 0.5243, + "step": 4747 + }, + { + "epoch": 0.7702790395846852, + "grad_norm": 0.5788860540221127, + "learning_rate": 4.252604785433576e-06, + "loss": 0.5525, + "step": 4748 + }, + { + "epoch": 0.7704412719013628, + "grad_norm": 0.6034962499917594, + "learning_rate": 4.25230023442483e-06, + "loss": 0.577, + "step": 4749 + }, + { + "epoch": 0.7706035042180402, + "grad_norm": 0.5623804452606848, + "learning_rate": 4.25199563228908e-06, + "loss": 0.5624, + "step": 4750 + }, + { + "epoch": 0.7707657365347177, + "grad_norm": 0.5791775065288534, + "learning_rate": 4.251690979035213e-06, + "loss": 0.5917, + "step": 4751 + }, + { + "epoch": 0.7709279688513953, + "grad_norm": 0.5961541945996296, + "learning_rate": 4.251386274672118e-06, + "loss": 0.5578, + "step": 4752 + }, + { + "epoch": 0.7710902011680727, + "grad_norm": 0.5680299523762498, + "learning_rate": 4.2510815192086854e-06, + "loss": 0.5687, + "step": 4753 + }, + { + "epoch": 0.7712524334847501, + "grad_norm": 0.5799010623207135, + "learning_rate": 4.250776712653806e-06, + "loss": 0.5689, + "step": 4754 + }, + { + "epoch": 0.7714146658014276, + "grad_norm": 0.5849308097291863, + "learning_rate": 4.250471855016375e-06, + "loss": 0.6081, + "step": 4755 + }, + { + "epoch": 0.7715768981181051, + "grad_norm": 0.6120057692890305, + "learning_rate": 4.250166946305287e-06, + "loss": 0.5446, + "step": 4756 + }, + { + "epoch": 0.7717391304347826, + "grad_norm": 0.6066306248713195, + "learning_rate": 4.249861986529437e-06, + "loss": 0.5438, + "step": 4757 + }, + { + "epoch": 0.77190136275146, + "grad_norm": 0.5751521906876398, + "learning_rate": 4.249556975697724e-06, + "loss": 0.5633, + "step": 4758 + }, + { + "epoch": 0.7720635950681376, + "grad_norm": 0.6257494990812763, + "learning_rate": 4.249251913819047e-06, + "loss": 0.5885, + "step": 4759 + }, + { + "epoch": 0.772225827384815, + "grad_norm": 0.5893263032119471, + "learning_rate": 4.248946800902306e-06, + "loss": 0.5572, + "step": 4760 + }, + { + "epoch": 0.7723880597014925, + "grad_norm": 0.6199142827020647, + "learning_rate": 4.248641636956405e-06, + "loss": 0.5606, + "step": 4761 + }, + { + "epoch": 0.77255029201817, + "grad_norm": 0.5658869387009042, + "learning_rate": 4.248336421990247e-06, + "loss": 0.569, + "step": 4762 + }, + { + "epoch": 0.7727125243348475, + "grad_norm": 0.5736853513327487, + "learning_rate": 4.248031156012737e-06, + "loss": 0.5575, + "step": 4763 + }, + { + "epoch": 0.7728747566515249, + "grad_norm": 0.5707863972276934, + "learning_rate": 4.247725839032781e-06, + "loss": 0.5394, + "step": 4764 + }, + { + "epoch": 0.7730369889682025, + "grad_norm": 0.5560741505967354, + "learning_rate": 4.2474204710592885e-06, + "loss": 0.5556, + "step": 4765 + }, + { + "epoch": 0.77319922128488, + "grad_norm": 0.6100052862566586, + "learning_rate": 4.247115052101169e-06, + "loss": 0.5489, + "step": 4766 + }, + { + "epoch": 0.7733614536015574, + "grad_norm": 0.5717557117188152, + "learning_rate": 4.246809582167335e-06, + "loss": 0.5527, + "step": 4767 + }, + { + "epoch": 0.773523685918235, + "grad_norm": 0.6824946133593973, + "learning_rate": 4.246504061266696e-06, + "loss": 0.5436, + "step": 4768 + }, + { + "epoch": 0.7736859182349124, + "grad_norm": 0.5795241642795992, + "learning_rate": 4.246198489408169e-06, + "loss": 0.5451, + "step": 4769 + }, + { + "epoch": 0.7738481505515898, + "grad_norm": 0.5576462636687062, + "learning_rate": 4.245892866600669e-06, + "loss": 0.5465, + "step": 4770 + }, + { + "epoch": 0.7740103828682674, + "grad_norm": 0.611032112242219, + "learning_rate": 4.245587192853113e-06, + "loss": 0.5324, + "step": 4771 + }, + { + "epoch": 0.7741726151849448, + "grad_norm": 0.5667025272305931, + "learning_rate": 4.245281468174419e-06, + "loss": 0.5708, + "step": 4772 + }, + { + "epoch": 0.7743348475016223, + "grad_norm": 0.5637272948958838, + "learning_rate": 4.244975692573508e-06, + "loss": 0.5762, + "step": 4773 + }, + { + "epoch": 0.7744970798182998, + "grad_norm": 0.5819963530017954, + "learning_rate": 4.244669866059302e-06, + "loss": 0.5428, + "step": 4774 + }, + { + "epoch": 0.7746593121349773, + "grad_norm": 0.5766392953451925, + "learning_rate": 4.244363988640723e-06, + "loss": 0.5151, + "step": 4775 + }, + { + "epoch": 0.7748215444516547, + "grad_norm": 0.5613738062307791, + "learning_rate": 4.244058060326696e-06, + "loss": 0.5611, + "step": 4776 + }, + { + "epoch": 0.7749837767683323, + "grad_norm": 0.5957993333516521, + "learning_rate": 4.243752081126147e-06, + "loss": 0.5412, + "step": 4777 + }, + { + "epoch": 0.7751460090850097, + "grad_norm": 0.5976020621950745, + "learning_rate": 4.243446051048005e-06, + "loss": 0.5525, + "step": 4778 + }, + { + "epoch": 0.7753082414016872, + "grad_norm": 0.6780196920048001, + "learning_rate": 4.243139970101197e-06, + "loss": 0.5489, + "step": 4779 + }, + { + "epoch": 0.7754704737183648, + "grad_norm": 0.5972962237177404, + "learning_rate": 4.242833838294654e-06, + "loss": 0.5503, + "step": 4780 + }, + { + "epoch": 0.7756327060350422, + "grad_norm": 0.5726359928803252, + "learning_rate": 4.24252765563731e-06, + "loss": 0.5399, + "step": 4781 + }, + { + "epoch": 0.7757949383517196, + "grad_norm": 0.6076598298275219, + "learning_rate": 4.242221422138097e-06, + "loss": 0.5608, + "step": 4782 + }, + { + "epoch": 0.7759571706683971, + "grad_norm": 0.6109106958830524, + "learning_rate": 4.241915137805948e-06, + "loss": 0.5159, + "step": 4783 + }, + { + "epoch": 0.7761194029850746, + "grad_norm": 0.5520584360781867, + "learning_rate": 4.241608802649803e-06, + "loss": 0.5355, + "step": 4784 + }, + { + "epoch": 0.7762816353017521, + "grad_norm": 0.5748336373689406, + "learning_rate": 4.241302416678598e-06, + "loss": 0.5657, + "step": 4785 + }, + { + "epoch": 0.7764438676184295, + "grad_norm": 0.6059328490640955, + "learning_rate": 4.240995979901273e-06, + "loss": 0.5352, + "step": 4786 + }, + { + "epoch": 0.7766060999351071, + "grad_norm": 0.5711248409884382, + "learning_rate": 4.240689492326769e-06, + "loss": 0.5906, + "step": 4787 + }, + { + "epoch": 0.7767683322517845, + "grad_norm": 0.5877232452692424, + "learning_rate": 4.240382953964027e-06, + "loss": 0.5305, + "step": 4788 + }, + { + "epoch": 0.776930564568462, + "grad_norm": 0.5517903865243587, + "learning_rate": 4.240076364821994e-06, + "loss": 0.548, + "step": 4789 + }, + { + "epoch": 0.7770927968851395, + "grad_norm": 0.5635208609689436, + "learning_rate": 4.239769724909613e-06, + "loss": 0.5846, + "step": 4790 + }, + { + "epoch": 0.777255029201817, + "grad_norm": 0.5761544609654943, + "learning_rate": 4.23946303423583e-06, + "loss": 0.5612, + "step": 4791 + }, + { + "epoch": 0.7774172615184944, + "grad_norm": 0.6032114790192676, + "learning_rate": 4.239156292809596e-06, + "loss": 0.5573, + "step": 4792 + }, + { + "epoch": 0.777579493835172, + "grad_norm": 0.589375868864188, + "learning_rate": 4.238849500639859e-06, + "loss": 0.5679, + "step": 4793 + }, + { + "epoch": 0.7777417261518494, + "grad_norm": 0.5970356442301886, + "learning_rate": 4.238542657735571e-06, + "loss": 0.5613, + "step": 4794 + }, + { + "epoch": 0.7779039584685269, + "grad_norm": 0.5701188466595742, + "learning_rate": 4.238235764105685e-06, + "loss": 0.5842, + "step": 4795 + }, + { + "epoch": 0.7780661907852044, + "grad_norm": 0.5992284016170675, + "learning_rate": 4.237928819759154e-06, + "loss": 0.594, + "step": 4796 + }, + { + "epoch": 0.7782284231018819, + "grad_norm": 0.5777137464992628, + "learning_rate": 4.237621824704936e-06, + "loss": 0.5244, + "step": 4797 + }, + { + "epoch": 0.7783906554185593, + "grad_norm": 0.5542247129405325, + "learning_rate": 4.2373147789519855e-06, + "loss": 0.5676, + "step": 4798 + }, + { + "epoch": 0.7785528877352369, + "grad_norm": 0.5702951063769904, + "learning_rate": 4.237007682509263e-06, + "loss": 0.5803, + "step": 4799 + }, + { + "epoch": 0.7787151200519143, + "grad_norm": 0.5714853757533236, + "learning_rate": 4.236700535385728e-06, + "loss": 0.5338, + "step": 4800 + }, + { + "epoch": 0.7788773523685918, + "grad_norm": 0.6131778615413481, + "learning_rate": 4.2363933375903435e-06, + "loss": 0.5412, + "step": 4801 + }, + { + "epoch": 0.7790395846852693, + "grad_norm": 0.5537704868160908, + "learning_rate": 4.23608608913207e-06, + "loss": 0.5392, + "step": 4802 + }, + { + "epoch": 0.7792018170019468, + "grad_norm": 0.5838953632822206, + "learning_rate": 4.2357787900198745e-06, + "loss": 0.5795, + "step": 4803 + }, + { + "epoch": 0.7793640493186242, + "grad_norm": 0.6117558292124513, + "learning_rate": 4.235471440262722e-06, + "loss": 0.5453, + "step": 4804 + }, + { + "epoch": 0.7795262816353018, + "grad_norm": 0.5979563770553012, + "learning_rate": 4.2351640398695795e-06, + "loss": 0.574, + "step": 4805 + }, + { + "epoch": 0.7796885139519792, + "grad_norm": 0.611342932699046, + "learning_rate": 4.234856588849418e-06, + "loss": 0.5611, + "step": 4806 + }, + { + "epoch": 0.7798507462686567, + "grad_norm": 0.5673200265948144, + "learning_rate": 4.234549087211206e-06, + "loss": 0.5362, + "step": 4807 + }, + { + "epoch": 0.7800129785853342, + "grad_norm": 0.5846512251756304, + "learning_rate": 4.234241534963916e-06, + "loss": 0.5688, + "step": 4808 + }, + { + "epoch": 0.7801752109020117, + "grad_norm": 0.5654982062381402, + "learning_rate": 4.233933932116522e-06, + "loss": 0.549, + "step": 4809 + }, + { + "epoch": 0.7803374432186891, + "grad_norm": 0.5801474917821617, + "learning_rate": 4.233626278677999e-06, + "loss": 0.5647, + "step": 4810 + }, + { + "epoch": 0.7804996755353667, + "grad_norm": 0.5738612580593206, + "learning_rate": 4.233318574657323e-06, + "loss": 0.54, + "step": 4811 + }, + { + "epoch": 0.7806619078520441, + "grad_norm": 0.5635736691363579, + "learning_rate": 4.233010820063473e-06, + "loss": 0.5344, + "step": 4812 + }, + { + "epoch": 0.7808241401687216, + "grad_norm": 0.6146208083875596, + "learning_rate": 4.232703014905427e-06, + "loss": 0.5361, + "step": 4813 + }, + { + "epoch": 0.780986372485399, + "grad_norm": 0.5617680014031395, + "learning_rate": 4.232395159192166e-06, + "loss": 0.5234, + "step": 4814 + }, + { + "epoch": 0.7811486048020766, + "grad_norm": 0.6114023649904302, + "learning_rate": 4.232087252932673e-06, + "loss": 0.5851, + "step": 4815 + }, + { + "epoch": 0.781310837118754, + "grad_norm": 0.5380999249273017, + "learning_rate": 4.23177929613593e-06, + "loss": 0.5215, + "step": 4816 + }, + { + "epoch": 0.7814730694354315, + "grad_norm": 0.5891785672419759, + "learning_rate": 4.231471288810926e-06, + "loss": 0.5403, + "step": 4817 + }, + { + "epoch": 0.781635301752109, + "grad_norm": 0.5833940569863524, + "learning_rate": 4.231163230966644e-06, + "loss": 0.5463, + "step": 4818 + }, + { + "epoch": 0.7817975340687865, + "grad_norm": 0.5936287351387198, + "learning_rate": 4.2308551226120745e-06, + "loss": 0.5529, + "step": 4819 + }, + { + "epoch": 0.7819597663854639, + "grad_norm": 0.5802204466424835, + "learning_rate": 4.230546963756207e-06, + "loss": 0.562, + "step": 4820 + }, + { + "epoch": 0.7821219987021415, + "grad_norm": 0.6012650218378793, + "learning_rate": 4.2302387544080305e-06, + "loss": 0.5273, + "step": 4821 + }, + { + "epoch": 0.7822842310188189, + "grad_norm": 0.5975547530716114, + "learning_rate": 4.22993049457654e-06, + "loss": 0.5859, + "step": 4822 + }, + { + "epoch": 0.7824464633354964, + "grad_norm": 0.5865557263838643, + "learning_rate": 4.229622184270729e-06, + "loss": 0.5305, + "step": 4823 + }, + { + "epoch": 0.782608695652174, + "grad_norm": 0.5996884004544605, + "learning_rate": 4.229313823499592e-06, + "loss": 0.5655, + "step": 4824 + }, + { + "epoch": 0.7827709279688514, + "grad_norm": 0.5816449826613664, + "learning_rate": 4.229005412272129e-06, + "loss": 0.5246, + "step": 4825 + }, + { + "epoch": 0.7829331602855288, + "grad_norm": 0.612549814058638, + "learning_rate": 4.228696950597335e-06, + "loss": 0.5793, + "step": 4826 + }, + { + "epoch": 0.7830953926022064, + "grad_norm": 0.5805186212496078, + "learning_rate": 4.228388438484212e-06, + "loss": 0.5782, + "step": 4827 + }, + { + "epoch": 0.7832576249188838, + "grad_norm": 0.5760738205457931, + "learning_rate": 4.228079875941762e-06, + "loss": 0.5158, + "step": 4828 + }, + { + "epoch": 0.7834198572355613, + "grad_norm": 0.5821604183506219, + "learning_rate": 4.227771262978986e-06, + "loss": 0.5822, + "step": 4829 + }, + { + "epoch": 0.7835820895522388, + "grad_norm": 0.6052906918072953, + "learning_rate": 4.227462599604889e-06, + "loss": 0.5457, + "step": 4830 + }, + { + "epoch": 0.7837443218689163, + "grad_norm": 0.6030093620142605, + "learning_rate": 4.227153885828478e-06, + "loss": 0.5629, + "step": 4831 + }, + { + "epoch": 0.7839065541855937, + "grad_norm": 0.5762415756589048, + "learning_rate": 4.22684512165876e-06, + "loss": 0.558, + "step": 4832 + }, + { + "epoch": 0.7840687865022713, + "grad_norm": 0.6591385497090977, + "learning_rate": 4.226536307104743e-06, + "loss": 0.5405, + "step": 4833 + }, + { + "epoch": 0.7842310188189487, + "grad_norm": 0.6048490511897417, + "learning_rate": 4.226227442175438e-06, + "loss": 0.5547, + "step": 4834 + }, + { + "epoch": 0.7843932511356262, + "grad_norm": 0.5730334785776277, + "learning_rate": 4.2259185268798565e-06, + "loss": 0.5411, + "step": 4835 + }, + { + "epoch": 0.7845554834523037, + "grad_norm": 0.5899533571832523, + "learning_rate": 4.225609561227012e-06, + "loss": 0.5728, + "step": 4836 + }, + { + "epoch": 0.7847177157689812, + "grad_norm": 0.581893084326678, + "learning_rate": 4.225300545225919e-06, + "loss": 0.571, + "step": 4837 + }, + { + "epoch": 0.7848799480856586, + "grad_norm": 0.5956789656025151, + "learning_rate": 4.224991478885594e-06, + "loss": 0.5758, + "step": 4838 + }, + { + "epoch": 0.7850421804023362, + "grad_norm": 0.5491764206944819, + "learning_rate": 4.224682362215055e-06, + "loss": 0.5653, + "step": 4839 + }, + { + "epoch": 0.7852044127190136, + "grad_norm": 0.6138045580923623, + "learning_rate": 4.224373195223319e-06, + "loss": 0.546, + "step": 4840 + }, + { + "epoch": 0.7853666450356911, + "grad_norm": 0.5924348816534161, + "learning_rate": 4.224063977919409e-06, + "loss": 0.5605, + "step": 4841 + }, + { + "epoch": 0.7855288773523685, + "grad_norm": 0.5550580863262247, + "learning_rate": 4.223754710312346e-06, + "loss": 0.5143, + "step": 4842 + }, + { + "epoch": 0.7856911096690461, + "grad_norm": 0.5947652548047049, + "learning_rate": 4.223445392411155e-06, + "loss": 0.5543, + "step": 4843 + }, + { + "epoch": 0.7858533419857235, + "grad_norm": 0.583445566196183, + "learning_rate": 4.223136024224859e-06, + "loss": 0.5287, + "step": 4844 + }, + { + "epoch": 0.786015574302401, + "grad_norm": 0.5716358830527732, + "learning_rate": 4.222826605762484e-06, + "loss": 0.5265, + "step": 4845 + }, + { + "epoch": 0.7861778066190785, + "grad_norm": 0.5843340663518621, + "learning_rate": 4.22251713703306e-06, + "loss": 0.5183, + "step": 4846 + }, + { + "epoch": 0.786340038935756, + "grad_norm": 0.5910043075637864, + "learning_rate": 4.2222076180456164e-06, + "loss": 0.5603, + "step": 4847 + }, + { + "epoch": 0.7865022712524334, + "grad_norm": 0.5774379422577437, + "learning_rate": 4.221898048809182e-06, + "loss": 0.5522, + "step": 4848 + }, + { + "epoch": 0.786664503569111, + "grad_norm": 0.5891571330694393, + "learning_rate": 4.2215884293327906e-06, + "loss": 0.5587, + "step": 4849 + }, + { + "epoch": 0.7868267358857884, + "grad_norm": 0.6071013035368976, + "learning_rate": 4.221278759625476e-06, + "loss": 0.5678, + "step": 4850 + }, + { + "epoch": 0.7869889682024659, + "grad_norm": 0.6132799854521468, + "learning_rate": 4.220969039696272e-06, + "loss": 0.5661, + "step": 4851 + }, + { + "epoch": 0.7871512005191434, + "grad_norm": 0.5642140636005113, + "learning_rate": 4.220659269554217e-06, + "loss": 0.5566, + "step": 4852 + }, + { + "epoch": 0.7873134328358209, + "grad_norm": 0.5730209687768523, + "learning_rate": 4.220349449208349e-06, + "loss": 0.5884, + "step": 4853 + }, + { + "epoch": 0.7874756651524983, + "grad_norm": 0.6071160692923798, + "learning_rate": 4.220039578667707e-06, + "loss": 0.5906, + "step": 4854 + }, + { + "epoch": 0.7876378974691759, + "grad_norm": 0.6188203115273763, + "learning_rate": 4.219729657941333e-06, + "loss": 0.5535, + "step": 4855 + }, + { + "epoch": 0.7878001297858533, + "grad_norm": 0.600784430555616, + "learning_rate": 4.219419687038269e-06, + "loss": 0.5596, + "step": 4856 + }, + { + "epoch": 0.7879623621025308, + "grad_norm": 0.5612301227434895, + "learning_rate": 4.219109665967558e-06, + "loss": 0.4974, + "step": 4857 + }, + { + "epoch": 0.7881245944192083, + "grad_norm": 0.596082544361406, + "learning_rate": 4.218799594738248e-06, + "loss": 0.5569, + "step": 4858 + }, + { + "epoch": 0.7882868267358858, + "grad_norm": 0.6180204861939331, + "learning_rate": 4.218489473359383e-06, + "loss": 0.5602, + "step": 4859 + }, + { + "epoch": 0.7884490590525632, + "grad_norm": 0.5920194384447498, + "learning_rate": 4.2181793018400135e-06, + "loss": 0.5707, + "step": 4860 + }, + { + "epoch": 0.7886112913692408, + "grad_norm": 0.6009321900227721, + "learning_rate": 4.217869080189189e-06, + "loss": 0.5234, + "step": 4861 + }, + { + "epoch": 0.7887735236859182, + "grad_norm": 0.6253693481898445, + "learning_rate": 4.2175588084159615e-06, + "loss": 0.5594, + "step": 4862 + }, + { + "epoch": 0.7889357560025957, + "grad_norm": 0.5715747597899474, + "learning_rate": 4.217248486529382e-06, + "loss": 0.566, + "step": 4863 + }, + { + "epoch": 0.7890979883192732, + "grad_norm": 0.5688113290310718, + "learning_rate": 4.216938114538506e-06, + "loss": 0.5331, + "step": 4864 + }, + { + "epoch": 0.7892602206359507, + "grad_norm": 0.5750573948977373, + "learning_rate": 4.21662769245239e-06, + "loss": 0.5567, + "step": 4865 + }, + { + "epoch": 0.7894224529526281, + "grad_norm": 0.5861884505154218, + "learning_rate": 4.216317220280089e-06, + "loss": 0.5978, + "step": 4866 + }, + { + "epoch": 0.7895846852693057, + "grad_norm": 0.5710873383780726, + "learning_rate": 4.216006698030664e-06, + "loss": 0.502, + "step": 4867 + }, + { + "epoch": 0.7897469175859831, + "grad_norm": 0.5987359040426427, + "learning_rate": 4.215696125713173e-06, + "loss": 0.5809, + "step": 4868 + }, + { + "epoch": 0.7899091499026606, + "grad_norm": 0.5625490558274088, + "learning_rate": 4.2153855033366794e-06, + "loss": 0.5516, + "step": 4869 + }, + { + "epoch": 0.7900713822193381, + "grad_norm": 0.5733169007303036, + "learning_rate": 4.215074830910245e-06, + "loss": 0.5258, + "step": 4870 + }, + { + "epoch": 0.7902336145360156, + "grad_norm": 0.6025842712008952, + "learning_rate": 4.214764108442936e-06, + "loss": 0.5817, + "step": 4871 + }, + { + "epoch": 0.790395846852693, + "grad_norm": 0.5743633608631679, + "learning_rate": 4.2144533359438165e-06, + "loss": 0.5724, + "step": 4872 + }, + { + "epoch": 0.7905580791693705, + "grad_norm": 0.5820750795728294, + "learning_rate": 4.214142513421955e-06, + "loss": 0.5745, + "step": 4873 + }, + { + "epoch": 0.790720311486048, + "grad_norm": 0.5752948220777814, + "learning_rate": 4.21383164088642e-06, + "loss": 0.561, + "step": 4874 + }, + { + "epoch": 0.7908825438027255, + "grad_norm": 0.6012921983820652, + "learning_rate": 4.213520718346281e-06, + "loss": 0.5651, + "step": 4875 + }, + { + "epoch": 0.7910447761194029, + "grad_norm": 0.5728113069822678, + "learning_rate": 4.213209745810612e-06, + "loss": 0.5556, + "step": 4876 + }, + { + "epoch": 0.7912070084360805, + "grad_norm": 0.5767732885843793, + "learning_rate": 4.212898723288485e-06, + "loss": 0.5416, + "step": 4877 + }, + { + "epoch": 0.7913692407527579, + "grad_norm": 0.5735081384457632, + "learning_rate": 4.212587650788973e-06, + "loss": 0.5415, + "step": 4878 + }, + { + "epoch": 0.7915314730694354, + "grad_norm": 0.5882693134233369, + "learning_rate": 4.212276528321155e-06, + "loss": 0.541, + "step": 4879 + }, + { + "epoch": 0.791693705386113, + "grad_norm": 0.5617283153977901, + "learning_rate": 4.211965355894108e-06, + "loss": 0.5554, + "step": 4880 + }, + { + "epoch": 0.7918559377027904, + "grad_norm": 0.5505184461064906, + "learning_rate": 4.2116541335169105e-06, + "loss": 0.5545, + "step": 4881 + }, + { + "epoch": 0.7920181700194678, + "grad_norm": 0.5635377457354892, + "learning_rate": 4.211342861198643e-06, + "loss": 0.5411, + "step": 4882 + }, + { + "epoch": 0.7921804023361454, + "grad_norm": 0.6064871695637287, + "learning_rate": 4.211031538948388e-06, + "loss": 0.5329, + "step": 4883 + }, + { + "epoch": 0.7923426346528228, + "grad_norm": 0.6245649960369496, + "learning_rate": 4.210720166775228e-06, + "loss": 0.5422, + "step": 4884 + }, + { + "epoch": 0.7925048669695003, + "grad_norm": 0.6009115711117952, + "learning_rate": 4.21040874468825e-06, + "loss": 0.5807, + "step": 4885 + }, + { + "epoch": 0.7926670992861778, + "grad_norm": 0.5904968151477781, + "learning_rate": 4.210097272696538e-06, + "loss": 0.5795, + "step": 4886 + }, + { + "epoch": 0.7928293316028553, + "grad_norm": 0.6156947120212853, + "learning_rate": 4.209785750809181e-06, + "loss": 0.5362, + "step": 4887 + }, + { + "epoch": 0.7929915639195327, + "grad_norm": 0.6394965476797208, + "learning_rate": 4.209474179035268e-06, + "loss": 0.5761, + "step": 4888 + }, + { + "epoch": 0.7931537962362103, + "grad_norm": 0.5591757904333452, + "learning_rate": 4.2091625573838905e-06, + "loss": 0.5591, + "step": 4889 + }, + { + "epoch": 0.7933160285528877, + "grad_norm": 0.5941118186224786, + "learning_rate": 4.20885088586414e-06, + "loss": 0.5431, + "step": 4890 + }, + { + "epoch": 0.7934782608695652, + "grad_norm": 0.5768888680757818, + "learning_rate": 4.20853916448511e-06, + "loss": 0.5594, + "step": 4891 + }, + { + "epoch": 0.7936404931862427, + "grad_norm": 0.5982343388977694, + "learning_rate": 4.208227393255896e-06, + "loss": 0.5466, + "step": 4892 + }, + { + "epoch": 0.7938027255029202, + "grad_norm": 0.6116751355966381, + "learning_rate": 4.207915572185594e-06, + "loss": 0.5537, + "step": 4893 + }, + { + "epoch": 0.7939649578195976, + "grad_norm": 0.5917784876330983, + "learning_rate": 4.207603701283304e-06, + "loss": 0.5701, + "step": 4894 + }, + { + "epoch": 0.7941271901362752, + "grad_norm": 0.5718123554113944, + "learning_rate": 4.2072917805581225e-06, + "loss": 0.5185, + "step": 4895 + }, + { + "epoch": 0.7942894224529526, + "grad_norm": 0.5926605029045825, + "learning_rate": 4.206979810019153e-06, + "loss": 0.5361, + "step": 4896 + }, + { + "epoch": 0.7944516547696301, + "grad_norm": 0.571857313047413, + "learning_rate": 4.206667789675496e-06, + "loss": 0.5682, + "step": 4897 + }, + { + "epoch": 0.7946138870863076, + "grad_norm": 0.5957794251849259, + "learning_rate": 4.206355719536257e-06, + "loss": 0.5411, + "step": 4898 + }, + { + "epoch": 0.7947761194029851, + "grad_norm": 0.5623968317772218, + "learning_rate": 4.20604359961054e-06, + "loss": 0.5175, + "step": 4899 + }, + { + "epoch": 0.7949383517196625, + "grad_norm": 0.5716091893217974, + "learning_rate": 4.2057314299074524e-06, + "loss": 0.5292, + "step": 4900 + }, + { + "epoch": 0.79510058403634, + "grad_norm": 0.609565661147297, + "learning_rate": 4.205419210436102e-06, + "loss": 0.5627, + "step": 4901 + }, + { + "epoch": 0.7952628163530175, + "grad_norm": 0.5668231192386484, + "learning_rate": 4.205106941205599e-06, + "loss": 0.5463, + "step": 4902 + }, + { + "epoch": 0.795425048669695, + "grad_norm": 0.5775056642025799, + "learning_rate": 4.204794622225053e-06, + "loss": 0.5756, + "step": 4903 + }, + { + "epoch": 0.7955872809863724, + "grad_norm": 0.5670674609743406, + "learning_rate": 4.20448225350358e-06, + "loss": 0.5526, + "step": 4904 + }, + { + "epoch": 0.79574951330305, + "grad_norm": 0.600075513790744, + "learning_rate": 4.2041698350502895e-06, + "loss": 0.5314, + "step": 4905 + }, + { + "epoch": 0.7959117456197274, + "grad_norm": 0.5683547908742991, + "learning_rate": 4.2038573668743e-06, + "loss": 0.5429, + "step": 4906 + }, + { + "epoch": 0.7960739779364049, + "grad_norm": 0.5740171786848558, + "learning_rate": 4.203544848984729e-06, + "loss": 0.5531, + "step": 4907 + }, + { + "epoch": 0.7962362102530824, + "grad_norm": 0.6201964774594898, + "learning_rate": 4.203232281390691e-06, + "loss": 0.5427, + "step": 4908 + }, + { + "epoch": 0.7963984425697599, + "grad_norm": 0.6146498910804307, + "learning_rate": 4.2029196641013104e-06, + "loss": 0.5445, + "step": 4909 + }, + { + "epoch": 0.7965606748864373, + "grad_norm": 0.598994054383017, + "learning_rate": 4.202606997125705e-06, + "loss": 0.5536, + "step": 4910 + }, + { + "epoch": 0.7967229072031149, + "grad_norm": 0.587646778180401, + "learning_rate": 4.202294280472999e-06, + "loss": 0.5609, + "step": 4911 + }, + { + "epoch": 0.7968851395197923, + "grad_norm": 0.5827952764713319, + "learning_rate": 4.201981514152317e-06, + "loss": 0.5628, + "step": 4912 + }, + { + "epoch": 0.7970473718364698, + "grad_norm": 0.5807909755284733, + "learning_rate": 4.201668698172784e-06, + "loss": 0.5261, + "step": 4913 + }, + { + "epoch": 0.7972096041531473, + "grad_norm": 0.5833293543597022, + "learning_rate": 4.201355832543527e-06, + "loss": 0.5558, + "step": 4914 + }, + { + "epoch": 0.7973718364698248, + "grad_norm": 0.5259250956089784, + "learning_rate": 4.201042917273674e-06, + "loss": 0.5204, + "step": 4915 + }, + { + "epoch": 0.7975340687865022, + "grad_norm": 0.570989285551963, + "learning_rate": 4.200729952372355e-06, + "loss": 0.5457, + "step": 4916 + }, + { + "epoch": 0.7976963011031798, + "grad_norm": 0.5880531590175305, + "learning_rate": 4.200416937848704e-06, + "loss": 0.5331, + "step": 4917 + }, + { + "epoch": 0.7978585334198572, + "grad_norm": 0.58787939929013, + "learning_rate": 4.20010387371185e-06, + "loss": 0.5719, + "step": 4918 + }, + { + "epoch": 0.7980207657365347, + "grad_norm": 0.5917282617653883, + "learning_rate": 4.19979075997093e-06, + "loss": 0.5247, + "step": 4919 + }, + { + "epoch": 0.7981829980532122, + "grad_norm": 0.5894549607475732, + "learning_rate": 4.199477596635078e-06, + "loss": 0.5509, + "step": 4920 + }, + { + "epoch": 0.7983452303698897, + "grad_norm": 0.6069470981568331, + "learning_rate": 4.199164383713433e-06, + "loss": 0.5457, + "step": 4921 + }, + { + "epoch": 0.7985074626865671, + "grad_norm": 0.6374316635390956, + "learning_rate": 4.198851121215132e-06, + "loss": 0.5558, + "step": 4922 + }, + { + "epoch": 0.7986696950032447, + "grad_norm": 0.5861766354004145, + "learning_rate": 4.198537809149316e-06, + "loss": 0.5542, + "step": 4923 + }, + { + "epoch": 0.7988319273199221, + "grad_norm": 0.5740920346666091, + "learning_rate": 4.198224447525126e-06, + "loss": 0.5463, + "step": 4924 + }, + { + "epoch": 0.7989941596365996, + "grad_norm": 0.6231474035755334, + "learning_rate": 4.197911036351706e-06, + "loss": 0.5465, + "step": 4925 + }, + { + "epoch": 0.7991563919532771, + "grad_norm": 0.6309413293204604, + "learning_rate": 4.197597575638198e-06, + "loss": 0.5563, + "step": 4926 + }, + { + "epoch": 0.7993186242699546, + "grad_norm": 0.5772878739009035, + "learning_rate": 4.1972840653937515e-06, + "loss": 0.5645, + "step": 4927 + }, + { + "epoch": 0.799480856586632, + "grad_norm": 0.6130941115716242, + "learning_rate": 4.196970505627511e-06, + "loss": 0.5668, + "step": 4928 + }, + { + "epoch": 0.7996430889033095, + "grad_norm": 0.5999100848904767, + "learning_rate": 4.1966568963486255e-06, + "loss": 0.5222, + "step": 4929 + }, + { + "epoch": 0.799805321219987, + "grad_norm": 0.6548557622547966, + "learning_rate": 4.1963432375662474e-06, + "loss": 0.563, + "step": 4930 + }, + { + "epoch": 0.7999675535366645, + "grad_norm": 0.6800798444683008, + "learning_rate": 4.196029529289525e-06, + "loss": 0.5346, + "step": 4931 + }, + { + "epoch": 0.8001297858533419, + "grad_norm": 0.60688126690516, + "learning_rate": 4.195715771527614e-06, + "loss": 0.581, + "step": 4932 + }, + { + "epoch": 0.8002920181700195, + "grad_norm": 0.5549581438504704, + "learning_rate": 4.195401964289668e-06, + "loss": 0.5959, + "step": 4933 + }, + { + "epoch": 0.8004542504866969, + "grad_norm": 0.5743281456079335, + "learning_rate": 4.195088107584843e-06, + "loss": 0.5457, + "step": 4934 + }, + { + "epoch": 0.8006164828033744, + "grad_norm": 0.5879628560418647, + "learning_rate": 4.194774201422297e-06, + "loss": 0.5436, + "step": 4935 + }, + { + "epoch": 0.8007787151200519, + "grad_norm": 0.605244162737623, + "learning_rate": 4.194460245811188e-06, + "loss": 0.5698, + "step": 4936 + }, + { + "epoch": 0.8009409474367294, + "grad_norm": 0.5803463073475494, + "learning_rate": 4.194146240760677e-06, + "loss": 0.5176, + "step": 4937 + }, + { + "epoch": 0.8011031797534068, + "grad_norm": 0.6221180341365936, + "learning_rate": 4.193832186279925e-06, + "loss": 0.538, + "step": 4938 + }, + { + "epoch": 0.8012654120700844, + "grad_norm": 0.6183069802943443, + "learning_rate": 4.193518082378095e-06, + "loss": 0.5746, + "step": 4939 + }, + { + "epoch": 0.8014276443867618, + "grad_norm": 0.6086911369760674, + "learning_rate": 4.1932039290643534e-06, + "loss": 0.605, + "step": 4940 + }, + { + "epoch": 0.8015898767034393, + "grad_norm": 0.6150295367888226, + "learning_rate": 4.192889726347864e-06, + "loss": 0.5508, + "step": 4941 + }, + { + "epoch": 0.8017521090201168, + "grad_norm": 0.5726309555864545, + "learning_rate": 4.192575474237797e-06, + "loss": 0.5563, + "step": 4942 + }, + { + "epoch": 0.8019143413367943, + "grad_norm": 0.6272266974799767, + "learning_rate": 4.192261172743318e-06, + "loss": 0.5527, + "step": 4943 + }, + { + "epoch": 0.8020765736534717, + "grad_norm": 0.5822796724688352, + "learning_rate": 4.1919468218736e-06, + "loss": 0.5317, + "step": 4944 + }, + { + "epoch": 0.8022388059701493, + "grad_norm": 0.587139009925098, + "learning_rate": 4.1916324216378145e-06, + "loss": 0.5322, + "step": 4945 + }, + { + "epoch": 0.8024010382868267, + "grad_norm": 0.6196525762217092, + "learning_rate": 4.191317972045134e-06, + "loss": 0.5336, + "step": 4946 + }, + { + "epoch": 0.8025632706035042, + "grad_norm": 0.5608980100329727, + "learning_rate": 4.1910034731047335e-06, + "loss": 0.5686, + "step": 4947 + }, + { + "epoch": 0.8027255029201817, + "grad_norm": 0.5638167250648521, + "learning_rate": 4.190688924825789e-06, + "loss": 0.5023, + "step": 4948 + }, + { + "epoch": 0.8028877352368592, + "grad_norm": 0.6145974661319487, + "learning_rate": 4.190374327217478e-06, + "loss": 0.5428, + "step": 4949 + }, + { + "epoch": 0.8030499675535366, + "grad_norm": 0.5716035318938026, + "learning_rate": 4.190059680288981e-06, + "loss": 0.5415, + "step": 4950 + }, + { + "epoch": 0.8032121998702142, + "grad_norm": 0.5688547560740189, + "learning_rate": 4.189744984049476e-06, + "loss": 0.5696, + "step": 4951 + }, + { + "epoch": 0.8033744321868916, + "grad_norm": 0.611743011221152, + "learning_rate": 4.189430238508147e-06, + "loss": 0.5593, + "step": 4952 + }, + { + "epoch": 0.8035366645035691, + "grad_norm": 0.5996488012892072, + "learning_rate": 4.189115443674177e-06, + "loss": 0.5411, + "step": 4953 + }, + { + "epoch": 0.8036988968202466, + "grad_norm": 0.5600151802665135, + "learning_rate": 4.188800599556749e-06, + "loss": 0.5625, + "step": 4954 + }, + { + "epoch": 0.8038611291369241, + "grad_norm": 0.5766129403430842, + "learning_rate": 4.188485706165052e-06, + "loss": 0.5188, + "step": 4955 + }, + { + "epoch": 0.8040233614536015, + "grad_norm": 0.5597623859204697, + "learning_rate": 4.188170763508271e-06, + "loss": 0.5446, + "step": 4956 + }, + { + "epoch": 0.8041855937702791, + "grad_norm": 0.560844190757258, + "learning_rate": 4.187855771595597e-06, + "loss": 0.5505, + "step": 4957 + }, + { + "epoch": 0.8043478260869565, + "grad_norm": 0.5800703619033958, + "learning_rate": 4.1875407304362206e-06, + "loss": 0.5044, + "step": 4958 + }, + { + "epoch": 0.804510058403634, + "grad_norm": 0.5740064060369267, + "learning_rate": 4.187225640039332e-06, + "loss": 0.5146, + "step": 4959 + }, + { + "epoch": 0.8046722907203114, + "grad_norm": 0.5880532949936803, + "learning_rate": 4.186910500414126e-06, + "loss": 0.5498, + "step": 4960 + }, + { + "epoch": 0.804834523036989, + "grad_norm": 0.57531695088739, + "learning_rate": 4.186595311569798e-06, + "loss": 0.5551, + "step": 4961 + }, + { + "epoch": 0.8049967553536664, + "grad_norm": 0.5858723357591605, + "learning_rate": 4.186280073515543e-06, + "loss": 0.5889, + "step": 4962 + }, + { + "epoch": 0.8051589876703439, + "grad_norm": 0.5819132426305801, + "learning_rate": 4.185964786260559e-06, + "loss": 0.5031, + "step": 4963 + }, + { + "epoch": 0.8053212199870214, + "grad_norm": 0.5860090005562953, + "learning_rate": 4.185649449814046e-06, + "loss": 0.5483, + "step": 4964 + }, + { + "epoch": 0.8054834523036989, + "grad_norm": 0.5699133055286937, + "learning_rate": 4.185334064185203e-06, + "loss": 0.5238, + "step": 4965 + }, + { + "epoch": 0.8056456846203763, + "grad_norm": 0.6174468496484408, + "learning_rate": 4.185018629383234e-06, + "loss": 0.5268, + "step": 4966 + }, + { + "epoch": 0.8058079169370539, + "grad_norm": 0.5635147421472954, + "learning_rate": 4.1847031454173406e-06, + "loss": 0.5445, + "step": 4967 + }, + { + "epoch": 0.8059701492537313, + "grad_norm": 0.5878260434942105, + "learning_rate": 4.184387612296729e-06, + "loss": 0.5345, + "step": 4968 + }, + { + "epoch": 0.8061323815704088, + "grad_norm": 0.5719446382165142, + "learning_rate": 4.184072030030605e-06, + "loss": 0.5506, + "step": 4969 + }, + { + "epoch": 0.8062946138870863, + "grad_norm": 0.588230881834831, + "learning_rate": 4.183756398628176e-06, + "loss": 0.5436, + "step": 4970 + }, + { + "epoch": 0.8064568462037638, + "grad_norm": 0.5735299432936151, + "learning_rate": 4.183440718098652e-06, + "loss": 0.559, + "step": 4971 + }, + { + "epoch": 0.8066190785204412, + "grad_norm": 0.5843001809927173, + "learning_rate": 4.1831249884512435e-06, + "loss": 0.5731, + "step": 4972 + }, + { + "epoch": 0.8067813108371188, + "grad_norm": 0.582574481213294, + "learning_rate": 4.182809209695163e-06, + "loss": 0.5661, + "step": 4973 + }, + { + "epoch": 0.8069435431537962, + "grad_norm": 0.5832748449226096, + "learning_rate": 4.182493381839622e-06, + "loss": 0.5496, + "step": 4974 + }, + { + "epoch": 0.8071057754704737, + "grad_norm": 0.6046805308918332, + "learning_rate": 4.1821775048938375e-06, + "loss": 0.5328, + "step": 4975 + }, + { + "epoch": 0.8072680077871512, + "grad_norm": 0.5773047030917111, + "learning_rate": 4.181861578867026e-06, + "loss": 0.5916, + "step": 4976 + }, + { + "epoch": 0.8074302401038287, + "grad_norm": 0.6011998945627108, + "learning_rate": 4.181545603768403e-06, + "loss": 0.535, + "step": 4977 + }, + { + "epoch": 0.8075924724205061, + "grad_norm": 0.601598651687244, + "learning_rate": 4.18122957960719e-06, + "loss": 0.5743, + "step": 4978 + }, + { + "epoch": 0.8077547047371837, + "grad_norm": 0.5649073386615346, + "learning_rate": 4.180913506392606e-06, + "loss": 0.5545, + "step": 4979 + }, + { + "epoch": 0.8079169370538611, + "grad_norm": 0.6095178966770974, + "learning_rate": 4.180597384133875e-06, + "loss": 0.556, + "step": 4980 + }, + { + "epoch": 0.8080791693705386, + "grad_norm": 0.5556512996134363, + "learning_rate": 4.18028121284022e-06, + "loss": 0.5356, + "step": 4981 + }, + { + "epoch": 0.8082414016872161, + "grad_norm": 0.5703731749049906, + "learning_rate": 4.179964992520864e-06, + "loss": 0.5424, + "step": 4982 + }, + { + "epoch": 0.8084036340038936, + "grad_norm": 0.5878391526755999, + "learning_rate": 4.1796487231850365e-06, + "loss": 0.5582, + "step": 4983 + }, + { + "epoch": 0.808565866320571, + "grad_norm": 0.5644663109211654, + "learning_rate": 4.179332404841963e-06, + "loss": 0.5405, + "step": 4984 + }, + { + "epoch": 0.8087280986372486, + "grad_norm": 0.5726982010657444, + "learning_rate": 4.179016037500872e-06, + "loss": 0.5818, + "step": 4985 + }, + { + "epoch": 0.808890330953926, + "grad_norm": 0.5796447052790231, + "learning_rate": 4.178699621170997e-06, + "loss": 0.5528, + "step": 4986 + }, + { + "epoch": 0.8090525632706035, + "grad_norm": 0.5566254092534081, + "learning_rate": 4.1783831558615685e-06, + "loss": 0.547, + "step": 4987 + }, + { + "epoch": 0.8092147955872809, + "grad_norm": 0.6151791539919037, + "learning_rate": 4.17806664158182e-06, + "loss": 0.5248, + "step": 4988 + }, + { + "epoch": 0.8093770279039585, + "grad_norm": 0.5566689863314536, + "learning_rate": 4.177750078340986e-06, + "loss": 0.5657, + "step": 4989 + }, + { + "epoch": 0.8095392602206359, + "grad_norm": 0.5564652748794973, + "learning_rate": 4.177433466148304e-06, + "loss": 0.579, + "step": 4990 + }, + { + "epoch": 0.8097014925373134, + "grad_norm": 0.5848918532205446, + "learning_rate": 4.177116805013011e-06, + "loss": 0.5448, + "step": 4991 + }, + { + "epoch": 0.8098637248539909, + "grad_norm": 0.6078445641358949, + "learning_rate": 4.176800094944348e-06, + "loss": 0.5604, + "step": 4992 + }, + { + "epoch": 0.8100259571706684, + "grad_norm": 0.5959333368785331, + "learning_rate": 4.176483335951553e-06, + "loss": 0.5547, + "step": 4993 + }, + { + "epoch": 0.8101881894873458, + "grad_norm": 0.5758485407352306, + "learning_rate": 4.176166528043871e-06, + "loss": 0.5805, + "step": 4994 + }, + { + "epoch": 0.8103504218040234, + "grad_norm": 0.565117035899313, + "learning_rate": 4.175849671230542e-06, + "loss": 0.5405, + "step": 4995 + }, + { + "epoch": 0.8105126541207008, + "grad_norm": 0.5802182862944403, + "learning_rate": 4.1755327655208146e-06, + "loss": 0.5732, + "step": 4996 + }, + { + "epoch": 0.8106748864373783, + "grad_norm": 0.5577753001946287, + "learning_rate": 4.175215810923932e-06, + "loss": 0.5572, + "step": 4997 + }, + { + "epoch": 0.8108371187540558, + "grad_norm": 0.6009540467907196, + "learning_rate": 4.174898807449144e-06, + "loss": 0.5346, + "step": 4998 + }, + { + "epoch": 0.8109993510707333, + "grad_norm": 0.5912192201419263, + "learning_rate": 4.1745817551057e-06, + "loss": 0.5405, + "step": 4999 + }, + { + "epoch": 0.8111615833874107, + "grad_norm": 0.5641394795523567, + "learning_rate": 4.1742646539028494e-06, + "loss": 0.5015, + "step": 5000 + }, + { + "epoch": 0.8113238157040883, + "grad_norm": 0.5592353019655139, + "learning_rate": 4.173947503849844e-06, + "loss": 0.5798, + "step": 5001 + }, + { + "epoch": 0.8114860480207657, + "grad_norm": 0.5895705908065486, + "learning_rate": 4.173630304955939e-06, + "loss": 0.5566, + "step": 5002 + }, + { + "epoch": 0.8116482803374432, + "grad_norm": 0.6130577194073632, + "learning_rate": 4.173313057230389e-06, + "loss": 0.5399, + "step": 5003 + }, + { + "epoch": 0.8118105126541207, + "grad_norm": 0.589950152473626, + "learning_rate": 4.17299576068245e-06, + "loss": 0.5639, + "step": 5004 + }, + { + "epoch": 0.8119727449707982, + "grad_norm": 0.5575046165444573, + "learning_rate": 4.172678415321379e-06, + "loss": 0.552, + "step": 5005 + }, + { + "epoch": 0.8121349772874756, + "grad_norm": 0.581983174709885, + "learning_rate": 4.172361021156436e-06, + "loss": 0.5295, + "step": 5006 + }, + { + "epoch": 0.8122972096041532, + "grad_norm": 0.6091434844207466, + "learning_rate": 4.172043578196881e-06, + "loss": 0.5307, + "step": 5007 + }, + { + "epoch": 0.8124594419208306, + "grad_norm": 0.5958280517038501, + "learning_rate": 4.171726086451977e-06, + "loss": 0.5151, + "step": 5008 + }, + { + "epoch": 0.8126216742375081, + "grad_norm": 0.6140206621999461, + "learning_rate": 4.171408545930987e-06, + "loss": 0.5755, + "step": 5009 + }, + { + "epoch": 0.8127839065541856, + "grad_norm": 0.5958101963297623, + "learning_rate": 4.171090956643177e-06, + "loss": 0.5555, + "step": 5010 + }, + { + "epoch": 0.8129461388708631, + "grad_norm": 0.6105122944959434, + "learning_rate": 4.170773318597811e-06, + "loss": 0.5975, + "step": 5011 + }, + { + "epoch": 0.8131083711875405, + "grad_norm": 0.5899136694466258, + "learning_rate": 4.1704556318041585e-06, + "loss": 0.571, + "step": 5012 + }, + { + "epoch": 0.8132706035042181, + "grad_norm": 0.6237503960902323, + "learning_rate": 4.170137896271488e-06, + "loss": 0.4985, + "step": 5013 + }, + { + "epoch": 0.8134328358208955, + "grad_norm": 0.5865143212611821, + "learning_rate": 4.1698201120090704e-06, + "loss": 0.5217, + "step": 5014 + }, + { + "epoch": 0.813595068137573, + "grad_norm": 0.5989414004303195, + "learning_rate": 4.1695022790261775e-06, + "loss": 0.5644, + "step": 5015 + }, + { + "epoch": 0.8137573004542504, + "grad_norm": 0.6916868726383832, + "learning_rate": 4.1691843973320825e-06, + "loss": 0.5682, + "step": 5016 + }, + { + "epoch": 0.813919532770928, + "grad_norm": 0.6067048504221128, + "learning_rate": 4.168866466936061e-06, + "loss": 0.5396, + "step": 5017 + }, + { + "epoch": 0.8140817650876054, + "grad_norm": 0.5970815953431803, + "learning_rate": 4.168548487847389e-06, + "loss": 0.577, + "step": 5018 + }, + { + "epoch": 0.8142439974042829, + "grad_norm": 0.6087788448174839, + "learning_rate": 4.1682304600753446e-06, + "loss": 0.549, + "step": 5019 + }, + { + "epoch": 0.8144062297209604, + "grad_norm": 0.5903825465260628, + "learning_rate": 4.167912383629206e-06, + "loss": 0.5555, + "step": 5020 + }, + { + "epoch": 0.8145684620376379, + "grad_norm": 0.5780594847429642, + "learning_rate": 4.1675942585182536e-06, + "loss": 0.5688, + "step": 5021 + }, + { + "epoch": 0.8147306943543153, + "grad_norm": 0.6031277237947625, + "learning_rate": 4.167276084751771e-06, + "loss": 0.5558, + "step": 5022 + }, + { + "epoch": 0.8148929266709929, + "grad_norm": 0.5535222675490482, + "learning_rate": 4.16695786233904e-06, + "loss": 0.5429, + "step": 5023 + }, + { + "epoch": 0.8150551589876703, + "grad_norm": 0.5649842294974055, + "learning_rate": 4.166639591289346e-06, + "loss": 0.5935, + "step": 5024 + }, + { + "epoch": 0.8152173913043478, + "grad_norm": 0.6050465916423209, + "learning_rate": 4.166321271611975e-06, + "loss": 0.5225, + "step": 5025 + }, + { + "epoch": 0.8153796236210253, + "grad_norm": 0.6160541188768902, + "learning_rate": 4.166002903316214e-06, + "loss": 0.5805, + "step": 5026 + }, + { + "epoch": 0.8155418559377028, + "grad_norm": 0.6066720224588715, + "learning_rate": 4.165684486411354e-06, + "loss": 0.5648, + "step": 5027 + }, + { + "epoch": 0.8157040882543802, + "grad_norm": 0.5558293725035601, + "learning_rate": 4.1653660209066835e-06, + "loss": 0.5755, + "step": 5028 + }, + { + "epoch": 0.8158663205710578, + "grad_norm": 0.5988537729711488, + "learning_rate": 4.165047506811496e-06, + "loss": 0.5842, + "step": 5029 + }, + { + "epoch": 0.8160285528877352, + "grad_norm": 0.5774749959358948, + "learning_rate": 4.164728944135083e-06, + "loss": 0.5826, + "step": 5030 + }, + { + "epoch": 0.8161907852044127, + "grad_norm": 0.5497809634180599, + "learning_rate": 4.164410332886741e-06, + "loss": 0.5278, + "step": 5031 + }, + { + "epoch": 0.8163530175210902, + "grad_norm": 0.6159354455040159, + "learning_rate": 4.164091673075766e-06, + "loss": 0.5697, + "step": 5032 + }, + { + "epoch": 0.8165152498377677, + "grad_norm": 0.5560208058651851, + "learning_rate": 4.163772964711453e-06, + "loss": 0.5753, + "step": 5033 + }, + { + "epoch": 0.8166774821544451, + "grad_norm": 0.5976556708802722, + "learning_rate": 4.163454207803105e-06, + "loss": 0.5114, + "step": 5034 + }, + { + "epoch": 0.8168397144711227, + "grad_norm": 0.5887010110887005, + "learning_rate": 4.163135402360019e-06, + "loss": 0.5956, + "step": 5035 + }, + { + "epoch": 0.8170019467878001, + "grad_norm": 0.5504780180048958, + "learning_rate": 4.162816548391498e-06, + "loss": 0.5455, + "step": 5036 + }, + { + "epoch": 0.8171641791044776, + "grad_norm": 0.5995941401492192, + "learning_rate": 4.162497645906846e-06, + "loss": 0.5514, + "step": 5037 + }, + { + "epoch": 0.8173264114211551, + "grad_norm": 0.5946300684129933, + "learning_rate": 4.162178694915368e-06, + "loss": 0.5442, + "step": 5038 + }, + { + "epoch": 0.8174886437378326, + "grad_norm": 0.5840627421162083, + "learning_rate": 4.1618596954263675e-06, + "loss": 0.551, + "step": 5039 + }, + { + "epoch": 0.81765087605451, + "grad_norm": 0.574610745884106, + "learning_rate": 4.161540647449154e-06, + "loss": 0.5344, + "step": 5040 + }, + { + "epoch": 0.8178131083711876, + "grad_norm": 0.5905755059386828, + "learning_rate": 4.161221550993036e-06, + "loss": 0.5719, + "step": 5041 + }, + { + "epoch": 0.817975340687865, + "grad_norm": 0.5807365500007333, + "learning_rate": 4.160902406067324e-06, + "loss": 0.55, + "step": 5042 + }, + { + "epoch": 0.8181375730045425, + "grad_norm": 0.6190436815707026, + "learning_rate": 4.160583212681328e-06, + "loss": 0.5848, + "step": 5043 + }, + { + "epoch": 0.81829980532122, + "grad_norm": 0.5558727886898365, + "learning_rate": 4.160263970844364e-06, + "loss": 0.5508, + "step": 5044 + }, + { + "epoch": 0.8184620376378975, + "grad_norm": 0.5658295191845131, + "learning_rate": 4.159944680565745e-06, + "loss": 0.5302, + "step": 5045 + }, + { + "epoch": 0.8186242699545749, + "grad_norm": 0.6096411541723843, + "learning_rate": 4.159625341854787e-06, + "loss": 0.6014, + "step": 5046 + }, + { + "epoch": 0.8187865022712524, + "grad_norm": 0.5838392631155799, + "learning_rate": 4.159305954720807e-06, + "loss": 0.5337, + "step": 5047 + }, + { + "epoch": 0.8189487345879299, + "grad_norm": 0.6013960884332239, + "learning_rate": 4.158986519173125e-06, + "loss": 0.5606, + "step": 5048 + }, + { + "epoch": 0.8191109669046074, + "grad_norm": 0.5681227112118862, + "learning_rate": 4.158667035221059e-06, + "loss": 0.5608, + "step": 5049 + }, + { + "epoch": 0.8192731992212848, + "grad_norm": 0.5800190512484037, + "learning_rate": 4.158347502873933e-06, + "loss": 0.5295, + "step": 5050 + }, + { + "epoch": 0.8194354315379624, + "grad_norm": 0.549020343110051, + "learning_rate": 4.15802792214107e-06, + "loss": 0.5489, + "step": 5051 + }, + { + "epoch": 0.8195976638546398, + "grad_norm": 0.5853721475649402, + "learning_rate": 4.157708293031793e-06, + "loss": 0.5799, + "step": 5052 + }, + { + "epoch": 0.8197598961713173, + "grad_norm": 0.5761723964137646, + "learning_rate": 4.157388615555427e-06, + "loss": 0.5771, + "step": 5053 + }, + { + "epoch": 0.8199221284879948, + "grad_norm": 0.5910940667482505, + "learning_rate": 4.157068889721302e-06, + "loss": 0.5661, + "step": 5054 + }, + { + "epoch": 0.8200843608046723, + "grad_norm": 0.6104926509022194, + "learning_rate": 4.156749115538744e-06, + "loss": 0.5438, + "step": 5055 + }, + { + "epoch": 0.8202465931213497, + "grad_norm": 0.5783394525848148, + "learning_rate": 4.1564292930170855e-06, + "loss": 0.5623, + "step": 5056 + }, + { + "epoch": 0.8204088254380273, + "grad_norm": 0.5588066259408059, + "learning_rate": 4.156109422165656e-06, + "loss": 0.5311, + "step": 5057 + }, + { + "epoch": 0.8205710577547047, + "grad_norm": 0.5752455963771981, + "learning_rate": 4.15578950299379e-06, + "loss": 0.5531, + "step": 5058 + }, + { + "epoch": 0.8207332900713822, + "grad_norm": 0.5674842145115904, + "learning_rate": 4.15546953551082e-06, + "loss": 0.5249, + "step": 5059 + }, + { + "epoch": 0.8208955223880597, + "grad_norm": 0.5852876528516482, + "learning_rate": 4.155149519726083e-06, + "loss": 0.5855, + "step": 5060 + }, + { + "epoch": 0.8210577547047372, + "grad_norm": 0.5640331330642898, + "learning_rate": 4.154829455648916e-06, + "loss": 0.5332, + "step": 5061 + }, + { + "epoch": 0.8212199870214146, + "grad_norm": 0.5748633065658687, + "learning_rate": 4.154509343288658e-06, + "loss": 0.5252, + "step": 5062 + }, + { + "epoch": 0.8213822193380922, + "grad_norm": 0.5927565407116256, + "learning_rate": 4.154189182654648e-06, + "loss": 0.5929, + "step": 5063 + }, + { + "epoch": 0.8215444516547696, + "grad_norm": 0.5914353549116154, + "learning_rate": 4.153868973756228e-06, + "loss": 0.5582, + "step": 5064 + }, + { + "epoch": 0.8217066839714471, + "grad_norm": 0.612704654416121, + "learning_rate": 4.1535487166027396e-06, + "loss": 0.584, + "step": 5065 + }, + { + "epoch": 0.8218689162881246, + "grad_norm": 0.5903724580337185, + "learning_rate": 4.1532284112035295e-06, + "loss": 0.5339, + "step": 5066 + }, + { + "epoch": 0.8220311486048021, + "grad_norm": 0.5978612062214901, + "learning_rate": 4.152908057567939e-06, + "loss": 0.5564, + "step": 5067 + }, + { + "epoch": 0.8221933809214795, + "grad_norm": 0.5608350576640598, + "learning_rate": 4.152587655705321e-06, + "loss": 0.5645, + "step": 5068 + }, + { + "epoch": 0.8223556132381571, + "grad_norm": 0.576945619272994, + "learning_rate": 4.1522672056250175e-06, + "loss": 0.5452, + "step": 5069 + }, + { + "epoch": 0.8225178455548345, + "grad_norm": 0.6173332556732631, + "learning_rate": 4.151946707336383e-06, + "loss": 0.5931, + "step": 5070 + }, + { + "epoch": 0.822680077871512, + "grad_norm": 0.6339415140640194, + "learning_rate": 4.151626160848766e-06, + "loss": 0.5486, + "step": 5071 + }, + { + "epoch": 0.8228423101881895, + "grad_norm": 0.6113708842027008, + "learning_rate": 4.151305566171521e-06, + "loss": 0.5589, + "step": 5072 + }, + { + "epoch": 0.823004542504867, + "grad_norm": 0.5908188318612407, + "learning_rate": 4.150984923314001e-06, + "loss": 0.558, + "step": 5073 + }, + { + "epoch": 0.8231667748215444, + "grad_norm": 0.5771481225903476, + "learning_rate": 4.1506642322855625e-06, + "loss": 0.5077, + "step": 5074 + }, + { + "epoch": 0.8233290071382219, + "grad_norm": 0.5987248008235827, + "learning_rate": 4.1503434930955606e-06, + "loss": 0.5646, + "step": 5075 + }, + { + "epoch": 0.8234912394548994, + "grad_norm": 0.5677921720461782, + "learning_rate": 4.150022705753354e-06, + "loss": 0.5387, + "step": 5076 + }, + { + "epoch": 0.8236534717715769, + "grad_norm": 0.6516303788042174, + "learning_rate": 4.149701870268303e-06, + "loss": 0.5672, + "step": 5077 + }, + { + "epoch": 0.8238157040882543, + "grad_norm": 0.6162344475926133, + "learning_rate": 4.149380986649769e-06, + "loss": 0.5741, + "step": 5078 + }, + { + "epoch": 0.8239779364049319, + "grad_norm": 0.6033258122407364, + "learning_rate": 4.149060054907114e-06, + "loss": 0.5358, + "step": 5079 + }, + { + "epoch": 0.8241401687216093, + "grad_norm": 0.5904869244669894, + "learning_rate": 4.148739075049701e-06, + "loss": 0.5615, + "step": 5080 + }, + { + "epoch": 0.8243024010382868, + "grad_norm": 0.5802821882551789, + "learning_rate": 4.148418047086895e-06, + "loss": 0.5602, + "step": 5081 + }, + { + "epoch": 0.8244646333549643, + "grad_norm": 0.5906795396416477, + "learning_rate": 4.148096971028065e-06, + "loss": 0.5623, + "step": 5082 + }, + { + "epoch": 0.8246268656716418, + "grad_norm": 0.6273919553612645, + "learning_rate": 4.147775846882577e-06, + "loss": 0.5692, + "step": 5083 + }, + { + "epoch": 0.8247890979883192, + "grad_norm": 0.6081289584309849, + "learning_rate": 4.147454674659802e-06, + "loss": 0.5553, + "step": 5084 + }, + { + "epoch": 0.8249513303049968, + "grad_norm": 0.5841996826774862, + "learning_rate": 4.147133454369109e-06, + "loss": 0.5662, + "step": 5085 + }, + { + "epoch": 0.8251135626216742, + "grad_norm": 0.6045136631615593, + "learning_rate": 4.146812186019872e-06, + "loss": 0.5308, + "step": 5086 + }, + { + "epoch": 0.8252757949383517, + "grad_norm": 0.5625774322395996, + "learning_rate": 4.146490869621464e-06, + "loss": 0.5712, + "step": 5087 + }, + { + "epoch": 0.8254380272550292, + "grad_norm": 0.5755307179561704, + "learning_rate": 4.146169505183258e-06, + "loss": 0.5448, + "step": 5088 + }, + { + "epoch": 0.8256002595717067, + "grad_norm": 0.5857720550133262, + "learning_rate": 4.145848092714635e-06, + "loss": 0.5497, + "step": 5089 + }, + { + "epoch": 0.8257624918883841, + "grad_norm": 0.5648161341650896, + "learning_rate": 4.145526632224969e-06, + "loss": 0.5597, + "step": 5090 + }, + { + "epoch": 0.8259247242050617, + "grad_norm": 0.5618862068482894, + "learning_rate": 4.14520512372364e-06, + "loss": 0.5533, + "step": 5091 + }, + { + "epoch": 0.8260869565217391, + "grad_norm": 0.5891424467011679, + "learning_rate": 4.1448835672200305e-06, + "loss": 0.5411, + "step": 5092 + }, + { + "epoch": 0.8262491888384166, + "grad_norm": 0.5755141929552391, + "learning_rate": 4.144561962723522e-06, + "loss": 0.5546, + "step": 5093 + }, + { + "epoch": 0.8264114211550941, + "grad_norm": 0.5637556239573321, + "learning_rate": 4.144240310243496e-06, + "loss": 0.5575, + "step": 5094 + }, + { + "epoch": 0.8265736534717716, + "grad_norm": 0.5911303932882812, + "learning_rate": 4.143918609789339e-06, + "loss": 0.5564, + "step": 5095 + }, + { + "epoch": 0.826735885788449, + "grad_norm": 0.5722220218369182, + "learning_rate": 4.143596861370437e-06, + "loss": 0.5431, + "step": 5096 + }, + { + "epoch": 0.8268981181051266, + "grad_norm": 0.5566375083847993, + "learning_rate": 4.1432750649961785e-06, + "loss": 0.5235, + "step": 5097 + }, + { + "epoch": 0.827060350421804, + "grad_norm": 0.5410677083565537, + "learning_rate": 4.142953220675952e-06, + "loss": 0.5484, + "step": 5098 + }, + { + "epoch": 0.8272225827384815, + "grad_norm": 0.5776084592571143, + "learning_rate": 4.142631328419147e-06, + "loss": 0.5539, + "step": 5099 + }, + { + "epoch": 0.827384815055159, + "grad_norm": 0.5836978329591285, + "learning_rate": 4.142309388235155e-06, + "loss": 0.5679, + "step": 5100 + }, + { + "epoch": 0.8275470473718365, + "grad_norm": 0.5955530822710127, + "learning_rate": 4.1419874001333725e-06, + "loss": 0.5368, + "step": 5101 + }, + { + "epoch": 0.8277092796885139, + "grad_norm": 0.609124010047205, + "learning_rate": 4.141665364123191e-06, + "loss": 0.5369, + "step": 5102 + }, + { + "epoch": 0.8278715120051914, + "grad_norm": 0.6302320017850094, + "learning_rate": 4.141343280214008e-06, + "loss": 0.5696, + "step": 5103 + }, + { + "epoch": 0.8280337443218689, + "grad_norm": 0.6018980575451341, + "learning_rate": 4.1410211484152205e-06, + "loss": 0.5273, + "step": 5104 + }, + { + "epoch": 0.8281959766385464, + "grad_norm": 0.6030937256565551, + "learning_rate": 4.140698968736227e-06, + "loss": 0.5591, + "step": 5105 + }, + { + "epoch": 0.8283582089552238, + "grad_norm": 0.5865848101413912, + "learning_rate": 4.140376741186429e-06, + "loss": 0.5257, + "step": 5106 + }, + { + "epoch": 0.8285204412719014, + "grad_norm": 0.5999373333700004, + "learning_rate": 4.1400544657752264e-06, + "loss": 0.5641, + "step": 5107 + }, + { + "epoch": 0.8286826735885788, + "grad_norm": 0.5979057223829326, + "learning_rate": 4.139732142512025e-06, + "loss": 0.5177, + "step": 5108 + }, + { + "epoch": 0.8288449059052563, + "grad_norm": 0.5826110749033921, + "learning_rate": 4.139409771406226e-06, + "loss": 0.5436, + "step": 5109 + }, + { + "epoch": 0.8290071382219338, + "grad_norm": 0.5558738439559913, + "learning_rate": 4.139087352467237e-06, + "loss": 0.5654, + "step": 5110 + }, + { + "epoch": 0.8291693705386113, + "grad_norm": 0.5625904193533383, + "learning_rate": 4.138764885704466e-06, + "loss": 0.5392, + "step": 5111 + }, + { + "epoch": 0.8293316028552887, + "grad_norm": 0.59419203552535, + "learning_rate": 4.138442371127319e-06, + "loss": 0.5653, + "step": 5112 + }, + { + "epoch": 0.8294938351719663, + "grad_norm": 0.547577944178211, + "learning_rate": 4.138119808745209e-06, + "loss": 0.5665, + "step": 5113 + }, + { + "epoch": 0.8296560674886437, + "grad_norm": 0.582766910429329, + "learning_rate": 4.137797198567546e-06, + "loss": 0.5714, + "step": 5114 + }, + { + "epoch": 0.8298182998053212, + "grad_norm": 0.5966213694144928, + "learning_rate": 4.137474540603742e-06, + "loss": 0.5505, + "step": 5115 + }, + { + "epoch": 0.8299805321219987, + "grad_norm": 0.5668121622061558, + "learning_rate": 4.137151834863213e-06, + "loss": 0.5462, + "step": 5116 + }, + { + "epoch": 0.8301427644386762, + "grad_norm": 0.5718760959345959, + "learning_rate": 4.136829081355373e-06, + "loss": 0.5702, + "step": 5117 + }, + { + "epoch": 0.8303049967553536, + "grad_norm": 0.5552506913789691, + "learning_rate": 4.136506280089641e-06, + "loss": 0.5044, + "step": 5118 + }, + { + "epoch": 0.8304672290720312, + "grad_norm": 0.5930324663974121, + "learning_rate": 4.136183431075433e-06, + "loss": 0.5588, + "step": 5119 + }, + { + "epoch": 0.8306294613887086, + "grad_norm": 0.5595856692025891, + "learning_rate": 4.1358605343221705e-06, + "loss": 0.5745, + "step": 5120 + }, + { + "epoch": 0.8307916937053861, + "grad_norm": 0.5570557851711679, + "learning_rate": 4.1355375898392745e-06, + "loss": 0.5475, + "step": 5121 + }, + { + "epoch": 0.8309539260220636, + "grad_norm": 0.5515216769908843, + "learning_rate": 4.135214597636167e-06, + "loss": 0.5535, + "step": 5122 + }, + { + "epoch": 0.8311161583387411, + "grad_norm": 0.5894689450301841, + "learning_rate": 4.1348915577222715e-06, + "loss": 0.5553, + "step": 5123 + }, + { + "epoch": 0.8312783906554185, + "grad_norm": 0.56680030338578, + "learning_rate": 4.1345684701070145e-06, + "loss": 0.5376, + "step": 5124 + }, + { + "epoch": 0.8314406229720961, + "grad_norm": 0.9162112260692503, + "learning_rate": 4.134245334799822e-06, + "loss": 0.5521, + "step": 5125 + }, + { + "epoch": 0.8316028552887735, + "grad_norm": 0.5555939519389206, + "learning_rate": 4.133922151810123e-06, + "loss": 0.5072, + "step": 5126 + }, + { + "epoch": 0.831765087605451, + "grad_norm": 0.5687251282888679, + "learning_rate": 4.133598921147347e-06, + "loss": 0.5671, + "step": 5127 + }, + { + "epoch": 0.8319273199221285, + "grad_norm": 0.5881041716639265, + "learning_rate": 4.133275642820923e-06, + "loss": 0.5785, + "step": 5128 + }, + { + "epoch": 0.832089552238806, + "grad_norm": 0.5652021182622674, + "learning_rate": 4.132952316840286e-06, + "loss": 0.546, + "step": 5129 + }, + { + "epoch": 0.8322517845554834, + "grad_norm": 0.5656847573702611, + "learning_rate": 4.132628943214867e-06, + "loss": 0.527, + "step": 5130 + }, + { + "epoch": 0.832414016872161, + "grad_norm": 0.5675930925939137, + "learning_rate": 4.132305521954104e-06, + "loss": 0.5337, + "step": 5131 + }, + { + "epoch": 0.8325762491888384, + "grad_norm": 0.5756408266043742, + "learning_rate": 4.131982053067432e-06, + "loss": 0.5789, + "step": 5132 + }, + { + "epoch": 0.8327384815055159, + "grad_norm": 0.5951180270384807, + "learning_rate": 4.1316585365642875e-06, + "loss": 0.5734, + "step": 5133 + }, + { + "epoch": 0.8329007138221933, + "grad_norm": 0.5630893953237189, + "learning_rate": 4.131334972454112e-06, + "loss": 0.547, + "step": 5134 + }, + { + "epoch": 0.8330629461388709, + "grad_norm": 0.5816771771919225, + "learning_rate": 4.131011360746346e-06, + "loss": 0.5915, + "step": 5135 + }, + { + "epoch": 0.8332251784555483, + "grad_norm": 0.5549156448093719, + "learning_rate": 4.1306877014504295e-06, + "loss": 0.5627, + "step": 5136 + }, + { + "epoch": 0.8333874107722258, + "grad_norm": 0.5763267925098903, + "learning_rate": 4.130363994575808e-06, + "loss": 0.5339, + "step": 5137 + }, + { + "epoch": 0.8335496430889033, + "grad_norm": 0.6061431822149564, + "learning_rate": 4.130040240131925e-06, + "loss": 0.5477, + "step": 5138 + }, + { + "epoch": 0.8337118754055808, + "grad_norm": 0.6106515498258004, + "learning_rate": 4.129716438128228e-06, + "loss": 0.5785, + "step": 5139 + }, + { + "epoch": 0.8338741077222582, + "grad_norm": 0.5722478013624014, + "learning_rate": 4.129392588574164e-06, + "loss": 0.5638, + "step": 5140 + }, + { + "epoch": 0.8340363400389358, + "grad_norm": 0.5926334668076263, + "learning_rate": 4.129068691479182e-06, + "loss": 0.5049, + "step": 5141 + }, + { + "epoch": 0.8341985723556132, + "grad_norm": 0.5745967116869722, + "learning_rate": 4.128744746852732e-06, + "loss": 0.5453, + "step": 5142 + }, + { + "epoch": 0.8343608046722907, + "grad_norm": 0.5918324232230554, + "learning_rate": 4.128420754704266e-06, + "loss": 0.5801, + "step": 5143 + }, + { + "epoch": 0.8345230369889682, + "grad_norm": 0.5658324264328641, + "learning_rate": 4.128096715043237e-06, + "loss": 0.5606, + "step": 5144 + }, + { + "epoch": 0.8346852693056457, + "grad_norm": 0.5814656408793653, + "learning_rate": 4.1277726278790995e-06, + "loss": 0.5728, + "step": 5145 + }, + { + "epoch": 0.8348475016223231, + "grad_norm": 0.5687195667818555, + "learning_rate": 4.12744849322131e-06, + "loss": 0.5518, + "step": 5146 + }, + { + "epoch": 0.8350097339390007, + "grad_norm": 0.5608517758105639, + "learning_rate": 4.127124311079326e-06, + "loss": 0.5653, + "step": 5147 + }, + { + "epoch": 0.8351719662556781, + "grad_norm": 0.5960680369722253, + "learning_rate": 4.126800081462605e-06, + "loss": 0.5459, + "step": 5148 + }, + { + "epoch": 0.8353341985723556, + "grad_norm": 0.5904634753094128, + "learning_rate": 4.1264758043806084e-06, + "loss": 0.5499, + "step": 5149 + }, + { + "epoch": 0.8354964308890331, + "grad_norm": 0.6007712684865748, + "learning_rate": 4.126151479842796e-06, + "loss": 0.5275, + "step": 5150 + }, + { + "epoch": 0.8356586632057106, + "grad_norm": 0.5766675718107502, + "learning_rate": 4.125827107858632e-06, + "loss": 0.5431, + "step": 5151 + }, + { + "epoch": 0.835820895522388, + "grad_norm": 0.6102255211315019, + "learning_rate": 4.125502688437581e-06, + "loss": 0.5519, + "step": 5152 + }, + { + "epoch": 0.8359831278390656, + "grad_norm": 0.5880541524932043, + "learning_rate": 4.125178221589107e-06, + "loss": 0.5497, + "step": 5153 + }, + { + "epoch": 0.836145360155743, + "grad_norm": 0.5952023471080838, + "learning_rate": 4.124853707322678e-06, + "loss": 0.5517, + "step": 5154 + }, + { + "epoch": 0.8363075924724205, + "grad_norm": 0.5673914836434715, + "learning_rate": 4.124529145647762e-06, + "loss": 0.5432, + "step": 5155 + }, + { + "epoch": 0.836469824789098, + "grad_norm": 0.60707597151807, + "learning_rate": 4.124204536573829e-06, + "loss": 0.5565, + "step": 5156 + }, + { + "epoch": 0.8366320571057755, + "grad_norm": 0.5756503522728085, + "learning_rate": 4.12387988011035e-06, + "loss": 0.555, + "step": 5157 + }, + { + "epoch": 0.8367942894224529, + "grad_norm": 0.5681273760864923, + "learning_rate": 4.123555176266798e-06, + "loss": 0.5077, + "step": 5158 + }, + { + "epoch": 0.8369565217391305, + "grad_norm": 0.5602105628497146, + "learning_rate": 4.123230425052647e-06, + "loss": 0.5307, + "step": 5159 + }, + { + "epoch": 0.8371187540558079, + "grad_norm": 0.557415978160736, + "learning_rate": 4.122905626477371e-06, + "loss": 0.5491, + "step": 5160 + }, + { + "epoch": 0.8372809863724854, + "grad_norm": 0.6056647930438709, + "learning_rate": 4.122580780550447e-06, + "loss": 0.5342, + "step": 5161 + }, + { + "epoch": 0.8374432186891628, + "grad_norm": 0.5999644125332066, + "learning_rate": 4.122255887281355e-06, + "loss": 0.5357, + "step": 5162 + }, + { + "epoch": 0.8376054510058404, + "grad_norm": 0.5848257527774096, + "learning_rate": 4.121930946679572e-06, + "loss": 0.5307, + "step": 5163 + }, + { + "epoch": 0.8377676833225178, + "grad_norm": 0.5957528796659728, + "learning_rate": 4.1216059587545804e-06, + "loss": 0.593, + "step": 5164 + }, + { + "epoch": 0.8379299156391953, + "grad_norm": 0.5849689420923863, + "learning_rate": 4.121280923515862e-06, + "loss": 0.5184, + "step": 5165 + }, + { + "epoch": 0.8380921479558728, + "grad_norm": 0.5655243347644752, + "learning_rate": 4.1209558409729e-06, + "loss": 0.5409, + "step": 5166 + }, + { + "epoch": 0.8382543802725503, + "grad_norm": 0.5612560728382381, + "learning_rate": 4.12063071113518e-06, + "loss": 0.566, + "step": 5167 + }, + { + "epoch": 0.8384166125892277, + "grad_norm": 0.5844572865869698, + "learning_rate": 4.120305534012186e-06, + "loss": 0.5412, + "step": 5168 + }, + { + "epoch": 0.8385788449059053, + "grad_norm": 0.5805122453398591, + "learning_rate": 4.119980309613409e-06, + "loss": 0.5575, + "step": 5169 + }, + { + "epoch": 0.8387410772225827, + "grad_norm": 0.5764004362175323, + "learning_rate": 4.119655037948337e-06, + "loss": 0.5897, + "step": 5170 + }, + { + "epoch": 0.8389033095392602, + "grad_norm": 0.5743528542343138, + "learning_rate": 4.11932971902646e-06, + "loss": 0.5304, + "step": 5171 + }, + { + "epoch": 0.8390655418559377, + "grad_norm": 0.5568695329080467, + "learning_rate": 4.119004352857271e-06, + "loss": 0.5532, + "step": 5172 + }, + { + "epoch": 0.8392277741726152, + "grad_norm": 0.5711951449395442, + "learning_rate": 4.118678939450261e-06, + "loss": 0.5302, + "step": 5173 + }, + { + "epoch": 0.8393900064892926, + "grad_norm": 0.6116513838818078, + "learning_rate": 4.1183534788149256e-06, + "loss": 0.5541, + "step": 5174 + }, + { + "epoch": 0.8395522388059702, + "grad_norm": 0.5748194635540887, + "learning_rate": 4.118027970960762e-06, + "loss": 0.5581, + "step": 5175 + }, + { + "epoch": 0.8397144711226476, + "grad_norm": 0.6033401193158905, + "learning_rate": 4.117702415897267e-06, + "loss": 0.5386, + "step": 5176 + }, + { + "epoch": 0.8398767034393251, + "grad_norm": 0.5875510451054182, + "learning_rate": 4.1173768136339375e-06, + "loss": 0.5496, + "step": 5177 + }, + { + "epoch": 0.8400389357560026, + "grad_norm": 0.5876368818182696, + "learning_rate": 4.117051164180277e-06, + "loss": 0.5713, + "step": 5178 + }, + { + "epoch": 0.8402011680726801, + "grad_norm": 0.5943318562047475, + "learning_rate": 4.116725467545783e-06, + "loss": 0.5499, + "step": 5179 + }, + { + "epoch": 0.8403634003893575, + "grad_norm": 0.5586508520798478, + "learning_rate": 4.116399723739962e-06, + "loss": 0.5732, + "step": 5180 + }, + { + "epoch": 0.8405256327060351, + "grad_norm": 0.5786613405434878, + "learning_rate": 4.116073932772316e-06, + "loss": 0.5436, + "step": 5181 + }, + { + "epoch": 0.8406878650227125, + "grad_norm": 0.5952493773100951, + "learning_rate": 4.115748094652352e-06, + "loss": 0.5512, + "step": 5182 + }, + { + "epoch": 0.84085009733939, + "grad_norm": 0.5496053719879003, + "learning_rate": 4.115422209389577e-06, + "loss": 0.5569, + "step": 5183 + }, + { + "epoch": 0.8410123296560675, + "grad_norm": 0.5776576618928561, + "learning_rate": 4.115096276993498e-06, + "loss": 0.5317, + "step": 5184 + }, + { + "epoch": 0.841174561972745, + "grad_norm": 0.555642485636945, + "learning_rate": 4.114770297473626e-06, + "loss": 0.5445, + "step": 5185 + }, + { + "epoch": 0.8413367942894224, + "grad_norm": 0.568037883270261, + "learning_rate": 4.1144442708394704e-06, + "loss": 0.5362, + "step": 5186 + }, + { + "epoch": 0.8414990266061, + "grad_norm": 0.5614193480220494, + "learning_rate": 4.114118197100546e-06, + "loss": 0.5664, + "step": 5187 + }, + { + "epoch": 0.8416612589227774, + "grad_norm": 0.5638205597892478, + "learning_rate": 4.1137920762663656e-06, + "loss": 0.5159, + "step": 5188 + }, + { + "epoch": 0.8418234912394549, + "grad_norm": 0.6162792972353333, + "learning_rate": 4.113465908346444e-06, + "loss": 0.575, + "step": 5189 + }, + { + "epoch": 0.8419857235561323, + "grad_norm": 0.5941194521572032, + "learning_rate": 4.1131396933503e-06, + "loss": 0.5604, + "step": 5190 + }, + { + "epoch": 0.8421479558728099, + "grad_norm": 0.5693708309482725, + "learning_rate": 4.112813431287448e-06, + "loss": 0.5612, + "step": 5191 + }, + { + "epoch": 0.8423101881894873, + "grad_norm": 0.571312801157387, + "learning_rate": 4.11248712216741e-06, + "loss": 0.5425, + "step": 5192 + }, + { + "epoch": 0.8424724205061648, + "grad_norm": 0.5794188396388538, + "learning_rate": 4.1121607659997065e-06, + "loss": 0.5495, + "step": 5193 + }, + { + "epoch": 0.8426346528228423, + "grad_norm": 0.5948479082275397, + "learning_rate": 4.111834362793858e-06, + "loss": 0.5589, + "step": 5194 + }, + { + "epoch": 0.8427968851395198, + "grad_norm": 0.5954250245123613, + "learning_rate": 4.11150791255939e-06, + "loss": 0.5434, + "step": 5195 + }, + { + "epoch": 0.8429591174561972, + "grad_norm": 0.5577022852477108, + "learning_rate": 4.111181415305827e-06, + "loss": 0.5514, + "step": 5196 + }, + { + "epoch": 0.8431213497728748, + "grad_norm": 0.5641691156011063, + "learning_rate": 4.110854871042694e-06, + "loss": 0.5698, + "step": 5197 + }, + { + "epoch": 0.8432835820895522, + "grad_norm": 0.5976572455281564, + "learning_rate": 4.11052827977952e-06, + "loss": 0.5703, + "step": 5198 + }, + { + "epoch": 0.8434458144062297, + "grad_norm": 0.5666199393294539, + "learning_rate": 4.1102016415258325e-06, + "loss": 0.5744, + "step": 5199 + }, + { + "epoch": 0.8436080467229072, + "grad_norm": 0.5481866749252751, + "learning_rate": 4.109874956291163e-06, + "loss": 0.5571, + "step": 5200 + }, + { + "epoch": 0.8437702790395847, + "grad_norm": 0.5797813973293564, + "learning_rate": 4.109548224085044e-06, + "loss": 0.5227, + "step": 5201 + }, + { + "epoch": 0.8439325113562621, + "grad_norm": 0.5732855126459803, + "learning_rate": 4.109221444917006e-06, + "loss": 0.5632, + "step": 5202 + }, + { + "epoch": 0.8440947436729397, + "grad_norm": 0.573397792594528, + "learning_rate": 4.1088946187965865e-06, + "loss": 0.5697, + "step": 5203 + }, + { + "epoch": 0.8442569759896171, + "grad_norm": 0.5831352312888456, + "learning_rate": 4.108567745733318e-06, + "loss": 0.5438, + "step": 5204 + }, + { + "epoch": 0.8444192083062946, + "grad_norm": 0.5769691984883719, + "learning_rate": 4.10824082573674e-06, + "loss": 0.5687, + "step": 5205 + }, + { + "epoch": 0.8445814406229721, + "grad_norm": 0.5952068608629482, + "learning_rate": 4.107913858816392e-06, + "loss": 0.5487, + "step": 5206 + }, + { + "epoch": 0.8447436729396496, + "grad_norm": 0.6259034407531548, + "learning_rate": 4.1075868449818115e-06, + "loss": 0.5411, + "step": 5207 + }, + { + "epoch": 0.844905905256327, + "grad_norm": 0.577410218539166, + "learning_rate": 4.107259784242541e-06, + "loss": 0.5762, + "step": 5208 + }, + { + "epoch": 0.8450681375730046, + "grad_norm": 0.5752360946057082, + "learning_rate": 4.106932676608122e-06, + "loss": 0.5916, + "step": 5209 + }, + { + "epoch": 0.845230369889682, + "grad_norm": 0.5691523893084213, + "learning_rate": 4.106605522088101e-06, + "loss": 0.516, + "step": 5210 + }, + { + "epoch": 0.8453926022063595, + "grad_norm": 0.5927636504823496, + "learning_rate": 4.106278320692022e-06, + "loss": 0.5643, + "step": 5211 + }, + { + "epoch": 0.845554834523037, + "grad_norm": 0.5782679223838638, + "learning_rate": 4.10595107242943e-06, + "loss": 0.5678, + "step": 5212 + }, + { + "epoch": 0.8457170668397145, + "grad_norm": 0.5790559737305727, + "learning_rate": 4.105623777309877e-06, + "loss": 0.5502, + "step": 5213 + }, + { + "epoch": 0.8458792991563919, + "grad_norm": 0.6066679394857519, + "learning_rate": 4.10529643534291e-06, + "loss": 0.554, + "step": 5214 + }, + { + "epoch": 0.8460415314730695, + "grad_norm": 0.5921314640013283, + "learning_rate": 4.10496904653808e-06, + "loss": 0.5754, + "step": 5215 + }, + { + "epoch": 0.8462037637897469, + "grad_norm": 0.5927841946418393, + "learning_rate": 4.104641610904939e-06, + "loss": 0.5248, + "step": 5216 + }, + { + "epoch": 0.8463659961064244, + "grad_norm": 0.6202686744941983, + "learning_rate": 4.104314128453042e-06, + "loss": 0.5211, + "step": 5217 + }, + { + "epoch": 0.8465282284231019, + "grad_norm": 0.5670698737757711, + "learning_rate": 4.103986599191943e-06, + "loss": 0.5735, + "step": 5218 + }, + { + "epoch": 0.8466904607397794, + "grad_norm": 0.5907370522889316, + "learning_rate": 4.103659023131197e-06, + "loss": 0.5413, + "step": 5219 + }, + { + "epoch": 0.8468526930564568, + "grad_norm": 0.5858090266888697, + "learning_rate": 4.103331400280365e-06, + "loss": 0.547, + "step": 5220 + }, + { + "epoch": 0.8470149253731343, + "grad_norm": 0.5616891106403228, + "learning_rate": 4.1030037306490035e-06, + "loss": 0.5533, + "step": 5221 + }, + { + "epoch": 0.8471771576898118, + "grad_norm": 0.5826265869390884, + "learning_rate": 4.102676014246674e-06, + "loss": 0.5543, + "step": 5222 + }, + { + "epoch": 0.8473393900064893, + "grad_norm": 0.6297190560047498, + "learning_rate": 4.102348251082938e-06, + "loss": 0.5883, + "step": 5223 + }, + { + "epoch": 0.8475016223231667, + "grad_norm": 0.5917503310160506, + "learning_rate": 4.102020441167359e-06, + "loss": 0.4927, + "step": 5224 + }, + { + "epoch": 0.8476638546398443, + "grad_norm": 0.5945386744333223, + "learning_rate": 4.101692584509502e-06, + "loss": 0.5445, + "step": 5225 + }, + { + "epoch": 0.8478260869565217, + "grad_norm": 0.5778641984534946, + "learning_rate": 4.10136468111893e-06, + "loss": 0.5852, + "step": 5226 + }, + { + "epoch": 0.8479883192731992, + "grad_norm": 0.6220383853645886, + "learning_rate": 4.101036731005213e-06, + "loss": 0.5604, + "step": 5227 + }, + { + "epoch": 0.8481505515898767, + "grad_norm": 0.5747902791423618, + "learning_rate": 4.10070873417792e-06, + "loss": 0.5766, + "step": 5228 + }, + { + "epoch": 0.8483127839065542, + "grad_norm": 0.5981225567437706, + "learning_rate": 4.10038069064662e-06, + "loss": 0.5362, + "step": 5229 + }, + { + "epoch": 0.8484750162232316, + "grad_norm": 0.5614982520938706, + "learning_rate": 4.100052600420884e-06, + "loss": 0.5547, + "step": 5230 + }, + { + "epoch": 0.8486372485399092, + "grad_norm": 0.5870301266133817, + "learning_rate": 4.099724463510285e-06, + "loss": 0.5861, + "step": 5231 + }, + { + "epoch": 0.8487994808565866, + "grad_norm": 0.6066233141866463, + "learning_rate": 4.0993962799243965e-06, + "loss": 0.5556, + "step": 5232 + }, + { + "epoch": 0.8489617131732641, + "grad_norm": 0.5470560656039457, + "learning_rate": 4.0990680496727965e-06, + "loss": 0.5076, + "step": 5233 + }, + { + "epoch": 0.8491239454899416, + "grad_norm": 0.5519869883569799, + "learning_rate": 4.098739772765058e-06, + "loss": 0.5419, + "step": 5234 + }, + { + "epoch": 0.8492861778066191, + "grad_norm": 0.5694311660375644, + "learning_rate": 4.098411449210762e-06, + "loss": 0.5631, + "step": 5235 + }, + { + "epoch": 0.8494484101232965, + "grad_norm": 0.5870899827230072, + "learning_rate": 4.098083079019487e-06, + "loss": 0.5812, + "step": 5236 + }, + { + "epoch": 0.8496106424399741, + "grad_norm": 0.562290828723328, + "learning_rate": 4.097754662200814e-06, + "loss": 0.5638, + "step": 5237 + }, + { + "epoch": 0.8497728747566515, + "grad_norm": 0.5904453154376786, + "learning_rate": 4.097426198764325e-06, + "loss": 0.5495, + "step": 5238 + }, + { + "epoch": 0.849935107073329, + "grad_norm": 0.5999613278429899, + "learning_rate": 4.097097688719605e-06, + "loss": 0.5354, + "step": 5239 + }, + { + "epoch": 0.8500973393900065, + "grad_norm": 0.5882524982812014, + "learning_rate": 4.096769132076236e-06, + "loss": 0.4742, + "step": 5240 + }, + { + "epoch": 0.850259571706684, + "grad_norm": 0.5707675497810646, + "learning_rate": 4.096440528843807e-06, + "loss": 0.5483, + "step": 5241 + }, + { + "epoch": 0.8504218040233614, + "grad_norm": 0.552862011226401, + "learning_rate": 4.096111879031905e-06, + "loss": 0.5779, + "step": 5242 + }, + { + "epoch": 0.850584036340039, + "grad_norm": 0.6044801282835663, + "learning_rate": 4.095783182650118e-06, + "loss": 0.5657, + "step": 5243 + }, + { + "epoch": 0.8507462686567164, + "grad_norm": 0.5677450471892679, + "learning_rate": 4.095454439708038e-06, + "loss": 0.545, + "step": 5244 + }, + { + "epoch": 0.8509085009733939, + "grad_norm": 0.5918176288071325, + "learning_rate": 4.095125650215256e-06, + "loss": 0.5588, + "step": 5245 + }, + { + "epoch": 0.8510707332900714, + "grad_norm": 0.6173411540485647, + "learning_rate": 4.094796814181366e-06, + "loss": 0.5505, + "step": 5246 + }, + { + "epoch": 0.8512329656067489, + "grad_norm": 0.6351090092832264, + "learning_rate": 4.09446793161596e-06, + "loss": 0.5142, + "step": 5247 + }, + { + "epoch": 0.8513951979234263, + "grad_norm": 0.5671349658188201, + "learning_rate": 4.094139002528635e-06, + "loss": 0.5527, + "step": 5248 + }, + { + "epoch": 0.8515574302401038, + "grad_norm": 0.594066147256548, + "learning_rate": 4.09381002692899e-06, + "loss": 0.5422, + "step": 5249 + }, + { + "epoch": 0.8517196625567813, + "grad_norm": 0.5976050059439615, + "learning_rate": 4.093481004826622e-06, + "loss": 0.5373, + "step": 5250 + }, + { + "epoch": 0.8518818948734588, + "grad_norm": 0.579190434334255, + "learning_rate": 4.09315193623113e-06, + "loss": 0.5318, + "step": 5251 + }, + { + "epoch": 0.8520441271901362, + "grad_norm": 0.5867445793740313, + "learning_rate": 4.092822821152117e-06, + "loss": 0.5596, + "step": 5252 + }, + { + "epoch": 0.8522063595068138, + "grad_norm": 0.576505920620384, + "learning_rate": 4.092493659599185e-06, + "loss": 0.5607, + "step": 5253 + }, + { + "epoch": 0.8523685918234912, + "grad_norm": 0.6011544023203932, + "learning_rate": 4.092164451581939e-06, + "loss": 0.5466, + "step": 5254 + }, + { + "epoch": 0.8525308241401687, + "grad_norm": 0.5794173282090512, + "learning_rate": 4.091835197109982e-06, + "loss": 0.5222, + "step": 5255 + }, + { + "epoch": 0.8526930564568462, + "grad_norm": 0.6132370184275542, + "learning_rate": 4.091505896192922e-06, + "loss": 0.5769, + "step": 5256 + }, + { + "epoch": 0.8528552887735237, + "grad_norm": 0.6284674057057988, + "learning_rate": 4.091176548840367e-06, + "loss": 0.5524, + "step": 5257 + }, + { + "epoch": 0.8530175210902011, + "grad_norm": 0.5840757849019635, + "learning_rate": 4.0908471550619265e-06, + "loss": 0.5589, + "step": 5258 + }, + { + "epoch": 0.8531797534068787, + "grad_norm": 0.5967855299781003, + "learning_rate": 4.0905177148672115e-06, + "loss": 0.5654, + "step": 5259 + }, + { + "epoch": 0.8533419857235561, + "grad_norm": 0.5731841178323659, + "learning_rate": 4.090188228265833e-06, + "loss": 0.5927, + "step": 5260 + }, + { + "epoch": 0.8535042180402336, + "grad_norm": 0.5748089293301907, + "learning_rate": 4.089858695267405e-06, + "loss": 0.5973, + "step": 5261 + }, + { + "epoch": 0.8536664503569111, + "grad_norm": 0.5932386048561458, + "learning_rate": 4.089529115881543e-06, + "loss": 0.5504, + "step": 5262 + }, + { + "epoch": 0.8538286826735886, + "grad_norm": 0.5661655129849135, + "learning_rate": 4.089199490117863e-06, + "loss": 0.5843, + "step": 5263 + }, + { + "epoch": 0.853990914990266, + "grad_norm": 0.5665327411544043, + "learning_rate": 4.088869817985982e-06, + "loss": 0.5728, + "step": 5264 + }, + { + "epoch": 0.8541531473069436, + "grad_norm": 0.5662340496644751, + "learning_rate": 4.088540099495518e-06, + "loss": 0.5412, + "step": 5265 + }, + { + "epoch": 0.854315379623621, + "grad_norm": 0.549701125061171, + "learning_rate": 4.088210334656094e-06, + "loss": 0.5374, + "step": 5266 + }, + { + "epoch": 0.8544776119402985, + "grad_norm": 0.5714931871252624, + "learning_rate": 4.0878805234773285e-06, + "loss": 0.5608, + "step": 5267 + }, + { + "epoch": 0.854639844256976, + "grad_norm": 0.5837051023890106, + "learning_rate": 4.087550665968846e-06, + "loss": 0.4994, + "step": 5268 + }, + { + "epoch": 0.8548020765736535, + "grad_norm": 0.568267014569209, + "learning_rate": 4.087220762140271e-06, + "loss": 0.602, + "step": 5269 + }, + { + "epoch": 0.8549643088903309, + "grad_norm": 0.5658977930431633, + "learning_rate": 4.086890812001228e-06, + "loss": 0.5397, + "step": 5270 + }, + { + "epoch": 0.8551265412070085, + "grad_norm": 0.5884100598882579, + "learning_rate": 4.0865608155613455e-06, + "loss": 0.5668, + "step": 5271 + }, + { + "epoch": 0.8552887735236859, + "grad_norm": 0.600478584584546, + "learning_rate": 4.086230772830251e-06, + "loss": 0.5386, + "step": 5272 + }, + { + "epoch": 0.8554510058403634, + "grad_norm": 0.5903053140225137, + "learning_rate": 4.085900683817573e-06, + "loss": 0.5337, + "step": 5273 + }, + { + "epoch": 0.8556132381570409, + "grad_norm": 0.5879473789660447, + "learning_rate": 4.0855705485329444e-06, + "loss": 0.5354, + "step": 5274 + }, + { + "epoch": 0.8557754704737184, + "grad_norm": 0.5751726525763465, + "learning_rate": 4.0852403669859976e-06, + "loss": 0.5682, + "step": 5275 + }, + { + "epoch": 0.8559377027903958, + "grad_norm": 0.5787959155633347, + "learning_rate": 4.084910139186364e-06, + "loss": 0.5673, + "step": 5276 + }, + { + "epoch": 0.8560999351070734, + "grad_norm": 0.5674115492415789, + "learning_rate": 4.084579865143682e-06, + "loss": 0.5567, + "step": 5277 + }, + { + "epoch": 0.8562621674237508, + "grad_norm": 0.6057491171711065, + "learning_rate": 4.084249544867585e-06, + "loss": 0.5616, + "step": 5278 + }, + { + "epoch": 0.8564243997404283, + "grad_norm": 0.5701485905249545, + "learning_rate": 4.083919178367712e-06, + "loss": 0.5679, + "step": 5279 + }, + { + "epoch": 0.8565866320571057, + "grad_norm": 0.6067624304204379, + "learning_rate": 4.0835887656537036e-06, + "loss": 0.5334, + "step": 5280 + }, + { + "epoch": 0.8567488643737833, + "grad_norm": 0.564039156299224, + "learning_rate": 4.083258306735198e-06, + "loss": 0.5691, + "step": 5281 + }, + { + "epoch": 0.8569110966904607, + "grad_norm": 0.5886319516158384, + "learning_rate": 4.082927801621839e-06, + "loss": 0.5641, + "step": 5282 + }, + { + "epoch": 0.8570733290071382, + "grad_norm": 0.5506735743443498, + "learning_rate": 4.082597250323267e-06, + "loss": 0.5373, + "step": 5283 + }, + { + "epoch": 0.8572355613238157, + "grad_norm": 0.5802724154409244, + "learning_rate": 4.08226665284913e-06, + "loss": 0.5508, + "step": 5284 + }, + { + "epoch": 0.8573977936404932, + "grad_norm": 0.5857927253386948, + "learning_rate": 4.081936009209071e-06, + "loss": 0.5115, + "step": 5285 + }, + { + "epoch": 0.8575600259571706, + "grad_norm": 0.5675479401395307, + "learning_rate": 4.081605319412738e-06, + "loss": 0.5686, + "step": 5286 + }, + { + "epoch": 0.8577222582738482, + "grad_norm": 0.5883052789419541, + "learning_rate": 4.081274583469781e-06, + "loss": 0.5196, + "step": 5287 + }, + { + "epoch": 0.8578844905905256, + "grad_norm": 0.5816881938992344, + "learning_rate": 4.080943801389849e-06, + "loss": 0.5565, + "step": 5288 + }, + { + "epoch": 0.8580467229072031, + "grad_norm": 0.5619062670270074, + "learning_rate": 4.080612973182591e-06, + "loss": 0.5528, + "step": 5289 + }, + { + "epoch": 0.8582089552238806, + "grad_norm": 0.5800551273720557, + "learning_rate": 4.080282098857663e-06, + "loss": 0.5446, + "step": 5290 + }, + { + "epoch": 0.8583711875405581, + "grad_norm": 0.5835300320042939, + "learning_rate": 4.079951178424718e-06, + "loss": 0.5371, + "step": 5291 + }, + { + "epoch": 0.8585334198572355, + "grad_norm": 0.5808256019042799, + "learning_rate": 4.07962021189341e-06, + "loss": 0.5808, + "step": 5292 + }, + { + "epoch": 0.8586956521739131, + "grad_norm": 0.5896244028405322, + "learning_rate": 4.079289199273397e-06, + "loss": 0.5032, + "step": 5293 + }, + { + "epoch": 0.8588578844905905, + "grad_norm": 0.5541551131705846, + "learning_rate": 4.078958140574337e-06, + "loss": 0.5407, + "step": 5294 + }, + { + "epoch": 0.859020116807268, + "grad_norm": 0.5818449062323701, + "learning_rate": 4.0786270358058885e-06, + "loss": 0.5294, + "step": 5295 + }, + { + "epoch": 0.8591823491239455, + "grad_norm": 0.58969854113, + "learning_rate": 4.078295884977712e-06, + "loss": 0.5225, + "step": 5296 + }, + { + "epoch": 0.859344581440623, + "grad_norm": 0.6046032204406047, + "learning_rate": 4.077964688099471e-06, + "loss": 0.5515, + "step": 5297 + }, + { + "epoch": 0.8595068137573004, + "grad_norm": 0.5751860692031712, + "learning_rate": 4.077633445180827e-06, + "loss": 0.5366, + "step": 5298 + }, + { + "epoch": 0.859669046073978, + "grad_norm": 0.5809020931942924, + "learning_rate": 4.0773021562314455e-06, + "loss": 0.5348, + "step": 5299 + }, + { + "epoch": 0.8598312783906554, + "grad_norm": 0.572769227705037, + "learning_rate": 4.076970821260994e-06, + "loss": 0.532, + "step": 5300 + }, + { + "epoch": 0.8599935107073329, + "grad_norm": 0.5898509128227449, + "learning_rate": 4.076639440279136e-06, + "loss": 0.5305, + "step": 5301 + }, + { + "epoch": 0.8601557430240104, + "grad_norm": 0.6106294418780106, + "learning_rate": 4.076308013295545e-06, + "loss": 0.5594, + "step": 5302 + }, + { + "epoch": 0.8603179753406879, + "grad_norm": 0.6039749847272402, + "learning_rate": 4.075976540319888e-06, + "loss": 0.553, + "step": 5303 + }, + { + "epoch": 0.8604802076573653, + "grad_norm": 0.6293283423501383, + "learning_rate": 4.075645021361837e-06, + "loss": 0.5507, + "step": 5304 + }, + { + "epoch": 0.8606424399740429, + "grad_norm": 0.5638036380233316, + "learning_rate": 4.0753134564310655e-06, + "loss": 0.5711, + "step": 5305 + }, + { + "epoch": 0.8608046722907203, + "grad_norm": 0.5711567704220293, + "learning_rate": 4.074981845537247e-06, + "loss": 0.578, + "step": 5306 + }, + { + "epoch": 0.8609669046073978, + "grad_norm": 0.5961878966572217, + "learning_rate": 4.074650188690057e-06, + "loss": 0.5834, + "step": 5307 + }, + { + "epoch": 0.8611291369240752, + "grad_norm": 0.6125141228535831, + "learning_rate": 4.074318485899172e-06, + "loss": 0.5631, + "step": 5308 + }, + { + "epoch": 0.8612913692407528, + "grad_norm": 0.6168042241033052, + "learning_rate": 4.073986737174271e-06, + "loss": 0.5353, + "step": 5309 + }, + { + "epoch": 0.8614536015574302, + "grad_norm": 0.5573275394710293, + "learning_rate": 4.073654942525032e-06, + "loss": 0.5327, + "step": 5310 + }, + { + "epoch": 0.8616158338741077, + "grad_norm": 0.565271746145609, + "learning_rate": 4.073323101961137e-06, + "loss": 0.555, + "step": 5311 + }, + { + "epoch": 0.8617780661907852, + "grad_norm": 0.6484549326619216, + "learning_rate": 4.072991215492268e-06, + "loss": 0.5496, + "step": 5312 + }, + { + "epoch": 0.8619402985074627, + "grad_norm": 0.6954984818137815, + "learning_rate": 4.072659283128109e-06, + "loss": 0.5261, + "step": 5313 + }, + { + "epoch": 0.8621025308241401, + "grad_norm": 0.6008248072627239, + "learning_rate": 4.0723273048783426e-06, + "loss": 0.5412, + "step": 5314 + }, + { + "epoch": 0.8622647631408177, + "grad_norm": 0.5826511039373922, + "learning_rate": 4.071995280752658e-06, + "loss": 0.535, + "step": 5315 + }, + { + "epoch": 0.8624269954574951, + "grad_norm": 0.6075315390093057, + "learning_rate": 4.07166321076074e-06, + "loss": 0.5519, + "step": 5316 + }, + { + "epoch": 0.8625892277741726, + "grad_norm": 0.5911761096864796, + "learning_rate": 4.07133109491228e-06, + "loss": 0.4844, + "step": 5317 + }, + { + "epoch": 0.8627514600908501, + "grad_norm": 0.5772897541028075, + "learning_rate": 4.070998933216965e-06, + "loss": 0.542, + "step": 5318 + }, + { + "epoch": 0.8629136924075276, + "grad_norm": 0.6168323090748801, + "learning_rate": 4.070666725684489e-06, + "loss": 0.5228, + "step": 5319 + }, + { + "epoch": 0.863075924724205, + "grad_norm": 0.5999657940353785, + "learning_rate": 4.070334472324544e-06, + "loss": 0.5391, + "step": 5320 + }, + { + "epoch": 0.8632381570408826, + "grad_norm": 0.5905298525273451, + "learning_rate": 4.070002173146826e-06, + "loss": 0.5622, + "step": 5321 + }, + { + "epoch": 0.86340038935756, + "grad_norm": 0.6004097333351469, + "learning_rate": 4.069669828161026e-06, + "loss": 0.5621, + "step": 5322 + }, + { + "epoch": 0.8635626216742375, + "grad_norm": 0.6052131642479395, + "learning_rate": 4.069337437376846e-06, + "loss": 0.5103, + "step": 5323 + }, + { + "epoch": 0.863724853990915, + "grad_norm": 0.5922234942999638, + "learning_rate": 4.069005000803981e-06, + "loss": 0.5421, + "step": 5324 + }, + { + "epoch": 0.8638870863075925, + "grad_norm": 0.5693965235415072, + "learning_rate": 4.06867251845213e-06, + "loss": 0.5567, + "step": 5325 + }, + { + "epoch": 0.8640493186242699, + "grad_norm": 0.5583608599806791, + "learning_rate": 4.068339990330997e-06, + "loss": 0.5594, + "step": 5326 + }, + { + "epoch": 0.8642115509409475, + "grad_norm": 0.5918799855339137, + "learning_rate": 4.0680074164502815e-06, + "loss": 0.5392, + "step": 5327 + }, + { + "epoch": 0.8643737832576249, + "grad_norm": 0.5916481852118664, + "learning_rate": 4.067674796819689e-06, + "loss": 0.5284, + "step": 5328 + }, + { + "epoch": 0.8645360155743024, + "grad_norm": 0.5672792294801579, + "learning_rate": 4.067342131448923e-06, + "loss": 0.5976, + "step": 5329 + }, + { + "epoch": 0.8646982478909799, + "grad_norm": 0.5702852187011387, + "learning_rate": 4.067009420347689e-06, + "loss": 0.5569, + "step": 5330 + }, + { + "epoch": 0.8648604802076574, + "grad_norm": 0.6016501257328839, + "learning_rate": 4.066676663525697e-06, + "loss": 0.5625, + "step": 5331 + }, + { + "epoch": 0.8650227125243348, + "grad_norm": 0.5796143324131192, + "learning_rate": 4.066343860992654e-06, + "loss": 0.5659, + "step": 5332 + }, + { + "epoch": 0.8651849448410124, + "grad_norm": 0.5845394936068984, + "learning_rate": 4.066011012758271e-06, + "loss": 0.5129, + "step": 5333 + }, + { + "epoch": 0.8653471771576898, + "grad_norm": 0.5808757344830688, + "learning_rate": 4.0656781188322585e-06, + "loss": 0.5735, + "step": 5334 + }, + { + "epoch": 0.8655094094743673, + "grad_norm": 0.599166357895603, + "learning_rate": 4.06534517922433e-06, + "loss": 0.581, + "step": 5335 + }, + { + "epoch": 0.8656716417910447, + "grad_norm": 0.5759883443449878, + "learning_rate": 4.065012193944201e-06, + "loss": 0.5496, + "step": 5336 + }, + { + "epoch": 0.8658338741077223, + "grad_norm": 0.5584121023163736, + "learning_rate": 4.064679163001585e-06, + "loss": 0.5562, + "step": 5337 + }, + { + "epoch": 0.8659961064243997, + "grad_norm": 0.5543156548529301, + "learning_rate": 4.0643460864062005e-06, + "loss": 0.5491, + "step": 5338 + }, + { + "epoch": 0.8661583387410772, + "grad_norm": 0.5791507616578996, + "learning_rate": 4.064012964167764e-06, + "loss": 0.5705, + "step": 5339 + }, + { + "epoch": 0.8663205710577547, + "grad_norm": 0.5955432595084313, + "learning_rate": 4.063679796295996e-06, + "loss": 0.5569, + "step": 5340 + }, + { + "epoch": 0.8664828033744322, + "grad_norm": 0.5992031971644947, + "learning_rate": 4.063346582800618e-06, + "loss": 0.5183, + "step": 5341 + }, + { + "epoch": 0.8666450356911096, + "grad_norm": 0.5919447131974884, + "learning_rate": 4.063013323691351e-06, + "loss": 0.5469, + "step": 5342 + }, + { + "epoch": 0.8668072680077872, + "grad_norm": 0.5473305077491204, + "learning_rate": 4.062680018977918e-06, + "loss": 0.5395, + "step": 5343 + }, + { + "epoch": 0.8669695003244646, + "grad_norm": 0.5695691434834815, + "learning_rate": 4.062346668670047e-06, + "loss": 0.5616, + "step": 5344 + }, + { + "epoch": 0.8671317326411421, + "grad_norm": 0.5844130842840638, + "learning_rate": 4.062013272777461e-06, + "loss": 0.5625, + "step": 5345 + }, + { + "epoch": 0.8672939649578196, + "grad_norm": 0.5806136566410655, + "learning_rate": 4.0616798313098885e-06, + "loss": 0.5818, + "step": 5346 + }, + { + "epoch": 0.8674561972744971, + "grad_norm": 0.572252320868347, + "learning_rate": 4.0613463442770585e-06, + "loss": 0.5228, + "step": 5347 + }, + { + "epoch": 0.8676184295911745, + "grad_norm": 0.5493782686153755, + "learning_rate": 4.061012811688702e-06, + "loss": 0.5718, + "step": 5348 + }, + { + "epoch": 0.8677806619078521, + "grad_norm": 0.5516229343580203, + "learning_rate": 4.0606792335545485e-06, + "loss": 0.5312, + "step": 5349 + }, + { + "epoch": 0.8679428942245295, + "grad_norm": 0.5490695378600913, + "learning_rate": 4.060345609884332e-06, + "loss": 0.5757, + "step": 5350 + }, + { + "epoch": 0.868105126541207, + "grad_norm": 0.6085697576093054, + "learning_rate": 4.060011940687787e-06, + "loss": 0.5108, + "step": 5351 + }, + { + "epoch": 0.8682673588578845, + "grad_norm": 0.6060289168645021, + "learning_rate": 4.059678225974649e-06, + "loss": 0.5508, + "step": 5352 + }, + { + "epoch": 0.868429591174562, + "grad_norm": 0.5799585281230872, + "learning_rate": 4.059344465754655e-06, + "loss": 0.5504, + "step": 5353 + }, + { + "epoch": 0.8685918234912394, + "grad_norm": 0.5803605045815834, + "learning_rate": 4.059010660037541e-06, + "loss": 0.5699, + "step": 5354 + }, + { + "epoch": 0.868754055807917, + "grad_norm": 0.5824428864628811, + "learning_rate": 4.058676808833049e-06, + "loss": 0.556, + "step": 5355 + }, + { + "epoch": 0.8689162881245944, + "grad_norm": 0.603199954382858, + "learning_rate": 4.058342912150919e-06, + "loss": 0.5655, + "step": 5356 + }, + { + "epoch": 0.8690785204412719, + "grad_norm": 0.5797574232445886, + "learning_rate": 4.058008970000892e-06, + "loss": 0.5054, + "step": 5357 + }, + { + "epoch": 0.8692407527579494, + "grad_norm": 0.5894474589851937, + "learning_rate": 4.057674982392713e-06, + "loss": 0.5549, + "step": 5358 + }, + { + "epoch": 0.8694029850746269, + "grad_norm": 0.5739012451982486, + "learning_rate": 4.057340949336127e-06, + "loss": 0.5162, + "step": 5359 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.58751061490638, + "learning_rate": 4.057006870840878e-06, + "loss": 0.5385, + "step": 5360 + }, + { + "epoch": 0.8697274497079819, + "grad_norm": 0.572067939025144, + "learning_rate": 4.056672746916716e-06, + "loss": 0.5569, + "step": 5361 + }, + { + "epoch": 0.8698896820246593, + "grad_norm": 0.5611766076055139, + "learning_rate": 4.056338577573388e-06, + "loss": 0.5337, + "step": 5362 + }, + { + "epoch": 0.8700519143413368, + "grad_norm": 0.6146737351523085, + "learning_rate": 4.056004362820645e-06, + "loss": 0.5342, + "step": 5363 + }, + { + "epoch": 0.8702141466580143, + "grad_norm": 0.5649495443496443, + "learning_rate": 4.055670102668238e-06, + "loss": 0.5836, + "step": 5364 + }, + { + "epoch": 0.8703763789746918, + "grad_norm": 0.5700701454247941, + "learning_rate": 4.055335797125919e-06, + "loss": 0.5627, + "step": 5365 + }, + { + "epoch": 0.8705386112913692, + "grad_norm": 0.5712781746120431, + "learning_rate": 4.055001446203444e-06, + "loss": 0.5398, + "step": 5366 + }, + { + "epoch": 0.8707008436080467, + "grad_norm": 0.5803820450687619, + "learning_rate": 4.054667049910567e-06, + "loss": 0.5421, + "step": 5367 + }, + { + "epoch": 0.8708630759247242, + "grad_norm": 0.568144797781329, + "learning_rate": 4.054332608257045e-06, + "loss": 0.5656, + "step": 5368 + }, + { + "epoch": 0.8710253082414017, + "grad_norm": 0.5894764678871537, + "learning_rate": 4.053998121252635e-06, + "loss": 0.5426, + "step": 5369 + }, + { + "epoch": 0.8711875405580791, + "grad_norm": 0.5631408817512662, + "learning_rate": 4.053663588907098e-06, + "loss": 0.5657, + "step": 5370 + }, + { + "epoch": 0.8713497728747567, + "grad_norm": 0.5821304555618502, + "learning_rate": 4.053329011230194e-06, + "loss": 0.545, + "step": 5371 + }, + { + "epoch": 0.8715120051914341, + "grad_norm": 0.5664397647761535, + "learning_rate": 4.052994388231685e-06, + "loss": 0.5246, + "step": 5372 + }, + { + "epoch": 0.8716742375081116, + "grad_norm": 0.5642480290037472, + "learning_rate": 4.052659719921334e-06, + "loss": 0.5521, + "step": 5373 + }, + { + "epoch": 0.8718364698247891, + "grad_norm": 0.5657205930857154, + "learning_rate": 4.0523250063089066e-06, + "loss": 0.5638, + "step": 5374 + }, + { + "epoch": 0.8719987021414666, + "grad_norm": 0.5755050598445486, + "learning_rate": 4.051990247404167e-06, + "loss": 0.5633, + "step": 5375 + }, + { + "epoch": 0.872160934458144, + "grad_norm": 0.5654130014133356, + "learning_rate": 4.051655443216884e-06, + "loss": 0.5526, + "step": 5376 + }, + { + "epoch": 0.8723231667748216, + "grad_norm": 0.5744402907436353, + "learning_rate": 4.0513205937568275e-06, + "loss": 0.548, + "step": 5377 + }, + { + "epoch": 0.872485399091499, + "grad_norm": 0.5844912897981286, + "learning_rate": 4.050985699033764e-06, + "loss": 0.5661, + "step": 5378 + }, + { + "epoch": 0.8726476314081765, + "grad_norm": 0.5942772501245, + "learning_rate": 4.050650759057468e-06, + "loss": 0.5148, + "step": 5379 + }, + { + "epoch": 0.872809863724854, + "grad_norm": 0.63527728648965, + "learning_rate": 4.050315773837708e-06, + "loss": 0.5817, + "step": 5380 + }, + { + "epoch": 0.8729720960415315, + "grad_norm": 0.5709716382386125, + "learning_rate": 4.049980743384263e-06, + "loss": 0.5321, + "step": 5381 + }, + { + "epoch": 0.8731343283582089, + "grad_norm": 0.5708118450400842, + "learning_rate": 4.049645667706905e-06, + "loss": 0.5565, + "step": 5382 + }, + { + "epoch": 0.8732965606748865, + "grad_norm": 0.5899158810113397, + "learning_rate": 4.04931054681541e-06, + "loss": 0.4667, + "step": 5383 + }, + { + "epoch": 0.8734587929915639, + "grad_norm": 0.5709490027693129, + "learning_rate": 4.0489753807195585e-06, + "loss": 0.5525, + "step": 5384 + }, + { + "epoch": 0.8736210253082414, + "grad_norm": 0.5636903004705703, + "learning_rate": 4.048640169429128e-06, + "loss": 0.5504, + "step": 5385 + }, + { + "epoch": 0.8737832576249189, + "grad_norm": 0.5478495585779265, + "learning_rate": 4.048304912953899e-06, + "loss": 0.5116, + "step": 5386 + }, + { + "epoch": 0.8739454899415964, + "grad_norm": 0.5704079995958133, + "learning_rate": 4.047969611303654e-06, + "loss": 0.5393, + "step": 5387 + }, + { + "epoch": 0.8741077222582738, + "grad_norm": 0.5848022287167292, + "learning_rate": 4.0476342644881745e-06, + "loss": 0.5003, + "step": 5388 + }, + { + "epoch": 0.8742699545749514, + "grad_norm": 0.5952405929373803, + "learning_rate": 4.047298872517247e-06, + "loss": 0.547, + "step": 5389 + }, + { + "epoch": 0.8744321868916288, + "grad_norm": 0.5821538154650582, + "learning_rate": 4.046963435400655e-06, + "loss": 0.5729, + "step": 5390 + }, + { + "epoch": 0.8745944192083063, + "grad_norm": 0.5580902758997298, + "learning_rate": 4.046627953148188e-06, + "loss": 0.5107, + "step": 5391 + }, + { + "epoch": 0.8747566515249838, + "grad_norm": 0.5664238209602728, + "learning_rate": 4.046292425769634e-06, + "loss": 0.5515, + "step": 5392 + }, + { + "epoch": 0.8749188838416613, + "grad_norm": 0.5953563436213163, + "learning_rate": 4.045956853274781e-06, + "loss": 0.5696, + "step": 5393 + }, + { + "epoch": 0.8750811161583387, + "grad_norm": 0.5822539161457938, + "learning_rate": 4.045621235673422e-06, + "loss": 0.5596, + "step": 5394 + }, + { + "epoch": 0.8752433484750162, + "grad_norm": 0.598923364620824, + "learning_rate": 4.045285572975347e-06, + "loss": 0.5682, + "step": 5395 + }, + { + "epoch": 0.8754055807916937, + "grad_norm": 0.6220707988364003, + "learning_rate": 4.044949865190352e-06, + "loss": 0.5834, + "step": 5396 + }, + { + "epoch": 0.8755678131083712, + "grad_norm": 0.5747053785918732, + "learning_rate": 4.044614112328231e-06, + "loss": 0.5631, + "step": 5397 + }, + { + "epoch": 0.8757300454250486, + "grad_norm": 0.5799357051834364, + "learning_rate": 4.04427831439878e-06, + "loss": 0.5701, + "step": 5398 + }, + { + "epoch": 0.8758922777417262, + "grad_norm": 0.5733252088259426, + "learning_rate": 4.043942471411797e-06, + "loss": 0.5272, + "step": 5399 + }, + { + "epoch": 0.8760545100584036, + "grad_norm": 0.5612564151882429, + "learning_rate": 4.043606583377081e-06, + "loss": 0.5331, + "step": 5400 + }, + { + "epoch": 0.8762167423750811, + "grad_norm": 0.5492346211888569, + "learning_rate": 4.043270650304432e-06, + "loss": 0.5385, + "step": 5401 + }, + { + "epoch": 0.8763789746917586, + "grad_norm": 0.6206909628024412, + "learning_rate": 4.042934672203651e-06, + "loss": 0.5676, + "step": 5402 + }, + { + "epoch": 0.8765412070084361, + "grad_norm": 0.5921706867923581, + "learning_rate": 4.042598649084541e-06, + "loss": 0.5626, + "step": 5403 + }, + { + "epoch": 0.8767034393251135, + "grad_norm": 0.5951126045575964, + "learning_rate": 4.042262580956908e-06, + "loss": 0.5766, + "step": 5404 + }, + { + "epoch": 0.8768656716417911, + "grad_norm": 0.5777756317554682, + "learning_rate": 4.041926467830556e-06, + "loss": 0.5541, + "step": 5405 + }, + { + "epoch": 0.8770279039584685, + "grad_norm": 0.6033542214232627, + "learning_rate": 4.041590309715292e-06, + "loss": 0.5474, + "step": 5406 + }, + { + "epoch": 0.877190136275146, + "grad_norm": 0.6046351448831684, + "learning_rate": 4.041254106620923e-06, + "loss": 0.5413, + "step": 5407 + }, + { + "epoch": 0.8773523685918235, + "grad_norm": 0.5911725314033326, + "learning_rate": 4.04091785855726e-06, + "loss": 0.5633, + "step": 5408 + }, + { + "epoch": 0.877514600908501, + "grad_norm": 0.592781518203692, + "learning_rate": 4.040581565534114e-06, + "loss": 0.5504, + "step": 5409 + }, + { + "epoch": 0.8776768332251784, + "grad_norm": 0.5975737511136758, + "learning_rate": 4.040245227561295e-06, + "loss": 0.5476, + "step": 5410 + }, + { + "epoch": 0.877839065541856, + "grad_norm": 0.6429879349335523, + "learning_rate": 4.039908844648618e-06, + "loss": 0.5405, + "step": 5411 + }, + { + "epoch": 0.8780012978585334, + "grad_norm": 0.6054075099034837, + "learning_rate": 4.039572416805898e-06, + "loss": 0.5301, + "step": 5412 + }, + { + "epoch": 0.8781635301752109, + "grad_norm": 0.5624197429616641, + "learning_rate": 4.03923594404295e-06, + "loss": 0.5184, + "step": 5413 + }, + { + "epoch": 0.8783257624918884, + "grad_norm": 0.5917230758815244, + "learning_rate": 4.038899426369592e-06, + "loss": 0.5497, + "step": 5414 + }, + { + "epoch": 0.8784879948085659, + "grad_norm": 0.5966098766398554, + "learning_rate": 4.038562863795641e-06, + "loss": 0.5658, + "step": 5415 + }, + { + "epoch": 0.8786502271252433, + "grad_norm": 0.5520996547885251, + "learning_rate": 4.038226256330918e-06, + "loss": 0.5445, + "step": 5416 + }, + { + "epoch": 0.8788124594419209, + "grad_norm": 0.5927967083090946, + "learning_rate": 4.037889603985245e-06, + "loss": 0.5437, + "step": 5417 + }, + { + "epoch": 0.8789746917585983, + "grad_norm": 0.5842762574146821, + "learning_rate": 4.037552906768443e-06, + "loss": 0.5476, + "step": 5418 + }, + { + "epoch": 0.8791369240752758, + "grad_norm": 0.5658841004064394, + "learning_rate": 4.037216164690338e-06, + "loss": 0.5852, + "step": 5419 + }, + { + "epoch": 0.8792991563919533, + "grad_norm": 0.5876769632151958, + "learning_rate": 4.036879377760753e-06, + "loss": 0.5413, + "step": 5420 + }, + { + "epoch": 0.8794613887086308, + "grad_norm": 0.5893767485701737, + "learning_rate": 4.036542545989515e-06, + "loss": 0.5291, + "step": 5421 + }, + { + "epoch": 0.8796236210253082, + "grad_norm": 0.5590990703318267, + "learning_rate": 4.0362056693864526e-06, + "loss": 0.532, + "step": 5422 + }, + { + "epoch": 0.8797858533419857, + "grad_norm": 0.5291862319166384, + "learning_rate": 4.035868747961394e-06, + "loss": 0.4964, + "step": 5423 + }, + { + "epoch": 0.8799480856586632, + "grad_norm": 0.5900982543094322, + "learning_rate": 4.0355317817241705e-06, + "loss": 0.583, + "step": 5424 + }, + { + "epoch": 0.8801103179753407, + "grad_norm": 0.5587784896192235, + "learning_rate": 4.035194770684612e-06, + "loss": 0.5441, + "step": 5425 + }, + { + "epoch": 0.8802725502920181, + "grad_norm": 0.6015082069878106, + "learning_rate": 4.034857714852554e-06, + "loss": 0.579, + "step": 5426 + }, + { + "epoch": 0.8804347826086957, + "grad_norm": 0.6028230211695745, + "learning_rate": 4.034520614237829e-06, + "loss": 0.532, + "step": 5427 + }, + { + "epoch": 0.8805970149253731, + "grad_norm": 0.6090662913679425, + "learning_rate": 4.034183468850272e-06, + "loss": 0.5493, + "step": 5428 + }, + { + "epoch": 0.8807592472420506, + "grad_norm": 0.5448285879992397, + "learning_rate": 4.033846278699723e-06, + "loss": 0.5165, + "step": 5429 + }, + { + "epoch": 0.8809214795587281, + "grad_norm": 0.5997056188506742, + "learning_rate": 4.0335090437960165e-06, + "loss": 0.5626, + "step": 5430 + }, + { + "epoch": 0.8810837118754056, + "grad_norm": 0.5911121341807068, + "learning_rate": 4.033171764148995e-06, + "loss": 0.5702, + "step": 5431 + }, + { + "epoch": 0.881245944192083, + "grad_norm": 0.6139895645398447, + "learning_rate": 4.032834439768497e-06, + "loss": 0.5235, + "step": 5432 + }, + { + "epoch": 0.8814081765087606, + "grad_norm": 0.5873360559400064, + "learning_rate": 4.0324970706643674e-06, + "loss": 0.5329, + "step": 5433 + }, + { + "epoch": 0.881570408825438, + "grad_norm": 0.5676338058893865, + "learning_rate": 4.032159656846446e-06, + "loss": 0.5405, + "step": 5434 + }, + { + "epoch": 0.8817326411421155, + "grad_norm": 0.6071141012916328, + "learning_rate": 4.031822198324581e-06, + "loss": 0.5336, + "step": 5435 + }, + { + "epoch": 0.881894873458793, + "grad_norm": 0.6125415005536459, + "learning_rate": 4.031484695108617e-06, + "loss": 0.5479, + "step": 5436 + }, + { + "epoch": 0.8820571057754705, + "grad_norm": 0.5683021575202994, + "learning_rate": 4.031147147208401e-06, + "loss": 0.5545, + "step": 5437 + }, + { + "epoch": 0.8822193380921479, + "grad_norm": 0.542370894977869, + "learning_rate": 4.0308095546337815e-06, + "loss": 0.5241, + "step": 5438 + }, + { + "epoch": 0.8823815704088255, + "grad_norm": 0.6128677420504808, + "learning_rate": 4.030471917394609e-06, + "loss": 0.5222, + "step": 5439 + }, + { + "epoch": 0.8825438027255029, + "grad_norm": 0.5778832776946902, + "learning_rate": 4.030134235500736e-06, + "loss": 0.5521, + "step": 5440 + }, + { + "epoch": 0.8827060350421804, + "grad_norm": 0.5715233389765899, + "learning_rate": 4.029796508962013e-06, + "loss": 0.5698, + "step": 5441 + }, + { + "epoch": 0.8828682673588579, + "grad_norm": 0.5575735444871791, + "learning_rate": 4.0294587377882944e-06, + "loss": 0.54, + "step": 5442 + }, + { + "epoch": 0.8830304996755354, + "grad_norm": 0.5715173287886194, + "learning_rate": 4.029120921989437e-06, + "loss": 0.5675, + "step": 5443 + }, + { + "epoch": 0.8831927319922128, + "grad_norm": 0.589492772685049, + "learning_rate": 4.028783061575294e-06, + "loss": 0.5437, + "step": 5444 + }, + { + "epoch": 0.8833549643088904, + "grad_norm": 0.5809369689984923, + "learning_rate": 4.028445156555727e-06, + "loss": 0.5693, + "step": 5445 + }, + { + "epoch": 0.8835171966255678, + "grad_norm": 0.5979885988751089, + "learning_rate": 4.028107206940592e-06, + "loss": 0.5775, + "step": 5446 + }, + { + "epoch": 0.8836794289422453, + "grad_norm": 0.5814972905727194, + "learning_rate": 4.027769212739751e-06, + "loss": 0.5408, + "step": 5447 + }, + { + "epoch": 0.8838416612589228, + "grad_norm": 0.573336219744911, + "learning_rate": 4.0274311739630655e-06, + "loss": 0.5484, + "step": 5448 + }, + { + "epoch": 0.8840038935756003, + "grad_norm": 0.5962717466116232, + "learning_rate": 4.0270930906203975e-06, + "loss": 0.5854, + "step": 5449 + }, + { + "epoch": 0.8841661258922777, + "grad_norm": 0.5718075375883949, + "learning_rate": 4.026754962721613e-06, + "loss": 0.537, + "step": 5450 + }, + { + "epoch": 0.8843283582089553, + "grad_norm": 0.5627114935676583, + "learning_rate": 4.026416790276576e-06, + "loss": 0.5639, + "step": 5451 + }, + { + "epoch": 0.8844905905256327, + "grad_norm": 0.563775981218277, + "learning_rate": 4.026078573295155e-06, + "loss": 0.5445, + "step": 5452 + }, + { + "epoch": 0.8846528228423102, + "grad_norm": 0.5602583597039659, + "learning_rate": 4.025740311787216e-06, + "loss": 0.5302, + "step": 5453 + }, + { + "epoch": 0.8848150551589876, + "grad_norm": 0.5783561360580703, + "learning_rate": 4.0254020057626295e-06, + "loss": 0.5546, + "step": 5454 + }, + { + "epoch": 0.8849772874756652, + "grad_norm": 0.5702091094804999, + "learning_rate": 4.025063655231267e-06, + "loss": 0.5177, + "step": 5455 + }, + { + "epoch": 0.8851395197923426, + "grad_norm": 0.5726963717212513, + "learning_rate": 4.024725260203001e-06, + "loss": 0.5499, + "step": 5456 + }, + { + "epoch": 0.8853017521090201, + "grad_norm": 0.5912518217100461, + "learning_rate": 4.024386820687703e-06, + "loss": 0.5719, + "step": 5457 + }, + { + "epoch": 0.8854639844256976, + "grad_norm": 0.5921400335047422, + "learning_rate": 4.024048336695249e-06, + "loss": 0.5467, + "step": 5458 + }, + { + "epoch": 0.8856262167423751, + "grad_norm": 0.5853634937186918, + "learning_rate": 4.023709808235514e-06, + "loss": 0.5362, + "step": 5459 + }, + { + "epoch": 0.8857884490590525, + "grad_norm": 0.5595486121535501, + "learning_rate": 4.023371235318376e-06, + "loss": 0.5697, + "step": 5460 + }, + { + "epoch": 0.8859506813757301, + "grad_norm": 0.6031146709716764, + "learning_rate": 4.023032617953714e-06, + "loss": 0.5266, + "step": 5461 + }, + { + "epoch": 0.8861129136924075, + "grad_norm": 0.5830269465662028, + "learning_rate": 4.0226939561514064e-06, + "loss": 0.5041, + "step": 5462 + }, + { + "epoch": 0.886275146009085, + "grad_norm": 0.5761488097627436, + "learning_rate": 4.022355249921336e-06, + "loss": 0.517, + "step": 5463 + }, + { + "epoch": 0.8864373783257625, + "grad_norm": 0.5635743054197416, + "learning_rate": 4.0220164992733846e-06, + "loss": 0.544, + "step": 5464 + }, + { + "epoch": 0.88659961064244, + "grad_norm": 0.578220582575355, + "learning_rate": 4.0216777042174355e-06, + "loss": 0.5564, + "step": 5465 + }, + { + "epoch": 0.8867618429591174, + "grad_norm": 0.5952230143350117, + "learning_rate": 4.021338864763374e-06, + "loss": 0.5518, + "step": 5466 + }, + { + "epoch": 0.886924075275795, + "grad_norm": 0.6131067153350194, + "learning_rate": 4.020999980921087e-06, + "loss": 0.5372, + "step": 5467 + }, + { + "epoch": 0.8870863075924724, + "grad_norm": 0.5740889282381334, + "learning_rate": 4.020661052700462e-06, + "loss": 0.5411, + "step": 5468 + }, + { + "epoch": 0.8872485399091499, + "grad_norm": 0.6462389264507132, + "learning_rate": 4.0203220801113864e-06, + "loss": 0.5488, + "step": 5469 + }, + { + "epoch": 0.8874107722258274, + "grad_norm": 0.5642473673596698, + "learning_rate": 4.019983063163752e-06, + "loss": 0.5254, + "step": 5470 + }, + { + "epoch": 0.8875730045425049, + "grad_norm": 0.5639390749579185, + "learning_rate": 4.019644001867451e-06, + "loss": 0.5448, + "step": 5471 + }, + { + "epoch": 0.8877352368591823, + "grad_norm": 0.5777373612308557, + "learning_rate": 4.019304896232375e-06, + "loss": 0.5218, + "step": 5472 + }, + { + "epoch": 0.8878974691758599, + "grad_norm": 0.5807229903890025, + "learning_rate": 4.018965746268417e-06, + "loss": 0.5471, + "step": 5473 + }, + { + "epoch": 0.8880597014925373, + "grad_norm": 0.5587400900079714, + "learning_rate": 4.018626551985475e-06, + "loss": 0.5609, + "step": 5474 + }, + { + "epoch": 0.8882219338092148, + "grad_norm": 0.5796482836646782, + "learning_rate": 4.0182873133934444e-06, + "loss": 0.5066, + "step": 5475 + }, + { + "epoch": 0.8883841661258923, + "grad_norm": 0.5629259747169924, + "learning_rate": 4.017948030502224e-06, + "loss": 0.5692, + "step": 5476 + }, + { + "epoch": 0.8885463984425698, + "grad_norm": 0.6099217410897168, + "learning_rate": 4.01760870332171e-06, + "loss": 0.5857, + "step": 5477 + }, + { + "epoch": 0.8887086307592472, + "grad_norm": 0.5946437128181477, + "learning_rate": 4.017269331861806e-06, + "loss": 0.5718, + "step": 5478 + }, + { + "epoch": 0.8888708630759248, + "grad_norm": 0.5729058414116333, + "learning_rate": 4.016929916132415e-06, + "loss": 0.5318, + "step": 5479 + }, + { + "epoch": 0.8890330953926022, + "grad_norm": 0.5487283815497758, + "learning_rate": 4.0165904561434365e-06, + "loss": 0.5392, + "step": 5480 + }, + { + "epoch": 0.8891953277092797, + "grad_norm": 0.5746075418832618, + "learning_rate": 4.016250951904777e-06, + "loss": 0.5274, + "step": 5481 + }, + { + "epoch": 0.8893575600259571, + "grad_norm": 0.5868196810076075, + "learning_rate": 4.015911403426342e-06, + "loss": 0.5352, + "step": 5482 + }, + { + "epoch": 0.8895197923426347, + "grad_norm": 0.5493362737577243, + "learning_rate": 4.015571810718039e-06, + "loss": 0.5369, + "step": 5483 + }, + { + "epoch": 0.8896820246593121, + "grad_norm": 0.5808357058460483, + "learning_rate": 4.015232173789776e-06, + "loss": 0.5399, + "step": 5484 + }, + { + "epoch": 0.8898442569759896, + "grad_norm": 0.5995131155397443, + "learning_rate": 4.014892492651462e-06, + "loss": 0.5358, + "step": 5485 + }, + { + "epoch": 0.8900064892926671, + "grad_norm": 0.6239050371144733, + "learning_rate": 4.014552767313008e-06, + "loss": 0.572, + "step": 5486 + }, + { + "epoch": 0.8901687216093446, + "grad_norm": 0.5536684107369879, + "learning_rate": 4.014212997784328e-06, + "loss": 0.5169, + "step": 5487 + }, + { + "epoch": 0.890330953926022, + "grad_norm": 0.5671102179397819, + "learning_rate": 4.013873184075333e-06, + "loss": 0.5769, + "step": 5488 + }, + { + "epoch": 0.8904931862426996, + "grad_norm": 0.5629631511699552, + "learning_rate": 4.013533326195939e-06, + "loss": 0.5598, + "step": 5489 + }, + { + "epoch": 0.890655418559377, + "grad_norm": 0.5814078606150711, + "learning_rate": 4.013193424156062e-06, + "loss": 0.5416, + "step": 5490 + }, + { + "epoch": 0.8908176508760545, + "grad_norm": 0.580951166130149, + "learning_rate": 4.012853477965619e-06, + "loss": 0.5567, + "step": 5491 + }, + { + "epoch": 0.890979883192732, + "grad_norm": 0.5725278168542367, + "learning_rate": 4.01251348763453e-06, + "loss": 0.5422, + "step": 5492 + }, + { + "epoch": 0.8911421155094095, + "grad_norm": 0.6486782648075349, + "learning_rate": 4.012173453172712e-06, + "loss": 0.5377, + "step": 5493 + }, + { + "epoch": 0.8913043478260869, + "grad_norm": 0.5607765383177911, + "learning_rate": 4.011833374590089e-06, + "loss": 0.5503, + "step": 5494 + }, + { + "epoch": 0.8914665801427645, + "grad_norm": 0.6243573702571933, + "learning_rate": 4.0114932518965825e-06, + "loss": 0.5577, + "step": 5495 + }, + { + "epoch": 0.8916288124594419, + "grad_norm": 0.5980040428867429, + "learning_rate": 4.011153085102116e-06, + "loss": 0.5659, + "step": 5496 + }, + { + "epoch": 0.8917910447761194, + "grad_norm": 0.5802299830570851, + "learning_rate": 4.010812874216616e-06, + "loss": 0.5497, + "step": 5497 + }, + { + "epoch": 0.8919532770927969, + "grad_norm": 0.5671703013245921, + "learning_rate": 4.010472619250006e-06, + "loss": 0.5571, + "step": 5498 + }, + { + "epoch": 0.8921155094094744, + "grad_norm": 0.5853133407591687, + "learning_rate": 4.010132320212216e-06, + "loss": 0.5511, + "step": 5499 + }, + { + "epoch": 0.8922777417261518, + "grad_norm": 0.5961100285776784, + "learning_rate": 4.009791977113175e-06, + "loss": 0.5412, + "step": 5500 + }, + { + "epoch": 0.8924399740428294, + "grad_norm": 0.5867061052649926, + "learning_rate": 4.009451589962811e-06, + "loss": 0.5694, + "step": 5501 + }, + { + "epoch": 0.8926022063595068, + "grad_norm": 0.5560982161882833, + "learning_rate": 4.009111158771059e-06, + "loss": 0.5559, + "step": 5502 + }, + { + "epoch": 0.8927644386761843, + "grad_norm": 0.5873402627181731, + "learning_rate": 4.008770683547848e-06, + "loss": 0.5574, + "step": 5503 + }, + { + "epoch": 0.8929266709928618, + "grad_norm": 0.5994181025894361, + "learning_rate": 4.008430164303115e-06, + "loss": 0.5854, + "step": 5504 + }, + { + "epoch": 0.8930889033095393, + "grad_norm": 0.5884135811377319, + "learning_rate": 4.008089601046795e-06, + "loss": 0.5326, + "step": 5505 + }, + { + "epoch": 0.8932511356262167, + "grad_norm": 0.5812088145151765, + "learning_rate": 4.007748993788822e-06, + "loss": 0.552, + "step": 5506 + }, + { + "epoch": 0.8934133679428943, + "grad_norm": 0.5844145635863932, + "learning_rate": 4.007408342539137e-06, + "loss": 0.5427, + "step": 5507 + }, + { + "epoch": 0.8935756002595717, + "grad_norm": 0.5928811071665491, + "learning_rate": 4.007067647307678e-06, + "loss": 0.5353, + "step": 5508 + }, + { + "epoch": 0.8937378325762492, + "grad_norm": 0.5695278636516693, + "learning_rate": 4.006726908104385e-06, + "loss": 0.5607, + "step": 5509 + }, + { + "epoch": 0.8939000648929266, + "grad_norm": 0.5829696458560939, + "learning_rate": 4.0063861249392015e-06, + "loss": 0.5421, + "step": 5510 + }, + { + "epoch": 0.8940622972096042, + "grad_norm": 0.5607684962464397, + "learning_rate": 4.006045297822067e-06, + "loss": 0.5513, + "step": 5511 + }, + { + "epoch": 0.8942245295262816, + "grad_norm": 0.5896758152713854, + "learning_rate": 4.00570442676293e-06, + "loss": 0.5613, + "step": 5512 + }, + { + "epoch": 0.8943867618429591, + "grad_norm": 0.5881802043878152, + "learning_rate": 4.005363511771735e-06, + "loss": 0.5551, + "step": 5513 + }, + { + "epoch": 0.8945489941596366, + "grad_norm": 0.5779391633714727, + "learning_rate": 4.005022552858427e-06, + "loss": 0.5697, + "step": 5514 + }, + { + "epoch": 0.8947112264763141, + "grad_norm": 0.5966490832163972, + "learning_rate": 4.004681550032956e-06, + "loss": 0.5161, + "step": 5515 + }, + { + "epoch": 0.8948734587929915, + "grad_norm": 0.6529713703640618, + "learning_rate": 4.00434050330527e-06, + "loss": 0.5906, + "step": 5516 + }, + { + "epoch": 0.8950356911096691, + "grad_norm": 0.5858149677968383, + "learning_rate": 4.003999412685321e-06, + "loss": 0.5733, + "step": 5517 + }, + { + "epoch": 0.8951979234263465, + "grad_norm": 0.587066690227359, + "learning_rate": 4.00365827818306e-06, + "loss": 0.5241, + "step": 5518 + }, + { + "epoch": 0.895360155743024, + "grad_norm": 0.5844432732375917, + "learning_rate": 4.003317099808443e-06, + "loss": 0.5684, + "step": 5519 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 0.6151483487847584, + "learning_rate": 4.002975877571421e-06, + "loss": 0.5102, + "step": 5520 + }, + { + "epoch": 0.895684620376379, + "grad_norm": 0.5766052174051763, + "learning_rate": 4.0026346114819515e-06, + "loss": 0.5451, + "step": 5521 + }, + { + "epoch": 0.8958468526930564, + "grad_norm": 0.5703197194891851, + "learning_rate": 4.002293301549992e-06, + "loss": 0.5352, + "step": 5522 + }, + { + "epoch": 0.896009085009734, + "grad_norm": 0.5524871196526844, + "learning_rate": 4.0019519477855e-06, + "loss": 0.5525, + "step": 5523 + }, + { + "epoch": 0.8961713173264114, + "grad_norm": 0.5961323125231184, + "learning_rate": 4.0016105501984356e-06, + "loss": 0.5446, + "step": 5524 + }, + { + "epoch": 0.8963335496430889, + "grad_norm": 0.5718406587896865, + "learning_rate": 4.001269108798761e-06, + "loss": 0.5687, + "step": 5525 + }, + { + "epoch": 0.8964957819597664, + "grad_norm": 0.5874875988535164, + "learning_rate": 4.000927623596436e-06, + "loss": 0.5536, + "step": 5526 + }, + { + "epoch": 0.8966580142764439, + "grad_norm": 0.6049510140526604, + "learning_rate": 4.0005860946014265e-06, + "loss": 0.5669, + "step": 5527 + }, + { + "epoch": 0.8968202465931213, + "grad_norm": 0.5701630830008412, + "learning_rate": 4.000244521823695e-06, + "loss": 0.5589, + "step": 5528 + }, + { + "epoch": 0.8969824789097989, + "grad_norm": 0.5612705182512753, + "learning_rate": 3.99990290527321e-06, + "loss": 0.5839, + "step": 5529 + }, + { + "epoch": 0.8971447112264763, + "grad_norm": 0.5735202391450253, + "learning_rate": 3.999561244959938e-06, + "loss": 0.5597, + "step": 5530 + }, + { + "epoch": 0.8973069435431538, + "grad_norm": 0.6092514803900878, + "learning_rate": 3.999219540893847e-06, + "loss": 0.5535, + "step": 5531 + }, + { + "epoch": 0.8974691758598313, + "grad_norm": 0.604056204538818, + "learning_rate": 3.998877793084907e-06, + "loss": 0.532, + "step": 5532 + }, + { + "epoch": 0.8976314081765088, + "grad_norm": 0.5956413584592666, + "learning_rate": 3.99853600154309e-06, + "loss": 0.5476, + "step": 5533 + }, + { + "epoch": 0.8977936404931862, + "grad_norm": 0.5876912011950761, + "learning_rate": 3.9981941662783675e-06, + "loss": 0.572, + "step": 5534 + }, + { + "epoch": 0.8979558728098638, + "grad_norm": 0.5753951148267704, + "learning_rate": 3.997852287300714e-06, + "loss": 0.5578, + "step": 5535 + }, + { + "epoch": 0.8981181051265412, + "grad_norm": 0.5680484824945213, + "learning_rate": 3.997510364620105e-06, + "loss": 0.5271, + "step": 5536 + }, + { + "epoch": 0.8982803374432187, + "grad_norm": 0.606283414567858, + "learning_rate": 3.997168398246516e-06, + "loss": 0.5622, + "step": 5537 + }, + { + "epoch": 0.8984425697598962, + "grad_norm": 0.5820703291932406, + "learning_rate": 3.996826388189924e-06, + "loss": 0.5483, + "step": 5538 + }, + { + "epoch": 0.8986048020765737, + "grad_norm": 0.5863486692657218, + "learning_rate": 3.996484334460309e-06, + "loss": 0.53, + "step": 5539 + }, + { + "epoch": 0.8987670343932511, + "grad_norm": 0.5750563898046173, + "learning_rate": 3.99614223706765e-06, + "loss": 0.5677, + "step": 5540 + }, + { + "epoch": 0.8989292667099286, + "grad_norm": 0.59364057412455, + "learning_rate": 3.99580009602193e-06, + "loss": 0.5829, + "step": 5541 + }, + { + "epoch": 0.8990914990266061, + "grad_norm": 0.5669580531517109, + "learning_rate": 3.99545791133313e-06, + "loss": 0.5675, + "step": 5542 + }, + { + "epoch": 0.8992537313432836, + "grad_norm": 0.5805321929015268, + "learning_rate": 3.995115683011236e-06, + "loss": 0.5656, + "step": 5543 + }, + { + "epoch": 0.899415963659961, + "grad_norm": 0.5840564484720268, + "learning_rate": 3.994773411066231e-06, + "loss": 0.5177, + "step": 5544 + }, + { + "epoch": 0.8995781959766386, + "grad_norm": 0.5752687131741453, + "learning_rate": 3.994431095508102e-06, + "loss": 0.572, + "step": 5545 + }, + { + "epoch": 0.899740428293316, + "grad_norm": 0.5584946781372313, + "learning_rate": 3.994088736346838e-06, + "loss": 0.554, + "step": 5546 + }, + { + "epoch": 0.8999026606099935, + "grad_norm": 0.5937216052523443, + "learning_rate": 3.993746333592427e-06, + "loss": 0.5586, + "step": 5547 + }, + { + "epoch": 0.900064892926671, + "grad_norm": 0.5816745212500714, + "learning_rate": 3.9934038872548595e-06, + "loss": 0.5983, + "step": 5548 + }, + { + "epoch": 0.9002271252433485, + "grad_norm": 0.5766633267531285, + "learning_rate": 3.993061397344127e-06, + "loss": 0.5583, + "step": 5549 + }, + { + "epoch": 0.9003893575600259, + "grad_norm": 0.5868125481014915, + "learning_rate": 3.992718863870223e-06, + "loss": 0.5419, + "step": 5550 + }, + { + "epoch": 0.9005515898767035, + "grad_norm": 0.5780833125638989, + "learning_rate": 3.9923762868431415e-06, + "loss": 0.556, + "step": 5551 + }, + { + "epoch": 0.9007138221933809, + "grad_norm": 0.5820784688403479, + "learning_rate": 3.992033666272877e-06, + "loss": 0.5752, + "step": 5552 + }, + { + "epoch": 0.9008760545100584, + "grad_norm": 0.5649190706656302, + "learning_rate": 3.991691002169426e-06, + "loss": 0.5819, + "step": 5553 + }, + { + "epoch": 0.9010382868267359, + "grad_norm": 0.5899030238877712, + "learning_rate": 3.9913482945427875e-06, + "loss": 0.5463, + "step": 5554 + }, + { + "epoch": 0.9012005191434134, + "grad_norm": 0.5915569697727435, + "learning_rate": 3.9910055434029606e-06, + "loss": 0.5274, + "step": 5555 + }, + { + "epoch": 0.9013627514600908, + "grad_norm": 0.5898022871230134, + "learning_rate": 3.990662748759946e-06, + "loss": 0.5675, + "step": 5556 + }, + { + "epoch": 0.9015249837767684, + "grad_norm": 0.5752419016724908, + "learning_rate": 3.990319910623743e-06, + "loss": 0.5456, + "step": 5557 + }, + { + "epoch": 0.9016872160934458, + "grad_norm": 0.5886824527617398, + "learning_rate": 3.989977029004359e-06, + "loss": 0.5428, + "step": 5558 + }, + { + "epoch": 0.9018494484101233, + "grad_norm": 0.5727367148090494, + "learning_rate": 3.989634103911795e-06, + "loss": 0.5637, + "step": 5559 + }, + { + "epoch": 0.9020116807268008, + "grad_norm": 0.5769690668932159, + "learning_rate": 3.989291135356057e-06, + "loss": 0.5523, + "step": 5560 + }, + { + "epoch": 0.9021739130434783, + "grad_norm": 0.5854271258977499, + "learning_rate": 3.9889481233471524e-06, + "loss": 0.5768, + "step": 5561 + }, + { + "epoch": 0.9023361453601557, + "grad_norm": 0.5599031363461041, + "learning_rate": 3.9886050678950894e-06, + "loss": 0.5272, + "step": 5562 + }, + { + "epoch": 0.9024983776768333, + "grad_norm": 0.5703978464934822, + "learning_rate": 3.988261969009876e-06, + "loss": 0.5657, + "step": 5563 + }, + { + "epoch": 0.9026606099935107, + "grad_norm": 0.5720041350012249, + "learning_rate": 3.987918826701525e-06, + "loss": 0.5717, + "step": 5564 + }, + { + "epoch": 0.9028228423101882, + "grad_norm": 0.6298015379153583, + "learning_rate": 3.987575640980048e-06, + "loss": 0.5448, + "step": 5565 + }, + { + "epoch": 0.9029850746268657, + "grad_norm": 0.5623900824339957, + "learning_rate": 3.987232411855456e-06, + "loss": 0.5286, + "step": 5566 + }, + { + "epoch": 0.9031473069435432, + "grad_norm": 0.5492575454481962, + "learning_rate": 3.986889139337765e-06, + "loss": 0.5371, + "step": 5567 + }, + { + "epoch": 0.9033095392602206, + "grad_norm": 0.5842330412856457, + "learning_rate": 3.986545823436991e-06, + "loss": 0.5501, + "step": 5568 + }, + { + "epoch": 0.9034717715768981, + "grad_norm": 0.6029686602381175, + "learning_rate": 3.98620246416315e-06, + "loss": 0.5721, + "step": 5569 + }, + { + "epoch": 0.9036340038935756, + "grad_norm": 0.5774856365840692, + "learning_rate": 3.985859061526261e-06, + "loss": 0.5523, + "step": 5570 + }, + { + "epoch": 0.9037962362102531, + "grad_norm": 0.5989033581182415, + "learning_rate": 3.985515615536342e-06, + "loss": 0.5783, + "step": 5571 + }, + { + "epoch": 0.9039584685269305, + "grad_norm": 0.7347356661505295, + "learning_rate": 3.985172126203416e-06, + "loss": 0.5545, + "step": 5572 + }, + { + "epoch": 0.9041207008436081, + "grad_norm": 0.5641930740386253, + "learning_rate": 3.984828593537504e-06, + "loss": 0.5697, + "step": 5573 + }, + { + "epoch": 0.9042829331602855, + "grad_norm": 0.5674982431008495, + "learning_rate": 3.984485017548628e-06, + "loss": 0.5187, + "step": 5574 + }, + { + "epoch": 0.904445165476963, + "grad_norm": 0.6290530402722229, + "learning_rate": 3.9841413982468145e-06, + "loss": 0.532, + "step": 5575 + }, + { + "epoch": 0.9046073977936405, + "grad_norm": 0.5741092269714577, + "learning_rate": 3.983797735642089e-06, + "loss": 0.5307, + "step": 5576 + }, + { + "epoch": 0.904769630110318, + "grad_norm": 0.6653218625020066, + "learning_rate": 3.983454029744477e-06, + "loss": 0.5373, + "step": 5577 + }, + { + "epoch": 0.9049318624269954, + "grad_norm": 0.6210982723332743, + "learning_rate": 3.983110280564009e-06, + "loss": 0.5597, + "step": 5578 + }, + { + "epoch": 0.905094094743673, + "grad_norm": 0.5958511822208313, + "learning_rate": 3.982766488110713e-06, + "loss": 0.5389, + "step": 5579 + }, + { + "epoch": 0.9052563270603504, + "grad_norm": 0.5749223932928652, + "learning_rate": 3.9824226523946206e-06, + "loss": 0.5327, + "step": 5580 + }, + { + "epoch": 0.9054185593770279, + "grad_norm": 0.5749783875162474, + "learning_rate": 3.982078773425763e-06, + "loss": 0.5616, + "step": 5581 + }, + { + "epoch": 0.9055807916937054, + "grad_norm": 0.5578338262962689, + "learning_rate": 3.9817348512141755e-06, + "loss": 0.524, + "step": 5582 + }, + { + "epoch": 0.9057430240103829, + "grad_norm": 0.5673965516880942, + "learning_rate": 3.98139088576989e-06, + "loss": 0.5598, + "step": 5583 + }, + { + "epoch": 0.9059052563270603, + "grad_norm": 0.6303274443099645, + "learning_rate": 3.981046877102945e-06, + "loss": 0.5701, + "step": 5584 + }, + { + "epoch": 0.9060674886437379, + "grad_norm": 0.5913810023811616, + "learning_rate": 3.980702825223377e-06, + "loss": 0.5071, + "step": 5585 + }, + { + "epoch": 0.9062297209604153, + "grad_norm": 0.5851097369725282, + "learning_rate": 3.980358730141224e-06, + "loss": 0.5418, + "step": 5586 + }, + { + "epoch": 0.9063919532770928, + "grad_norm": 0.5615473444013318, + "learning_rate": 3.980014591866524e-06, + "loss": 0.5565, + "step": 5587 + }, + { + "epoch": 0.9065541855937703, + "grad_norm": 0.5796464951175344, + "learning_rate": 3.979670410409321e-06, + "loss": 0.5566, + "step": 5588 + }, + { + "epoch": 0.9067164179104478, + "grad_norm": 0.5745735151642781, + "learning_rate": 3.979326185779656e-06, + "loss": 0.5674, + "step": 5589 + }, + { + "epoch": 0.9068786502271252, + "grad_norm": 0.5891722038519456, + "learning_rate": 3.9789819179875725e-06, + "loss": 0.5799, + "step": 5590 + }, + { + "epoch": 0.9070408825438028, + "grad_norm": 0.6181587359188979, + "learning_rate": 3.978637607043116e-06, + "loss": 0.5823, + "step": 5591 + }, + { + "epoch": 0.9072031148604802, + "grad_norm": 0.5874603621927631, + "learning_rate": 3.978293252956329e-06, + "loss": 0.5577, + "step": 5592 + }, + { + "epoch": 0.9073653471771577, + "grad_norm": 0.5714632357407923, + "learning_rate": 3.977948855737263e-06, + "loss": 0.5533, + "step": 5593 + }, + { + "epoch": 0.9075275794938352, + "grad_norm": 0.5723797696335523, + "learning_rate": 3.9776044153959645e-06, + "loss": 0.5536, + "step": 5594 + }, + { + "epoch": 0.9076898118105127, + "grad_norm": 0.5750624066690962, + "learning_rate": 3.977259931942484e-06, + "loss": 0.5804, + "step": 5595 + }, + { + "epoch": 0.9078520441271901, + "grad_norm": 0.5595787717490635, + "learning_rate": 3.976915405386872e-06, + "loss": 0.566, + "step": 5596 + }, + { + "epoch": 0.9080142764438677, + "grad_norm": 0.5490014376347522, + "learning_rate": 3.976570835739181e-06, + "loss": 0.545, + "step": 5597 + }, + { + "epoch": 0.9081765087605451, + "grad_norm": 0.5643626821616349, + "learning_rate": 3.976226223009463e-06, + "loss": 0.5235, + "step": 5598 + }, + { + "epoch": 0.9083387410772226, + "grad_norm": 0.6020907053628195, + "learning_rate": 3.975881567207776e-06, + "loss": 0.5038, + "step": 5599 + }, + { + "epoch": 0.9085009733939, + "grad_norm": 0.5753480905314756, + "learning_rate": 3.975536868344174e-06, + "loss": 0.5345, + "step": 5600 + }, + { + "epoch": 0.9086632057105776, + "grad_norm": 0.5986140177143239, + "learning_rate": 3.975192126428714e-06, + "loss": 0.5338, + "step": 5601 + }, + { + "epoch": 0.908825438027255, + "grad_norm": 0.5912610653580614, + "learning_rate": 3.974847341471455e-06, + "loss": 0.6076, + "step": 5602 + }, + { + "epoch": 0.9089876703439325, + "grad_norm": 0.5747561424998553, + "learning_rate": 3.974502513482457e-06, + "loss": 0.5253, + "step": 5603 + }, + { + "epoch": 0.90914990266061, + "grad_norm": 0.5788468747976936, + "learning_rate": 3.974157642471782e-06, + "loss": 0.532, + "step": 5604 + }, + { + "epoch": 0.9093121349772875, + "grad_norm": 0.5690956639394613, + "learning_rate": 3.973812728449491e-06, + "loss": 0.5658, + "step": 5605 + }, + { + "epoch": 0.9094743672939649, + "grad_norm": 0.5665329661168769, + "learning_rate": 3.973467771425647e-06, + "loss": 0.4968, + "step": 5606 + }, + { + "epoch": 0.9096365996106425, + "grad_norm": 0.6090901457797787, + "learning_rate": 3.973122771410317e-06, + "loss": 0.5344, + "step": 5607 + }, + { + "epoch": 0.9097988319273199, + "grad_norm": 0.543436322043561, + "learning_rate": 3.972777728413565e-06, + "loss": 0.53, + "step": 5608 + }, + { + "epoch": 0.9099610642439974, + "grad_norm": 0.6276733811473786, + "learning_rate": 3.97243264244546e-06, + "loss": 0.5865, + "step": 5609 + }, + { + "epoch": 0.9101232965606749, + "grad_norm": 0.6056027764149362, + "learning_rate": 3.972087513516069e-06, + "loss": 0.5715, + "step": 5610 + }, + { + "epoch": 0.9102855288773524, + "grad_norm": 0.5635298798946143, + "learning_rate": 3.9717423416354625e-06, + "loss": 0.5389, + "step": 5611 + }, + { + "epoch": 0.9104477611940298, + "grad_norm": 0.5727131421569698, + "learning_rate": 3.971397126813713e-06, + "loss": 0.5591, + "step": 5612 + }, + { + "epoch": 0.9106099935107074, + "grad_norm": 0.5538894058870545, + "learning_rate": 3.971051869060891e-06, + "loss": 0.5295, + "step": 5613 + }, + { + "epoch": 0.9107722258273848, + "grad_norm": 0.6260799571863158, + "learning_rate": 3.970706568387071e-06, + "loss": 0.5449, + "step": 5614 + }, + { + "epoch": 0.9109344581440623, + "grad_norm": 0.5733576210121655, + "learning_rate": 3.970361224802328e-06, + "loss": 0.5504, + "step": 5615 + }, + { + "epoch": 0.9110966904607398, + "grad_norm": 0.5985296336618351, + "learning_rate": 3.970015838316737e-06, + "loss": 0.5504, + "step": 5616 + }, + { + "epoch": 0.9112589227774173, + "grad_norm": 0.5680597767595053, + "learning_rate": 3.969670408940377e-06, + "loss": 0.5383, + "step": 5617 + }, + { + "epoch": 0.9114211550940947, + "grad_norm": 0.5919126432018732, + "learning_rate": 3.9693249366833244e-06, + "loss": 0.5485, + "step": 5618 + }, + { + "epoch": 0.9115833874107723, + "grad_norm": 0.5945235999358779, + "learning_rate": 3.9689794215556615e-06, + "loss": 0.5592, + "step": 5619 + }, + { + "epoch": 0.9117456197274497, + "grad_norm": 0.5782186911242632, + "learning_rate": 3.968633863567468e-06, + "loss": 0.5288, + "step": 5620 + }, + { + "epoch": 0.9119078520441272, + "grad_norm": 0.5885627650531727, + "learning_rate": 3.968288262728827e-06, + "loss": 0.5918, + "step": 5621 + }, + { + "epoch": 0.9120700843608047, + "grad_norm": 0.5669950988855772, + "learning_rate": 3.96794261904982e-06, + "loss": 0.5263, + "step": 5622 + }, + { + "epoch": 0.9122323166774822, + "grad_norm": 0.5967344333263223, + "learning_rate": 3.967596932540535e-06, + "loss": 0.5698, + "step": 5623 + }, + { + "epoch": 0.9123945489941596, + "grad_norm": 0.611668119515622, + "learning_rate": 3.967251203211058e-06, + "loss": 0.5716, + "step": 5624 + }, + { + "epoch": 0.9125567813108372, + "grad_norm": 0.5915553184407796, + "learning_rate": 3.966905431071473e-06, + "loss": 0.5418, + "step": 5625 + }, + { + "epoch": 0.9127190136275146, + "grad_norm": 0.5885009214276516, + "learning_rate": 3.9665596161318715e-06, + "loss": 0.5543, + "step": 5626 + }, + { + "epoch": 0.9128812459441921, + "grad_norm": 0.5921497103138842, + "learning_rate": 3.966213758402343e-06, + "loss": 0.4999, + "step": 5627 + }, + { + "epoch": 0.9130434782608695, + "grad_norm": 0.5811491397390184, + "learning_rate": 3.965867857892978e-06, + "loss": 0.579, + "step": 5628 + }, + { + "epoch": 0.9132057105775471, + "grad_norm": 0.6065281881202647, + "learning_rate": 3.965521914613868e-06, + "loss": 0.5371, + "step": 5629 + }, + { + "epoch": 0.9133679428942245, + "grad_norm": 0.5668633040752448, + "learning_rate": 3.965175928575109e-06, + "loss": 0.5708, + "step": 5630 + }, + { + "epoch": 0.913530175210902, + "grad_norm": 0.6091448252757994, + "learning_rate": 3.964829899786794e-06, + "loss": 0.5666, + "step": 5631 + }, + { + "epoch": 0.9136924075275795, + "grad_norm": 0.5890356195050246, + "learning_rate": 3.964483828259019e-06, + "loss": 0.5704, + "step": 5632 + }, + { + "epoch": 0.913854639844257, + "grad_norm": 0.5768178548503806, + "learning_rate": 3.964137714001883e-06, + "loss": 0.524, + "step": 5633 + }, + { + "epoch": 0.9140168721609344, + "grad_norm": 0.6131801106902213, + "learning_rate": 3.963791557025484e-06, + "loss": 0.545, + "step": 5634 + }, + { + "epoch": 0.914179104477612, + "grad_norm": 0.5746477997555296, + "learning_rate": 3.963445357339921e-06, + "loss": 0.5493, + "step": 5635 + }, + { + "epoch": 0.9143413367942894, + "grad_norm": 0.5812179132038683, + "learning_rate": 3.963099114955296e-06, + "loss": 0.5653, + "step": 5636 + }, + { + "epoch": 0.9145035691109669, + "grad_norm": 0.6034655391770195, + "learning_rate": 3.96275282988171e-06, + "loss": 0.5653, + "step": 5637 + }, + { + "epoch": 0.9146658014276444, + "grad_norm": 0.5775259060779144, + "learning_rate": 3.962406502129268e-06, + "loss": 0.5708, + "step": 5638 + }, + { + "epoch": 0.9148280337443219, + "grad_norm": 0.5854616338320306, + "learning_rate": 3.962060131708074e-06, + "loss": 0.5533, + "step": 5639 + }, + { + "epoch": 0.9149902660609993, + "grad_norm": 0.6323947137338921, + "learning_rate": 3.961713718628235e-06, + "loss": 0.5432, + "step": 5640 + }, + { + "epoch": 0.9151524983776769, + "grad_norm": 0.6161130027001087, + "learning_rate": 3.961367262899856e-06, + "loss": 0.5468, + "step": 5641 + }, + { + "epoch": 0.9153147306943543, + "grad_norm": 0.5955522807310785, + "learning_rate": 3.96102076453305e-06, + "loss": 0.5678, + "step": 5642 + }, + { + "epoch": 0.9154769630110318, + "grad_norm": 0.5847891586064149, + "learning_rate": 3.960674223537923e-06, + "loss": 0.5114, + "step": 5643 + }, + { + "epoch": 0.9156391953277093, + "grad_norm": 0.5853963052476623, + "learning_rate": 3.9603276399245864e-06, + "loss": 0.5325, + "step": 5644 + }, + { + "epoch": 0.9158014276443868, + "grad_norm": 0.5725509846424774, + "learning_rate": 3.959981013703153e-06, + "loss": 0.533, + "step": 5645 + }, + { + "epoch": 0.9159636599610642, + "grad_norm": 0.5768361973653706, + "learning_rate": 3.9596343448837385e-06, + "loss": 0.5175, + "step": 5646 + }, + { + "epoch": 0.9161258922777418, + "grad_norm": 0.574708790354445, + "learning_rate": 3.959287633476454e-06, + "loss": 0.5584, + "step": 5647 + }, + { + "epoch": 0.9162881245944192, + "grad_norm": 0.5773956460099662, + "learning_rate": 3.958940879491419e-06, + "loss": 0.5504, + "step": 5648 + }, + { + "epoch": 0.9164503569110967, + "grad_norm": 0.5438761706543822, + "learning_rate": 3.958594082938747e-06, + "loss": 0.5418, + "step": 5649 + }, + { + "epoch": 0.9166125892277742, + "grad_norm": 0.6231120778050813, + "learning_rate": 3.95824724382856e-06, + "loss": 0.5466, + "step": 5650 + }, + { + "epoch": 0.9167748215444517, + "grad_norm": 0.580885299893518, + "learning_rate": 3.957900362170976e-06, + "loss": 0.5448, + "step": 5651 + }, + { + "epoch": 0.9169370538611291, + "grad_norm": 0.588265067889545, + "learning_rate": 3.957553437976116e-06, + "loss": 0.5458, + "step": 5652 + }, + { + "epoch": 0.9170992861778067, + "grad_norm": 0.5825477940033955, + "learning_rate": 3.957206471254103e-06, + "loss": 0.5419, + "step": 5653 + }, + { + "epoch": 0.9172615184944841, + "grad_norm": 0.577930751206501, + "learning_rate": 3.95685946201506e-06, + "loss": 0.5574, + "step": 5654 + }, + { + "epoch": 0.9174237508111616, + "grad_norm": 0.5789453671012902, + "learning_rate": 3.956512410269112e-06, + "loss": 0.552, + "step": 5655 + }, + { + "epoch": 0.917585983127839, + "grad_norm": 0.5461625027720265, + "learning_rate": 3.9561653160263845e-06, + "loss": 0.5327, + "step": 5656 + }, + { + "epoch": 0.9177482154445166, + "grad_norm": 0.6072911118218945, + "learning_rate": 3.955818179297005e-06, + "loss": 0.5529, + "step": 5657 + }, + { + "epoch": 0.917910447761194, + "grad_norm": 0.5972530627155075, + "learning_rate": 3.955471000091102e-06, + "loss": 0.5699, + "step": 5658 + }, + { + "epoch": 0.9180726800778715, + "grad_norm": 0.6017786492603445, + "learning_rate": 3.9551237784188045e-06, + "loss": 0.5387, + "step": 5659 + }, + { + "epoch": 0.918234912394549, + "grad_norm": 0.5860563949608523, + "learning_rate": 3.954776514290245e-06, + "loss": 0.5496, + "step": 5660 + }, + { + "epoch": 0.9183971447112265, + "grad_norm": 0.5705919663897323, + "learning_rate": 3.9544292077155535e-06, + "loss": 0.5489, + "step": 5661 + }, + { + "epoch": 0.9185593770279039, + "grad_norm": 0.5863240113860274, + "learning_rate": 3.954081858704864e-06, + "loss": 0.5751, + "step": 5662 + }, + { + "epoch": 0.9187216093445815, + "grad_norm": 0.5662253847614404, + "learning_rate": 3.9537344672683135e-06, + "loss": 0.5781, + "step": 5663 + }, + { + "epoch": 0.9188838416612589, + "grad_norm": 0.5728073603471251, + "learning_rate": 3.953387033416035e-06, + "loss": 0.5617, + "step": 5664 + }, + { + "epoch": 0.9190460739779364, + "grad_norm": 0.5655378254920586, + "learning_rate": 3.953039557158166e-06, + "loss": 0.5032, + "step": 5665 + }, + { + "epoch": 0.9192083062946139, + "grad_norm": 0.5849051105248193, + "learning_rate": 3.9526920385048465e-06, + "loss": 0.5661, + "step": 5666 + }, + { + "epoch": 0.9193705386112914, + "grad_norm": 0.5805328027641907, + "learning_rate": 3.952344477466214e-06, + "loss": 0.5356, + "step": 5667 + }, + { + "epoch": 0.9195327709279688, + "grad_norm": 0.5811643630288554, + "learning_rate": 3.951996874052411e-06, + "loss": 0.5382, + "step": 5668 + }, + { + "epoch": 0.9196950032446464, + "grad_norm": 0.5774153242583209, + "learning_rate": 3.951649228273579e-06, + "loss": 0.5258, + "step": 5669 + }, + { + "epoch": 0.9198572355613238, + "grad_norm": 0.5713731292963659, + "learning_rate": 3.951301540139861e-06, + "loss": 0.5233, + "step": 5670 + }, + { + "epoch": 0.9200194678780013, + "grad_norm": 0.586303428498121, + "learning_rate": 3.950953809661401e-06, + "loss": 0.5469, + "step": 5671 + }, + { + "epoch": 0.9201817001946788, + "grad_norm": 0.5830078421120786, + "learning_rate": 3.950606036848346e-06, + "loss": 0.5326, + "step": 5672 + }, + { + "epoch": 0.9203439325113563, + "grad_norm": 0.5746774791674173, + "learning_rate": 3.950258221710843e-06, + "loss": 0.558, + "step": 5673 + }, + { + "epoch": 0.9205061648280337, + "grad_norm": 0.589600712956311, + "learning_rate": 3.949910364259039e-06, + "loss": 0.528, + "step": 5674 + }, + { + "epoch": 0.9206683971447113, + "grad_norm": 0.5951940785504014, + "learning_rate": 3.949562464503084e-06, + "loss": 0.5372, + "step": 5675 + }, + { + "epoch": 0.9208306294613887, + "grad_norm": 0.6076436834837241, + "learning_rate": 3.94921452245313e-06, + "loss": 0.5821, + "step": 5676 + }, + { + "epoch": 0.9209928617780662, + "grad_norm": 0.5626127550452293, + "learning_rate": 3.948866538119326e-06, + "loss": 0.57, + "step": 5677 + }, + { + "epoch": 0.9211550940947437, + "grad_norm": 0.6011035600401993, + "learning_rate": 3.9485185115118284e-06, + "loss": 0.5415, + "step": 5678 + }, + { + "epoch": 0.9213173264114212, + "grad_norm": 0.612172641898578, + "learning_rate": 3.94817044264079e-06, + "loss": 0.5544, + "step": 5679 + }, + { + "epoch": 0.9214795587280986, + "grad_norm": 0.5773375772204391, + "learning_rate": 3.947822331516365e-06, + "loss": 0.5739, + "step": 5680 + }, + { + "epoch": 0.9216417910447762, + "grad_norm": 0.5686191057047646, + "learning_rate": 3.9474741781487145e-06, + "loss": 0.5729, + "step": 5681 + }, + { + "epoch": 0.9218040233614536, + "grad_norm": 0.5761446408936093, + "learning_rate": 3.9471259825479925e-06, + "loss": 0.5464, + "step": 5682 + }, + { + "epoch": 0.9219662556781311, + "grad_norm": 0.5938626303427111, + "learning_rate": 3.9467777447243595e-06, + "loss": 0.5487, + "step": 5683 + }, + { + "epoch": 0.9221284879948086, + "grad_norm": 0.5800267168550552, + "learning_rate": 3.9464294646879764e-06, + "loss": 0.5367, + "step": 5684 + }, + { + "epoch": 0.9222907203114861, + "grad_norm": 0.5871857980110325, + "learning_rate": 3.946081142449005e-06, + "loss": 0.5347, + "step": 5685 + }, + { + "epoch": 0.9224529526281635, + "grad_norm": 0.6201989488977429, + "learning_rate": 3.945732778017609e-06, + "loss": 0.5611, + "step": 5686 + }, + { + "epoch": 0.922615184944841, + "grad_norm": 0.6015448362683976, + "learning_rate": 3.9453843714039506e-06, + "loss": 0.5669, + "step": 5687 + }, + { + "epoch": 0.9227774172615185, + "grad_norm": 0.5954631838235119, + "learning_rate": 3.945035922618198e-06, + "loss": 0.5424, + "step": 5688 + }, + { + "epoch": 0.922939649578196, + "grad_norm": 0.578333961239201, + "learning_rate": 3.944687431670516e-06, + "loss": 0.5387, + "step": 5689 + }, + { + "epoch": 0.9231018818948734, + "grad_norm": 0.592716116236683, + "learning_rate": 3.944338898571073e-06, + "loss": 0.5752, + "step": 5690 + }, + { + "epoch": 0.923264114211551, + "grad_norm": 0.5830174728645892, + "learning_rate": 3.943990323330038e-06, + "loss": 0.5744, + "step": 5691 + }, + { + "epoch": 0.9234263465282284, + "grad_norm": 0.5844390759395592, + "learning_rate": 3.9436417059575816e-06, + "loss": 0.5435, + "step": 5692 + }, + { + "epoch": 0.9235885788449059, + "grad_norm": 0.5565221626334114, + "learning_rate": 3.943293046463876e-06, + "loss": 0.5369, + "step": 5693 + }, + { + "epoch": 0.9237508111615834, + "grad_norm": 0.5689480432993852, + "learning_rate": 3.942944344859093e-06, + "loss": 0.5625, + "step": 5694 + }, + { + "epoch": 0.9239130434782609, + "grad_norm": 0.606901929954401, + "learning_rate": 3.942595601153408e-06, + "loss": 0.5533, + "step": 5695 + }, + { + "epoch": 0.9240752757949383, + "grad_norm": 0.5713836117031493, + "learning_rate": 3.942246815356995e-06, + "loss": 0.5576, + "step": 5696 + }, + { + "epoch": 0.9242375081116159, + "grad_norm": 0.5737173880208186, + "learning_rate": 3.941897987480031e-06, + "loss": 0.5227, + "step": 5697 + }, + { + "epoch": 0.9243997404282933, + "grad_norm": 0.6004605895738729, + "learning_rate": 3.941549117532694e-06, + "loss": 0.5749, + "step": 5698 + }, + { + "epoch": 0.9245619727449708, + "grad_norm": 0.6008891662619089, + "learning_rate": 3.941200205525164e-06, + "loss": 0.5678, + "step": 5699 + }, + { + "epoch": 0.9247242050616483, + "grad_norm": 0.5961575545601818, + "learning_rate": 3.940851251467619e-06, + "loss": 0.5393, + "step": 5700 + }, + { + "epoch": 0.9248864373783258, + "grad_norm": 0.5736413851563972, + "learning_rate": 3.940502255370242e-06, + "loss": 0.5275, + "step": 5701 + }, + { + "epoch": 0.9250486696950032, + "grad_norm": 0.5673214092825487, + "learning_rate": 3.940153217243215e-06, + "loss": 0.5494, + "step": 5702 + }, + { + "epoch": 0.9252109020116808, + "grad_norm": 0.5566399425847972, + "learning_rate": 3.939804137096722e-06, + "loss": 0.5721, + "step": 5703 + }, + { + "epoch": 0.9253731343283582, + "grad_norm": 0.5513043812615848, + "learning_rate": 3.939455014940949e-06, + "loss": 0.5535, + "step": 5704 + }, + { + "epoch": 0.9255353666450357, + "grad_norm": 0.5601781075876215, + "learning_rate": 3.939105850786081e-06, + "loss": 0.5577, + "step": 5705 + }, + { + "epoch": 0.9256975989617132, + "grad_norm": 0.6000150074678463, + "learning_rate": 3.938756644642308e-06, + "loss": 0.5453, + "step": 5706 + }, + { + "epoch": 0.9258598312783907, + "grad_norm": 0.5781642571117331, + "learning_rate": 3.938407396519815e-06, + "loss": 0.5506, + "step": 5707 + }, + { + "epoch": 0.9260220635950681, + "grad_norm": 0.6092567307967027, + "learning_rate": 3.938058106428795e-06, + "loss": 0.5817, + "step": 5708 + }, + { + "epoch": 0.9261842959117457, + "grad_norm": 0.5754895747085753, + "learning_rate": 3.937708774379439e-06, + "loss": 0.556, + "step": 5709 + }, + { + "epoch": 0.9263465282284231, + "grad_norm": 0.6771696814654296, + "learning_rate": 3.937359400381938e-06, + "loss": 0.4966, + "step": 5710 + }, + { + "epoch": 0.9265087605451006, + "grad_norm": 0.547738288499589, + "learning_rate": 3.937009984446487e-06, + "loss": 0.5391, + "step": 5711 + }, + { + "epoch": 0.9266709928617781, + "grad_norm": 0.5494249816359854, + "learning_rate": 3.9366605265832805e-06, + "loss": 0.5768, + "step": 5712 + }, + { + "epoch": 0.9268332251784556, + "grad_norm": 0.6019118593628442, + "learning_rate": 3.936311026802515e-06, + "loss": 0.5449, + "step": 5713 + }, + { + "epoch": 0.926995457495133, + "grad_norm": 0.5640289144687978, + "learning_rate": 3.935961485114388e-06, + "loss": 0.5402, + "step": 5714 + }, + { + "epoch": 0.9271576898118105, + "grad_norm": 0.5423130530095225, + "learning_rate": 3.9356119015290965e-06, + "loss": 0.5314, + "step": 5715 + }, + { + "epoch": 0.927319922128488, + "grad_norm": 0.5671374843212857, + "learning_rate": 3.935262276056843e-06, + "loss": 0.571, + "step": 5716 + }, + { + "epoch": 0.9274821544451655, + "grad_norm": 0.5686608630989358, + "learning_rate": 3.934912608707826e-06, + "loss": 0.5599, + "step": 5717 + }, + { + "epoch": 0.9276443867618429, + "grad_norm": 0.5707089417108117, + "learning_rate": 3.934562899492249e-06, + "loss": 0.5635, + "step": 5718 + }, + { + "epoch": 0.9278066190785205, + "grad_norm": 0.5923453372209282, + "learning_rate": 3.934213148420315e-06, + "loss": 0.5412, + "step": 5719 + }, + { + "epoch": 0.9279688513951979, + "grad_norm": 0.5832158933939124, + "learning_rate": 3.93386335550223e-06, + "loss": 0.565, + "step": 5720 + }, + { + "epoch": 0.9281310837118754, + "grad_norm": 0.580330675068676, + "learning_rate": 3.933513520748198e-06, + "loss": 0.5635, + "step": 5721 + }, + { + "epoch": 0.9282933160285529, + "grad_norm": 0.5996458384769567, + "learning_rate": 3.933163644168428e-06, + "loss": 0.5458, + "step": 5722 + }, + { + "epoch": 0.9284555483452304, + "grad_norm": 0.6102857378777499, + "learning_rate": 3.932813725773127e-06, + "loss": 0.5652, + "step": 5723 + }, + { + "epoch": 0.9286177806619078, + "grad_norm": 0.5591841369502003, + "learning_rate": 3.932463765572506e-06, + "loss": 0.5495, + "step": 5724 + }, + { + "epoch": 0.9287800129785854, + "grad_norm": 0.5537744517947926, + "learning_rate": 3.932113763576774e-06, + "loss": 0.5835, + "step": 5725 + }, + { + "epoch": 0.9289422452952628, + "grad_norm": 0.5799054832394495, + "learning_rate": 3.9317637197961445e-06, + "loss": 0.5709, + "step": 5726 + }, + { + "epoch": 0.9291044776119403, + "grad_norm": 0.5777878782575572, + "learning_rate": 3.9314136342408295e-06, + "loss": 0.5623, + "step": 5727 + }, + { + "epoch": 0.9292667099286178, + "grad_norm": 0.6661794411148771, + "learning_rate": 3.931063506921045e-06, + "loss": 0.5534, + "step": 5728 + }, + { + "epoch": 0.9294289422452953, + "grad_norm": 0.5504359047979153, + "learning_rate": 3.930713337847007e-06, + "loss": 0.508, + "step": 5729 + }, + { + "epoch": 0.9295911745619727, + "grad_norm": 0.5999492304280832, + "learning_rate": 3.930363127028929e-06, + "loss": 0.5411, + "step": 5730 + }, + { + "epoch": 0.9297534068786503, + "grad_norm": 0.5993150699876282, + "learning_rate": 3.930012874477032e-06, + "loss": 0.5603, + "step": 5731 + }, + { + "epoch": 0.9299156391953277, + "grad_norm": 0.5785397123712758, + "learning_rate": 3.929662580201536e-06, + "loss": 0.5523, + "step": 5732 + }, + { + "epoch": 0.9300778715120052, + "grad_norm": 0.6147375031682037, + "learning_rate": 3.92931224421266e-06, + "loss": 0.5481, + "step": 5733 + }, + { + "epoch": 0.9302401038286827, + "grad_norm": 0.5738363522129024, + "learning_rate": 3.928961866520625e-06, + "loss": 0.5699, + "step": 5734 + }, + { + "epoch": 0.9304023361453602, + "grad_norm": 0.5696591206744827, + "learning_rate": 3.928611447135656e-06, + "loss": 0.4977, + "step": 5735 + }, + { + "epoch": 0.9305645684620376, + "grad_norm": 0.6018562173600366, + "learning_rate": 3.928260986067977e-06, + "loss": 0.5306, + "step": 5736 + }, + { + "epoch": 0.9307268007787152, + "grad_norm": 0.5558589214019758, + "learning_rate": 3.927910483327811e-06, + "loss": 0.5395, + "step": 5737 + }, + { + "epoch": 0.9308890330953926, + "grad_norm": 0.577321258495538, + "learning_rate": 3.927559938925388e-06, + "loss": 0.5359, + "step": 5738 + }, + { + "epoch": 0.9310512654120701, + "grad_norm": 0.5617531174349911, + "learning_rate": 3.927209352870934e-06, + "loss": 0.5627, + "step": 5739 + }, + { + "epoch": 0.9312134977287476, + "grad_norm": 0.5494064611281899, + "learning_rate": 3.926858725174678e-06, + "loss": 0.5661, + "step": 5740 + }, + { + "epoch": 0.9313757300454251, + "grad_norm": 0.5704501266995218, + "learning_rate": 3.926508055846851e-06, + "loss": 0.5391, + "step": 5741 + }, + { + "epoch": 0.9315379623621025, + "grad_norm": 0.6177097667610392, + "learning_rate": 3.926157344897685e-06, + "loss": 0.557, + "step": 5742 + }, + { + "epoch": 0.93170019467878, + "grad_norm": 0.6026983904508331, + "learning_rate": 3.9258065923374104e-06, + "loss": 0.5557, + "step": 5743 + }, + { + "epoch": 0.9318624269954575, + "grad_norm": 0.5713443506697417, + "learning_rate": 3.925455798176263e-06, + "loss": 0.5051, + "step": 5744 + }, + { + "epoch": 0.932024659312135, + "grad_norm": 0.5717114488591069, + "learning_rate": 3.925104962424479e-06, + "loss": 0.5699, + "step": 5745 + }, + { + "epoch": 0.9321868916288124, + "grad_norm": 0.5968836922559008, + "learning_rate": 3.924754085092291e-06, + "loss": 0.5585, + "step": 5746 + }, + { + "epoch": 0.93234912394549, + "grad_norm": 0.5880411200773689, + "learning_rate": 3.92440316618994e-06, + "loss": 0.5285, + "step": 5747 + }, + { + "epoch": 0.9325113562621674, + "grad_norm": 0.5751086357142042, + "learning_rate": 3.924052205727664e-06, + "loss": 0.5526, + "step": 5748 + }, + { + "epoch": 0.9326735885788449, + "grad_norm": 0.613673322424379, + "learning_rate": 3.923701203715703e-06, + "loss": 0.5621, + "step": 5749 + }, + { + "epoch": 0.9328358208955224, + "grad_norm": 0.5867888947218218, + "learning_rate": 3.923350160164297e-06, + "loss": 0.5296, + "step": 5750 + }, + { + "epoch": 0.9329980532121999, + "grad_norm": 0.6090079257137851, + "learning_rate": 3.92299907508369e-06, + "loss": 0.57, + "step": 5751 + }, + { + "epoch": 0.9331602855288773, + "grad_norm": 0.632137767359036, + "learning_rate": 3.922647948484125e-06, + "loss": 0.5928, + "step": 5752 + }, + { + "epoch": 0.9333225178455549, + "grad_norm": 0.6291224391836424, + "learning_rate": 3.922296780375846e-06, + "loss": 0.5458, + "step": 5753 + }, + { + "epoch": 0.9334847501622323, + "grad_norm": 0.564970192564282, + "learning_rate": 3.9219455707691004e-06, + "loss": 0.5349, + "step": 5754 + }, + { + "epoch": 0.9336469824789098, + "grad_norm": 0.5946049198361384, + "learning_rate": 3.921594319674136e-06, + "loss": 0.5256, + "step": 5755 + }, + { + "epoch": 0.9338092147955873, + "grad_norm": 0.5850792528704616, + "learning_rate": 3.921243027101199e-06, + "loss": 0.5592, + "step": 5756 + }, + { + "epoch": 0.9339714471122648, + "grad_norm": 0.6496994592318416, + "learning_rate": 3.92089169306054e-06, + "loss": 0.5964, + "step": 5757 + }, + { + "epoch": 0.9341336794289422, + "grad_norm": 0.5836883524463055, + "learning_rate": 3.920540317562412e-06, + "loss": 0.5292, + "step": 5758 + }, + { + "epoch": 0.9342959117456198, + "grad_norm": 0.6127433290253648, + "learning_rate": 3.920188900617064e-06, + "loss": 0.5628, + "step": 5759 + }, + { + "epoch": 0.9344581440622972, + "grad_norm": 0.5828479404348794, + "learning_rate": 3.91983744223475e-06, + "loss": 0.5055, + "step": 5760 + }, + { + "epoch": 0.9346203763789747, + "grad_norm": 0.5984846071779022, + "learning_rate": 3.919485942425727e-06, + "loss": 0.5242, + "step": 5761 + }, + { + "epoch": 0.9347826086956522, + "grad_norm": 0.57777848446178, + "learning_rate": 3.919134401200248e-06, + "loss": 0.5558, + "step": 5762 + }, + { + "epoch": 0.9349448410123297, + "grad_norm": 0.5844406832894503, + "learning_rate": 3.918782818568571e-06, + "loss": 0.5554, + "step": 5763 + }, + { + "epoch": 0.9351070733290071, + "grad_norm": 0.5823696488549867, + "learning_rate": 3.918431194540953e-06, + "loss": 0.5048, + "step": 5764 + }, + { + "epoch": 0.9352693056456847, + "grad_norm": 0.5437718685923686, + "learning_rate": 3.918079529127657e-06, + "loss": 0.5271, + "step": 5765 + }, + { + "epoch": 0.9354315379623621, + "grad_norm": 0.6322416248937102, + "learning_rate": 3.917727822338938e-06, + "loss": 0.5361, + "step": 5766 + }, + { + "epoch": 0.9355937702790396, + "grad_norm": 0.6281512800773156, + "learning_rate": 3.9173760741850625e-06, + "loss": 0.5464, + "step": 5767 + }, + { + "epoch": 0.9357560025957171, + "grad_norm": 0.5812736028396585, + "learning_rate": 3.917024284676292e-06, + "loss": 0.5434, + "step": 5768 + }, + { + "epoch": 0.9359182349123946, + "grad_norm": 0.5927495056214589, + "learning_rate": 3.916672453822889e-06, + "loss": 0.5039, + "step": 5769 + }, + { + "epoch": 0.936080467229072, + "grad_norm": 0.5735823288777665, + "learning_rate": 3.9163205816351215e-06, + "loss": 0.4997, + "step": 5770 + }, + { + "epoch": 0.9362426995457496, + "grad_norm": 0.6041292484520342, + "learning_rate": 3.915968668123255e-06, + "loss": 0.5512, + "step": 5771 + }, + { + "epoch": 0.936404931862427, + "grad_norm": 0.5835327071211706, + "learning_rate": 3.915616713297556e-06, + "loss": 0.5488, + "step": 5772 + }, + { + "epoch": 0.9365671641791045, + "grad_norm": 0.5803110737981861, + "learning_rate": 3.915264717168296e-06, + "loss": 0.5043, + "step": 5773 + }, + { + "epoch": 0.9367293964957819, + "grad_norm": 0.5626373672360744, + "learning_rate": 3.914912679745743e-06, + "loss": 0.5637, + "step": 5774 + }, + { + "epoch": 0.9368916288124595, + "grad_norm": 0.5605801717280641, + "learning_rate": 3.914560601040171e-06, + "loss": 0.5464, + "step": 5775 + }, + { + "epoch": 0.9370538611291369, + "grad_norm": 0.5687977114425471, + "learning_rate": 3.91420848106185e-06, + "loss": 0.5678, + "step": 5776 + }, + { + "epoch": 0.9372160934458144, + "grad_norm": 0.5853231226339282, + "learning_rate": 3.913856319821054e-06, + "loss": 0.5894, + "step": 5777 + }, + { + "epoch": 0.9373783257624919, + "grad_norm": 0.5875621488239033, + "learning_rate": 3.9135041173280596e-06, + "loss": 0.562, + "step": 5778 + }, + { + "epoch": 0.9375405580791694, + "grad_norm": 0.6107210780657979, + "learning_rate": 3.913151873593143e-06, + "loss": 0.5214, + "step": 5779 + }, + { + "epoch": 0.9377027903958468, + "grad_norm": 0.5794190661288426, + "learning_rate": 3.91279958862658e-06, + "loss": 0.5515, + "step": 5780 + }, + { + "epoch": 0.9378650227125244, + "grad_norm": 0.573478990412596, + "learning_rate": 3.912447262438651e-06, + "loss": 0.569, + "step": 5781 + }, + { + "epoch": 0.9380272550292018, + "grad_norm": 0.5872363627102073, + "learning_rate": 3.912094895039634e-06, + "loss": 0.5459, + "step": 5782 + }, + { + "epoch": 0.9381894873458793, + "grad_norm": 0.566667126672781, + "learning_rate": 3.911742486439812e-06, + "loss": 0.5477, + "step": 5783 + }, + { + "epoch": 0.9383517196625568, + "grad_norm": 0.6088801224636179, + "learning_rate": 3.911390036649466e-06, + "loss": 0.5011, + "step": 5784 + }, + { + "epoch": 0.9385139519792343, + "grad_norm": 0.5641174209917452, + "learning_rate": 3.9110375456788804e-06, + "loss": 0.5289, + "step": 5785 + }, + { + "epoch": 0.9386761842959117, + "grad_norm": 0.5931784329823728, + "learning_rate": 3.910685013538339e-06, + "loss": 0.5361, + "step": 5786 + }, + { + "epoch": 0.9388384166125893, + "grad_norm": 0.5837059880455462, + "learning_rate": 3.9103324402381285e-06, + "loss": 0.5241, + "step": 5787 + }, + { + "epoch": 0.9390006489292667, + "grad_norm": 0.5896393239394072, + "learning_rate": 3.909979825788535e-06, + "loss": 0.5543, + "step": 5788 + }, + { + "epoch": 0.9391628812459442, + "grad_norm": 0.5630907875212756, + "learning_rate": 3.909627170199846e-06, + "loss": 0.5445, + "step": 5789 + }, + { + "epoch": 0.9393251135626217, + "grad_norm": 0.611606364002218, + "learning_rate": 3.909274473482353e-06, + "loss": 0.5761, + "step": 5790 + }, + { + "epoch": 0.9394873458792992, + "grad_norm": 0.573367486732002, + "learning_rate": 3.908921735646346e-06, + "loss": 0.5584, + "step": 5791 + }, + { + "epoch": 0.9396495781959766, + "grad_norm": 0.5711808498051636, + "learning_rate": 3.908568956702116e-06, + "loss": 0.5553, + "step": 5792 + }, + { + "epoch": 0.9398118105126542, + "grad_norm": 0.7884883927221025, + "learning_rate": 3.908216136659958e-06, + "loss": 0.5861, + "step": 5793 + }, + { + "epoch": 0.9399740428293316, + "grad_norm": 0.5998499991336496, + "learning_rate": 3.907863275530164e-06, + "loss": 0.5616, + "step": 5794 + }, + { + "epoch": 0.9401362751460091, + "grad_norm": 0.5832687272821471, + "learning_rate": 3.90751037332303e-06, + "loss": 0.5591, + "step": 5795 + }, + { + "epoch": 0.9402985074626866, + "grad_norm": 0.5854641399504055, + "learning_rate": 3.907157430048855e-06, + "loss": 0.5204, + "step": 5796 + }, + { + "epoch": 0.9404607397793641, + "grad_norm": 0.5539066147158667, + "learning_rate": 3.906804445717934e-06, + "loss": 0.5765, + "step": 5797 + }, + { + "epoch": 0.9406229720960415, + "grad_norm": 0.5519083344743103, + "learning_rate": 3.906451420340566e-06, + "loss": 0.5521, + "step": 5798 + }, + { + "epoch": 0.9407852044127191, + "grad_norm": 0.5854627781030999, + "learning_rate": 3.906098353927052e-06, + "loss": 0.5733, + "step": 5799 + }, + { + "epoch": 0.9409474367293965, + "grad_norm": 0.5886210466914779, + "learning_rate": 3.905745246487695e-06, + "loss": 0.5849, + "step": 5800 + }, + { + "epoch": 0.941109669046074, + "grad_norm": 0.5775027491031413, + "learning_rate": 3.905392098032796e-06, + "loss": 0.5364, + "step": 5801 + }, + { + "epoch": 0.9412719013627514, + "grad_norm": 0.6320535981118115, + "learning_rate": 3.9050389085726595e-06, + "loss": 0.5374, + "step": 5802 + }, + { + "epoch": 0.941434133679429, + "grad_norm": 0.6373557244953211, + "learning_rate": 3.904685678117589e-06, + "loss": 0.5294, + "step": 5803 + }, + { + "epoch": 0.9415963659961064, + "grad_norm": 0.5722876086427389, + "learning_rate": 3.904332406677893e-06, + "loss": 0.5637, + "step": 5804 + }, + { + "epoch": 0.9417585983127839, + "grad_norm": 0.5658276561807315, + "learning_rate": 3.903979094263878e-06, + "loss": 0.5156, + "step": 5805 + }, + { + "epoch": 0.9419208306294614, + "grad_norm": 0.5624379913512312, + "learning_rate": 3.903625740885852e-06, + "loss": 0.5261, + "step": 5806 + }, + { + "epoch": 0.9420830629461389, + "grad_norm": 0.5926458769133842, + "learning_rate": 3.903272346554125e-06, + "loss": 0.563, + "step": 5807 + }, + { + "epoch": 0.9422452952628163, + "grad_norm": 0.5614008177773512, + "learning_rate": 3.902918911279009e-06, + "loss": 0.5257, + "step": 5808 + }, + { + "epoch": 0.9424075275794939, + "grad_norm": 0.5683202918015606, + "learning_rate": 3.902565435070815e-06, + "loss": 0.5587, + "step": 5809 + }, + { + "epoch": 0.9425697598961713, + "grad_norm": 0.5887383839805703, + "learning_rate": 3.902211917939857e-06, + "loss": 0.5469, + "step": 5810 + }, + { + "epoch": 0.9427319922128488, + "grad_norm": 0.5888519205839803, + "learning_rate": 3.90185835989645e-06, + "loss": 0.5419, + "step": 5811 + }, + { + "epoch": 0.9428942245295263, + "grad_norm": 0.5724560648239465, + "learning_rate": 3.901504760950909e-06, + "loss": 0.5615, + "step": 5812 + }, + { + "epoch": 0.9430564568462038, + "grad_norm": 0.6198324408840948, + "learning_rate": 3.901151121113551e-06, + "loss": 0.5274, + "step": 5813 + }, + { + "epoch": 0.9432186891628812, + "grad_norm": 0.5807451272012768, + "learning_rate": 3.900797440394695e-06, + "loss": 0.5597, + "step": 5814 + }, + { + "epoch": 0.9433809214795588, + "grad_norm": 0.6153680536324605, + "learning_rate": 3.90044371880466e-06, + "loss": 0.5724, + "step": 5815 + }, + { + "epoch": 0.9435431537962362, + "grad_norm": 0.5927615038416845, + "learning_rate": 3.900089956353766e-06, + "loss": 0.5526, + "step": 5816 + }, + { + "epoch": 0.9437053861129137, + "grad_norm": 0.5840360750664656, + "learning_rate": 3.899736153052335e-06, + "loss": 0.5365, + "step": 5817 + }, + { + "epoch": 0.9438676184295912, + "grad_norm": 0.5878760900563462, + "learning_rate": 3.899382308910691e-06, + "loss": 0.5452, + "step": 5818 + }, + { + "epoch": 0.9440298507462687, + "grad_norm": 0.591054215889644, + "learning_rate": 3.899028423939156e-06, + "loss": 0.5072, + "step": 5819 + }, + { + "epoch": 0.9441920830629461, + "grad_norm": 0.6040658296093707, + "learning_rate": 3.898674498148058e-06, + "loss": 0.5602, + "step": 5820 + }, + { + "epoch": 0.9443543153796237, + "grad_norm": 0.5500967273588686, + "learning_rate": 3.898320531547722e-06, + "loss": 0.5088, + "step": 5821 + }, + { + "epoch": 0.9445165476963011, + "grad_norm": 0.5886651351311939, + "learning_rate": 3.897966524148475e-06, + "loss": 0.5774, + "step": 5822 + }, + { + "epoch": 0.9446787800129786, + "grad_norm": 0.6376088550085791, + "learning_rate": 3.897612475960646e-06, + "loss": 0.5359, + "step": 5823 + }, + { + "epoch": 0.9448410123296561, + "grad_norm": 0.5575212451125307, + "learning_rate": 3.897258386994567e-06, + "loss": 0.5331, + "step": 5824 + }, + { + "epoch": 0.9450032446463336, + "grad_norm": 0.5681056076025132, + "learning_rate": 3.896904257260568e-06, + "loss": 0.5346, + "step": 5825 + }, + { + "epoch": 0.945165476963011, + "grad_norm": 0.5654971324672214, + "learning_rate": 3.896550086768982e-06, + "loss": 0.5411, + "step": 5826 + }, + { + "epoch": 0.9453277092796886, + "grad_norm": 0.6253617689403131, + "learning_rate": 3.8961958755301414e-06, + "loss": 0.5402, + "step": 5827 + }, + { + "epoch": 0.945489941596366, + "grad_norm": 0.5610193794465579, + "learning_rate": 3.895841623554382e-06, + "loss": 0.5236, + "step": 5828 + }, + { + "epoch": 0.9456521739130435, + "grad_norm": 0.5765546519825874, + "learning_rate": 3.89548733085204e-06, + "loss": 0.5649, + "step": 5829 + }, + { + "epoch": 0.9458144062297209, + "grad_norm": 0.5902972957345678, + "learning_rate": 3.8951329974334525e-06, + "loss": 0.5387, + "step": 5830 + }, + { + "epoch": 0.9459766385463985, + "grad_norm": 0.5672448578810118, + "learning_rate": 3.894778623308958e-06, + "loss": 0.5425, + "step": 5831 + }, + { + "epoch": 0.9461388708630759, + "grad_norm": 0.5933447645686449, + "learning_rate": 3.894424208488895e-06, + "loss": 0.554, + "step": 5832 + }, + { + "epoch": 0.9463011031797534, + "grad_norm": 0.593751175415092, + "learning_rate": 3.894069752983606e-06, + "loss": 0.5892, + "step": 5833 + }, + { + "epoch": 0.9464633354964309, + "grad_norm": 0.5941057758118153, + "learning_rate": 3.893715256803432e-06, + "loss": 0.5626, + "step": 5834 + }, + { + "epoch": 0.9466255678131084, + "grad_norm": 0.6081011519259364, + "learning_rate": 3.893360719958715e-06, + "loss": 0.5616, + "step": 5835 + }, + { + "epoch": 0.9467878001297858, + "grad_norm": 0.6243603413105547, + "learning_rate": 3.893006142459802e-06, + "loss": 0.5238, + "step": 5836 + }, + { + "epoch": 0.9469500324464634, + "grad_norm": 0.5736827715910562, + "learning_rate": 3.892651524317037e-06, + "loss": 0.557, + "step": 5837 + }, + { + "epoch": 0.9471122647631408, + "grad_norm": 0.5994161472819605, + "learning_rate": 3.892296865540767e-06, + "loss": 0.5672, + "step": 5838 + }, + { + "epoch": 0.9472744970798183, + "grad_norm": 0.5866262551680421, + "learning_rate": 3.89194216614134e-06, + "loss": 0.5638, + "step": 5839 + }, + { + "epoch": 0.9474367293964958, + "grad_norm": 0.5785670117118105, + "learning_rate": 3.891587426129104e-06, + "loss": 0.5602, + "step": 5840 + }, + { + "epoch": 0.9475989617131733, + "grad_norm": 0.7040541689182754, + "learning_rate": 3.8912326455144114e-06, + "loss": 0.5164, + "step": 5841 + }, + { + "epoch": 0.9477611940298507, + "grad_norm": 0.6232744888156058, + "learning_rate": 3.890877824307611e-06, + "loss": 0.5413, + "step": 5842 + }, + { + "epoch": 0.9479234263465283, + "grad_norm": 0.5478893810317848, + "learning_rate": 3.890522962519058e-06, + "loss": 0.5445, + "step": 5843 + }, + { + "epoch": 0.9480856586632057, + "grad_norm": 0.7479240016535769, + "learning_rate": 3.890168060159105e-06, + "loss": 0.5192, + "step": 5844 + }, + { + "epoch": 0.9482478909798832, + "grad_norm": 0.6258033643616979, + "learning_rate": 3.889813117238107e-06, + "loss": 0.5574, + "step": 5845 + }, + { + "epoch": 0.9484101232965607, + "grad_norm": 0.5834221664662024, + "learning_rate": 3.88945813376642e-06, + "loss": 0.5659, + "step": 5846 + }, + { + "epoch": 0.9485723556132382, + "grad_norm": 0.552353407914602, + "learning_rate": 3.889103109754402e-06, + "loss": 0.5321, + "step": 5847 + }, + { + "epoch": 0.9487345879299156, + "grad_norm": 0.5777379210348003, + "learning_rate": 3.8887480452124116e-06, + "loss": 0.5512, + "step": 5848 + }, + { + "epoch": 0.9488968202465932, + "grad_norm": 0.5831772270928536, + "learning_rate": 3.888392940150808e-06, + "loss": 0.5522, + "step": 5849 + }, + { + "epoch": 0.9490590525632706, + "grad_norm": 0.6257236748613834, + "learning_rate": 3.888037794579952e-06, + "loss": 0.4974, + "step": 5850 + }, + { + "epoch": 0.9492212848799481, + "grad_norm": 0.622303813539185, + "learning_rate": 3.887682608510206e-06, + "loss": 0.5805, + "step": 5851 + }, + { + "epoch": 0.9493835171966256, + "grad_norm": 0.5799102279181317, + "learning_rate": 3.887327381951934e-06, + "loss": 0.5559, + "step": 5852 + }, + { + "epoch": 0.9495457495133031, + "grad_norm": 0.5794152291767001, + "learning_rate": 3.886972114915499e-06, + "loss": 0.5424, + "step": 5853 + }, + { + "epoch": 0.9497079818299805, + "grad_norm": 0.5666471837275526, + "learning_rate": 3.886616807411269e-06, + "loss": 0.546, + "step": 5854 + }, + { + "epoch": 0.9498702141466581, + "grad_norm": 0.6069668390107936, + "learning_rate": 3.886261459449608e-06, + "loss": 0.5665, + "step": 5855 + }, + { + "epoch": 0.9500324464633355, + "grad_norm": 0.6042157796672757, + "learning_rate": 3.8859060710408855e-06, + "loss": 0.5636, + "step": 5856 + }, + { + "epoch": 0.950194678780013, + "grad_norm": 0.5966934769886973, + "learning_rate": 3.8855506421954715e-06, + "loss": 0.5373, + "step": 5857 + }, + { + "epoch": 0.9503569110966905, + "grad_norm": 0.5511550067727318, + "learning_rate": 3.885195172923734e-06, + "loss": 0.5345, + "step": 5858 + }, + { + "epoch": 0.950519143413368, + "grad_norm": 0.5669162857824317, + "learning_rate": 3.884839663236048e-06, + "loss": 0.5264, + "step": 5859 + }, + { + "epoch": 0.9506813757300454, + "grad_norm": 0.5357486033964208, + "learning_rate": 3.884484113142782e-06, + "loss": 0.5059, + "step": 5860 + }, + { + "epoch": 0.9508436080467229, + "grad_norm": 0.5544018582955251, + "learning_rate": 3.884128522654312e-06, + "loss": 0.5711, + "step": 5861 + }, + { + "epoch": 0.9510058403634004, + "grad_norm": 0.6000037980411175, + "learning_rate": 3.883772891781015e-06, + "loss": 0.5659, + "step": 5862 + }, + { + "epoch": 0.9511680726800779, + "grad_norm": 0.5744622092137783, + "learning_rate": 3.883417220533264e-06, + "loss": 0.5431, + "step": 5863 + }, + { + "epoch": 0.9513303049967553, + "grad_norm": 0.5754743197806801, + "learning_rate": 3.883061508921439e-06, + "loss": 0.5549, + "step": 5864 + }, + { + "epoch": 0.9514925373134329, + "grad_norm": 0.5686410485634915, + "learning_rate": 3.882705756955918e-06, + "loss": 0.5628, + "step": 5865 + }, + { + "epoch": 0.9516547696301103, + "grad_norm": 0.5724595540679794, + "learning_rate": 3.882349964647079e-06, + "loss": 0.5557, + "step": 5866 + }, + { + "epoch": 0.9518170019467878, + "grad_norm": 0.61070495613498, + "learning_rate": 3.881994132005305e-06, + "loss": 0.5584, + "step": 5867 + }, + { + "epoch": 0.9519792342634653, + "grad_norm": 0.5979632841361726, + "learning_rate": 3.881638259040977e-06, + "loss": 0.5387, + "step": 5868 + }, + { + "epoch": 0.9521414665801428, + "grad_norm": 0.5820797079723513, + "learning_rate": 3.881282345764479e-06, + "loss": 0.519, + "step": 5869 + }, + { + "epoch": 0.9523036988968202, + "grad_norm": 0.5629270293791461, + "learning_rate": 3.880926392186196e-06, + "loss": 0.5328, + "step": 5870 + }, + { + "epoch": 0.9524659312134978, + "grad_norm": 0.5775508180140394, + "learning_rate": 3.880570398316512e-06, + "loss": 0.5412, + "step": 5871 + }, + { + "epoch": 0.9526281635301752, + "grad_norm": 0.594515068595814, + "learning_rate": 3.880214364165817e-06, + "loss": 0.5487, + "step": 5872 + }, + { + "epoch": 0.9527903958468527, + "grad_norm": 0.5695496364334035, + "learning_rate": 3.879858289744495e-06, + "loss": 0.5402, + "step": 5873 + }, + { + "epoch": 0.9529526281635302, + "grad_norm": 0.648057808265531, + "learning_rate": 3.879502175062937e-06, + "loss": 0.5515, + "step": 5874 + }, + { + "epoch": 0.9531148604802077, + "grad_norm": 0.548548788935216, + "learning_rate": 3.8791460201315335e-06, + "loss": 0.5423, + "step": 5875 + }, + { + "epoch": 0.9532770927968851, + "grad_norm": 0.607187839536779, + "learning_rate": 3.878789824960677e-06, + "loss": 0.5586, + "step": 5876 + }, + { + "epoch": 0.9534393251135627, + "grad_norm": 0.6242866454930339, + "learning_rate": 3.878433589560759e-06, + "loss": 0.5631, + "step": 5877 + }, + { + "epoch": 0.9536015574302401, + "grad_norm": 0.5746467179687922, + "learning_rate": 3.878077313942174e-06, + "loss": 0.5244, + "step": 5878 + }, + { + "epoch": 0.9537637897469176, + "grad_norm": 0.5655318282925239, + "learning_rate": 3.877720998115316e-06, + "loss": 0.548, + "step": 5879 + }, + { + "epoch": 0.9539260220635951, + "grad_norm": 0.5741797241016334, + "learning_rate": 3.877364642090583e-06, + "loss": 0.4912, + "step": 5880 + }, + { + "epoch": 0.9540882543802726, + "grad_norm": 0.5653449081128539, + "learning_rate": 3.877008245878371e-06, + "loss": 0.5292, + "step": 5881 + }, + { + "epoch": 0.95425048669695, + "grad_norm": 0.5903898803045223, + "learning_rate": 3.876651809489079e-06, + "loss": 0.5899, + "step": 5882 + }, + { + "epoch": 0.9544127190136276, + "grad_norm": 0.5840328562318815, + "learning_rate": 3.876295332933107e-06, + "loss": 0.5569, + "step": 5883 + }, + { + "epoch": 0.954574951330305, + "grad_norm": 0.5966195852049712, + "learning_rate": 3.875938816220855e-06, + "loss": 0.5271, + "step": 5884 + }, + { + "epoch": 0.9547371836469825, + "grad_norm": 0.5902316774056988, + "learning_rate": 3.875582259362728e-06, + "loss": 0.5572, + "step": 5885 + }, + { + "epoch": 0.95489941596366, + "grad_norm": 0.599889057278639, + "learning_rate": 3.875225662369125e-06, + "loss": 0.5835, + "step": 5886 + }, + { + "epoch": 0.9550616482803375, + "grad_norm": 0.6063009308209374, + "learning_rate": 3.874869025250454e-06, + "loss": 0.5284, + "step": 5887 + }, + { + "epoch": 0.9552238805970149, + "grad_norm": 0.6098172502685675, + "learning_rate": 3.87451234801712e-06, + "loss": 0.5428, + "step": 5888 + }, + { + "epoch": 0.9553861129136924, + "grad_norm": 0.6133559641145275, + "learning_rate": 3.8741556306795266e-06, + "loss": 0.5427, + "step": 5889 + }, + { + "epoch": 0.9555483452303699, + "grad_norm": 0.5863295671610643, + "learning_rate": 3.873798873248087e-06, + "loss": 0.5676, + "step": 5890 + }, + { + "epoch": 0.9557105775470474, + "grad_norm": 0.5656390276550044, + "learning_rate": 3.873442075733206e-06, + "loss": 0.5458, + "step": 5891 + }, + { + "epoch": 0.9558728098637248, + "grad_norm": 0.5412183941978496, + "learning_rate": 3.873085238145295e-06, + "loss": 0.5136, + "step": 5892 + }, + { + "epoch": 0.9560350421804024, + "grad_norm": 0.5822856388293696, + "learning_rate": 3.872728360494768e-06, + "loss": 0.5659, + "step": 5893 + }, + { + "epoch": 0.9561972744970798, + "grad_norm": 0.5923742638740433, + "learning_rate": 3.872371442792034e-06, + "loss": 0.5338, + "step": 5894 + }, + { + "epoch": 0.9563595068137573, + "grad_norm": 0.5746559399800434, + "learning_rate": 3.8720144850475095e-06, + "loss": 0.5426, + "step": 5895 + }, + { + "epoch": 0.9565217391304348, + "grad_norm": 0.5956625432861878, + "learning_rate": 3.871657487271608e-06, + "loss": 0.5431, + "step": 5896 + }, + { + "epoch": 0.9566839714471123, + "grad_norm": 0.589954714286503, + "learning_rate": 3.871300449474746e-06, + "loss": 0.576, + "step": 5897 + }, + { + "epoch": 0.9568462037637897, + "grad_norm": 0.5907114551886575, + "learning_rate": 3.870943371667341e-06, + "loss": 0.5343, + "step": 5898 + }, + { + "epoch": 0.9570084360804673, + "grad_norm": 0.5785664440977836, + "learning_rate": 3.870586253859812e-06, + "loss": 0.5466, + "step": 5899 + }, + { + "epoch": 0.9571706683971447, + "grad_norm": 0.5678668665880838, + "learning_rate": 3.870229096062577e-06, + "loss": 0.5326, + "step": 5900 + }, + { + "epoch": 0.9573329007138222, + "grad_norm": 0.5661194472048968, + "learning_rate": 3.869871898286059e-06, + "loss": 0.5228, + "step": 5901 + }, + { + "epoch": 0.9574951330304997, + "grad_norm": 0.5594250931955785, + "learning_rate": 3.8695146605406774e-06, + "loss": 0.5493, + "step": 5902 + }, + { + "epoch": 0.9576573653471772, + "grad_norm": 0.5693756697246655, + "learning_rate": 3.8691573828368575e-06, + "loss": 0.5743, + "step": 5903 + }, + { + "epoch": 0.9578195976638546, + "grad_norm": 0.6021154902265012, + "learning_rate": 3.868800065185024e-06, + "loss": 0.5615, + "step": 5904 + }, + { + "epoch": 0.9579818299805322, + "grad_norm": 0.606640238719913, + "learning_rate": 3.868442707595601e-06, + "loss": 0.5989, + "step": 5905 + }, + { + "epoch": 0.9581440622972096, + "grad_norm": 0.5721664240354664, + "learning_rate": 3.868085310079014e-06, + "loss": 0.5089, + "step": 5906 + }, + { + "epoch": 0.958306294613887, + "grad_norm": 0.5961342873148439, + "learning_rate": 3.867727872645694e-06, + "loss": 0.5575, + "step": 5907 + }, + { + "epoch": 0.9584685269305646, + "grad_norm": 0.5874483370440082, + "learning_rate": 3.8673703953060685e-06, + "loss": 0.5071, + "step": 5908 + }, + { + "epoch": 0.9586307592472421, + "grad_norm": 0.5735263621605953, + "learning_rate": 3.867012878070565e-06, + "loss": 0.5688, + "step": 5909 + }, + { + "epoch": 0.9587929915639195, + "grad_norm": 0.5536133667688411, + "learning_rate": 3.866655320949619e-06, + "loss": 0.5248, + "step": 5910 + }, + { + "epoch": 0.9589552238805971, + "grad_norm": 0.5823097470650714, + "learning_rate": 3.866297723953662e-06, + "loss": 0.5418, + "step": 5911 + }, + { + "epoch": 0.9591174561972745, + "grad_norm": 0.6090146827151386, + "learning_rate": 3.865940087093125e-06, + "loss": 0.5215, + "step": 5912 + }, + { + "epoch": 0.959279688513952, + "grad_norm": 0.5444114044785895, + "learning_rate": 3.865582410378446e-06, + "loss": 0.5008, + "step": 5913 + }, + { + "epoch": 0.9594419208306295, + "grad_norm": 0.5721131044543903, + "learning_rate": 3.865224693820059e-06, + "loss": 0.5091, + "step": 5914 + }, + { + "epoch": 0.959604153147307, + "grad_norm": 0.598802575842759, + "learning_rate": 3.864866937428402e-06, + "loss": 0.5395, + "step": 5915 + }, + { + "epoch": 0.9597663854639844, + "grad_norm": 0.5823342869443909, + "learning_rate": 3.864509141213913e-06, + "loss": 0.5307, + "step": 5916 + }, + { + "epoch": 0.9599286177806619, + "grad_norm": 0.5714505678152598, + "learning_rate": 3.864151305187032e-06, + "loss": 0.5191, + "step": 5917 + }, + { + "epoch": 0.9600908500973394, + "grad_norm": 0.5593192664201602, + "learning_rate": 3.863793429358198e-06, + "loss": 0.5283, + "step": 5918 + }, + { + "epoch": 0.9602530824140169, + "grad_norm": 0.5571374591724987, + "learning_rate": 3.863435513737854e-06, + "loss": 0.5353, + "step": 5919 + }, + { + "epoch": 0.9604153147306943, + "grad_norm": 0.5583994912372515, + "learning_rate": 3.863077558336444e-06, + "loss": 0.531, + "step": 5920 + }, + { + "epoch": 0.9605775470473719, + "grad_norm": 0.6234883823696258, + "learning_rate": 3.86271956316441e-06, + "loss": 0.5655, + "step": 5921 + }, + { + "epoch": 0.9607397793640493, + "grad_norm": 0.6252083657903543, + "learning_rate": 3.862361528232198e-06, + "loss": 0.5752, + "step": 5922 + }, + { + "epoch": 0.9609020116807268, + "grad_norm": 0.5600788519013405, + "learning_rate": 3.862003453550255e-06, + "loss": 0.5173, + "step": 5923 + }, + { + "epoch": 0.9610642439974043, + "grad_norm": 0.590325522364652, + "learning_rate": 3.8616453391290275e-06, + "loss": 0.5677, + "step": 5924 + }, + { + "epoch": 0.9612264763140818, + "grad_norm": 0.5735886530786681, + "learning_rate": 3.861287184978965e-06, + "loss": 0.5252, + "step": 5925 + }, + { + "epoch": 0.9613887086307592, + "grad_norm": 0.5460983442823911, + "learning_rate": 3.860928991110517e-06, + "loss": 0.5648, + "step": 5926 + }, + { + "epoch": 0.9615509409474368, + "grad_norm": 0.5655308816921484, + "learning_rate": 3.860570757534135e-06, + "loss": 0.5402, + "step": 5927 + }, + { + "epoch": 0.9617131732641142, + "grad_norm": 0.6132035857592495, + "learning_rate": 3.860212484260272e-06, + "loss": 0.5197, + "step": 5928 + }, + { + "epoch": 0.9618754055807917, + "grad_norm": 0.5704917177860879, + "learning_rate": 3.8598541712993795e-06, + "loss": 0.5784, + "step": 5929 + }, + { + "epoch": 0.9620376378974692, + "grad_norm": 0.6028444110903229, + "learning_rate": 3.859495818661914e-06, + "loss": 0.5282, + "step": 5930 + }, + { + "epoch": 0.9621998702141467, + "grad_norm": 0.5746643008113501, + "learning_rate": 3.859137426358328e-06, + "loss": 0.5375, + "step": 5931 + }, + { + "epoch": 0.9623621025308241, + "grad_norm": 0.6218828306916278, + "learning_rate": 3.858778994399081e-06, + "loss": 0.5627, + "step": 5932 + }, + { + "epoch": 0.9625243348475017, + "grad_norm": 0.5693874601018989, + "learning_rate": 3.858420522794631e-06, + "loss": 0.5444, + "step": 5933 + }, + { + "epoch": 0.9626865671641791, + "grad_norm": 0.5448611554193582, + "learning_rate": 3.8580620115554375e-06, + "loss": 0.5513, + "step": 5934 + }, + { + "epoch": 0.9628487994808566, + "grad_norm": 0.5935543546408398, + "learning_rate": 3.857703460691959e-06, + "loss": 0.5531, + "step": 5935 + }, + { + "epoch": 0.9630110317975341, + "grad_norm": 0.5558851083208552, + "learning_rate": 3.857344870214658e-06, + "loss": 0.5363, + "step": 5936 + }, + { + "epoch": 0.9631732641142116, + "grad_norm": 0.5676463218609278, + "learning_rate": 3.856986240133997e-06, + "loss": 0.5324, + "step": 5937 + }, + { + "epoch": 0.963335496430889, + "grad_norm": 0.5803003467341324, + "learning_rate": 3.85662757046044e-06, + "loss": 0.546, + "step": 5938 + }, + { + "epoch": 0.9634977287475666, + "grad_norm": 0.5615906058760344, + "learning_rate": 3.856268861204451e-06, + "loss": 0.5227, + "step": 5939 + }, + { + "epoch": 0.963659961064244, + "grad_norm": 0.577354748367859, + "learning_rate": 3.855910112376496e-06, + "loss": 0.538, + "step": 5940 + }, + { + "epoch": 0.9638221933809215, + "grad_norm": 0.6174728810823461, + "learning_rate": 3.855551323987045e-06, + "loss": 0.5326, + "step": 5941 + }, + { + "epoch": 0.963984425697599, + "grad_norm": 0.568796770562521, + "learning_rate": 3.855192496046564e-06, + "loss": 0.5228, + "step": 5942 + }, + { + "epoch": 0.9641466580142765, + "grad_norm": 0.5873267728462452, + "learning_rate": 3.854833628565522e-06, + "loss": 0.5464, + "step": 5943 + }, + { + "epoch": 0.9643088903309539, + "grad_norm": 0.6087769028323616, + "learning_rate": 3.854474721554391e-06, + "loss": 0.5504, + "step": 5944 + }, + { + "epoch": 0.9644711226476315, + "grad_norm": 0.5772134487884283, + "learning_rate": 3.854115775023643e-06, + "loss": 0.5598, + "step": 5945 + }, + { + "epoch": 0.9646333549643089, + "grad_norm": 0.5573735988114029, + "learning_rate": 3.85375678898375e-06, + "loss": 0.5758, + "step": 5946 + }, + { + "epoch": 0.9647955872809864, + "grad_norm": 0.584645460610426, + "learning_rate": 3.853397763445187e-06, + "loss": 0.5366, + "step": 5947 + }, + { + "epoch": 0.9649578195976638, + "grad_norm": 0.568785750399962, + "learning_rate": 3.853038698418429e-06, + "loss": 0.5536, + "step": 5948 + }, + { + "epoch": 0.9651200519143414, + "grad_norm": 0.5774076075749451, + "learning_rate": 3.852679593913952e-06, + "loss": 0.5584, + "step": 5949 + }, + { + "epoch": 0.9652822842310188, + "grad_norm": 0.5790348838525458, + "learning_rate": 3.852320449942235e-06, + "loss": 0.5402, + "step": 5950 + }, + { + "epoch": 0.9654445165476963, + "grad_norm": 0.6000554739442798, + "learning_rate": 3.851961266513756e-06, + "loss": 0.5293, + "step": 5951 + }, + { + "epoch": 0.9656067488643738, + "grad_norm": 0.5907105761205851, + "learning_rate": 3.8516020436389945e-06, + "loss": 0.555, + "step": 5952 + }, + { + "epoch": 0.9657689811810513, + "grad_norm": 0.5950613920674301, + "learning_rate": 3.851242781328432e-06, + "loss": 0.5554, + "step": 5953 + }, + { + "epoch": 0.9659312134977287, + "grad_norm": 0.6113843835457845, + "learning_rate": 3.85088347959255e-06, + "loss": 0.55, + "step": 5954 + }, + { + "epoch": 0.9660934458144063, + "grad_norm": 0.561537398794176, + "learning_rate": 3.850524138441833e-06, + "loss": 0.5602, + "step": 5955 + }, + { + "epoch": 0.9662556781310837, + "grad_norm": 0.5645605308441197, + "learning_rate": 3.850164757886765e-06, + "loss": 0.5595, + "step": 5956 + }, + { + "epoch": 0.9664179104477612, + "grad_norm": 0.5801623200513427, + "learning_rate": 3.849805337937832e-06, + "loss": 0.5107, + "step": 5957 + }, + { + "epoch": 0.9665801427644387, + "grad_norm": 0.6036270905437214, + "learning_rate": 3.84944587860552e-06, + "loss": 0.5329, + "step": 5958 + }, + { + "epoch": 0.9667423750811162, + "grad_norm": 0.5939237077609629, + "learning_rate": 3.849086379900317e-06, + "loss": 0.5512, + "step": 5959 + }, + { + "epoch": 0.9669046073977936, + "grad_norm": 0.5895324810083458, + "learning_rate": 3.848726841832713e-06, + "loss": 0.5278, + "step": 5960 + }, + { + "epoch": 0.9670668397144712, + "grad_norm": 0.5794682459525752, + "learning_rate": 3.848367264413199e-06, + "loss": 0.5431, + "step": 5961 + }, + { + "epoch": 0.9672290720311486, + "grad_norm": 0.6043425926267273, + "learning_rate": 3.848007647652264e-06, + "loss": 0.5215, + "step": 5962 + }, + { + "epoch": 0.967391304347826, + "grad_norm": 0.5660768287511043, + "learning_rate": 3.8476479915604024e-06, + "loss": 0.5574, + "step": 5963 + }, + { + "epoch": 0.9675535366645036, + "grad_norm": 0.5702350966878093, + "learning_rate": 3.847288296148107e-06, + "loss": 0.5035, + "step": 5964 + }, + { + "epoch": 0.9677157689811811, + "grad_norm": 0.5771474782584353, + "learning_rate": 3.846928561425873e-06, + "loss": 0.5552, + "step": 5965 + }, + { + "epoch": 0.9678780012978585, + "grad_norm": 0.5843956288856773, + "learning_rate": 3.8465687874041975e-06, + "loss": 0.5838, + "step": 5966 + }, + { + "epoch": 0.9680402336145361, + "grad_norm": 0.5774486619835879, + "learning_rate": 3.846208974093576e-06, + "loss": 0.5584, + "step": 5967 + }, + { + "epoch": 0.9682024659312135, + "grad_norm": 0.5879509767080717, + "learning_rate": 3.8458491215045075e-06, + "loss": 0.5611, + "step": 5968 + }, + { + "epoch": 0.968364698247891, + "grad_norm": 0.5777260391232004, + "learning_rate": 3.845489229647491e-06, + "loss": 0.5471, + "step": 5969 + }, + { + "epoch": 0.9685269305645685, + "grad_norm": 0.5856040293590833, + "learning_rate": 3.845129298533028e-06, + "loss": 0.5425, + "step": 5970 + }, + { + "epoch": 0.968689162881246, + "grad_norm": 0.6039447195047014, + "learning_rate": 3.84476932817162e-06, + "loss": 0.5631, + "step": 5971 + }, + { + "epoch": 0.9688513951979234, + "grad_norm": 0.5717101565553967, + "learning_rate": 3.844409318573768e-06, + "loss": 0.5419, + "step": 5972 + }, + { + "epoch": 0.969013627514601, + "grad_norm": 0.58092306013239, + "learning_rate": 3.844049269749979e-06, + "loss": 0.5545, + "step": 5973 + }, + { + "epoch": 0.9691758598312784, + "grad_norm": 0.5679297363801533, + "learning_rate": 3.843689181710756e-06, + "loss": 0.5409, + "step": 5974 + }, + { + "epoch": 0.9693380921479559, + "grad_norm": 0.5852230282740949, + "learning_rate": 3.8433290544666066e-06, + "loss": 0.528, + "step": 5975 + }, + { + "epoch": 0.9695003244646333, + "grad_norm": 0.5658106286265987, + "learning_rate": 3.842968888028038e-06, + "loss": 0.5703, + "step": 5976 + }, + { + "epoch": 0.9696625567813109, + "grad_norm": 0.5670512278955256, + "learning_rate": 3.842608682405558e-06, + "loss": 0.5613, + "step": 5977 + }, + { + "epoch": 0.9698247890979883, + "grad_norm": 0.5793800983020533, + "learning_rate": 3.842248437609677e-06, + "loss": 0.5407, + "step": 5978 + }, + { + "epoch": 0.9699870214146658, + "grad_norm": 0.6339969366390342, + "learning_rate": 3.841888153650906e-06, + "loss": 0.5604, + "step": 5979 + }, + { + "epoch": 0.9701492537313433, + "grad_norm": 0.5595947086543875, + "learning_rate": 3.8415278305397564e-06, + "loss": 0.5234, + "step": 5980 + }, + { + "epoch": 0.9703114860480208, + "grad_norm": 0.5694707573747567, + "learning_rate": 3.8411674682867426e-06, + "loss": 0.524, + "step": 5981 + }, + { + "epoch": 0.9704737183646982, + "grad_norm": 0.5723571387071476, + "learning_rate": 3.840807066902378e-06, + "loss": 0.5035, + "step": 5982 + }, + { + "epoch": 0.9706359506813758, + "grad_norm": 0.5911525456380801, + "learning_rate": 3.840446626397177e-06, + "loss": 0.5494, + "step": 5983 + }, + { + "epoch": 0.9707981829980532, + "grad_norm": 0.5975205184463259, + "learning_rate": 3.840086146781658e-06, + "loss": 0.5805, + "step": 5984 + }, + { + "epoch": 0.9709604153147307, + "grad_norm": 0.5827838912172215, + "learning_rate": 3.839725628066339e-06, + "loss": 0.551, + "step": 5985 + }, + { + "epoch": 0.9711226476314082, + "grad_norm": 0.5895728672674023, + "learning_rate": 3.839365070261738e-06, + "loss": 0.5416, + "step": 5986 + }, + { + "epoch": 0.9712848799480857, + "grad_norm": 0.5943741794976032, + "learning_rate": 3.839004473378374e-06, + "loss": 0.5239, + "step": 5987 + }, + { + "epoch": 0.9714471122647631, + "grad_norm": 0.5979859703555487, + "learning_rate": 3.8386438374267696e-06, + "loss": 0.5567, + "step": 5988 + }, + { + "epoch": 0.9716093445814407, + "grad_norm": 0.5517407387362742, + "learning_rate": 3.8382831624174465e-06, + "loss": 0.4908, + "step": 5989 + }, + { + "epoch": 0.9717715768981181, + "grad_norm": 0.5855189510052504, + "learning_rate": 3.837922448360929e-06, + "loss": 0.5399, + "step": 5990 + }, + { + "epoch": 0.9719338092147956, + "grad_norm": 0.5523078724122231, + "learning_rate": 3.837561695267741e-06, + "loss": 0.5669, + "step": 5991 + }, + { + "epoch": 0.9720960415314731, + "grad_norm": 0.5741848600446717, + "learning_rate": 3.8372009031484085e-06, + "loss": 0.5304, + "step": 5992 + }, + { + "epoch": 0.9722582738481506, + "grad_norm": 0.5600929542889919, + "learning_rate": 3.836840072013458e-06, + "loss": 0.5772, + "step": 5993 + }, + { + "epoch": 0.972420506164828, + "grad_norm": 0.5710150881417376, + "learning_rate": 3.836479201873416e-06, + "loss": 0.5579, + "step": 5994 + }, + { + "epoch": 0.9725827384815056, + "grad_norm": 0.5492337488116876, + "learning_rate": 3.8361182927388155e-06, + "loss": 0.5423, + "step": 5995 + }, + { + "epoch": 0.972744970798183, + "grad_norm": 0.592418887508651, + "learning_rate": 3.835757344620183e-06, + "loss": 0.5111, + "step": 5996 + }, + { + "epoch": 0.9729072031148605, + "grad_norm": 0.5747933959389278, + "learning_rate": 3.835396357528051e-06, + "loss": 0.5572, + "step": 5997 + }, + { + "epoch": 0.973069435431538, + "grad_norm": 0.5535950991019705, + "learning_rate": 3.835035331472955e-06, + "loss": 0.503, + "step": 5998 + }, + { + "epoch": 0.9732316677482155, + "grad_norm": 0.5957426812006622, + "learning_rate": 3.834674266465425e-06, + "loss": 0.5822, + "step": 5999 + }, + { + "epoch": 0.9733939000648929, + "grad_norm": 0.5857834885166288, + "learning_rate": 3.834313162515996e-06, + "loss": 0.5851, + "step": 6000 + }, + { + "epoch": 0.9735561323815705, + "grad_norm": 0.5718736691525177, + "learning_rate": 3.833952019635205e-06, + "loss": 0.5382, + "step": 6001 + }, + { + "epoch": 0.9737183646982479, + "grad_norm": 0.5689570913587383, + "learning_rate": 3.83359083783359e-06, + "loss": 0.5524, + "step": 6002 + }, + { + "epoch": 0.9738805970149254, + "grad_norm": 0.5788797654996086, + "learning_rate": 3.833229617121688e-06, + "loss": 0.5501, + "step": 6003 + }, + { + "epoch": 0.9740428293316029, + "grad_norm": 0.5777223353679123, + "learning_rate": 3.832868357510039e-06, + "loss": 0.5309, + "step": 6004 + }, + { + "epoch": 0.9742050616482804, + "grad_norm": 0.5779578952251281, + "learning_rate": 3.832507059009183e-06, + "loss": 0.5698, + "step": 6005 + }, + { + "epoch": 0.9743672939649578, + "grad_norm": 0.5814617000200619, + "learning_rate": 3.832145721629662e-06, + "loss": 0.5275, + "step": 6006 + }, + { + "epoch": 0.9745295262816352, + "grad_norm": 0.5672392326862858, + "learning_rate": 3.8317843453820175e-06, + "loss": 0.5052, + "step": 6007 + }, + { + "epoch": 0.9746917585983128, + "grad_norm": 0.5988104286924465, + "learning_rate": 3.831422930276795e-06, + "loss": 0.5525, + "step": 6008 + }, + { + "epoch": 0.9748539909149903, + "grad_norm": 0.5634990919880644, + "learning_rate": 3.83106147632454e-06, + "loss": 0.5464, + "step": 6009 + }, + { + "epoch": 0.9750162232316677, + "grad_norm": 0.5694445723445073, + "learning_rate": 3.830699983535796e-06, + "loss": 0.5314, + "step": 6010 + }, + { + "epoch": 0.9751784555483453, + "grad_norm": 0.6094397688647037, + "learning_rate": 3.830338451921114e-06, + "loss": 0.5734, + "step": 6011 + }, + { + "epoch": 0.9753406878650227, + "grad_norm": 0.566993017535106, + "learning_rate": 3.829976881491038e-06, + "loss": 0.5486, + "step": 6012 + }, + { + "epoch": 0.9755029201817002, + "grad_norm": 0.5896763784580604, + "learning_rate": 3.829615272256122e-06, + "loss": 0.5659, + "step": 6013 + }, + { + "epoch": 0.9756651524983777, + "grad_norm": 0.5863604176733793, + "learning_rate": 3.829253624226914e-06, + "loss": 0.5535, + "step": 6014 + }, + { + "epoch": 0.9758273848150552, + "grad_norm": 0.6173846820567984, + "learning_rate": 3.828891937413967e-06, + "loss": 0.5509, + "step": 6015 + }, + { + "epoch": 0.9759896171317326, + "grad_norm": 0.5607661224639967, + "learning_rate": 3.828530211827832e-06, + "loss": 0.5304, + "step": 6016 + }, + { + "epoch": 0.9761518494484102, + "grad_norm": 0.590797213997201, + "learning_rate": 3.828168447479065e-06, + "loss": 0.5261, + "step": 6017 + }, + { + "epoch": 0.9763140817650876, + "grad_norm": 0.6052858427242225, + "learning_rate": 3.827806644378221e-06, + "loss": 0.554, + "step": 6018 + }, + { + "epoch": 0.976476314081765, + "grad_norm": 0.6148401145488636, + "learning_rate": 3.827444802535856e-06, + "loss": 0.5645, + "step": 6019 + }, + { + "epoch": 0.9766385463984426, + "grad_norm": 0.5844360830982829, + "learning_rate": 3.82708292196253e-06, + "loss": 0.5686, + "step": 6020 + }, + { + "epoch": 0.97680077871512, + "grad_norm": 0.5493545536222271, + "learning_rate": 3.826721002668796e-06, + "loss": 0.5547, + "step": 6021 + }, + { + "epoch": 0.9769630110317975, + "grad_norm": 0.5701087617465687, + "learning_rate": 3.826359044665219e-06, + "loss": 0.5155, + "step": 6022 + }, + { + "epoch": 0.9771252433484751, + "grad_norm": 0.5661993797836163, + "learning_rate": 3.825997047962358e-06, + "loss": 0.5536, + "step": 6023 + }, + { + "epoch": 0.9772874756651525, + "grad_norm": 0.5520829556304423, + "learning_rate": 3.825635012570774e-06, + "loss": 0.5563, + "step": 6024 + }, + { + "epoch": 0.97744970798183, + "grad_norm": 0.5748277721307946, + "learning_rate": 3.8252729385010305e-06, + "loss": 0.5477, + "step": 6025 + }, + { + "epoch": 0.9776119402985075, + "grad_norm": 0.5819620822463669, + "learning_rate": 3.824910825763693e-06, + "loss": 0.5431, + "step": 6026 + }, + { + "epoch": 0.977774172615185, + "grad_norm": 0.6053837336076568, + "learning_rate": 3.824548674369326e-06, + "loss": 0.5217, + "step": 6027 + }, + { + "epoch": 0.9779364049318624, + "grad_norm": 0.5573882539425019, + "learning_rate": 3.8241864843284974e-06, + "loss": 0.5235, + "step": 6028 + }, + { + "epoch": 0.97809863724854, + "grad_norm": 0.5767050218525165, + "learning_rate": 3.8238242556517725e-06, + "loss": 0.5229, + "step": 6029 + }, + { + "epoch": 0.9782608695652174, + "grad_norm": 0.581658513585614, + "learning_rate": 3.823461988349721e-06, + "loss": 0.5796, + "step": 6030 + }, + { + "epoch": 0.9784231018818949, + "grad_norm": 0.5790504703527071, + "learning_rate": 3.823099682432914e-06, + "loss": 0.5247, + "step": 6031 + }, + { + "epoch": 0.9785853341985724, + "grad_norm": 0.6079575523501113, + "learning_rate": 3.82273733791192e-06, + "loss": 0.5376, + "step": 6032 + }, + { + "epoch": 0.9787475665152499, + "grad_norm": 0.5708149403205279, + "learning_rate": 3.822374954797315e-06, + "loss": 0.5375, + "step": 6033 + }, + { + "epoch": 0.9789097988319273, + "grad_norm": 0.5804355757973317, + "learning_rate": 3.822012533099668e-06, + "loss": 0.5653, + "step": 6034 + }, + { + "epoch": 0.9790720311486047, + "grad_norm": 0.5679296249830647, + "learning_rate": 3.821650072829557e-06, + "loss": 0.5508, + "step": 6035 + }, + { + "epoch": 0.9792342634652823, + "grad_norm": 0.5890791313802285, + "learning_rate": 3.821287573997556e-06, + "loss": 0.566, + "step": 6036 + }, + { + "epoch": 0.9793964957819598, + "grad_norm": 0.5815208735709835, + "learning_rate": 3.820925036614239e-06, + "loss": 0.5478, + "step": 6037 + }, + { + "epoch": 0.9795587280986372, + "grad_norm": 0.582078959607882, + "learning_rate": 3.820562460690189e-06, + "loss": 0.5114, + "step": 6038 + }, + { + "epoch": 0.9797209604153148, + "grad_norm": 0.5862513769162802, + "learning_rate": 3.820199846235982e-06, + "loss": 0.5717, + "step": 6039 + }, + { + "epoch": 0.9798831927319922, + "grad_norm": 0.5891207860733367, + "learning_rate": 3.819837193262197e-06, + "loss": 0.5768, + "step": 6040 + }, + { + "epoch": 0.9800454250486696, + "grad_norm": 0.5922422676261536, + "learning_rate": 3.819474501779417e-06, + "loss": 0.5724, + "step": 6041 + }, + { + "epoch": 0.9802076573653472, + "grad_norm": 0.5914514907002679, + "learning_rate": 3.819111771798224e-06, + "loss": 0.5228, + "step": 6042 + }, + { + "epoch": 0.9803698896820247, + "grad_norm": 0.6246477547217323, + "learning_rate": 3.818749003329201e-06, + "loss": 0.5835, + "step": 6043 + }, + { + "epoch": 0.9805321219987021, + "grad_norm": 0.5724685182087388, + "learning_rate": 3.8183861963829335e-06, + "loss": 0.5031, + "step": 6044 + }, + { + "epoch": 0.9806943543153797, + "grad_norm": 0.5760475528679759, + "learning_rate": 3.818023350970005e-06, + "loss": 0.5547, + "step": 6045 + }, + { + "epoch": 0.9808565866320571, + "grad_norm": 0.5945774213003511, + "learning_rate": 3.817660467101005e-06, + "loss": 0.5507, + "step": 6046 + }, + { + "epoch": 0.9810188189487346, + "grad_norm": 0.5580338794356648, + "learning_rate": 3.817297544786519e-06, + "loss": 0.5295, + "step": 6047 + }, + { + "epoch": 0.9811810512654121, + "grad_norm": 0.589166951856517, + "learning_rate": 3.816934584037137e-06, + "loss": 0.573, + "step": 6048 + }, + { + "epoch": 0.9813432835820896, + "grad_norm": 0.5939720653166491, + "learning_rate": 3.8165715848634496e-06, + "loss": 0.5789, + "step": 6049 + }, + { + "epoch": 0.981505515898767, + "grad_norm": 0.5667138684765494, + "learning_rate": 3.816208547276048e-06, + "loss": 0.5504, + "step": 6050 + }, + { + "epoch": 0.9816677482154446, + "grad_norm": 0.5975555222702994, + "learning_rate": 3.815845471285523e-06, + "loss": 0.5738, + "step": 6051 + }, + { + "epoch": 0.981829980532122, + "grad_norm": 0.5925835850213922, + "learning_rate": 3.8154823569024705e-06, + "loss": 0.5508, + "step": 6052 + }, + { + "epoch": 0.9819922128487995, + "grad_norm": 0.5644433085200812, + "learning_rate": 3.815119204137483e-06, + "loss": 0.582, + "step": 6053 + }, + { + "epoch": 0.982154445165477, + "grad_norm": 0.5775807455186684, + "learning_rate": 3.814756013001158e-06, + "loss": 0.5561, + "step": 6054 + }, + { + "epoch": 0.9823166774821545, + "grad_norm": 0.604759109676717, + "learning_rate": 3.81439278350409e-06, + "loss": 0.5386, + "step": 6055 + }, + { + "epoch": 0.9824789097988319, + "grad_norm": 0.5768896545595945, + "learning_rate": 3.81402951565688e-06, + "loss": 0.5036, + "step": 6056 + }, + { + "epoch": 0.9826411421155095, + "grad_norm": 0.5684809515364979, + "learning_rate": 3.813666209470124e-06, + "loss": 0.5258, + "step": 6057 + }, + { + "epoch": 0.9828033744321869, + "grad_norm": 0.5761776182592868, + "learning_rate": 3.813302864954425e-06, + "loss": 0.5614, + "step": 6058 + }, + { + "epoch": 0.9829656067488644, + "grad_norm": 0.6138147489909416, + "learning_rate": 3.812939482120383e-06, + "loss": 0.5321, + "step": 6059 + }, + { + "epoch": 0.9831278390655419, + "grad_norm": 0.5785878062182729, + "learning_rate": 3.8125760609786e-06, + "loss": 0.56, + "step": 6060 + }, + { + "epoch": 0.9832900713822194, + "grad_norm": 0.5937505979009049, + "learning_rate": 3.812212601539681e-06, + "loss": 0.5283, + "step": 6061 + }, + { + "epoch": 0.9834523036988968, + "grad_norm": 0.608811923923077, + "learning_rate": 3.811849103814229e-06, + "loss": 0.5384, + "step": 6062 + }, + { + "epoch": 0.9836145360155742, + "grad_norm": 0.5784061688699738, + "learning_rate": 3.81148556781285e-06, + "loss": 0.5276, + "step": 6063 + }, + { + "epoch": 0.9837767683322518, + "grad_norm": 0.61899910760352, + "learning_rate": 3.811121993546153e-06, + "loss": 0.5844, + "step": 6064 + }, + { + "epoch": 0.9839390006489293, + "grad_norm": 0.5488869426108708, + "learning_rate": 3.810758381024744e-06, + "loss": 0.539, + "step": 6065 + }, + { + "epoch": 0.9841012329656067, + "grad_norm": 0.5804295433846797, + "learning_rate": 3.8103947302592324e-06, + "loss": 0.5746, + "step": 6066 + }, + { + "epoch": 0.9842634652822843, + "grad_norm": 0.5553077289282228, + "learning_rate": 3.810031041260229e-06, + "loss": 0.5505, + "step": 6067 + }, + { + "epoch": 0.9844256975989617, + "grad_norm": 0.5744253266302443, + "learning_rate": 3.809667314038345e-06, + "loss": 0.5452, + "step": 6068 + }, + { + "epoch": 0.9845879299156391, + "grad_norm": 0.6186010371894006, + "learning_rate": 3.8093035486041935e-06, + "loss": 0.5557, + "step": 6069 + }, + { + "epoch": 0.9847501622323167, + "grad_norm": 0.5733073061358229, + "learning_rate": 3.8089397449683863e-06, + "loss": 0.5719, + "step": 6070 + }, + { + "epoch": 0.9849123945489942, + "grad_norm": 0.5858280040246754, + "learning_rate": 3.8085759031415403e-06, + "loss": 0.5371, + "step": 6071 + }, + { + "epoch": 0.9850746268656716, + "grad_norm": 0.5865584527834167, + "learning_rate": 3.8082120231342695e-06, + "loss": 0.5197, + "step": 6072 + }, + { + "epoch": 0.9852368591823492, + "grad_norm": 0.6160503597361996, + "learning_rate": 3.8078481049571915e-06, + "loss": 0.553, + "step": 6073 + }, + { + "epoch": 0.9853990914990266, + "grad_norm": 0.5622452825761797, + "learning_rate": 3.8074841486209256e-06, + "loss": 0.5575, + "step": 6074 + }, + { + "epoch": 0.985561323815704, + "grad_norm": 0.5795317231729884, + "learning_rate": 3.807120154136089e-06, + "loss": 0.5495, + "step": 6075 + }, + { + "epoch": 0.9857235561323816, + "grad_norm": 0.6326574507850653, + "learning_rate": 3.806756121513304e-06, + "loss": 0.5642, + "step": 6076 + }, + { + "epoch": 0.985885788449059, + "grad_norm": 0.6048584517964017, + "learning_rate": 3.80639205076319e-06, + "loss": 0.5683, + "step": 6077 + }, + { + "epoch": 0.9860480207657365, + "grad_norm": 0.5644463028676973, + "learning_rate": 3.8060279418963714e-06, + "loss": 0.5346, + "step": 6078 + }, + { + "epoch": 0.9862102530824141, + "grad_norm": 0.5687183514480363, + "learning_rate": 3.8056637949234705e-06, + "loss": 0.5296, + "step": 6079 + }, + { + "epoch": 0.9863724853990915, + "grad_norm": 0.580311126092485, + "learning_rate": 3.805299609855112e-06, + "loss": 0.5221, + "step": 6080 + }, + { + "epoch": 0.986534717715769, + "grad_norm": 0.5627298594212498, + "learning_rate": 3.8049353867019222e-06, + "loss": 0.5509, + "step": 6081 + }, + { + "epoch": 0.9866969500324465, + "grad_norm": 0.6034538319695402, + "learning_rate": 3.8045711254745288e-06, + "loss": 0.5694, + "step": 6082 + }, + { + "epoch": 0.986859182349124, + "grad_norm": 0.5611107472350629, + "learning_rate": 3.8042068261835583e-06, + "loss": 0.5155, + "step": 6083 + }, + { + "epoch": 0.9870214146658014, + "grad_norm": 0.5865267873438239, + "learning_rate": 3.803842488839642e-06, + "loss": 0.5181, + "step": 6084 + }, + { + "epoch": 0.987183646982479, + "grad_norm": 0.5876356520341428, + "learning_rate": 3.803478113453408e-06, + "loss": 0.5173, + "step": 6085 + }, + { + "epoch": 0.9873458792991564, + "grad_norm": 0.5547083191586815, + "learning_rate": 3.8031137000354888e-06, + "loss": 0.5111, + "step": 6086 + }, + { + "epoch": 0.9875081116158339, + "grad_norm": 0.6018354096457919, + "learning_rate": 3.8027492485965174e-06, + "loss": 0.5529, + "step": 6087 + }, + { + "epoch": 0.9876703439325114, + "grad_norm": 0.5830371753533254, + "learning_rate": 3.802384759147126e-06, + "loss": 0.5686, + "step": 6088 + }, + { + "epoch": 0.9878325762491889, + "grad_norm": 0.5905949750415085, + "learning_rate": 3.8020202316979505e-06, + "loss": 0.5458, + "step": 6089 + }, + { + "epoch": 0.9879948085658663, + "grad_norm": 0.5714901147465996, + "learning_rate": 3.801655666259626e-06, + "loss": 0.5363, + "step": 6090 + }, + { + "epoch": 0.9881570408825439, + "grad_norm": 0.5942061423528209, + "learning_rate": 3.80129106284279e-06, + "loss": 0.5466, + "step": 6091 + }, + { + "epoch": 0.9883192731992213, + "grad_norm": 0.5609296558848976, + "learning_rate": 3.800926421458081e-06, + "loss": 0.5633, + "step": 6092 + }, + { + "epoch": 0.9884815055158988, + "grad_norm": 0.5947956978349009, + "learning_rate": 3.8005617421161367e-06, + "loss": 0.5128, + "step": 6093 + }, + { + "epoch": 0.9886437378325762, + "grad_norm": 0.5615832599088559, + "learning_rate": 3.800197024827599e-06, + "loss": 0.5453, + "step": 6094 + }, + { + "epoch": 0.9888059701492538, + "grad_norm": 0.6100222623378798, + "learning_rate": 3.7998322696031083e-06, + "loss": 0.5594, + "step": 6095 + }, + { + "epoch": 0.9889682024659312, + "grad_norm": 0.5771895252372439, + "learning_rate": 3.799467476453307e-06, + "loss": 0.5432, + "step": 6096 + }, + { + "epoch": 0.9891304347826086, + "grad_norm": 0.5541984827995955, + "learning_rate": 3.7991026453888397e-06, + "loss": 0.544, + "step": 6097 + }, + { + "epoch": 0.9892926670992862, + "grad_norm": 0.5616568057507089, + "learning_rate": 3.7987377764203502e-06, + "loss": 0.5709, + "step": 6098 + }, + { + "epoch": 0.9894548994159637, + "grad_norm": 0.597364063117782, + "learning_rate": 3.7983728695584845e-06, + "loss": 0.5475, + "step": 6099 + }, + { + "epoch": 0.9896171317326411, + "grad_norm": 0.6579721011443328, + "learning_rate": 3.7980079248138892e-06, + "loss": 0.5746, + "step": 6100 + }, + { + "epoch": 0.9897793640493187, + "grad_norm": 0.5967432905565834, + "learning_rate": 3.7976429421972126e-06, + "loss": 0.5447, + "step": 6101 + }, + { + "epoch": 0.9899415963659961, + "grad_norm": 0.6038650457687659, + "learning_rate": 3.7972779217191046e-06, + "loss": 0.5497, + "step": 6102 + }, + { + "epoch": 0.9901038286826735, + "grad_norm": 0.6052518189398308, + "learning_rate": 3.7969128633902147e-06, + "loss": 0.5501, + "step": 6103 + }, + { + "epoch": 0.9902660609993511, + "grad_norm": 0.5636019062866495, + "learning_rate": 3.7965477672211935e-06, + "loss": 0.5395, + "step": 6104 + }, + { + "epoch": 0.9904282933160286, + "grad_norm": 0.5915362745801563, + "learning_rate": 3.7961826332226953e-06, + "loss": 0.5465, + "step": 6105 + }, + { + "epoch": 0.990590525632706, + "grad_norm": 0.5711206745108355, + "learning_rate": 3.795817461405372e-06, + "loss": 0.5665, + "step": 6106 + }, + { + "epoch": 0.9907527579493836, + "grad_norm": 0.5603996459665752, + "learning_rate": 3.7954522517798785e-06, + "loss": 0.5184, + "step": 6107 + }, + { + "epoch": 0.990914990266061, + "grad_norm": 0.5997765790905186, + "learning_rate": 3.7950870043568717e-06, + "loss": 0.5707, + "step": 6108 + }, + { + "epoch": 0.9910772225827384, + "grad_norm": 0.5892993459602742, + "learning_rate": 3.794721719147007e-06, + "loss": 0.5395, + "step": 6109 + }, + { + "epoch": 0.991239454899416, + "grad_norm": 0.5791325960562924, + "learning_rate": 3.7943563961609433e-06, + "loss": 0.5627, + "step": 6110 + }, + { + "epoch": 0.9914016872160935, + "grad_norm": 0.5653934324747472, + "learning_rate": 3.793991035409339e-06, + "loss": 0.5275, + "step": 6111 + }, + { + "epoch": 0.9915639195327709, + "grad_norm": 0.5712461195977552, + "learning_rate": 3.793625636902855e-06, + "loss": 0.5422, + "step": 6112 + }, + { + "epoch": 0.9917261518494485, + "grad_norm": 0.5765313117045797, + "learning_rate": 3.7932602006521516e-06, + "loss": 0.5546, + "step": 6113 + }, + { + "epoch": 0.9918883841661259, + "grad_norm": 0.5862059987877984, + "learning_rate": 3.792894726667892e-06, + "loss": 0.5676, + "step": 6114 + }, + { + "epoch": 0.9920506164828033, + "grad_norm": 0.5787444570757412, + "learning_rate": 3.7925292149607395e-06, + "loss": 0.5595, + "step": 6115 + }, + { + "epoch": 0.9922128487994809, + "grad_norm": 0.6006857873534167, + "learning_rate": 3.792163665541359e-06, + "loss": 0.5399, + "step": 6116 + }, + { + "epoch": 0.9923750811161584, + "grad_norm": 0.5668585506460353, + "learning_rate": 3.7917980784204158e-06, + "loss": 0.5238, + "step": 6117 + }, + { + "epoch": 0.9925373134328358, + "grad_norm": 0.598105298097587, + "learning_rate": 3.791432453608575e-06, + "loss": 0.5312, + "step": 6118 + }, + { + "epoch": 0.9926995457495134, + "grad_norm": 0.5650662796011923, + "learning_rate": 3.791066791116507e-06, + "loss": 0.5573, + "step": 6119 + }, + { + "epoch": 0.9928617780661908, + "grad_norm": 0.5913820489222938, + "learning_rate": 3.79070109095488e-06, + "loss": 0.5116, + "step": 6120 + }, + { + "epoch": 0.9930240103828682, + "grad_norm": 0.60973834050462, + "learning_rate": 3.790335353134364e-06, + "loss": 0.5614, + "step": 6121 + }, + { + "epoch": 0.9931862426995457, + "grad_norm": 0.595740971291725, + "learning_rate": 3.789969577665629e-06, + "loss": 0.5571, + "step": 6122 + }, + { + "epoch": 0.9933484750162233, + "grad_norm": 0.5880272522793002, + "learning_rate": 3.78960376455935e-06, + "loss": 0.5762, + "step": 6123 + }, + { + "epoch": 0.9935107073329007, + "grad_norm": 0.5810359645011789, + "learning_rate": 3.789237913826197e-06, + "loss": 0.5486, + "step": 6124 + }, + { + "epoch": 0.9936729396495781, + "grad_norm": 0.6104494521635323, + "learning_rate": 3.7888720254768473e-06, + "loss": 0.5415, + "step": 6125 + }, + { + "epoch": 0.9938351719662557, + "grad_norm": 0.5490008321339999, + "learning_rate": 3.7885060995219748e-06, + "loss": 0.5008, + "step": 6126 + }, + { + "epoch": 0.9939974042829332, + "grad_norm": 0.5587596628301635, + "learning_rate": 3.788140135972257e-06, + "loss": 0.5376, + "step": 6127 + }, + { + "epoch": 0.9941596365996106, + "grad_norm": 0.5722375194992576, + "learning_rate": 3.7877741348383703e-06, + "loss": 0.5684, + "step": 6128 + }, + { + "epoch": 0.9943218689162882, + "grad_norm": 0.5626647278449181, + "learning_rate": 3.7874080961309946e-06, + "loss": 0.5672, + "step": 6129 + }, + { + "epoch": 0.9944841012329656, + "grad_norm": 0.531348005741479, + "learning_rate": 3.787042019860811e-06, + "loss": 0.5558, + "step": 6130 + }, + { + "epoch": 0.994646333549643, + "grad_norm": 0.5717399901957539, + "learning_rate": 3.7866759060384982e-06, + "loss": 0.571, + "step": 6131 + }, + { + "epoch": 0.9948085658663206, + "grad_norm": 0.6019090911281794, + "learning_rate": 3.7863097546747398e-06, + "loss": 0.5558, + "step": 6132 + }, + { + "epoch": 0.994970798182998, + "grad_norm": 0.5777766585170577, + "learning_rate": 3.7859435657802186e-06, + "loss": 0.5755, + "step": 6133 + }, + { + "epoch": 0.9951330304996755, + "grad_norm": 0.58128779572777, + "learning_rate": 3.7855773393656187e-06, + "loss": 0.5367, + "step": 6134 + }, + { + "epoch": 0.995295262816353, + "grad_norm": 0.5962071353253445, + "learning_rate": 3.785211075441626e-06, + "loss": 0.5532, + "step": 6135 + }, + { + "epoch": 0.9954574951330305, + "grad_norm": 0.559542273478946, + "learning_rate": 3.7848447740189263e-06, + "loss": 0.5463, + "step": 6136 + }, + { + "epoch": 0.995619727449708, + "grad_norm": 0.6013532570234678, + "learning_rate": 3.7844784351082086e-06, + "loss": 0.5528, + "step": 6137 + }, + { + "epoch": 0.9957819597663855, + "grad_norm": 0.5686426317010495, + "learning_rate": 3.7841120587201604e-06, + "loss": 0.5258, + "step": 6138 + }, + { + "epoch": 0.995944192083063, + "grad_norm": 0.596874161162158, + "learning_rate": 3.783745644865472e-06, + "loss": 0.5138, + "step": 6139 + }, + { + "epoch": 0.9961064243997404, + "grad_norm": 0.5851889657512099, + "learning_rate": 3.7833791935548336e-06, + "loss": 0.538, + "step": 6140 + }, + { + "epoch": 0.996268656716418, + "grad_norm": 0.6059141296381768, + "learning_rate": 3.7830127047989375e-06, + "loss": 0.5644, + "step": 6141 + }, + { + "epoch": 0.9964308890330954, + "grad_norm": 0.6190947443739953, + "learning_rate": 3.7826461786084767e-06, + "loss": 0.5332, + "step": 6142 + }, + { + "epoch": 0.9965931213497728, + "grad_norm": 0.5553209903488016, + "learning_rate": 3.7822796149941466e-06, + "loss": 0.5183, + "step": 6143 + }, + { + "epoch": 0.9967553536664504, + "grad_norm": 0.578473696802547, + "learning_rate": 3.7819130139666406e-06, + "loss": 0.5322, + "step": 6144 + }, + { + "epoch": 0.9969175859831279, + "grad_norm": 0.6060237425937886, + "learning_rate": 3.7815463755366567e-06, + "loss": 0.5238, + "step": 6145 + }, + { + "epoch": 0.9970798182998053, + "grad_norm": 0.5877235326463873, + "learning_rate": 3.7811796997148908e-06, + "loss": 0.573, + "step": 6146 + }, + { + "epoch": 0.9972420506164829, + "grad_norm": 0.5952615340437679, + "learning_rate": 3.7808129865120426e-06, + "loss": 0.4901, + "step": 6147 + }, + { + "epoch": 0.9974042829331603, + "grad_norm": 0.5702278004221647, + "learning_rate": 3.7804462359388115e-06, + "loss": 0.5558, + "step": 6148 + }, + { + "epoch": 0.9975665152498377, + "grad_norm": 0.5978196950241899, + "learning_rate": 3.780079448005897e-06, + "loss": 0.5442, + "step": 6149 + }, + { + "epoch": 0.9977287475665152, + "grad_norm": 0.5626915401015069, + "learning_rate": 3.779712622724003e-06, + "loss": 0.5346, + "step": 6150 + }, + { + "epoch": 0.9978909798831928, + "grad_norm": 0.57064449341381, + "learning_rate": 3.779345760103831e-06, + "loss": 0.5274, + "step": 6151 + }, + { + "epoch": 0.9980532121998702, + "grad_norm": 0.5725185737049915, + "learning_rate": 3.778978860156085e-06, + "loss": 0.5215, + "step": 6152 + }, + { + "epoch": 0.9982154445165476, + "grad_norm": 0.5739971229727212, + "learning_rate": 3.778611922891471e-06, + "loss": 0.5392, + "step": 6153 + }, + { + "epoch": 0.9983776768332252, + "grad_norm": 0.6150261841588978, + "learning_rate": 3.7782449483206935e-06, + "loss": 0.5478, + "step": 6154 + }, + { + "epoch": 0.9985399091499026, + "grad_norm": 0.5689763422809184, + "learning_rate": 3.7778779364544606e-06, + "loss": 0.5463, + "step": 6155 + }, + { + "epoch": 0.9987021414665801, + "grad_norm": 0.5629210540903641, + "learning_rate": 3.7775108873034818e-06, + "loss": 0.4894, + "step": 6156 + }, + { + "epoch": 0.9988643737832577, + "grad_norm": 0.5799350713307965, + "learning_rate": 3.777143800878465e-06, + "loss": 0.5607, + "step": 6157 + }, + { + "epoch": 0.9990266060999351, + "grad_norm": 0.5974765435036504, + "learning_rate": 3.7767766771901216e-06, + "loss": 0.5593, + "step": 6158 + }, + { + "epoch": 0.9991888384166125, + "grad_norm": 0.5811801945966422, + "learning_rate": 3.776409516249162e-06, + "loss": 0.5302, + "step": 6159 + }, + { + "epoch": 0.9993510707332901, + "grad_norm": 0.5901729008566338, + "learning_rate": 3.7760423180662997e-06, + "loss": 0.5643, + "step": 6160 + }, + { + "epoch": 0.9995133030499675, + "grad_norm": 0.5967838058464779, + "learning_rate": 3.77567508265225e-06, + "loss": 0.5524, + "step": 6161 + }, + { + "epoch": 0.999675535366645, + "grad_norm": 0.5955892959624534, + "learning_rate": 3.7753078100177244e-06, + "loss": 0.5281, + "step": 6162 + }, + { + "epoch": 0.9998377676833226, + "grad_norm": 0.5764766645587274, + "learning_rate": 3.774940500173442e-06, + "loss": 0.556, + "step": 6163 + }, + { + "epoch": 1.0, + "grad_norm": 0.5455570437725655, + "learning_rate": 3.7745731531301176e-06, + "loss": 0.5549, + "step": 6164 + }, + { + "epoch": 1.0001622323166774, + "grad_norm": 0.5716042303335398, + "learning_rate": 3.7742057688984703e-06, + "loss": 0.5391, + "step": 6165 + }, + { + "epoch": 1.000324464633355, + "grad_norm": 0.5808118524501584, + "learning_rate": 3.773838347489219e-06, + "loss": 0.5301, + "step": 6166 + }, + { + "epoch": 1.0004866969500323, + "grad_norm": 0.5813587307208589, + "learning_rate": 3.773470888913085e-06, + "loss": 0.5292, + "step": 6167 + }, + { + "epoch": 1.00064892926671, + "grad_norm": 0.5716367880401602, + "learning_rate": 3.773103393180788e-06, + "loss": 0.532, + "step": 6168 + }, + { + "epoch": 1.0008111615833875, + "grad_norm": 0.5860741681190437, + "learning_rate": 3.7727358603030523e-06, + "loss": 0.509, + "step": 6169 + }, + { + "epoch": 1.000973393900065, + "grad_norm": 0.565458657068114, + "learning_rate": 3.7723682902906e-06, + "loss": 0.5546, + "step": 6170 + }, + { + "epoch": 1.0011356262167423, + "grad_norm": 0.5853631907562524, + "learning_rate": 3.7720006831541563e-06, + "loss": 0.5302, + "step": 6171 + }, + { + "epoch": 1.0012978585334198, + "grad_norm": 0.564229553311842, + "learning_rate": 3.7716330389044463e-06, + "loss": 0.5269, + "step": 6172 + }, + { + "epoch": 1.0014600908500972, + "grad_norm": 0.5822503359167219, + "learning_rate": 3.7712653575521975e-06, + "loss": 0.5415, + "step": 6173 + }, + { + "epoch": 1.001622323166775, + "grad_norm": 0.5869421852901614, + "learning_rate": 3.7708976391081375e-06, + "loss": 0.5036, + "step": 6174 + }, + { + "epoch": 1.0017845554834524, + "grad_norm": 0.5888747258832189, + "learning_rate": 3.7705298835829952e-06, + "loss": 0.5092, + "step": 6175 + }, + { + "epoch": 1.0019467878001298, + "grad_norm": 0.5788788605274997, + "learning_rate": 3.770162090987501e-06, + "loss": 0.533, + "step": 6176 + }, + { + "epoch": 1.0021090201168072, + "grad_norm": 0.5638256770123031, + "learning_rate": 3.7697942613323856e-06, + "loss": 0.5267, + "step": 6177 + }, + { + "epoch": 1.0022712524334847, + "grad_norm": 0.5861881407719451, + "learning_rate": 3.7694263946283804e-06, + "loss": 0.5155, + "step": 6178 + }, + { + "epoch": 1.0024334847501621, + "grad_norm": 0.5887210341868193, + "learning_rate": 3.769058490886221e-06, + "loss": 0.5478, + "step": 6179 + }, + { + "epoch": 1.0025957170668398, + "grad_norm": 0.6149966690350802, + "learning_rate": 3.7686905501166392e-06, + "loss": 0.5612, + "step": 6180 + }, + { + "epoch": 1.0027579493835173, + "grad_norm": 0.5875378196515918, + "learning_rate": 3.768322572330372e-06, + "loss": 0.5605, + "step": 6181 + }, + { + "epoch": 1.0029201817001947, + "grad_norm": 0.6618826962035653, + "learning_rate": 3.767954557538156e-06, + "loss": 0.5068, + "step": 6182 + }, + { + "epoch": 1.0030824140168721, + "grad_norm": 0.5682108953937736, + "learning_rate": 3.767586505750728e-06, + "loss": 0.5602, + "step": 6183 + }, + { + "epoch": 1.0032446463335496, + "grad_norm": 0.5577888227270975, + "learning_rate": 3.7672184169788275e-06, + "loss": 0.5095, + "step": 6184 + }, + { + "epoch": 1.003406878650227, + "grad_norm": 0.5401888056644312, + "learning_rate": 3.766850291233193e-06, + "loss": 0.5034, + "step": 6185 + }, + { + "epoch": 1.0035691109669047, + "grad_norm": 0.6166971528657308, + "learning_rate": 3.766482128524566e-06, + "loss": 0.514, + "step": 6186 + }, + { + "epoch": 1.0037313432835822, + "grad_norm": 0.5769170083811443, + "learning_rate": 3.7661139288636885e-06, + "loss": 0.5571, + "step": 6187 + }, + { + "epoch": 1.0038935756002596, + "grad_norm": 0.5842833400132407, + "learning_rate": 3.7657456922613033e-06, + "loss": 0.5075, + "step": 6188 + }, + { + "epoch": 1.004055807916937, + "grad_norm": 0.5634947526946001, + "learning_rate": 3.7653774187281557e-06, + "loss": 0.5395, + "step": 6189 + }, + { + "epoch": 1.0042180402336145, + "grad_norm": 0.6667718421165799, + "learning_rate": 3.765009108274989e-06, + "loss": 0.526, + "step": 6190 + }, + { + "epoch": 1.004380272550292, + "grad_norm": 0.5847682935748765, + "learning_rate": 3.76464076091255e-06, + "loss": 0.5375, + "step": 6191 + }, + { + "epoch": 1.0045425048669696, + "grad_norm": 0.6368292022158957, + "learning_rate": 3.7642723766515863e-06, + "loss": 0.4945, + "step": 6192 + }, + { + "epoch": 1.004704737183647, + "grad_norm": 0.5495105327955592, + "learning_rate": 3.7639039555028467e-06, + "loss": 0.5118, + "step": 6193 + }, + { + "epoch": 1.0048669695003245, + "grad_norm": 0.5721647892252102, + "learning_rate": 3.76353549747708e-06, + "loss": 0.5066, + "step": 6194 + }, + { + "epoch": 1.005029201817002, + "grad_norm": 0.5832083017142098, + "learning_rate": 3.7631670025850365e-06, + "loss": 0.5475, + "step": 6195 + }, + { + "epoch": 1.0051914341336794, + "grad_norm": 0.5583536640676788, + "learning_rate": 3.7627984708374678e-06, + "loss": 0.5354, + "step": 6196 + }, + { + "epoch": 1.0053536664503568, + "grad_norm": 0.6035200015771619, + "learning_rate": 3.7624299022451284e-06, + "loss": 0.4892, + "step": 6197 + }, + { + "epoch": 1.0055158987670343, + "grad_norm": 0.5785397094232414, + "learning_rate": 3.7620612968187693e-06, + "loss": 0.5362, + "step": 6198 + }, + { + "epoch": 1.005678131083712, + "grad_norm": 0.5869520385991294, + "learning_rate": 3.7616926545691466e-06, + "loss": 0.5046, + "step": 6199 + }, + { + "epoch": 1.0058403634003894, + "grad_norm": 0.582915644621895, + "learning_rate": 3.7613239755070175e-06, + "loss": 0.4992, + "step": 6200 + }, + { + "epoch": 1.0060025957170668, + "grad_norm": 0.5560274783761063, + "learning_rate": 3.7609552596431365e-06, + "loss": 0.5101, + "step": 6201 + }, + { + "epoch": 1.0061648280337443, + "grad_norm": 0.5521008669446303, + "learning_rate": 3.7605865069882634e-06, + "loss": 0.5022, + "step": 6202 + }, + { + "epoch": 1.0063270603504217, + "grad_norm": 0.5894924939568669, + "learning_rate": 3.7602177175531564e-06, + "loss": 0.4942, + "step": 6203 + }, + { + "epoch": 1.0064892926670992, + "grad_norm": 0.5939575836913457, + "learning_rate": 3.759848891348577e-06, + "loss": 0.5376, + "step": 6204 + }, + { + "epoch": 1.0066515249837769, + "grad_norm": 0.576257355056443, + "learning_rate": 3.7594800283852845e-06, + "loss": 0.5459, + "step": 6205 + }, + { + "epoch": 1.0068137573004543, + "grad_norm": 0.6089137363505378, + "learning_rate": 3.759111128674043e-06, + "loss": 0.553, + "step": 6206 + }, + { + "epoch": 1.0069759896171318, + "grad_norm": 0.5695715194790504, + "learning_rate": 3.758742192225615e-06, + "loss": 0.5389, + "step": 6207 + }, + { + "epoch": 1.0071382219338092, + "grad_norm": 0.5974048576788545, + "learning_rate": 3.758373219050765e-06, + "loss": 0.5391, + "step": 6208 + }, + { + "epoch": 1.0073004542504866, + "grad_norm": 0.5751022640333114, + "learning_rate": 3.7580042091602592e-06, + "loss": 0.5346, + "step": 6209 + }, + { + "epoch": 1.007462686567164, + "grad_norm": 0.5700503238886845, + "learning_rate": 3.757635162564863e-06, + "loss": 0.5624, + "step": 6210 + }, + { + "epoch": 1.0076249188838418, + "grad_norm": 0.5934376825959475, + "learning_rate": 3.7572660792753457e-06, + "loss": 0.5566, + "step": 6211 + }, + { + "epoch": 1.0077871512005192, + "grad_norm": 0.5699787549638441, + "learning_rate": 3.756896959302475e-06, + "loss": 0.5028, + "step": 6212 + }, + { + "epoch": 1.0079493835171967, + "grad_norm": 0.5720719022138115, + "learning_rate": 3.7565278026570216e-06, + "loss": 0.5122, + "step": 6213 + }, + { + "epoch": 1.008111615833874, + "grad_norm": 0.6133294171257342, + "learning_rate": 3.756158609349755e-06, + "loss": 0.5301, + "step": 6214 + }, + { + "epoch": 1.0082738481505515, + "grad_norm": 0.6231054855626177, + "learning_rate": 3.755789379391449e-06, + "loss": 0.5365, + "step": 6215 + }, + { + "epoch": 1.008436080467229, + "grad_norm": 0.5559479909134906, + "learning_rate": 3.7554201127928747e-06, + "loss": 0.5231, + "step": 6216 + }, + { + "epoch": 1.0085983127839067, + "grad_norm": 0.577308303476747, + "learning_rate": 3.755050809564808e-06, + "loss": 0.52, + "step": 6217 + }, + { + "epoch": 1.008760545100584, + "grad_norm": 0.5775978746925399, + "learning_rate": 3.7546814697180225e-06, + "loss": 0.5369, + "step": 6218 + }, + { + "epoch": 1.0089227774172616, + "grad_norm": 0.5884799576489848, + "learning_rate": 3.754312093263296e-06, + "loss": 0.55, + "step": 6219 + }, + { + "epoch": 1.009085009733939, + "grad_norm": 0.5801936752272525, + "learning_rate": 3.753942680211404e-06, + "loss": 0.5316, + "step": 6220 + }, + { + "epoch": 1.0092472420506164, + "grad_norm": 0.5999890074776282, + "learning_rate": 3.7535732305731267e-06, + "loss": 0.5284, + "step": 6221 + }, + { + "epoch": 1.009409474367294, + "grad_norm": 0.6041366117493866, + "learning_rate": 3.7532037443592436e-06, + "loss": 0.5196, + "step": 6222 + }, + { + "epoch": 1.0095717066839713, + "grad_norm": 0.5812330990961478, + "learning_rate": 3.752834221580533e-06, + "loss": 0.5446, + "step": 6223 + }, + { + "epoch": 1.009733939000649, + "grad_norm": 0.5407365393903988, + "learning_rate": 3.7524646622477787e-06, + "loss": 0.5093, + "step": 6224 + }, + { + "epoch": 1.0098961713173265, + "grad_norm": 0.6095096784167069, + "learning_rate": 3.7520950663717625e-06, + "loss": 0.5328, + "step": 6225 + }, + { + "epoch": 1.010058403634004, + "grad_norm": 0.5857228373498189, + "learning_rate": 3.751725433963268e-06, + "loss": 0.5326, + "step": 6226 + }, + { + "epoch": 1.0102206359506813, + "grad_norm": 0.5976990238831699, + "learning_rate": 3.751355765033081e-06, + "loss": 0.516, + "step": 6227 + }, + { + "epoch": 1.0103828682673588, + "grad_norm": 0.5805794993019102, + "learning_rate": 3.7509860595919855e-06, + "loss": 0.4931, + "step": 6228 + }, + { + "epoch": 1.0105451005840362, + "grad_norm": 0.6102975408888135, + "learning_rate": 3.7506163176507703e-06, + "loss": 0.5654, + "step": 6229 + }, + { + "epoch": 1.010707332900714, + "grad_norm": 0.5873604419203613, + "learning_rate": 3.7502465392202227e-06, + "loss": 0.5334, + "step": 6230 + }, + { + "epoch": 1.0108695652173914, + "grad_norm": 0.6011128927619968, + "learning_rate": 3.749876724311131e-06, + "loss": 0.5158, + "step": 6231 + }, + { + "epoch": 1.0110317975340688, + "grad_norm": 0.570913414633691, + "learning_rate": 3.7495068729342864e-06, + "loss": 0.5028, + "step": 6232 + }, + { + "epoch": 1.0111940298507462, + "grad_norm": 0.5747928116213761, + "learning_rate": 3.74913698510048e-06, + "loss": 0.4875, + "step": 6233 + }, + { + "epoch": 1.0113562621674237, + "grad_norm": 0.6340973405225447, + "learning_rate": 3.748767060820503e-06, + "loss": 0.519, + "step": 6234 + }, + { + "epoch": 1.0115184944841011, + "grad_norm": 0.6241083040157728, + "learning_rate": 3.74839710010515e-06, + "loss": 0.5234, + "step": 6235 + }, + { + "epoch": 1.0116807268007788, + "grad_norm": 0.5629986427784393, + "learning_rate": 3.748027102965214e-06, + "loss": 0.5013, + "step": 6236 + }, + { + "epoch": 1.0118429591174563, + "grad_norm": 0.6034442707321648, + "learning_rate": 3.747657069411492e-06, + "loss": 0.5015, + "step": 6237 + }, + { + "epoch": 1.0120051914341337, + "grad_norm": 0.5899723415761525, + "learning_rate": 3.74728699945478e-06, + "loss": 0.55, + "step": 6238 + }, + { + "epoch": 1.0121674237508111, + "grad_norm": 0.6131828627946396, + "learning_rate": 3.7469168931058743e-06, + "loss": 0.5368, + "step": 6239 + }, + { + "epoch": 1.0123296560674886, + "grad_norm": 0.617485432365091, + "learning_rate": 3.746546750375576e-06, + "loss": 0.5613, + "step": 6240 + }, + { + "epoch": 1.012491888384166, + "grad_norm": 0.594543310291495, + "learning_rate": 3.746176571274682e-06, + "loss": 0.5633, + "step": 6241 + }, + { + "epoch": 1.0126541207008437, + "grad_norm": 0.56668217979918, + "learning_rate": 3.7458063558139945e-06, + "loss": 0.489, + "step": 6242 + }, + { + "epoch": 1.0128163530175212, + "grad_norm": 0.6187830430407898, + "learning_rate": 3.745436104004315e-06, + "loss": 0.5532, + "step": 6243 + }, + { + "epoch": 1.0129785853341986, + "grad_norm": 0.6070314734055081, + "learning_rate": 3.7450658158564474e-06, + "loss": 0.5198, + "step": 6244 + }, + { + "epoch": 1.013140817650876, + "grad_norm": 0.5992618476001256, + "learning_rate": 3.7446954913811946e-06, + "loss": 0.512, + "step": 6245 + }, + { + "epoch": 1.0133030499675535, + "grad_norm": 0.6033156710043552, + "learning_rate": 3.7443251305893606e-06, + "loss": 0.5384, + "step": 6246 + }, + { + "epoch": 1.013465282284231, + "grad_norm": 0.5774061555778468, + "learning_rate": 3.7439547334917535e-06, + "loss": 0.5102, + "step": 6247 + }, + { + "epoch": 1.0136275146009086, + "grad_norm": 0.60583201282164, + "learning_rate": 3.7435843000991793e-06, + "loss": 0.5088, + "step": 6248 + }, + { + "epoch": 1.013789746917586, + "grad_norm": 0.6094630292565889, + "learning_rate": 3.7432138304224462e-06, + "loss": 0.5082, + "step": 6249 + }, + { + "epoch": 1.0139519792342635, + "grad_norm": 0.6151635281384067, + "learning_rate": 3.7428433244723645e-06, + "loss": 0.522, + "step": 6250 + }, + { + "epoch": 1.014114211550941, + "grad_norm": 0.5944080648122887, + "learning_rate": 3.742472782259743e-06, + "loss": 0.5254, + "step": 6251 + }, + { + "epoch": 1.0142764438676184, + "grad_norm": 0.5837846959031118, + "learning_rate": 3.7421022037953925e-06, + "loss": 0.5156, + "step": 6252 + }, + { + "epoch": 1.0144386761842958, + "grad_norm": 0.5821543135589391, + "learning_rate": 3.7417315890901284e-06, + "loss": 0.5244, + "step": 6253 + }, + { + "epoch": 1.0146009085009733, + "grad_norm": 0.5981475879943396, + "learning_rate": 3.7413609381547604e-06, + "loss": 0.577, + "step": 6254 + }, + { + "epoch": 1.014763140817651, + "grad_norm": 0.5682735248930869, + "learning_rate": 3.740990251000106e-06, + "loss": 0.513, + "step": 6255 + }, + { + "epoch": 1.0149253731343284, + "grad_norm": 0.5774062364397559, + "learning_rate": 3.7406195276369794e-06, + "loss": 0.5267, + "step": 6256 + }, + { + "epoch": 1.0150876054510058, + "grad_norm": 0.5738620610811, + "learning_rate": 3.7402487680761975e-06, + "loss": 0.5233, + "step": 6257 + }, + { + "epoch": 1.0152498377676833, + "grad_norm": 0.5791292484956216, + "learning_rate": 3.739877972328578e-06, + "loss": 0.4951, + "step": 6258 + }, + { + "epoch": 1.0154120700843607, + "grad_norm": 0.6521659409013258, + "learning_rate": 3.739507140404939e-06, + "loss": 0.543, + "step": 6259 + }, + { + "epoch": 1.0155743024010382, + "grad_norm": 0.5946289271662971, + "learning_rate": 3.739136272316102e-06, + "loss": 0.5237, + "step": 6260 + }, + { + "epoch": 1.0157365347177159, + "grad_norm": 0.6018738059664507, + "learning_rate": 3.7387653680728853e-06, + "loss": 0.5266, + "step": 6261 + }, + { + "epoch": 1.0158987670343933, + "grad_norm": 0.5819924164546607, + "learning_rate": 3.7383944276861135e-06, + "loss": 0.5115, + "step": 6262 + }, + { + "epoch": 1.0160609993510707, + "grad_norm": 0.5983780341413815, + "learning_rate": 3.7380234511666077e-06, + "loss": 0.5528, + "step": 6263 + }, + { + "epoch": 1.0162232316677482, + "grad_norm": 0.592916680720818, + "learning_rate": 3.737652438525192e-06, + "loss": 0.4994, + "step": 6264 + }, + { + "epoch": 1.0163854639844256, + "grad_norm": 0.5609296127538606, + "learning_rate": 3.737281389772692e-06, + "loss": 0.5335, + "step": 6265 + }, + { + "epoch": 1.016547696301103, + "grad_norm": 0.6070994386161915, + "learning_rate": 3.7369103049199344e-06, + "loss": 0.5149, + "step": 6266 + }, + { + "epoch": 1.0167099286177808, + "grad_norm": 0.5737296208522085, + "learning_rate": 3.736539183977746e-06, + "loss": 0.5326, + "step": 6267 + }, + { + "epoch": 1.0168721609344582, + "grad_norm": 0.6234445838650821, + "learning_rate": 3.7361680269569543e-06, + "loss": 0.5389, + "step": 6268 + }, + { + "epoch": 1.0170343932511356, + "grad_norm": 0.6134854599924938, + "learning_rate": 3.7357968338683893e-06, + "loss": 0.5204, + "step": 6269 + }, + { + "epoch": 1.017196625567813, + "grad_norm": 0.5833649564568758, + "learning_rate": 3.7354256047228803e-06, + "loss": 0.5492, + "step": 6270 + }, + { + "epoch": 1.0173588578844905, + "grad_norm": 0.6028714167285253, + "learning_rate": 3.7350543395312604e-06, + "loss": 0.5343, + "step": 6271 + }, + { + "epoch": 1.017521090201168, + "grad_norm": 0.5836070502447881, + "learning_rate": 3.734683038304361e-06, + "loss": 0.5341, + "step": 6272 + }, + { + "epoch": 1.0176833225178457, + "grad_norm": 0.6030322881268925, + "learning_rate": 3.734311701053015e-06, + "loss": 0.5469, + "step": 6273 + }, + { + "epoch": 1.017845554834523, + "grad_norm": 0.5727574549440098, + "learning_rate": 3.733940327788058e-06, + "loss": 0.5235, + "step": 6274 + }, + { + "epoch": 1.0180077871512005, + "grad_norm": 0.6190261299409798, + "learning_rate": 3.733568918520325e-06, + "loss": 0.5237, + "step": 6275 + }, + { + "epoch": 1.018170019467878, + "grad_norm": 0.5683502018142262, + "learning_rate": 3.7331974732606534e-06, + "loss": 0.5146, + "step": 6276 + }, + { + "epoch": 1.0183322517845554, + "grad_norm": 0.5918112561905563, + "learning_rate": 3.7328259920198794e-06, + "loss": 0.5435, + "step": 6277 + }, + { + "epoch": 1.0184944841012329, + "grad_norm": 0.5857894087227551, + "learning_rate": 3.732454474808844e-06, + "loss": 0.5341, + "step": 6278 + }, + { + "epoch": 1.0186567164179103, + "grad_norm": 0.5736928699587178, + "learning_rate": 3.7320829216383846e-06, + "loss": 0.5316, + "step": 6279 + }, + { + "epoch": 1.018818948734588, + "grad_norm": 0.5785853741905154, + "learning_rate": 3.7317113325193432e-06, + "loss": 0.5434, + "step": 6280 + }, + { + "epoch": 1.0189811810512654, + "grad_norm": 0.5849820076664786, + "learning_rate": 3.7313397074625625e-06, + "loss": 0.5301, + "step": 6281 + }, + { + "epoch": 1.019143413367943, + "grad_norm": 0.6223317038086641, + "learning_rate": 3.7309680464788835e-06, + "loss": 0.5201, + "step": 6282 + }, + { + "epoch": 1.0193056456846203, + "grad_norm": 0.5728247957308806, + "learning_rate": 3.7305963495791513e-06, + "loss": 0.5239, + "step": 6283 + }, + { + "epoch": 1.0194678780012978, + "grad_norm": 0.5980433388391762, + "learning_rate": 3.730224616774211e-06, + "loss": 0.5115, + "step": 6284 + }, + { + "epoch": 1.0196301103179752, + "grad_norm": 0.5816057088369821, + "learning_rate": 3.7298528480749088e-06, + "loss": 0.5336, + "step": 6285 + }, + { + "epoch": 1.019792342634653, + "grad_norm": 0.6164649608440026, + "learning_rate": 3.7294810434920914e-06, + "loss": 0.5283, + "step": 6286 + }, + { + "epoch": 1.0199545749513304, + "grad_norm": 0.6011493806018358, + "learning_rate": 3.7291092030366067e-06, + "loss": 0.5123, + "step": 6287 + }, + { + "epoch": 1.0201168072680078, + "grad_norm": 0.5775104602388575, + "learning_rate": 3.728737326719304e-06, + "loss": 0.5055, + "step": 6288 + }, + { + "epoch": 1.0202790395846852, + "grad_norm": 0.5881126703007007, + "learning_rate": 3.728365414551035e-06, + "loss": 0.535, + "step": 6289 + }, + { + "epoch": 1.0204412719013627, + "grad_norm": 0.5769351238041467, + "learning_rate": 3.727993466542649e-06, + "loss": 0.5294, + "step": 6290 + }, + { + "epoch": 1.0206035042180401, + "grad_norm": 0.5742490659935897, + "learning_rate": 3.727621482705e-06, + "loss": 0.5437, + "step": 6291 + }, + { + "epoch": 1.0207657365347178, + "grad_norm": 0.5929400284922391, + "learning_rate": 3.7272494630489403e-06, + "loss": 0.508, + "step": 6292 + }, + { + "epoch": 1.0209279688513953, + "grad_norm": 0.6048870257000355, + "learning_rate": 3.726877407585325e-06, + "loss": 0.5297, + "step": 6293 + }, + { + "epoch": 1.0210902011680727, + "grad_norm": 0.609144328550923, + "learning_rate": 3.7265053163250087e-06, + "loss": 0.5077, + "step": 6294 + }, + { + "epoch": 1.0212524334847501, + "grad_norm": 0.5751096740608068, + "learning_rate": 3.726133189278848e-06, + "loss": 0.523, + "step": 6295 + }, + { + "epoch": 1.0214146658014276, + "grad_norm": 0.5726493005541645, + "learning_rate": 3.7257610264577026e-06, + "loss": 0.5231, + "step": 6296 + }, + { + "epoch": 1.021576898118105, + "grad_norm": 0.5733751280735854, + "learning_rate": 3.725388827872428e-06, + "loss": 0.5461, + "step": 6297 + }, + { + "epoch": 1.0217391304347827, + "grad_norm": 0.6237502985005352, + "learning_rate": 3.725016593533887e-06, + "loss": 0.5257, + "step": 6298 + }, + { + "epoch": 1.0219013627514602, + "grad_norm": 0.6323730682963652, + "learning_rate": 3.724644323452937e-06, + "loss": 0.5021, + "step": 6299 + }, + { + "epoch": 1.0220635950681376, + "grad_norm": 0.6078210546947189, + "learning_rate": 3.724272017640443e-06, + "loss": 0.5412, + "step": 6300 + }, + { + "epoch": 1.022225827384815, + "grad_norm": 0.5718436283484376, + "learning_rate": 3.7238996761072654e-06, + "loss": 0.5404, + "step": 6301 + }, + { + "epoch": 1.0223880597014925, + "grad_norm": 0.5679741775984851, + "learning_rate": 3.723527298864269e-06, + "loss": 0.5349, + "step": 6302 + }, + { + "epoch": 1.02255029201817, + "grad_norm": 0.5684385383197542, + "learning_rate": 3.723154885922319e-06, + "loss": 0.5424, + "step": 6303 + }, + { + "epoch": 1.0227125243348476, + "grad_norm": 0.5768200719210802, + "learning_rate": 3.72278243729228e-06, + "loss": 0.5378, + "step": 6304 + }, + { + "epoch": 1.022874756651525, + "grad_norm": 0.5955379669573472, + "learning_rate": 3.7224099529850204e-06, + "loss": 0.5082, + "step": 6305 + }, + { + "epoch": 1.0230369889682025, + "grad_norm": 0.5988504553771593, + "learning_rate": 3.7220374330114073e-06, + "loss": 0.5519, + "step": 6306 + }, + { + "epoch": 1.02319922128488, + "grad_norm": 0.5556954278482878, + "learning_rate": 3.7216648773823115e-06, + "loss": 0.5518, + "step": 6307 + }, + { + "epoch": 1.0233614536015574, + "grad_norm": 0.5760577236439619, + "learning_rate": 3.7212922861086002e-06, + "loss": 0.5434, + "step": 6308 + }, + { + "epoch": 1.0235236859182348, + "grad_norm": 0.6052401271234171, + "learning_rate": 3.7209196592011467e-06, + "loss": 0.5145, + "step": 6309 + }, + { + "epoch": 1.0236859182349125, + "grad_norm": 0.5948092070444263, + "learning_rate": 3.7205469966708226e-06, + "loss": 0.4903, + "step": 6310 + }, + { + "epoch": 1.02384815055159, + "grad_norm": 0.5732783060275909, + "learning_rate": 3.7201742985285004e-06, + "loss": 0.5232, + "step": 6311 + }, + { + "epoch": 1.0240103828682674, + "grad_norm": 0.5728623984994405, + "learning_rate": 3.7198015647850557e-06, + "loss": 0.5701, + "step": 6312 + }, + { + "epoch": 1.0241726151849448, + "grad_norm": 0.6278891200238875, + "learning_rate": 3.719428795451362e-06, + "loss": 0.5597, + "step": 6313 + }, + { + "epoch": 1.0243348475016223, + "grad_norm": 0.5787081980434172, + "learning_rate": 3.719055990538298e-06, + "loss": 0.5293, + "step": 6314 + }, + { + "epoch": 1.0244970798182997, + "grad_norm": 0.5689886967608107, + "learning_rate": 3.7186831500567382e-06, + "loss": 0.543, + "step": 6315 + }, + { + "epoch": 1.0246593121349772, + "grad_norm": 0.6542388240161642, + "learning_rate": 3.7183102740175635e-06, + "loss": 0.5404, + "step": 6316 + }, + { + "epoch": 1.0248215444516549, + "grad_norm": 0.5779194483727051, + "learning_rate": 3.7179373624316525e-06, + "loss": 0.524, + "step": 6317 + }, + { + "epoch": 1.0249837767683323, + "grad_norm": 0.5961835486085427, + "learning_rate": 3.717564415309885e-06, + "loss": 0.5286, + "step": 6318 + }, + { + "epoch": 1.0251460090850097, + "grad_norm": 0.6000456973302143, + "learning_rate": 3.717191432663143e-06, + "loss": 0.5542, + "step": 6319 + }, + { + "epoch": 1.0253082414016872, + "grad_norm": 0.5646188019764452, + "learning_rate": 3.716818414502309e-06, + "loss": 0.556, + "step": 6320 + }, + { + "epoch": 1.0254704737183646, + "grad_norm": 0.5760674554955937, + "learning_rate": 3.7164453608382665e-06, + "loss": 0.5547, + "step": 6321 + }, + { + "epoch": 1.025632706035042, + "grad_norm": 0.6174064956948346, + "learning_rate": 3.7160722716819007e-06, + "loss": 0.5461, + "step": 6322 + }, + { + "epoch": 1.0257949383517198, + "grad_norm": 0.5962578021049738, + "learning_rate": 3.715699147044097e-06, + "loss": 0.5487, + "step": 6323 + }, + { + "epoch": 1.0259571706683972, + "grad_norm": 0.6262616811273114, + "learning_rate": 3.7153259869357417e-06, + "loss": 0.5438, + "step": 6324 + }, + { + "epoch": 1.0261194029850746, + "grad_norm": 0.5624691229987938, + "learning_rate": 3.7149527913677232e-06, + "loss": 0.5246, + "step": 6325 + }, + { + "epoch": 1.026281635301752, + "grad_norm": 0.578944906978835, + "learning_rate": 3.7145795603509282e-06, + "loss": 0.5358, + "step": 6326 + }, + { + "epoch": 1.0264438676184295, + "grad_norm": 0.5681373367683722, + "learning_rate": 3.7142062938962496e-06, + "loss": 0.4986, + "step": 6327 + }, + { + "epoch": 1.026606099935107, + "grad_norm": 0.5864223576147186, + "learning_rate": 3.713832992014576e-06, + "loss": 0.5243, + "step": 6328 + }, + { + "epoch": 1.0267683322517847, + "grad_norm": 0.5603464162614934, + "learning_rate": 3.7134596547168e-06, + "loss": 0.5413, + "step": 6329 + }, + { + "epoch": 1.026930564568462, + "grad_norm": 0.5836250940244764, + "learning_rate": 3.7130862820138148e-06, + "loss": 0.5616, + "step": 6330 + }, + { + "epoch": 1.0270927968851395, + "grad_norm": 0.5869775593986873, + "learning_rate": 3.7127128739165134e-06, + "loss": 0.5015, + "step": 6331 + }, + { + "epoch": 1.027255029201817, + "grad_norm": 0.6017626988261595, + "learning_rate": 3.712339430435792e-06, + "loss": 0.5489, + "step": 6332 + }, + { + "epoch": 1.0274172615184944, + "grad_norm": 0.5784766788048106, + "learning_rate": 3.7119659515825457e-06, + "loss": 0.5314, + "step": 6333 + }, + { + "epoch": 1.0275794938351719, + "grad_norm": 0.5581618429138937, + "learning_rate": 3.7115924373676713e-06, + "loss": 0.5018, + "step": 6334 + }, + { + "epoch": 1.0277417261518496, + "grad_norm": 0.5910561078656736, + "learning_rate": 3.711218887802068e-06, + "loss": 0.475, + "step": 6335 + }, + { + "epoch": 1.027903958468527, + "grad_norm": 0.6218009944647117, + "learning_rate": 3.7108453028966334e-06, + "loss": 0.5452, + "step": 6336 + }, + { + "epoch": 1.0280661907852044, + "grad_norm": 0.6142898799598919, + "learning_rate": 3.7104716826622687e-06, + "loss": 0.5141, + "step": 6337 + }, + { + "epoch": 1.028228423101882, + "grad_norm": 0.5955531020605167, + "learning_rate": 3.7100980271098746e-06, + "loss": 0.5715, + "step": 6338 + }, + { + "epoch": 1.0283906554185593, + "grad_norm": 0.62868121324349, + "learning_rate": 3.7097243362503537e-06, + "loss": 0.5216, + "step": 6339 + }, + { + "epoch": 1.0285528877352368, + "grad_norm": 0.5957392796269989, + "learning_rate": 3.709350610094609e-06, + "loss": 0.4962, + "step": 6340 + }, + { + "epoch": 1.0287151200519142, + "grad_norm": 0.5925891667262354, + "learning_rate": 3.708976848653545e-06, + "loss": 0.4976, + "step": 6341 + }, + { + "epoch": 1.028877352368592, + "grad_norm": 0.5836052618290773, + "learning_rate": 3.7086030519380662e-06, + "loss": 0.5419, + "step": 6342 + }, + { + "epoch": 1.0290395846852693, + "grad_norm": 0.5724856238210356, + "learning_rate": 3.708229219959079e-06, + "loss": 0.5182, + "step": 6343 + }, + { + "epoch": 1.0292018170019468, + "grad_norm": 0.5602465185997902, + "learning_rate": 3.7078553527274907e-06, + "loss": 0.5301, + "step": 6344 + }, + { + "epoch": 1.0293640493186242, + "grad_norm": 0.6119077266046337, + "learning_rate": 3.707481450254212e-06, + "loss": 0.5424, + "step": 6345 + }, + { + "epoch": 1.0295262816353017, + "grad_norm": 0.5887142697594769, + "learning_rate": 3.707107512550148e-06, + "loss": 0.5331, + "step": 6346 + }, + { + "epoch": 1.0296885139519791, + "grad_norm": 0.586662894096278, + "learning_rate": 3.7067335396262125e-06, + "loss": 0.5244, + "step": 6347 + }, + { + "epoch": 1.0298507462686568, + "grad_norm": 0.5876398230053617, + "learning_rate": 3.706359531493316e-06, + "loss": 0.54, + "step": 6348 + }, + { + "epoch": 1.0300129785853342, + "grad_norm": 0.5697668923182304, + "learning_rate": 3.7059854881623703e-06, + "loss": 0.5162, + "step": 6349 + }, + { + "epoch": 1.0301752109020117, + "grad_norm": 0.5889350951928006, + "learning_rate": 3.7056114096442896e-06, + "loss": 0.5182, + "step": 6350 + }, + { + "epoch": 1.0303374432186891, + "grad_norm": 0.6111567375081172, + "learning_rate": 3.705237295949988e-06, + "loss": 0.5493, + "step": 6351 + }, + { + "epoch": 1.0304996755353666, + "grad_norm": 0.6032317553163591, + "learning_rate": 3.704863147090382e-06, + "loss": 0.5521, + "step": 6352 + }, + { + "epoch": 1.030661907852044, + "grad_norm": 0.5856167737427, + "learning_rate": 3.704488963076387e-06, + "loss": 0.5484, + "step": 6353 + }, + { + "epoch": 1.0308241401687217, + "grad_norm": 0.5767379293835833, + "learning_rate": 3.7041147439189207e-06, + "loss": 0.532, + "step": 6354 + }, + { + "epoch": 1.0309863724853991, + "grad_norm": 0.6000478621129985, + "learning_rate": 3.7037404896289025e-06, + "loss": 0.4935, + "step": 6355 + }, + { + "epoch": 1.0311486048020766, + "grad_norm": 0.5806711982845237, + "learning_rate": 3.7033662002172514e-06, + "loss": 0.4848, + "step": 6356 + }, + { + "epoch": 1.031310837118754, + "grad_norm": 0.5752683880102194, + "learning_rate": 3.702991875694888e-06, + "loss": 0.5235, + "step": 6357 + }, + { + "epoch": 1.0314730694354315, + "grad_norm": 0.5594588344369227, + "learning_rate": 3.7026175160727347e-06, + "loss": 0.4744, + "step": 6358 + }, + { + "epoch": 1.031635301752109, + "grad_norm": 0.5665440869765107, + "learning_rate": 3.702243121361714e-06, + "loss": 0.5091, + "step": 6359 + }, + { + "epoch": 1.0317975340687866, + "grad_norm": 0.5782839173357287, + "learning_rate": 3.7018686915727485e-06, + "loss": 0.525, + "step": 6360 + }, + { + "epoch": 1.031959766385464, + "grad_norm": 0.5911454177566816, + "learning_rate": 3.7014942267167653e-06, + "loss": 0.5026, + "step": 6361 + }, + { + "epoch": 1.0321219987021415, + "grad_norm": 0.5785604655807326, + "learning_rate": 3.701119726804687e-06, + "loss": 0.5364, + "step": 6362 + }, + { + "epoch": 1.032284231018819, + "grad_norm": 0.6106379631662336, + "learning_rate": 3.700745191847444e-06, + "loss": 0.5329, + "step": 6363 + }, + { + "epoch": 1.0324464633354964, + "grad_norm": 0.56692708999526, + "learning_rate": 3.700370621855961e-06, + "loss": 0.4948, + "step": 6364 + }, + { + "epoch": 1.0326086956521738, + "grad_norm": 0.5764777183196925, + "learning_rate": 3.699996016841169e-06, + "loss": 0.5163, + "step": 6365 + }, + { + "epoch": 1.0327709279688513, + "grad_norm": 0.6082792527158972, + "learning_rate": 3.699621376813997e-06, + "loss": 0.5278, + "step": 6366 + }, + { + "epoch": 1.032933160285529, + "grad_norm": 0.5793882015377807, + "learning_rate": 3.699246701785375e-06, + "loss": 0.5404, + "step": 6367 + }, + { + "epoch": 1.0330953926022064, + "grad_norm": 0.6414885478007268, + "learning_rate": 3.6988719917662374e-06, + "loss": 0.5336, + "step": 6368 + }, + { + "epoch": 1.0332576249188838, + "grad_norm": 0.5951117199406384, + "learning_rate": 3.698497246767515e-06, + "loss": 0.51, + "step": 6369 + }, + { + "epoch": 1.0334198572355613, + "grad_norm": 0.5528742396554082, + "learning_rate": 3.6981224668001427e-06, + "loss": 0.5048, + "step": 6370 + }, + { + "epoch": 1.0335820895522387, + "grad_norm": 0.7529712811795412, + "learning_rate": 3.6977476518750544e-06, + "loss": 0.5111, + "step": 6371 + }, + { + "epoch": 1.0337443218689162, + "grad_norm": 0.5971455941278582, + "learning_rate": 3.697372802003188e-06, + "loss": 0.5472, + "step": 6372 + }, + { + "epoch": 1.0339065541855939, + "grad_norm": 0.5911717122460096, + "learning_rate": 3.6969979171954786e-06, + "loss": 0.5248, + "step": 6373 + }, + { + "epoch": 1.0340687865022713, + "grad_norm": 0.588398716028952, + "learning_rate": 3.696622997462865e-06, + "loss": 0.4861, + "step": 6374 + }, + { + "epoch": 1.0342310188189487, + "grad_norm": 0.6033568615539652, + "learning_rate": 3.696248042816287e-06, + "loss": 0.5253, + "step": 6375 + }, + { + "epoch": 1.0343932511356262, + "grad_norm": 0.5767402767033643, + "learning_rate": 3.6958730532666837e-06, + "loss": 0.541, + "step": 6376 + }, + { + "epoch": 1.0345554834523036, + "grad_norm": 0.5986677666548209, + "learning_rate": 3.6954980288249966e-06, + "loss": 0.5288, + "step": 6377 + }, + { + "epoch": 1.034717715768981, + "grad_norm": 0.6131056263321483, + "learning_rate": 3.6951229695021675e-06, + "loss": 0.5257, + "step": 6378 + }, + { + "epoch": 1.0348799480856588, + "grad_norm": 0.5527027062857229, + "learning_rate": 3.6947478753091404e-06, + "loss": 0.5092, + "step": 6379 + }, + { + "epoch": 1.0350421804023362, + "grad_norm": 0.5769549998664157, + "learning_rate": 3.694372746256858e-06, + "loss": 0.5256, + "step": 6380 + }, + { + "epoch": 1.0352044127190136, + "grad_norm": 0.5968617624569424, + "learning_rate": 3.6939975823562668e-06, + "loss": 0.5101, + "step": 6381 + }, + { + "epoch": 1.035366645035691, + "grad_norm": 0.5848239532642092, + "learning_rate": 3.693622383618312e-06, + "loss": 0.5301, + "step": 6382 + }, + { + "epoch": 1.0355288773523685, + "grad_norm": 0.6113943871065743, + "learning_rate": 3.693247150053942e-06, + "loss": 0.526, + "step": 6383 + }, + { + "epoch": 1.035691109669046, + "grad_norm": 0.5565095373306002, + "learning_rate": 3.6928718816741038e-06, + "loss": 0.5345, + "step": 6384 + }, + { + "epoch": 1.0358533419857237, + "grad_norm": 0.609812795247896, + "learning_rate": 3.692496578489747e-06, + "loss": 0.5515, + "step": 6385 + }, + { + "epoch": 1.036015574302401, + "grad_norm": 0.5561400064746747, + "learning_rate": 3.6921212405118228e-06, + "loss": 0.5241, + "step": 6386 + }, + { + "epoch": 1.0361778066190785, + "grad_norm": 0.5875585099362288, + "learning_rate": 3.6917458677512807e-06, + "loss": 0.518, + "step": 6387 + }, + { + "epoch": 1.036340038935756, + "grad_norm": 0.613131100537701, + "learning_rate": 3.6913704602190753e-06, + "loss": 0.5373, + "step": 6388 + }, + { + "epoch": 1.0365022712524334, + "grad_norm": 0.5947992811501231, + "learning_rate": 3.690995017926157e-06, + "loss": 0.5191, + "step": 6389 + }, + { + "epoch": 1.0366645035691109, + "grad_norm": 0.6300135755401953, + "learning_rate": 3.690619540883482e-06, + "loss": 0.5301, + "step": 6390 + }, + { + "epoch": 1.0368267358857886, + "grad_norm": 0.6041466902658875, + "learning_rate": 3.690244029102006e-06, + "loss": 0.5208, + "step": 6391 + }, + { + "epoch": 1.036988968202466, + "grad_norm": 0.5777701219519239, + "learning_rate": 3.6898684825926845e-06, + "loss": 0.5399, + "step": 6392 + }, + { + "epoch": 1.0371512005191434, + "grad_norm": 0.6395119548031453, + "learning_rate": 3.6894929013664747e-06, + "loss": 0.5205, + "step": 6393 + }, + { + "epoch": 1.037313432835821, + "grad_norm": 0.572902866826475, + "learning_rate": 3.689117285434335e-06, + "loss": 0.5412, + "step": 6394 + }, + { + "epoch": 1.0374756651524983, + "grad_norm": 0.5949435199759754, + "learning_rate": 3.6887416348072245e-06, + "loss": 0.5133, + "step": 6395 + }, + { + "epoch": 1.0376378974691758, + "grad_norm": 0.5742447601361931, + "learning_rate": 3.6883659494961057e-06, + "loss": 0.5181, + "step": 6396 + }, + { + "epoch": 1.0378001297858535, + "grad_norm": 0.565394144673042, + "learning_rate": 3.6879902295119375e-06, + "loss": 0.506, + "step": 6397 + }, + { + "epoch": 1.037962362102531, + "grad_norm": 0.590145054049005, + "learning_rate": 3.6876144748656828e-06, + "loss": 0.5566, + "step": 6398 + }, + { + "epoch": 1.0381245944192083, + "grad_norm": 0.5541951632008811, + "learning_rate": 3.6872386855683062e-06, + "loss": 0.5192, + "step": 6399 + }, + { + "epoch": 1.0382868267358858, + "grad_norm": 0.581992723688843, + "learning_rate": 3.6868628616307707e-06, + "loss": 0.5426, + "step": 6400 + }, + { + "epoch": 1.0384490590525632, + "grad_norm": 0.5645111331404751, + "learning_rate": 3.6864870030640426e-06, + "loss": 0.5436, + "step": 6401 + }, + { + "epoch": 1.0386112913692407, + "grad_norm": 0.6012663970377603, + "learning_rate": 3.686111109879088e-06, + "loss": 0.5367, + "step": 6402 + }, + { + "epoch": 1.0387735236859181, + "grad_norm": 0.59730242708733, + "learning_rate": 3.6857351820868747e-06, + "loss": 0.5443, + "step": 6403 + }, + { + "epoch": 1.0389357560025958, + "grad_norm": 0.5689831753440736, + "learning_rate": 3.6853592196983713e-06, + "loss": 0.5269, + "step": 6404 + }, + { + "epoch": 1.0390979883192732, + "grad_norm": 0.6113645464202554, + "learning_rate": 3.6849832227245463e-06, + "loss": 0.5278, + "step": 6405 + }, + { + "epoch": 1.0392602206359507, + "grad_norm": 0.5962550070403753, + "learning_rate": 3.684607191176372e-06, + "loss": 0.5008, + "step": 6406 + }, + { + "epoch": 1.0394224529526281, + "grad_norm": 0.6135705624126395, + "learning_rate": 3.684231125064818e-06, + "loss": 0.5571, + "step": 6407 + }, + { + "epoch": 1.0395846852693056, + "grad_norm": 0.5782821750148215, + "learning_rate": 3.6838550244008574e-06, + "loss": 0.5307, + "step": 6408 + }, + { + "epoch": 1.039746917585983, + "grad_norm": 0.5770569853737846, + "learning_rate": 3.6834788891954642e-06, + "loss": 0.5124, + "step": 6409 + }, + { + "epoch": 1.0399091499026607, + "grad_norm": 0.582469981801135, + "learning_rate": 3.683102719459613e-06, + "loss": 0.5255, + "step": 6410 + }, + { + "epoch": 1.0400713822193381, + "grad_norm": 0.5593153168398912, + "learning_rate": 3.682726515204279e-06, + "loss": 0.5181, + "step": 6411 + }, + { + "epoch": 1.0402336145360156, + "grad_norm": 0.5923572524759525, + "learning_rate": 3.682350276440438e-06, + "loss": 0.5415, + "step": 6412 + }, + { + "epoch": 1.040395846852693, + "grad_norm": 0.5656226197025112, + "learning_rate": 3.6819740031790686e-06, + "loss": 0.5034, + "step": 6413 + }, + { + "epoch": 1.0405580791693705, + "grad_norm": 0.5975367324831421, + "learning_rate": 3.681597695431149e-06, + "loss": 0.5308, + "step": 6414 + }, + { + "epoch": 1.040720311486048, + "grad_norm": 0.6124723783580106, + "learning_rate": 3.681221353207659e-06, + "loss": 0.539, + "step": 6415 + }, + { + "epoch": 1.0408825438027256, + "grad_norm": 0.5962741708659371, + "learning_rate": 3.6808449765195786e-06, + "loss": 0.5103, + "step": 6416 + }, + { + "epoch": 1.041044776119403, + "grad_norm": 0.5865606034890493, + "learning_rate": 3.6804685653778906e-06, + "loss": 0.4784, + "step": 6417 + }, + { + "epoch": 1.0412070084360805, + "grad_norm": 0.5801141647593234, + "learning_rate": 3.680092119793575e-06, + "loss": 0.531, + "step": 6418 + }, + { + "epoch": 1.041369240752758, + "grad_norm": 0.582622008125494, + "learning_rate": 3.679715639777618e-06, + "loss": 0.5395, + "step": 6419 + }, + { + "epoch": 1.0415314730694354, + "grad_norm": 0.5875934492811183, + "learning_rate": 3.6793391253410036e-06, + "loss": 0.5132, + "step": 6420 + }, + { + "epoch": 1.0416937053861128, + "grad_norm": 0.6192292680741188, + "learning_rate": 3.678962576494717e-06, + "loss": 0.5081, + "step": 6421 + }, + { + "epoch": 1.0418559377027905, + "grad_norm": 0.5795362395419197, + "learning_rate": 3.678585993249745e-06, + "loss": 0.5399, + "step": 6422 + }, + { + "epoch": 1.042018170019468, + "grad_norm": 0.5926791256098234, + "learning_rate": 3.678209375617074e-06, + "loss": 0.5374, + "step": 6423 + }, + { + "epoch": 1.0421804023361454, + "grad_norm": 0.5818736234952875, + "learning_rate": 3.677832723607695e-06, + "loss": 0.5243, + "step": 6424 + }, + { + "epoch": 1.0423426346528228, + "grad_norm": 0.5786127656126063, + "learning_rate": 3.6774560372325953e-06, + "loss": 0.5343, + "step": 6425 + }, + { + "epoch": 1.0425048669695003, + "grad_norm": 0.6139033398967536, + "learning_rate": 3.677079316502767e-06, + "loss": 0.5642, + "step": 6426 + }, + { + "epoch": 1.0426670992861777, + "grad_norm": 0.5821416129390551, + "learning_rate": 3.6767025614292007e-06, + "loss": 0.536, + "step": 6427 + }, + { + "epoch": 1.0428293316028552, + "grad_norm": 0.5822304396135115, + "learning_rate": 3.67632577202289e-06, + "loss": 0.5142, + "step": 6428 + }, + { + "epoch": 1.0429915639195328, + "grad_norm": 0.5913115495646613, + "learning_rate": 3.675948948294828e-06, + "loss": 0.5243, + "step": 6429 + }, + { + "epoch": 1.0431537962362103, + "grad_norm": 0.5790267211219442, + "learning_rate": 3.6755720902560087e-06, + "loss": 0.5272, + "step": 6430 + }, + { + "epoch": 1.0433160285528877, + "grad_norm": 0.6021984900332156, + "learning_rate": 3.675195197917429e-06, + "loss": 0.5113, + "step": 6431 + }, + { + "epoch": 1.0434782608695652, + "grad_norm": 0.5957463333739929, + "learning_rate": 3.674818271290085e-06, + "loss": 0.518, + "step": 6432 + }, + { + "epoch": 1.0436404931862426, + "grad_norm": 0.5655302083208585, + "learning_rate": 3.6744413103849736e-06, + "loss": 0.5428, + "step": 6433 + }, + { + "epoch": 1.04380272550292, + "grad_norm": 0.7520404262399446, + "learning_rate": 3.6740643152130944e-06, + "loss": 0.5245, + "step": 6434 + }, + { + "epoch": 1.0439649578195977, + "grad_norm": 0.5621164250522684, + "learning_rate": 3.6736872857854465e-06, + "loss": 0.5054, + "step": 6435 + }, + { + "epoch": 1.0441271901362752, + "grad_norm": 0.6257879419878969, + "learning_rate": 3.6733102221130303e-06, + "loss": 0.5385, + "step": 6436 + }, + { + "epoch": 1.0442894224529526, + "grad_norm": 0.5831580220261586, + "learning_rate": 3.6729331242068487e-06, + "loss": 0.5517, + "step": 6437 + }, + { + "epoch": 1.04445165476963, + "grad_norm": 0.6324320568465853, + "learning_rate": 3.6725559920779023e-06, + "loss": 0.5088, + "step": 6438 + }, + { + "epoch": 1.0446138870863075, + "grad_norm": 0.6486444969662977, + "learning_rate": 3.6721788257371964e-06, + "loss": 0.5355, + "step": 6439 + }, + { + "epoch": 1.044776119402985, + "grad_norm": 0.5890190768978774, + "learning_rate": 3.671801625195735e-06, + "loss": 0.52, + "step": 6440 + }, + { + "epoch": 1.0449383517196627, + "grad_norm": 0.5729890756073722, + "learning_rate": 3.671424390464523e-06, + "loss": 0.532, + "step": 6441 + }, + { + "epoch": 1.04510058403634, + "grad_norm": 0.5816209894131008, + "learning_rate": 3.671047121554569e-06, + "loss": 0.5178, + "step": 6442 + }, + { + "epoch": 1.0452628163530175, + "grad_norm": 0.5994786491908036, + "learning_rate": 3.670669818476878e-06, + "loss": 0.5078, + "step": 6443 + }, + { + "epoch": 1.045425048669695, + "grad_norm": 0.5617276796311861, + "learning_rate": 3.670292481242461e-06, + "loss": 0.5352, + "step": 6444 + }, + { + "epoch": 1.0455872809863724, + "grad_norm": 0.6050051084739602, + "learning_rate": 3.669915109862325e-06, + "loss": 0.5252, + "step": 6445 + }, + { + "epoch": 1.0457495133030499, + "grad_norm": 0.5643747137634516, + "learning_rate": 3.669537704347484e-06, + "loss": 0.533, + "step": 6446 + }, + { + "epoch": 1.0459117456197276, + "grad_norm": 0.6104263404008954, + "learning_rate": 3.669160264708947e-06, + "loss": 0.5276, + "step": 6447 + }, + { + "epoch": 1.046073977936405, + "grad_norm": 0.6700927184494103, + "learning_rate": 3.6687827909577266e-06, + "loss": 0.5149, + "step": 6448 + }, + { + "epoch": 1.0462362102530824, + "grad_norm": 0.5687765665892046, + "learning_rate": 3.668405283104838e-06, + "loss": 0.4917, + "step": 6449 + }, + { + "epoch": 1.04639844256976, + "grad_norm": 0.5751946941438311, + "learning_rate": 3.668027741161294e-06, + "loss": 0.4841, + "step": 6450 + }, + { + "epoch": 1.0465606748864373, + "grad_norm": 0.5935920217905931, + "learning_rate": 3.6676501651381117e-06, + "loss": 0.5276, + "step": 6451 + }, + { + "epoch": 1.0467229072031148, + "grad_norm": 0.5846229356258525, + "learning_rate": 3.6672725550463065e-06, + "loss": 0.515, + "step": 6452 + }, + { + "epoch": 1.0468851395197922, + "grad_norm": 0.5703613405726319, + "learning_rate": 3.666894910896897e-06, + "loss": 0.5086, + "step": 6453 + }, + { + "epoch": 1.04704737183647, + "grad_norm": 0.5945492303055219, + "learning_rate": 3.666517232700901e-06, + "loss": 0.5591, + "step": 6454 + }, + { + "epoch": 1.0472096041531473, + "grad_norm": 0.6009198854278179, + "learning_rate": 3.6661395204693386e-06, + "loss": 0.5458, + "step": 6455 + }, + { + "epoch": 1.0473718364698248, + "grad_norm": 0.6010099425197631, + "learning_rate": 3.665761774213229e-06, + "loss": 0.5427, + "step": 6456 + }, + { + "epoch": 1.0475340687865022, + "grad_norm": 0.5640035107791659, + "learning_rate": 3.6653839939435955e-06, + "loss": 0.5086, + "step": 6457 + }, + { + "epoch": 1.0476963011031797, + "grad_norm": 0.5899861834084227, + "learning_rate": 3.6650061796714597e-06, + "loss": 0.5456, + "step": 6458 + }, + { + "epoch": 1.0478585334198571, + "grad_norm": 0.6024844179011017, + "learning_rate": 3.6646283314078447e-06, + "loss": 0.5371, + "step": 6459 + }, + { + "epoch": 1.0480207657365348, + "grad_norm": 0.6029957629525885, + "learning_rate": 3.664250449163777e-06, + "loss": 0.5466, + "step": 6460 + }, + { + "epoch": 1.0481829980532122, + "grad_norm": 0.5583451100496718, + "learning_rate": 3.6638725329502796e-06, + "loss": 0.5065, + "step": 6461 + }, + { + "epoch": 1.0483452303698897, + "grad_norm": 0.5845839608964151, + "learning_rate": 3.6634945827783807e-06, + "loss": 0.5181, + "step": 6462 + }, + { + "epoch": 1.0485074626865671, + "grad_norm": 0.5946070618899262, + "learning_rate": 3.663116598659106e-06, + "loss": 0.5368, + "step": 6463 + }, + { + "epoch": 1.0486696950032446, + "grad_norm": 0.575285104829396, + "learning_rate": 3.662738580603486e-06, + "loss": 0.534, + "step": 6464 + }, + { + "epoch": 1.048831927319922, + "grad_norm": 0.623243275023896, + "learning_rate": 3.6623605286225496e-06, + "loss": 0.5332, + "step": 6465 + }, + { + "epoch": 1.0489941596365997, + "grad_norm": 0.6040342575457653, + "learning_rate": 3.661982442727326e-06, + "loss": 0.5463, + "step": 6466 + }, + { + "epoch": 1.0491563919532771, + "grad_norm": 0.5641919809230884, + "learning_rate": 3.6616043229288483e-06, + "loss": 0.5083, + "step": 6467 + }, + { + "epoch": 1.0493186242699546, + "grad_norm": 0.5631494511250412, + "learning_rate": 3.6612261692381473e-06, + "loss": 0.5335, + "step": 6468 + }, + { + "epoch": 1.049480856586632, + "grad_norm": 0.6958961711356862, + "learning_rate": 3.660847981666258e-06, + "loss": 0.5541, + "step": 6469 + }, + { + "epoch": 1.0496430889033095, + "grad_norm": 0.5837171229653279, + "learning_rate": 3.6604697602242146e-06, + "loss": 0.5536, + "step": 6470 + }, + { + "epoch": 1.049805321219987, + "grad_norm": 0.6234904590662955, + "learning_rate": 3.660091504923051e-06, + "loss": 0.5492, + "step": 6471 + }, + { + "epoch": 1.0499675535366646, + "grad_norm": 0.5770003747828297, + "learning_rate": 3.6597132157738053e-06, + "loss": 0.5382, + "step": 6472 + }, + { + "epoch": 1.050129785853342, + "grad_norm": 0.598570850877123, + "learning_rate": 3.6593348927875143e-06, + "loss": 0.5165, + "step": 6473 + }, + { + "epoch": 1.0502920181700195, + "grad_norm": 0.5769853657471297, + "learning_rate": 3.658956535975216e-06, + "loss": 0.5209, + "step": 6474 + }, + { + "epoch": 1.050454250486697, + "grad_norm": 0.5785107805239407, + "learning_rate": 3.6585781453479504e-06, + "loss": 0.5109, + "step": 6475 + }, + { + "epoch": 1.0506164828033744, + "grad_norm": 0.5581147813862989, + "learning_rate": 3.6581997209167567e-06, + "loss": 0.5207, + "step": 6476 + }, + { + "epoch": 1.0507787151200518, + "grad_norm": 0.6080099262105991, + "learning_rate": 3.6578212626926766e-06, + "loss": 0.5465, + "step": 6477 + }, + { + "epoch": 1.0509409474367295, + "grad_norm": 0.5890530336434245, + "learning_rate": 3.6574427706867543e-06, + "loss": 0.5525, + "step": 6478 + }, + { + "epoch": 1.051103179753407, + "grad_norm": 0.5967419932826318, + "learning_rate": 3.65706424491003e-06, + "loss": 0.5148, + "step": 6479 + }, + { + "epoch": 1.0512654120700844, + "grad_norm": 0.5760710513140689, + "learning_rate": 3.656685685373552e-06, + "loss": 0.5184, + "step": 6480 + }, + { + "epoch": 1.0514276443867618, + "grad_norm": 0.5938563122631398, + "learning_rate": 3.6563070920883603e-06, + "loss": 0.5399, + "step": 6481 + }, + { + "epoch": 1.0515898767034393, + "grad_norm": 0.5801263761427639, + "learning_rate": 3.655928465065505e-06, + "loss": 0.4963, + "step": 6482 + }, + { + "epoch": 1.0517521090201167, + "grad_norm": 0.5698537582229366, + "learning_rate": 3.6555498043160325e-06, + "loss": 0.512, + "step": 6483 + }, + { + "epoch": 1.0519143413367944, + "grad_norm": 0.5974936530513275, + "learning_rate": 3.6551711098509906e-06, + "loss": 0.4835, + "step": 6484 + }, + { + "epoch": 1.0520765736534718, + "grad_norm": 0.5957594421009585, + "learning_rate": 3.654792381681429e-06, + "loss": 0.5522, + "step": 6485 + }, + { + "epoch": 1.0522388059701493, + "grad_norm": 0.5966002230285093, + "learning_rate": 3.6544136198183965e-06, + "loss": 0.5263, + "step": 6486 + }, + { + "epoch": 1.0524010382868267, + "grad_norm": 0.5718107309225222, + "learning_rate": 3.6540348242729464e-06, + "loss": 0.5448, + "step": 6487 + }, + { + "epoch": 1.0525632706035042, + "grad_norm": 0.5868781705522675, + "learning_rate": 3.6536559950561295e-06, + "loss": 0.5578, + "step": 6488 + }, + { + "epoch": 1.0527255029201816, + "grad_norm": 0.6043022789656477, + "learning_rate": 3.6532771321789995e-06, + "loss": 0.5427, + "step": 6489 + }, + { + "epoch": 1.052887735236859, + "grad_norm": 0.6272531821517218, + "learning_rate": 3.65289823565261e-06, + "loss": 0.4917, + "step": 6490 + }, + { + "epoch": 1.0530499675535367, + "grad_norm": 0.5955322058441409, + "learning_rate": 3.6525193054880157e-06, + "loss": 0.5328, + "step": 6491 + }, + { + "epoch": 1.0532121998702142, + "grad_norm": 0.5982364369234667, + "learning_rate": 3.652140341696274e-06, + "loss": 0.5181, + "step": 6492 + }, + { + "epoch": 1.0533744321868916, + "grad_norm": 0.6076105648479536, + "learning_rate": 3.651761344288441e-06, + "loss": 0.566, + "step": 6493 + }, + { + "epoch": 1.053536664503569, + "grad_norm": 0.5875340630824745, + "learning_rate": 3.6513823132755754e-06, + "loss": 0.5232, + "step": 6494 + }, + { + "epoch": 1.0536988968202465, + "grad_norm": 0.6098532555342608, + "learning_rate": 3.6510032486687353e-06, + "loss": 0.5207, + "step": 6495 + }, + { + "epoch": 1.053861129136924, + "grad_norm": 0.5884223536037525, + "learning_rate": 3.6506241504789814e-06, + "loss": 0.544, + "step": 6496 + }, + { + "epoch": 1.0540233614536016, + "grad_norm": 0.6172321465416533, + "learning_rate": 3.6502450187173745e-06, + "loss": 0.4953, + "step": 6497 + }, + { + "epoch": 1.054185593770279, + "grad_norm": 0.6129796337511145, + "learning_rate": 3.6498658533949766e-06, + "loss": 0.5454, + "step": 6498 + }, + { + "epoch": 1.0543478260869565, + "grad_norm": 0.5790835587604344, + "learning_rate": 3.64948665452285e-06, + "loss": 0.5225, + "step": 6499 + }, + { + "epoch": 1.054510058403634, + "grad_norm": 0.5621943486516149, + "learning_rate": 3.64910742211206e-06, + "loss": 0.5208, + "step": 6500 + }, + { + "epoch": 1.0546722907203114, + "grad_norm": 0.6067898334173809, + "learning_rate": 3.6487281561736705e-06, + "loss": 0.5254, + "step": 6501 + }, + { + "epoch": 1.0548345230369889, + "grad_norm": 0.5730162508723897, + "learning_rate": 3.6483488567187473e-06, + "loss": 0.526, + "step": 6502 + }, + { + "epoch": 1.0549967553536665, + "grad_norm": 0.59104017079076, + "learning_rate": 3.647969523758358e-06, + "loss": 0.5264, + "step": 6503 + }, + { + "epoch": 1.055158987670344, + "grad_norm": 0.5730607399517774, + "learning_rate": 3.6475901573035688e-06, + "loss": 0.5098, + "step": 6504 + }, + { + "epoch": 1.0553212199870214, + "grad_norm": 0.5920567925507413, + "learning_rate": 3.6472107573654506e-06, + "loss": 0.5319, + "step": 6505 + }, + { + "epoch": 1.0554834523036989, + "grad_norm": 0.6028565542334052, + "learning_rate": 3.646831323955072e-06, + "loss": 0.5188, + "step": 6506 + }, + { + "epoch": 1.0556456846203763, + "grad_norm": 0.6191105843880745, + "learning_rate": 3.6464518570835046e-06, + "loss": 0.5544, + "step": 6507 + }, + { + "epoch": 1.0558079169370538, + "grad_norm": 0.5828939881314297, + "learning_rate": 3.646072356761819e-06, + "loss": 0.4827, + "step": 6508 + }, + { + "epoch": 1.0559701492537314, + "grad_norm": 0.6255388729473039, + "learning_rate": 3.6456928230010884e-06, + "loss": 0.557, + "step": 6509 + }, + { + "epoch": 1.056132381570409, + "grad_norm": 0.6340127998517345, + "learning_rate": 3.645313255812386e-06, + "loss": 0.5595, + "step": 6510 + }, + { + "epoch": 1.0562946138870863, + "grad_norm": 0.6124065021958905, + "learning_rate": 3.6449336552067877e-06, + "loss": 0.5055, + "step": 6511 + }, + { + "epoch": 1.0564568462037638, + "grad_norm": 0.6134965953009259, + "learning_rate": 3.6445540211953683e-06, + "loss": 0.5484, + "step": 6512 + }, + { + "epoch": 1.0566190785204412, + "grad_norm": 0.5765967317254568, + "learning_rate": 3.6441743537892045e-06, + "loss": 0.5181, + "step": 6513 + }, + { + "epoch": 1.0567813108371187, + "grad_norm": 0.5954701503823908, + "learning_rate": 3.6437946529993738e-06, + "loss": 0.5272, + "step": 6514 + }, + { + "epoch": 1.0569435431537961, + "grad_norm": 0.5675841621259191, + "learning_rate": 3.6434149188369548e-06, + "loss": 0.4996, + "step": 6515 + }, + { + "epoch": 1.0571057754704738, + "grad_norm": 0.5980888173725573, + "learning_rate": 3.643035151313028e-06, + "loss": 0.534, + "step": 6516 + }, + { + "epoch": 1.0572680077871512, + "grad_norm": 0.6186414440327729, + "learning_rate": 3.642655350438672e-06, + "loss": 0.5091, + "step": 6517 + }, + { + "epoch": 1.0574302401038287, + "grad_norm": 0.5729579072495447, + "learning_rate": 3.6422755162249694e-06, + "loss": 0.4907, + "step": 6518 + }, + { + "epoch": 1.0575924724205061, + "grad_norm": 0.5848609635743584, + "learning_rate": 3.6418956486830026e-06, + "loss": 0.5353, + "step": 6519 + }, + { + "epoch": 1.0577547047371836, + "grad_norm": 0.5953875058975243, + "learning_rate": 3.6415157478238554e-06, + "loss": 0.5608, + "step": 6520 + }, + { + "epoch": 1.057916937053861, + "grad_norm": 0.5708597815557424, + "learning_rate": 3.6411358136586117e-06, + "loss": 0.5632, + "step": 6521 + }, + { + "epoch": 1.0580791693705387, + "grad_norm": 0.6252180262624523, + "learning_rate": 3.640755846198356e-06, + "loss": 0.5331, + "step": 6522 + }, + { + "epoch": 1.0582414016872161, + "grad_norm": 0.5739762905717886, + "learning_rate": 3.6403758454541764e-06, + "loss": 0.5279, + "step": 6523 + }, + { + "epoch": 1.0584036340038936, + "grad_norm": 0.5745973270559923, + "learning_rate": 3.6399958114371597e-06, + "loss": 0.5169, + "step": 6524 + }, + { + "epoch": 1.058565866320571, + "grad_norm": 0.5722635379146572, + "learning_rate": 3.6396157441583937e-06, + "loss": 0.5264, + "step": 6525 + }, + { + "epoch": 1.0587280986372485, + "grad_norm": 0.6160773121323019, + "learning_rate": 3.6392356436289677e-06, + "loss": 0.5141, + "step": 6526 + }, + { + "epoch": 1.058890330953926, + "grad_norm": 0.6297952972232009, + "learning_rate": 3.638855509859972e-06, + "loss": 0.492, + "step": 6527 + }, + { + "epoch": 1.0590525632706036, + "grad_norm": 0.6229127064960545, + "learning_rate": 3.6384753428624973e-06, + "loss": 0.5264, + "step": 6528 + }, + { + "epoch": 1.059214795587281, + "grad_norm": 0.6369529072687444, + "learning_rate": 3.638095142647638e-06, + "loss": 0.5293, + "step": 6529 + }, + { + "epoch": 1.0593770279039585, + "grad_norm": 0.6015303204913127, + "learning_rate": 3.637714909226484e-06, + "loss": 0.5299, + "step": 6530 + }, + { + "epoch": 1.059539260220636, + "grad_norm": 0.6209660881876217, + "learning_rate": 3.6373346426101314e-06, + "loss": 0.5371, + "step": 6531 + }, + { + "epoch": 1.0597014925373134, + "grad_norm": 0.6095412082006315, + "learning_rate": 3.636954342809675e-06, + "loss": 0.5356, + "step": 6532 + }, + { + "epoch": 1.0598637248539908, + "grad_norm": 0.5908721327875521, + "learning_rate": 3.63657400983621e-06, + "loss": 0.5478, + "step": 6533 + }, + { + "epoch": 1.0600259571706685, + "grad_norm": 0.6019419366953515, + "learning_rate": 3.636193643700835e-06, + "loss": 0.5, + "step": 6534 + }, + { + "epoch": 1.060188189487346, + "grad_norm": 0.5642233274773225, + "learning_rate": 3.6358132444146465e-06, + "loss": 0.5345, + "step": 6535 + }, + { + "epoch": 1.0603504218040234, + "grad_norm": 0.6053013390830766, + "learning_rate": 3.6354328119887446e-06, + "loss": 0.5414, + "step": 6536 + }, + { + "epoch": 1.0605126541207008, + "grad_norm": 0.5625995994919462, + "learning_rate": 3.6350523464342275e-06, + "loss": 0.5187, + "step": 6537 + }, + { + "epoch": 1.0606748864373783, + "grad_norm": 0.572637182806109, + "learning_rate": 3.6346718477621977e-06, + "loss": 0.4835, + "step": 6538 + }, + { + "epoch": 1.0608371187540557, + "grad_norm": 0.556849062516536, + "learning_rate": 3.634291315983757e-06, + "loss": 0.4813, + "step": 6539 + }, + { + "epoch": 1.0609993510707332, + "grad_norm": 0.5884987211757723, + "learning_rate": 3.6339107511100065e-06, + "loss": 0.5301, + "step": 6540 + }, + { + "epoch": 1.0611615833874108, + "grad_norm": 0.6029332616708414, + "learning_rate": 3.6335301531520518e-06, + "loss": 0.5381, + "step": 6541 + }, + { + "epoch": 1.0613238157040883, + "grad_norm": 0.570203266459703, + "learning_rate": 3.6331495221209972e-06, + "loss": 0.4909, + "step": 6542 + }, + { + "epoch": 1.0614860480207657, + "grad_norm": 0.6089875537312962, + "learning_rate": 3.6327688580279484e-06, + "loss": 0.5263, + "step": 6543 + }, + { + "epoch": 1.0616482803374432, + "grad_norm": 0.6199841169588314, + "learning_rate": 3.632388160884012e-06, + "loss": 0.5785, + "step": 6544 + }, + { + "epoch": 1.0618105126541206, + "grad_norm": 0.5771170324209938, + "learning_rate": 3.632007430700295e-06, + "loss": 0.4905, + "step": 6545 + }, + { + "epoch": 1.061972744970798, + "grad_norm": 0.5957485713231548, + "learning_rate": 3.631626667487906e-06, + "loss": 0.4832, + "step": 6546 + }, + { + "epoch": 1.0621349772874757, + "grad_norm": 0.5825559927032469, + "learning_rate": 3.6312458712579564e-06, + "loss": 0.5292, + "step": 6547 + }, + { + "epoch": 1.0622972096041532, + "grad_norm": 0.5826921272885983, + "learning_rate": 3.6308650420215543e-06, + "loss": 0.5473, + "step": 6548 + }, + { + "epoch": 1.0624594419208306, + "grad_norm": 0.5719908524882343, + "learning_rate": 3.6304841797898126e-06, + "loss": 0.5496, + "step": 6549 + }, + { + "epoch": 1.062621674237508, + "grad_norm": 0.610412470853104, + "learning_rate": 3.630103284573844e-06, + "loss": 0.4913, + "step": 6550 + }, + { + "epoch": 1.0627839065541855, + "grad_norm": 0.5739975357952485, + "learning_rate": 3.62972235638476e-06, + "loss": 0.521, + "step": 6551 + }, + { + "epoch": 1.062946138870863, + "grad_norm": 0.5869000292277833, + "learning_rate": 3.6293413952336774e-06, + "loss": 0.5344, + "step": 6552 + }, + { + "epoch": 1.0631083711875406, + "grad_norm": 0.5718106810574829, + "learning_rate": 3.6289604011317104e-06, + "loss": 0.5389, + "step": 6553 + }, + { + "epoch": 1.063270603504218, + "grad_norm": 0.6185471160236738, + "learning_rate": 3.6285793740899754e-06, + "loss": 0.5413, + "step": 6554 + }, + { + "epoch": 1.0634328358208955, + "grad_norm": 0.6075700785001278, + "learning_rate": 3.6281983141195886e-06, + "loss": 0.5288, + "step": 6555 + }, + { + "epoch": 1.063595068137573, + "grad_norm": 0.5789530737931343, + "learning_rate": 3.62781722123167e-06, + "loss": 0.5061, + "step": 6556 + }, + { + "epoch": 1.0637573004542504, + "grad_norm": 0.5779127190601518, + "learning_rate": 3.6274360954373383e-06, + "loss": 0.5393, + "step": 6557 + }, + { + "epoch": 1.0639195327709279, + "grad_norm": 0.6301771486572669, + "learning_rate": 3.6270549367477124e-06, + "loss": 0.5347, + "step": 6558 + }, + { + "epoch": 1.0640817650876055, + "grad_norm": 0.5966515242825692, + "learning_rate": 3.6266737451739152e-06, + "loss": 0.5613, + "step": 6559 + }, + { + "epoch": 1.064243997404283, + "grad_norm": 0.5643658422622518, + "learning_rate": 3.626292520727067e-06, + "loss": 0.5099, + "step": 6560 + }, + { + "epoch": 1.0644062297209604, + "grad_norm": 0.5850015552040299, + "learning_rate": 3.6259112634182926e-06, + "loss": 0.5267, + "step": 6561 + }, + { + "epoch": 1.0645684620376379, + "grad_norm": 0.6270592098083224, + "learning_rate": 3.6255299732587144e-06, + "loss": 0.5141, + "step": 6562 + }, + { + "epoch": 1.0647306943543153, + "grad_norm": 0.5872612367617878, + "learning_rate": 3.6251486502594584e-06, + "loss": 0.5565, + "step": 6563 + }, + { + "epoch": 1.0648929266709928, + "grad_norm": 0.5835592547970522, + "learning_rate": 3.62476729443165e-06, + "loss": 0.5367, + "step": 6564 + }, + { + "epoch": 1.0650551589876704, + "grad_norm": 0.6022182674945654, + "learning_rate": 3.6243859057864167e-06, + "loss": 0.5345, + "step": 6565 + }, + { + "epoch": 1.065217391304348, + "grad_norm": 0.5718263993707964, + "learning_rate": 3.6240044843348843e-06, + "loss": 0.5416, + "step": 6566 + }, + { + "epoch": 1.0653796236210253, + "grad_norm": 0.5614676389620448, + "learning_rate": 3.623623030088184e-06, + "loss": 0.5406, + "step": 6567 + }, + { + "epoch": 1.0655418559377028, + "grad_norm": 0.5996219414866112, + "learning_rate": 3.623241543057445e-06, + "loss": 0.5043, + "step": 6568 + }, + { + "epoch": 1.0657040882543802, + "grad_norm": 0.5911559870227723, + "learning_rate": 3.6228600232537965e-06, + "loss": 0.5181, + "step": 6569 + }, + { + "epoch": 1.0658663205710577, + "grad_norm": 0.6820446088037604, + "learning_rate": 3.622478470688372e-06, + "loss": 0.5203, + "step": 6570 + }, + { + "epoch": 1.0660285528877353, + "grad_norm": 0.6025089749014975, + "learning_rate": 3.622096885372303e-06, + "loss": 0.5314, + "step": 6571 + }, + { + "epoch": 1.0661907852044128, + "grad_norm": 0.5956751924750509, + "learning_rate": 3.6217152673167234e-06, + "loss": 0.527, + "step": 6572 + }, + { + "epoch": 1.0663530175210902, + "grad_norm": 0.5849616741969839, + "learning_rate": 3.6213336165327667e-06, + "loss": 0.5236, + "step": 6573 + }, + { + "epoch": 1.0665152498377677, + "grad_norm": 0.5975798785441471, + "learning_rate": 3.6209519330315695e-06, + "loss": 0.5151, + "step": 6574 + }, + { + "epoch": 1.0666774821544451, + "grad_norm": 0.5973144819381094, + "learning_rate": 3.6205702168242686e-06, + "loss": 0.5135, + "step": 6575 + }, + { + "epoch": 1.0668397144711226, + "grad_norm": 0.6446446339269939, + "learning_rate": 3.6201884679220007e-06, + "loss": 0.5303, + "step": 6576 + }, + { + "epoch": 1.0670019467878, + "grad_norm": 0.6013465729775719, + "learning_rate": 3.619806686335904e-06, + "loss": 0.5325, + "step": 6577 + }, + { + "epoch": 1.0671641791044777, + "grad_norm": 0.5843968886020531, + "learning_rate": 3.619424872077117e-06, + "loss": 0.4817, + "step": 6578 + }, + { + "epoch": 1.0673264114211551, + "grad_norm": 0.5680434261089445, + "learning_rate": 3.6190430251567817e-06, + "loss": 0.5383, + "step": 6579 + }, + { + "epoch": 1.0674886437378326, + "grad_norm": 0.5812015783943735, + "learning_rate": 3.618661145586038e-06, + "loss": 0.5042, + "step": 6580 + }, + { + "epoch": 1.06765087605451, + "grad_norm": 0.5881561938063045, + "learning_rate": 3.618279233376029e-06, + "loss": 0.5296, + "step": 6581 + }, + { + "epoch": 1.0678131083711875, + "grad_norm": 0.6351156750830035, + "learning_rate": 3.6178972885378973e-06, + "loss": 0.5103, + "step": 6582 + }, + { + "epoch": 1.067975340687865, + "grad_norm": 0.5922574525599781, + "learning_rate": 3.6175153110827867e-06, + "loss": 0.5562, + "step": 6583 + }, + { + "epoch": 1.0681375730045426, + "grad_norm": 0.6265082891669366, + "learning_rate": 3.6171333010218412e-06, + "loss": 0.5337, + "step": 6584 + }, + { + "epoch": 1.06829980532122, + "grad_norm": 0.6006483245631793, + "learning_rate": 3.616751258366209e-06, + "loss": 0.5243, + "step": 6585 + }, + { + "epoch": 1.0684620376378975, + "grad_norm": 0.5711429246409468, + "learning_rate": 3.616369183127036e-06, + "loss": 0.5161, + "step": 6586 + }, + { + "epoch": 1.068624269954575, + "grad_norm": 0.6072838214464916, + "learning_rate": 3.615987075315469e-06, + "loss": 0.5632, + "step": 6587 + }, + { + "epoch": 1.0687865022712524, + "grad_norm": 0.590060992298662, + "learning_rate": 3.6156049349426592e-06, + "loss": 0.5275, + "step": 6588 + }, + { + "epoch": 1.0689487345879298, + "grad_norm": 0.600327286611885, + "learning_rate": 3.615222762019753e-06, + "loss": 0.538, + "step": 6589 + }, + { + "epoch": 1.0691109669046075, + "grad_norm": 0.6069278764858484, + "learning_rate": 3.614840556557905e-06, + "loss": 0.5182, + "step": 6590 + }, + { + "epoch": 1.069273199221285, + "grad_norm": 0.5880139640998333, + "learning_rate": 3.614458318568263e-06, + "loss": 0.5183, + "step": 6591 + }, + { + "epoch": 1.0694354315379624, + "grad_norm": 0.5754739263004486, + "learning_rate": 3.614076048061982e-06, + "loss": 0.5368, + "step": 6592 + }, + { + "epoch": 1.0695976638546398, + "grad_norm": 0.5696022445785563, + "learning_rate": 3.6136937450502154e-06, + "loss": 0.5171, + "step": 6593 + }, + { + "epoch": 1.0697598961713173, + "grad_norm": 0.6005709618900577, + "learning_rate": 3.613311409544116e-06, + "loss": 0.5519, + "step": 6594 + }, + { + "epoch": 1.0699221284879947, + "grad_norm": 0.575514031370159, + "learning_rate": 3.6129290415548417e-06, + "loss": 0.4946, + "step": 6595 + }, + { + "epoch": 1.0700843608046724, + "grad_norm": 0.5913405067129476, + "learning_rate": 3.6125466410935466e-06, + "loss": 0.5596, + "step": 6596 + }, + { + "epoch": 1.0702465931213498, + "grad_norm": 0.5647746272427547, + "learning_rate": 3.612164208171389e-06, + "loss": 0.5323, + "step": 6597 + }, + { + "epoch": 1.0704088254380273, + "grad_norm": 0.5958226885848936, + "learning_rate": 3.6117817427995283e-06, + "loss": 0.5256, + "step": 6598 + }, + { + "epoch": 1.0705710577547047, + "grad_norm": 0.5804215480014054, + "learning_rate": 3.6113992449891224e-06, + "loss": 0.5281, + "step": 6599 + }, + { + "epoch": 1.0707332900713822, + "grad_norm": 0.5907540667984521, + "learning_rate": 3.611016714751331e-06, + "loss": 0.5286, + "step": 6600 + }, + { + "epoch": 1.0708955223880596, + "grad_norm": 0.5859480467515956, + "learning_rate": 3.610634152097316e-06, + "loss": 0.5422, + "step": 6601 + }, + { + "epoch": 1.071057754704737, + "grad_norm": 0.5573150372906829, + "learning_rate": 3.610251557038239e-06, + "loss": 0.5147, + "step": 6602 + }, + { + "epoch": 1.0712199870214147, + "grad_norm": 0.6080150484786203, + "learning_rate": 3.609868929585264e-06, + "loss": 0.5165, + "step": 6603 + }, + { + "epoch": 1.0713822193380922, + "grad_norm": 0.5798130328494056, + "learning_rate": 3.6094862697495547e-06, + "loss": 0.5368, + "step": 6604 + }, + { + "epoch": 1.0715444516547696, + "grad_norm": 0.6011994764712276, + "learning_rate": 3.609103577542275e-06, + "loss": 0.5651, + "step": 6605 + }, + { + "epoch": 1.071706683971447, + "grad_norm": 0.6398884265606144, + "learning_rate": 3.6087208529745916e-06, + "loss": 0.5363, + "step": 6606 + }, + { + "epoch": 1.0718689162881245, + "grad_norm": 0.5908278696334326, + "learning_rate": 3.6083380960576706e-06, + "loss": 0.5357, + "step": 6607 + }, + { + "epoch": 1.072031148604802, + "grad_norm": 0.5718933977021896, + "learning_rate": 3.6079553068026813e-06, + "loss": 0.5325, + "step": 6608 + }, + { + "epoch": 1.0721933809214796, + "grad_norm": 0.6047082511904969, + "learning_rate": 3.6075724852207897e-06, + "loss": 0.5409, + "step": 6609 + }, + { + "epoch": 1.072355613238157, + "grad_norm": 0.6046596298082024, + "learning_rate": 3.607189631323168e-06, + "loss": 0.5088, + "step": 6610 + }, + { + "epoch": 1.0725178455548345, + "grad_norm": 0.5472486229963418, + "learning_rate": 3.6068067451209852e-06, + "loss": 0.5148, + "step": 6611 + }, + { + "epoch": 1.072680077871512, + "grad_norm": 0.5712659414143135, + "learning_rate": 3.606423826625414e-06, + "loss": 0.5247, + "step": 6612 + }, + { + "epoch": 1.0728423101881894, + "grad_norm": 0.589266310433464, + "learning_rate": 3.6060408758476263e-06, + "loss": 0.5251, + "step": 6613 + }, + { + "epoch": 1.0730045425048669, + "grad_norm": 0.5868749800423502, + "learning_rate": 3.6056578927987946e-06, + "loss": 0.5101, + "step": 6614 + }, + { + "epoch": 1.0731667748215445, + "grad_norm": 0.5811238460175383, + "learning_rate": 3.6052748774900944e-06, + "loss": 0.4923, + "step": 6615 + }, + { + "epoch": 1.073329007138222, + "grad_norm": 0.6134517060488408, + "learning_rate": 3.6048918299327e-06, + "loss": 0.5545, + "step": 6616 + }, + { + "epoch": 1.0734912394548994, + "grad_norm": 0.6115613595012999, + "learning_rate": 3.604508750137789e-06, + "loss": 0.5185, + "step": 6617 + }, + { + "epoch": 1.0736534717715769, + "grad_norm": 0.576416852984215, + "learning_rate": 3.6041256381165373e-06, + "loss": 0.5108, + "step": 6618 + }, + { + "epoch": 1.0738157040882543, + "grad_norm": 0.6003118237739737, + "learning_rate": 3.6037424938801233e-06, + "loss": 0.5093, + "step": 6619 + }, + { + "epoch": 1.0739779364049318, + "grad_norm": 0.5699856562399876, + "learning_rate": 3.6033593174397264e-06, + "loss": 0.5115, + "step": 6620 + }, + { + "epoch": 1.0741401687216094, + "grad_norm": 0.5997042385732239, + "learning_rate": 3.6029761088065255e-06, + "loss": 0.5288, + "step": 6621 + }, + { + "epoch": 1.074302401038287, + "grad_norm": 0.6205611568319058, + "learning_rate": 3.6025928679917033e-06, + "loss": 0.5794, + "step": 6622 + }, + { + "epoch": 1.0744646333549643, + "grad_norm": 0.6091383259959207, + "learning_rate": 3.6022095950064406e-06, + "loss": 0.5556, + "step": 6623 + }, + { + "epoch": 1.0746268656716418, + "grad_norm": 0.5534881086358362, + "learning_rate": 3.60182628986192e-06, + "loss": 0.516, + "step": 6624 + }, + { + "epoch": 1.0747890979883192, + "grad_norm": 0.5810475428018536, + "learning_rate": 3.601442952569325e-06, + "loss": 0.5494, + "step": 6625 + }, + { + "epoch": 1.0749513303049967, + "grad_norm": 0.5902235892220121, + "learning_rate": 3.601059583139842e-06, + "loss": 0.5108, + "step": 6626 + }, + { + "epoch": 1.0751135626216741, + "grad_norm": 0.5622559557970662, + "learning_rate": 3.6006761815846535e-06, + "loss": 0.5341, + "step": 6627 + }, + { + "epoch": 1.0752757949383518, + "grad_norm": 0.6372276491255621, + "learning_rate": 3.6002927479149494e-06, + "loss": 0.5443, + "step": 6628 + }, + { + "epoch": 1.0754380272550292, + "grad_norm": 0.5959317107579924, + "learning_rate": 3.5999092821419153e-06, + "loss": 0.525, + "step": 6629 + }, + { + "epoch": 1.0756002595717067, + "grad_norm": 0.5881313546723314, + "learning_rate": 3.5995257842767396e-06, + "loss": 0.5392, + "step": 6630 + }, + { + "epoch": 1.0757624918883841, + "grad_norm": 0.6000406123559595, + "learning_rate": 3.5991422543306128e-06, + "loss": 0.5406, + "step": 6631 + }, + { + "epoch": 1.0759247242050616, + "grad_norm": 0.6023112322816823, + "learning_rate": 3.5987586923147234e-06, + "loss": 0.5489, + "step": 6632 + }, + { + "epoch": 1.0760869565217392, + "grad_norm": 0.5963905999102519, + "learning_rate": 3.5983750982402645e-06, + "loss": 0.5219, + "step": 6633 + }, + { + "epoch": 1.0762491888384167, + "grad_norm": 0.5660356957129777, + "learning_rate": 3.5979914721184263e-06, + "loss": 0.514, + "step": 6634 + }, + { + "epoch": 1.0764114211550941, + "grad_norm": 0.6103030914922086, + "learning_rate": 3.5976078139604036e-06, + "loss": 0.5085, + "step": 6635 + }, + { + "epoch": 1.0765736534717716, + "grad_norm": 0.6081873588750002, + "learning_rate": 3.59722412377739e-06, + "loss": 0.5282, + "step": 6636 + }, + { + "epoch": 1.076735885788449, + "grad_norm": 0.6583531705834331, + "learning_rate": 3.5968404015805796e-06, + "loss": 0.5426, + "step": 6637 + }, + { + "epoch": 1.0768981181051265, + "grad_norm": 0.5726730642910888, + "learning_rate": 3.5964566473811692e-06, + "loss": 0.5472, + "step": 6638 + }, + { + "epoch": 1.077060350421804, + "grad_norm": 0.6090463048363441, + "learning_rate": 3.596072861190356e-06, + "loss": 0.5118, + "step": 6639 + }, + { + "epoch": 1.0772225827384816, + "grad_norm": 0.634177579980308, + "learning_rate": 3.595689043019336e-06, + "loss": 0.4975, + "step": 6640 + }, + { + "epoch": 1.077384815055159, + "grad_norm": 0.6114419817451183, + "learning_rate": 3.59530519287931e-06, + "loss": 0.6053, + "step": 6641 + }, + { + "epoch": 1.0775470473718365, + "grad_norm": 0.636024135192991, + "learning_rate": 3.594921310781476e-06, + "loss": 0.542, + "step": 6642 + }, + { + "epoch": 1.077709279688514, + "grad_norm": 0.6319114739398666, + "learning_rate": 3.594537396737035e-06, + "loss": 0.5112, + "step": 6643 + }, + { + "epoch": 1.0778715120051914, + "grad_norm": 0.6214049828191938, + "learning_rate": 3.5941534507571896e-06, + "loss": 0.5467, + "step": 6644 + }, + { + "epoch": 1.0780337443218688, + "grad_norm": 0.5853927290199498, + "learning_rate": 3.593769472853141e-06, + "loss": 0.5286, + "step": 6645 + }, + { + "epoch": 1.0781959766385465, + "grad_norm": 0.574861714070116, + "learning_rate": 3.593385463036093e-06, + "loss": 0.5356, + "step": 6646 + }, + { + "epoch": 1.078358208955224, + "grad_norm": 0.5922680461387907, + "learning_rate": 3.593001421317249e-06, + "loss": 0.5342, + "step": 6647 + }, + { + "epoch": 1.0785204412719014, + "grad_norm": 0.5996484147787757, + "learning_rate": 3.592617347707816e-06, + "loss": 0.5258, + "step": 6648 + }, + { + "epoch": 1.0786826735885788, + "grad_norm": 0.5732533309616125, + "learning_rate": 3.592233242218998e-06, + "loss": 0.494, + "step": 6649 + }, + { + "epoch": 1.0788449059052563, + "grad_norm": 0.5747053347800686, + "learning_rate": 3.5918491048620037e-06, + "loss": 0.51, + "step": 6650 + }, + { + "epoch": 1.0790071382219337, + "grad_norm": 0.6114911865729303, + "learning_rate": 3.5914649356480414e-06, + "loss": 0.5615, + "step": 6651 + }, + { + "epoch": 1.0791693705386114, + "grad_norm": 0.5801757879312484, + "learning_rate": 3.591080734588318e-06, + "loss": 0.5218, + "step": 6652 + }, + { + "epoch": 1.0793316028552888, + "grad_norm": 0.5772471265654219, + "learning_rate": 3.590696501694045e-06, + "loss": 0.5679, + "step": 6653 + }, + { + "epoch": 1.0794938351719663, + "grad_norm": 0.5808540428693395, + "learning_rate": 3.590312236976433e-06, + "loss": 0.5321, + "step": 6654 + }, + { + "epoch": 1.0796560674886437, + "grad_norm": 0.5614767170982472, + "learning_rate": 3.5899279404466937e-06, + "loss": 0.523, + "step": 6655 + }, + { + "epoch": 1.0798182998053212, + "grad_norm": 0.6196604276407301, + "learning_rate": 3.5895436121160388e-06, + "loss": 0.5573, + "step": 6656 + }, + { + "epoch": 1.0799805321219986, + "grad_norm": 0.6004361138931884, + "learning_rate": 3.589159251995684e-06, + "loss": 0.5083, + "step": 6657 + }, + { + "epoch": 1.0801427644386763, + "grad_norm": 0.6064311352648245, + "learning_rate": 3.588774860096841e-06, + "loss": 0.5697, + "step": 6658 + }, + { + "epoch": 1.0803049967553537, + "grad_norm": 0.5770493862229453, + "learning_rate": 3.588390436430727e-06, + "loss": 0.547, + "step": 6659 + }, + { + "epoch": 1.0804672290720312, + "grad_norm": 0.595796984054806, + "learning_rate": 3.588005981008558e-06, + "loss": 0.5255, + "step": 6660 + }, + { + "epoch": 1.0806294613887086, + "grad_norm": 0.5532468406074911, + "learning_rate": 3.5876214938415515e-06, + "loss": 0.4877, + "step": 6661 + }, + { + "epoch": 1.080791693705386, + "grad_norm": 0.5655050689441662, + "learning_rate": 3.5872369749409264e-06, + "loss": 0.5206, + "step": 6662 + }, + { + "epoch": 1.0809539260220635, + "grad_norm": 0.6013192167793939, + "learning_rate": 3.5868524243178996e-06, + "loss": 0.5748, + "step": 6663 + }, + { + "epoch": 1.081116158338741, + "grad_norm": 0.6231164656591998, + "learning_rate": 3.586467841983694e-06, + "loss": 0.5268, + "step": 6664 + }, + { + "epoch": 1.0812783906554186, + "grad_norm": 0.6360798296578335, + "learning_rate": 3.586083227949528e-06, + "loss": 0.536, + "step": 6665 + }, + { + "epoch": 1.081440622972096, + "grad_norm": 0.5609204892362988, + "learning_rate": 3.585698582226625e-06, + "loss": 0.5194, + "step": 6666 + }, + { + "epoch": 1.0816028552887735, + "grad_norm": 0.6337069137302014, + "learning_rate": 3.5853139048262076e-06, + "loss": 0.483, + "step": 6667 + }, + { + "epoch": 1.081765087605451, + "grad_norm": 0.6046311586913395, + "learning_rate": 3.5849291957594988e-06, + "loss": 0.539, + "step": 6668 + }, + { + "epoch": 1.0819273199221284, + "grad_norm": 0.609962131407889, + "learning_rate": 3.584544455037725e-06, + "loss": 0.5453, + "step": 6669 + }, + { + "epoch": 1.0820895522388059, + "grad_norm": 0.610478639790512, + "learning_rate": 3.5841596826721096e-06, + "loss": 0.5152, + "step": 6670 + }, + { + "epoch": 1.0822517845554835, + "grad_norm": 0.6063836689845036, + "learning_rate": 3.5837748786738812e-06, + "loss": 0.5194, + "step": 6671 + }, + { + "epoch": 1.082414016872161, + "grad_norm": 0.6231649166533815, + "learning_rate": 3.5833900430542663e-06, + "loss": 0.5284, + "step": 6672 + }, + { + "epoch": 1.0825762491888384, + "grad_norm": 0.5793129252634688, + "learning_rate": 3.583005175824493e-06, + "loss": 0.492, + "step": 6673 + }, + { + "epoch": 1.0827384815055159, + "grad_norm": 0.5878016014366144, + "learning_rate": 3.5826202769957907e-06, + "loss": 0.5134, + "step": 6674 + }, + { + "epoch": 1.0829007138221933, + "grad_norm": 0.5755920033209464, + "learning_rate": 3.5822353465793907e-06, + "loss": 0.5477, + "step": 6675 + }, + { + "epoch": 1.0830629461388708, + "grad_norm": 0.6005282032422958, + "learning_rate": 3.581850384586522e-06, + "loss": 0.5395, + "step": 6676 + }, + { + "epoch": 1.0832251784555484, + "grad_norm": 0.6053446839127151, + "learning_rate": 3.581465391028419e-06, + "loss": 0.5449, + "step": 6677 + }, + { + "epoch": 1.083387410772226, + "grad_norm": 0.583017372483867, + "learning_rate": 3.5810803659163136e-06, + "loss": 0.5115, + "step": 6678 + }, + { + "epoch": 1.0835496430889033, + "grad_norm": 0.5617258554423555, + "learning_rate": 3.5806953092614394e-06, + "loss": 0.5435, + "step": 6679 + }, + { + "epoch": 1.0837118754055808, + "grad_norm": 0.5974951952565629, + "learning_rate": 3.580310221075032e-06, + "loss": 0.5161, + "step": 6680 + }, + { + "epoch": 1.0838741077222582, + "grad_norm": 0.5923875787846925, + "learning_rate": 3.5799251013683256e-06, + "loss": 0.5309, + "step": 6681 + }, + { + "epoch": 1.0840363400389357, + "grad_norm": 0.5907532634322535, + "learning_rate": 3.5795399501525596e-06, + "loss": 0.5268, + "step": 6682 + }, + { + "epoch": 1.0841985723556133, + "grad_norm": 0.582361675882674, + "learning_rate": 3.579154767438969e-06, + "loss": 0.5307, + "step": 6683 + }, + { + "epoch": 1.0843608046722908, + "grad_norm": 0.6080415072232527, + "learning_rate": 3.5787695532387933e-06, + "loss": 0.5279, + "step": 6684 + }, + { + "epoch": 1.0845230369889682, + "grad_norm": 0.6106359158033683, + "learning_rate": 3.5783843075632725e-06, + "loss": 0.4862, + "step": 6685 + }, + { + "epoch": 1.0846852693056457, + "grad_norm": 0.6344986851877827, + "learning_rate": 3.577999030423646e-06, + "loss": 0.5382, + "step": 6686 + }, + { + "epoch": 1.0848475016223231, + "grad_norm": 0.5493580528838398, + "learning_rate": 3.5776137218311557e-06, + "loss": 0.5362, + "step": 6687 + }, + { + "epoch": 1.0850097339390006, + "grad_norm": 0.5624714693653616, + "learning_rate": 3.5772283817970425e-06, + "loss": 0.5183, + "step": 6688 + }, + { + "epoch": 1.085171966255678, + "grad_norm": 0.5930204509554877, + "learning_rate": 3.5768430103325512e-06, + "loss": 0.5086, + "step": 6689 + }, + { + "epoch": 1.0853341985723557, + "grad_norm": 0.6154789606498562, + "learning_rate": 3.5764576074489254e-06, + "loss": 0.5389, + "step": 6690 + }, + { + "epoch": 1.0854964308890331, + "grad_norm": 0.5499079756181331, + "learning_rate": 3.5760721731574098e-06, + "loss": 0.5136, + "step": 6691 + }, + { + "epoch": 1.0856586632057106, + "grad_norm": 0.5769687418389924, + "learning_rate": 3.5756867074692504e-06, + "loss": 0.5329, + "step": 6692 + }, + { + "epoch": 1.085820895522388, + "grad_norm": 0.5518517484404273, + "learning_rate": 3.575301210395693e-06, + "loss": 0.4996, + "step": 6693 + }, + { + "epoch": 1.0859831278390655, + "grad_norm": 0.5746180745566066, + "learning_rate": 3.5749156819479856e-06, + "loss": 0.5203, + "step": 6694 + }, + { + "epoch": 1.086145360155743, + "grad_norm": 0.6127859946790579, + "learning_rate": 3.5745301221373786e-06, + "loss": 0.5131, + "step": 6695 + }, + { + "epoch": 1.0863075924724206, + "grad_norm": 0.6312270251712048, + "learning_rate": 3.5741445309751198e-06, + "loss": 0.5394, + "step": 6696 + }, + { + "epoch": 1.086469824789098, + "grad_norm": 0.581160340170841, + "learning_rate": 3.5737589084724593e-06, + "loss": 0.5227, + "step": 6697 + }, + { + "epoch": 1.0866320571057755, + "grad_norm": 0.5772278379862912, + "learning_rate": 3.5733732546406497e-06, + "loss": 0.5201, + "step": 6698 + }, + { + "epoch": 1.086794289422453, + "grad_norm": 0.5958688040909174, + "learning_rate": 3.572987569490942e-06, + "loss": 0.5105, + "step": 6699 + }, + { + "epoch": 1.0869565217391304, + "grad_norm": 0.592270392216555, + "learning_rate": 3.5726018530345913e-06, + "loss": 0.5375, + "step": 6700 + }, + { + "epoch": 1.0871187540558078, + "grad_norm": 0.6120567686541002, + "learning_rate": 3.572216105282849e-06, + "loss": 0.5214, + "step": 6701 + }, + { + "epoch": 1.0872809863724855, + "grad_norm": 0.6186884859089615, + "learning_rate": 3.5718303262469723e-06, + "loss": 0.564, + "step": 6702 + }, + { + "epoch": 1.087443218689163, + "grad_norm": 0.6022557040947774, + "learning_rate": 3.571444515938216e-06, + "loss": 0.5446, + "step": 6703 + }, + { + "epoch": 1.0876054510058404, + "grad_norm": 0.6081726052845262, + "learning_rate": 3.571058674367838e-06, + "loss": 0.481, + "step": 6704 + }, + { + "epoch": 1.0877676833225178, + "grad_norm": 0.6051039055703453, + "learning_rate": 3.5706728015470942e-06, + "loss": 0.5377, + "step": 6705 + }, + { + "epoch": 1.0879299156391953, + "grad_norm": 0.5936295570503166, + "learning_rate": 3.5702868974872444e-06, + "loss": 0.5456, + "step": 6706 + }, + { + "epoch": 1.0880921479558727, + "grad_norm": 0.6147231058727496, + "learning_rate": 3.5699009621995485e-06, + "loss": 0.5037, + "step": 6707 + }, + { + "epoch": 1.0882543802725504, + "grad_norm": 0.5874553038817336, + "learning_rate": 3.5695149956952664e-06, + "loss": 0.5216, + "step": 6708 + }, + { + "epoch": 1.0884166125892278, + "grad_norm": 0.6068137786851859, + "learning_rate": 3.5691289979856595e-06, + "loss": 0.5519, + "step": 6709 + }, + { + "epoch": 1.0885788449059053, + "grad_norm": 0.5747903467605507, + "learning_rate": 3.568742969081991e-06, + "loss": 0.5366, + "step": 6710 + }, + { + "epoch": 1.0887410772225827, + "grad_norm": 0.5650805490934601, + "learning_rate": 3.568356908995522e-06, + "loss": 0.5193, + "step": 6711 + }, + { + "epoch": 1.0889033095392602, + "grad_norm": 0.5856247358289715, + "learning_rate": 3.5679708177375182e-06, + "loss": 0.5215, + "step": 6712 + }, + { + "epoch": 1.0890655418559376, + "grad_norm": 0.6278012506178668, + "learning_rate": 3.5675846953192446e-06, + "loss": 0.5441, + "step": 6713 + }, + { + "epoch": 1.089227774172615, + "grad_norm": 0.6067242264816933, + "learning_rate": 3.5671985417519666e-06, + "loss": 0.5037, + "step": 6714 + }, + { + "epoch": 1.0893900064892927, + "grad_norm": 0.6089505752326408, + "learning_rate": 3.5668123570469516e-06, + "loss": 0.5676, + "step": 6715 + }, + { + "epoch": 1.0895522388059702, + "grad_norm": 0.6190438906376715, + "learning_rate": 3.5664261412154667e-06, + "loss": 0.5322, + "step": 6716 + }, + { + "epoch": 1.0897144711226476, + "grad_norm": 0.6051456883551917, + "learning_rate": 3.56603989426878e-06, + "loss": 0.5487, + "step": 6717 + }, + { + "epoch": 1.089876703439325, + "grad_norm": 0.6159659537168527, + "learning_rate": 3.565653616218164e-06, + "loss": 0.5376, + "step": 6718 + }, + { + "epoch": 1.0900389357560025, + "grad_norm": 0.6516290621230121, + "learning_rate": 3.5652673070748854e-06, + "loss": 0.5636, + "step": 6719 + }, + { + "epoch": 1.0902011680726802, + "grad_norm": 0.5994708758825661, + "learning_rate": 3.5648809668502184e-06, + "loss": 0.5054, + "step": 6720 + }, + { + "epoch": 1.0903634003893576, + "grad_norm": 0.5879543726518016, + "learning_rate": 3.564494595555433e-06, + "loss": 0.5289, + "step": 6721 + }, + { + "epoch": 1.090525632706035, + "grad_norm": 0.5938943891501762, + "learning_rate": 3.564108193201804e-06, + "loss": 0.5337, + "step": 6722 + }, + { + "epoch": 1.0906878650227125, + "grad_norm": 0.601920127652967, + "learning_rate": 3.5637217598006047e-06, + "loss": 0.5133, + "step": 6723 + }, + { + "epoch": 1.09085009733939, + "grad_norm": 0.5933555829777619, + "learning_rate": 3.563335295363111e-06, + "loss": 0.5227, + "step": 6724 + }, + { + "epoch": 1.0910123296560674, + "grad_norm": 0.5670058450247422, + "learning_rate": 3.562948799900598e-06, + "loss": 0.5266, + "step": 6725 + }, + { + "epoch": 1.0911745619727449, + "grad_norm": 0.5812747985639176, + "learning_rate": 3.562562273424342e-06, + "loss": 0.4923, + "step": 6726 + }, + { + "epoch": 1.0913367942894225, + "grad_norm": 0.6166759769818873, + "learning_rate": 3.5621757159456217e-06, + "loss": 0.543, + "step": 6727 + }, + { + "epoch": 1.0914990266061, + "grad_norm": 0.565043786993577, + "learning_rate": 3.561789127475716e-06, + "loss": 0.5295, + "step": 6728 + }, + { + "epoch": 1.0916612589227774, + "grad_norm": 0.5810250874301733, + "learning_rate": 3.561402508025904e-06, + "loss": 0.5146, + "step": 6729 + }, + { + "epoch": 1.0918234912394549, + "grad_norm": 0.5780259908076086, + "learning_rate": 3.561015857607465e-06, + "loss": 0.5332, + "step": 6730 + }, + { + "epoch": 1.0919857235561323, + "grad_norm": 0.5909401791505925, + "learning_rate": 3.560629176231682e-06, + "loss": 0.5127, + "step": 6731 + }, + { + "epoch": 1.0921479558728098, + "grad_norm": 0.5709946467490741, + "learning_rate": 3.5602424639098364e-06, + "loss": 0.5165, + "step": 6732 + }, + { + "epoch": 1.0923101881894874, + "grad_norm": 0.5770179397311103, + "learning_rate": 3.5598557206532114e-06, + "loss": 0.5246, + "step": 6733 + }, + { + "epoch": 1.0924724205061649, + "grad_norm": 0.5862510006210813, + "learning_rate": 3.5594689464730914e-06, + "loss": 0.5171, + "step": 6734 + }, + { + "epoch": 1.0926346528228423, + "grad_norm": 0.5695726262096694, + "learning_rate": 3.5590821413807606e-06, + "loss": 0.5212, + "step": 6735 + }, + { + "epoch": 1.0927968851395198, + "grad_norm": 0.5739672978616942, + "learning_rate": 3.5586953053875063e-06, + "loss": 0.4826, + "step": 6736 + }, + { + "epoch": 1.0929591174561972, + "grad_norm": 0.6214367817874302, + "learning_rate": 3.5583084385046133e-06, + "loss": 0.5274, + "step": 6737 + }, + { + "epoch": 1.0931213497728747, + "grad_norm": 0.5722005017425015, + "learning_rate": 3.5579215407433714e-06, + "loss": 0.5038, + "step": 6738 + }, + { + "epoch": 1.0932835820895523, + "grad_norm": 0.570968715728372, + "learning_rate": 3.5575346121150665e-06, + "loss": 0.5142, + "step": 6739 + }, + { + "epoch": 1.0934458144062298, + "grad_norm": 0.6053609006349281, + "learning_rate": 3.55714765263099e-06, + "loss": 0.5309, + "step": 6740 + }, + { + "epoch": 1.0936080467229072, + "grad_norm": 0.6187141072602087, + "learning_rate": 3.556760662302432e-06, + "loss": 0.494, + "step": 6741 + }, + { + "epoch": 1.0937702790395847, + "grad_norm": 0.563406961908888, + "learning_rate": 3.5563736411406836e-06, + "loss": 0.5238, + "step": 6742 + }, + { + "epoch": 1.0939325113562621, + "grad_norm": 0.6208614107148006, + "learning_rate": 3.555986589157037e-06, + "loss": 0.5152, + "step": 6743 + }, + { + "epoch": 1.0940947436729396, + "grad_norm": 0.5798580408579355, + "learning_rate": 3.5555995063627842e-06, + "loss": 0.5012, + "step": 6744 + }, + { + "epoch": 1.0942569759896172, + "grad_norm": 0.5955813261154966, + "learning_rate": 3.5552123927692206e-06, + "loss": 0.5233, + "step": 6745 + }, + { + "epoch": 1.0944192083062947, + "grad_norm": 0.5907083348165361, + "learning_rate": 3.5548252483876403e-06, + "loss": 0.5097, + "step": 6746 + }, + { + "epoch": 1.0945814406229721, + "grad_norm": 0.6054913591061198, + "learning_rate": 3.5544380732293394e-06, + "loss": 0.5272, + "step": 6747 + }, + { + "epoch": 1.0947436729396496, + "grad_norm": 0.6355253706038805, + "learning_rate": 3.5540508673056147e-06, + "loss": 0.5453, + "step": 6748 + }, + { + "epoch": 1.094905905256327, + "grad_norm": 0.6036590650016058, + "learning_rate": 3.5536636306277628e-06, + "loss": 0.5028, + "step": 6749 + }, + { + "epoch": 1.0950681375730045, + "grad_norm": 0.6296605377544183, + "learning_rate": 3.553276363207083e-06, + "loss": 0.5049, + "step": 6750 + }, + { + "epoch": 1.095230369889682, + "grad_norm": 0.5978092058243233, + "learning_rate": 3.552889065054874e-06, + "loss": 0.5419, + "step": 6751 + }, + { + "epoch": 1.0953926022063596, + "grad_norm": 0.5834406830255124, + "learning_rate": 3.552501736182437e-06, + "loss": 0.485, + "step": 6752 + }, + { + "epoch": 1.095554834523037, + "grad_norm": 0.5840903750607723, + "learning_rate": 3.552114376601072e-06, + "loss": 0.5493, + "step": 6753 + }, + { + "epoch": 1.0957170668397145, + "grad_norm": 0.5836893924534506, + "learning_rate": 3.5517269863220822e-06, + "loss": 0.5386, + "step": 6754 + }, + { + "epoch": 1.095879299156392, + "grad_norm": 0.6135708497762752, + "learning_rate": 3.551339565356769e-06, + "loss": 0.5461, + "step": 6755 + }, + { + "epoch": 1.0960415314730694, + "grad_norm": 0.6008097905937188, + "learning_rate": 3.550952113716438e-06, + "loss": 0.5452, + "step": 6756 + }, + { + "epoch": 1.0962037637897468, + "grad_norm": 0.6146353530772084, + "learning_rate": 3.550564631412392e-06, + "loss": 0.5368, + "step": 6757 + }, + { + "epoch": 1.0963659961064245, + "grad_norm": 0.6009289796871937, + "learning_rate": 3.5501771184559385e-06, + "loss": 0.5326, + "step": 6758 + }, + { + "epoch": 1.096528228423102, + "grad_norm": 0.579339445952926, + "learning_rate": 3.5497895748583827e-06, + "loss": 0.533, + "step": 6759 + }, + { + "epoch": 1.0966904607397794, + "grad_norm": 0.6303112053013502, + "learning_rate": 3.549402000631032e-06, + "loss": 0.4999, + "step": 6760 + }, + { + "epoch": 1.0968526930564568, + "grad_norm": 0.5751477925977126, + "learning_rate": 3.549014395785196e-06, + "loss": 0.501, + "step": 6761 + }, + { + "epoch": 1.0970149253731343, + "grad_norm": 0.6458459288604401, + "learning_rate": 3.548626760332182e-06, + "loss": 0.5028, + "step": 6762 + }, + { + "epoch": 1.0971771576898117, + "grad_norm": 0.5849804407784305, + "learning_rate": 3.5482390942833015e-06, + "loss": 0.5082, + "step": 6763 + }, + { + "epoch": 1.0973393900064894, + "grad_norm": 0.6214446591720284, + "learning_rate": 3.547851397649865e-06, + "loss": 0.5542, + "step": 6764 + }, + { + "epoch": 1.0975016223231668, + "grad_norm": 0.592535638740625, + "learning_rate": 3.547463670443184e-06, + "loss": 0.5297, + "step": 6765 + }, + { + "epoch": 1.0976638546398443, + "grad_norm": 0.5910815142384632, + "learning_rate": 3.5470759126745726e-06, + "loss": 0.5213, + "step": 6766 + }, + { + "epoch": 1.0978260869565217, + "grad_norm": 0.6583140203395904, + "learning_rate": 3.5466881243553417e-06, + "loss": 0.526, + "step": 6767 + }, + { + "epoch": 1.0979883192731992, + "grad_norm": 0.5903711011834738, + "learning_rate": 3.5463003054968085e-06, + "loss": 0.524, + "step": 6768 + }, + { + "epoch": 1.0981505515898766, + "grad_norm": 0.651424622684269, + "learning_rate": 3.5459124561102876e-06, + "loss": 0.5563, + "step": 6769 + }, + { + "epoch": 1.0983127839065543, + "grad_norm": 0.5703996680807042, + "learning_rate": 3.545524576207095e-06, + "loss": 0.5154, + "step": 6770 + }, + { + "epoch": 1.0984750162232317, + "grad_norm": 0.596734434543672, + "learning_rate": 3.545136665798548e-06, + "loss": 0.5553, + "step": 6771 + }, + { + "epoch": 1.0986372485399092, + "grad_norm": 0.5839368523429954, + "learning_rate": 3.544748724895965e-06, + "loss": 0.5423, + "step": 6772 + }, + { + "epoch": 1.0987994808565866, + "grad_norm": 0.5938048830286449, + "learning_rate": 3.544360753510664e-06, + "loss": 0.5477, + "step": 6773 + }, + { + "epoch": 1.098961713173264, + "grad_norm": 0.6203398437643001, + "learning_rate": 3.543972751653967e-06, + "loss": 0.4979, + "step": 6774 + }, + { + "epoch": 1.0991239454899415, + "grad_norm": 0.5756272150682938, + "learning_rate": 3.543584719337192e-06, + "loss": 0.5539, + "step": 6775 + }, + { + "epoch": 1.099286177806619, + "grad_norm": 0.5963059155678683, + "learning_rate": 3.543196656571663e-06, + "loss": 0.521, + "step": 6776 + }, + { + "epoch": 1.0994484101232966, + "grad_norm": 0.5889836562967429, + "learning_rate": 3.5428085633687016e-06, + "loss": 0.5172, + "step": 6777 + }, + { + "epoch": 1.099610642439974, + "grad_norm": 0.5912686151406017, + "learning_rate": 3.5424204397396307e-06, + "loss": 0.5424, + "step": 6778 + }, + { + "epoch": 1.0997728747566515, + "grad_norm": 0.5817109203016845, + "learning_rate": 3.542032285695775e-06, + "loss": 0.5258, + "step": 6779 + }, + { + "epoch": 1.099935107073329, + "grad_norm": 0.6000389884627005, + "learning_rate": 3.5416441012484596e-06, + "loss": 0.5369, + "step": 6780 + }, + { + "epoch": 1.1000973393900064, + "grad_norm": 0.6019105113166785, + "learning_rate": 3.541255886409012e-06, + "loss": 0.5052, + "step": 6781 + }, + { + "epoch": 1.1002595717066839, + "grad_norm": 0.5757816914112658, + "learning_rate": 3.540867641188757e-06, + "loss": 0.5021, + "step": 6782 + }, + { + "epoch": 1.1004218040233615, + "grad_norm": 0.5834042012632846, + "learning_rate": 3.5404793655990236e-06, + "loss": 0.5205, + "step": 6783 + }, + { + "epoch": 1.100584036340039, + "grad_norm": 0.5944131953375033, + "learning_rate": 3.5400910596511406e-06, + "loss": 0.5382, + "step": 6784 + }, + { + "epoch": 1.1007462686567164, + "grad_norm": 0.6037996847827218, + "learning_rate": 3.539702723356438e-06, + "loss": 0.521, + "step": 6785 + }, + { + "epoch": 1.1009085009733939, + "grad_norm": 0.5916811729182233, + "learning_rate": 3.5393143567262445e-06, + "loss": 0.5499, + "step": 6786 + }, + { + "epoch": 1.1010707332900713, + "grad_norm": 0.6177915781612395, + "learning_rate": 3.5389259597718934e-06, + "loss": 0.5371, + "step": 6787 + }, + { + "epoch": 1.1012329656067488, + "grad_norm": 0.6106668276453381, + "learning_rate": 3.5385375325047167e-06, + "loss": 0.5035, + "step": 6788 + }, + { + "epoch": 1.1013951979234264, + "grad_norm": 0.6041649878979886, + "learning_rate": 3.5381490749360464e-06, + "loss": 0.5597, + "step": 6789 + }, + { + "epoch": 1.1015574302401039, + "grad_norm": 0.5835534106566602, + "learning_rate": 3.537760587077218e-06, + "loss": 0.5291, + "step": 6790 + }, + { + "epoch": 1.1017196625567813, + "grad_norm": 0.5679944651659684, + "learning_rate": 3.537372068939565e-06, + "loss": 0.4733, + "step": 6791 + }, + { + "epoch": 1.1018818948734588, + "grad_norm": 0.5928785464336359, + "learning_rate": 3.5369835205344254e-06, + "loss": 0.5142, + "step": 6792 + }, + { + "epoch": 1.1020441271901362, + "grad_norm": 0.6167562352573889, + "learning_rate": 3.5365949418731338e-06, + "loss": 0.4953, + "step": 6793 + }, + { + "epoch": 1.1022063595068137, + "grad_norm": 0.6188881488636861, + "learning_rate": 3.536206332967028e-06, + "loss": 0.512, + "step": 6794 + }, + { + "epoch": 1.1023685918234913, + "grad_norm": 0.6003619466475267, + "learning_rate": 3.535817693827448e-06, + "loss": 0.519, + "step": 6795 + }, + { + "epoch": 1.1025308241401688, + "grad_norm": 0.5742386012746149, + "learning_rate": 3.535429024465732e-06, + "loss": 0.5317, + "step": 6796 + }, + { + "epoch": 1.1026930564568462, + "grad_norm": 0.5578374212166581, + "learning_rate": 3.5350403248932197e-06, + "loss": 0.532, + "step": 6797 + }, + { + "epoch": 1.1028552887735237, + "grad_norm": 0.6128168311348104, + "learning_rate": 3.5346515951212533e-06, + "loss": 0.5393, + "step": 6798 + }, + { + "epoch": 1.1030175210902011, + "grad_norm": 0.6104032275407091, + "learning_rate": 3.5342628351611744e-06, + "loss": 0.5121, + "step": 6799 + }, + { + "epoch": 1.1031797534068786, + "grad_norm": 0.5926683509863064, + "learning_rate": 3.533874045024326e-06, + "loss": 0.5804, + "step": 6800 + }, + { + "epoch": 1.103341985723556, + "grad_norm": 0.5866055420408562, + "learning_rate": 3.533485224722052e-06, + "loss": 0.5163, + "step": 6801 + }, + { + "epoch": 1.1035042180402337, + "grad_norm": 0.5712451235421032, + "learning_rate": 3.5330963742656963e-06, + "loss": 0.536, + "step": 6802 + }, + { + "epoch": 1.1036664503569111, + "grad_norm": 0.5954305349610775, + "learning_rate": 3.5327074936666057e-06, + "loss": 0.526, + "step": 6803 + }, + { + "epoch": 1.1038286826735886, + "grad_norm": 0.5979141701620017, + "learning_rate": 3.532318582936125e-06, + "loss": 0.4954, + "step": 6804 + }, + { + "epoch": 1.103990914990266, + "grad_norm": 0.6046088946740845, + "learning_rate": 3.5319296420856026e-06, + "loss": 0.5616, + "step": 6805 + }, + { + "epoch": 1.1041531473069435, + "grad_norm": 0.5777068670671442, + "learning_rate": 3.5315406711263865e-06, + "loss": 0.4938, + "step": 6806 + }, + { + "epoch": 1.1043153796236211, + "grad_norm": 0.7047751254016767, + "learning_rate": 3.5311516700698256e-06, + "loss": 0.4663, + "step": 6807 + }, + { + "epoch": 1.1044776119402986, + "grad_norm": 0.6171264449743052, + "learning_rate": 3.5307626389272698e-06, + "loss": 0.5559, + "step": 6808 + }, + { + "epoch": 1.104639844256976, + "grad_norm": 0.6156880064262812, + "learning_rate": 3.530373577710069e-06, + "loss": 0.5359, + "step": 6809 + }, + { + "epoch": 1.1048020765736535, + "grad_norm": 0.6680822482391398, + "learning_rate": 3.5299844864295773e-06, + "loss": 0.4922, + "step": 6810 + }, + { + "epoch": 1.104964308890331, + "grad_norm": 0.6000150775724692, + "learning_rate": 3.529595365097145e-06, + "loss": 0.5574, + "step": 6811 + }, + { + "epoch": 1.1051265412070084, + "grad_norm": 0.5797539364906871, + "learning_rate": 3.5292062137241268e-06, + "loss": 0.5347, + "step": 6812 + }, + { + "epoch": 1.1052887735236858, + "grad_norm": 0.58227095485364, + "learning_rate": 3.528817032321875e-06, + "loss": 0.492, + "step": 6813 + }, + { + "epoch": 1.1054510058403635, + "grad_norm": 0.6431780478932541, + "learning_rate": 3.5284278209017475e-06, + "loss": 0.5277, + "step": 6814 + }, + { + "epoch": 1.105613238157041, + "grad_norm": 0.6030684997075815, + "learning_rate": 3.528038579475099e-06, + "loss": 0.5159, + "step": 6815 + }, + { + "epoch": 1.1057754704737184, + "grad_norm": 0.6293494267747273, + "learning_rate": 3.527649308053285e-06, + "loss": 0.4945, + "step": 6816 + }, + { + "epoch": 1.1059377027903958, + "grad_norm": 0.5906149845777905, + "learning_rate": 3.527260006647667e-06, + "loss": 0.5274, + "step": 6817 + }, + { + "epoch": 1.1060999351070733, + "grad_norm": 0.574023153199902, + "learning_rate": 3.5268706752696e-06, + "loss": 0.5334, + "step": 6818 + }, + { + "epoch": 1.1062621674237507, + "grad_norm": 0.6131473613114051, + "learning_rate": 3.526481313930446e-06, + "loss": 0.5434, + "step": 6819 + }, + { + "epoch": 1.1064243997404284, + "grad_norm": 0.6488366964124551, + "learning_rate": 3.526091922641564e-06, + "loss": 0.5171, + "step": 6820 + }, + { + "epoch": 1.1065866320571058, + "grad_norm": 0.582787086650523, + "learning_rate": 3.525702501414316e-06, + "loss": 0.5431, + "step": 6821 + }, + { + "epoch": 1.1067488643737833, + "grad_norm": 0.5731447479969933, + "learning_rate": 3.525313050260063e-06, + "loss": 0.5115, + "step": 6822 + }, + { + "epoch": 1.1069110966904607, + "grad_norm": 0.5844972866181652, + "learning_rate": 3.52492356919017e-06, + "loss": 0.5107, + "step": 6823 + }, + { + "epoch": 1.1070733290071382, + "grad_norm": 0.5835391928031265, + "learning_rate": 3.5245340582159997e-06, + "loss": 0.5258, + "step": 6824 + }, + { + "epoch": 1.1072355613238156, + "grad_norm": 0.6194496751321181, + "learning_rate": 3.5241445173489173e-06, + "loss": 0.5385, + "step": 6825 + }, + { + "epoch": 1.1073977936404933, + "grad_norm": 0.5951902546015487, + "learning_rate": 3.5237549466002883e-06, + "loss": 0.536, + "step": 6826 + }, + { + "epoch": 1.1075600259571707, + "grad_norm": 0.6334451904564486, + "learning_rate": 3.5233653459814787e-06, + "loss": 0.4962, + "step": 6827 + }, + { + "epoch": 1.1077222582738482, + "grad_norm": 0.5614855836958083, + "learning_rate": 3.5229757155038568e-06, + "loss": 0.5293, + "step": 6828 + }, + { + "epoch": 1.1078844905905256, + "grad_norm": 0.6322639463306023, + "learning_rate": 3.5225860551787903e-06, + "loss": 0.5152, + "step": 6829 + }, + { + "epoch": 1.108046722907203, + "grad_norm": 0.5694535418214652, + "learning_rate": 3.5221963650176494e-06, + "loss": 0.5429, + "step": 6830 + }, + { + "epoch": 1.1082089552238805, + "grad_norm": 0.586155791827465, + "learning_rate": 3.5218066450318016e-06, + "loss": 0.5199, + "step": 6831 + }, + { + "epoch": 1.1083711875405582, + "grad_norm": 0.605889880546372, + "learning_rate": 3.5214168952326205e-06, + "loss": 0.5339, + "step": 6832 + }, + { + "epoch": 1.1085334198572356, + "grad_norm": 0.6114921520038602, + "learning_rate": 3.5210271156314767e-06, + "loss": 0.5102, + "step": 6833 + }, + { + "epoch": 1.108695652173913, + "grad_norm": 0.6085780002449912, + "learning_rate": 3.5206373062397423e-06, + "loss": 0.5486, + "step": 6834 + }, + { + "epoch": 1.1088578844905905, + "grad_norm": 0.5830379849170938, + "learning_rate": 3.520247467068793e-06, + "loss": 0.5529, + "step": 6835 + }, + { + "epoch": 1.109020116807268, + "grad_norm": 0.5920678533242411, + "learning_rate": 3.5198575981300006e-06, + "loss": 0.5203, + "step": 6836 + }, + { + "epoch": 1.1091823491239454, + "grad_norm": 0.5802107010616966, + "learning_rate": 3.5194676994347415e-06, + "loss": 0.5305, + "step": 6837 + }, + { + "epoch": 1.1093445814406229, + "grad_norm": 0.5799439922476852, + "learning_rate": 3.519077770994391e-06, + "loss": 0.5073, + "step": 6838 + }, + { + "epoch": 1.1095068137573005, + "grad_norm": 0.6408011268635042, + "learning_rate": 3.5186878128203282e-06, + "loss": 0.5553, + "step": 6839 + }, + { + "epoch": 1.109669046073978, + "grad_norm": 0.5893068612742869, + "learning_rate": 3.5182978249239287e-06, + "loss": 0.5123, + "step": 6840 + }, + { + "epoch": 1.1098312783906554, + "grad_norm": 0.5749685399323218, + "learning_rate": 3.5179078073165716e-06, + "loss": 0.5201, + "step": 6841 + }, + { + "epoch": 1.1099935107073329, + "grad_norm": 0.5836254501881336, + "learning_rate": 3.5175177600096374e-06, + "loss": 0.5532, + "step": 6842 + }, + { + "epoch": 1.1101557430240103, + "grad_norm": 0.5838758272888057, + "learning_rate": 3.5171276830145058e-06, + "loss": 0.5537, + "step": 6843 + }, + { + "epoch": 1.1103179753406878, + "grad_norm": 0.5967767496295597, + "learning_rate": 3.516737576342558e-06, + "loss": 0.5596, + "step": 6844 + }, + { + "epoch": 1.1104802076573654, + "grad_norm": 0.5752340631390953, + "learning_rate": 3.516347440005177e-06, + "loss": 0.4896, + "step": 6845 + }, + { + "epoch": 1.1106424399740429, + "grad_norm": 0.5523753324192399, + "learning_rate": 3.5159572740137453e-06, + "loss": 0.5212, + "step": 6846 + }, + { + "epoch": 1.1108046722907203, + "grad_norm": 0.5753952945061689, + "learning_rate": 3.5155670783796457e-06, + "loss": 0.5487, + "step": 6847 + }, + { + "epoch": 1.1109669046073978, + "grad_norm": 0.625988024896482, + "learning_rate": 3.515176853114266e-06, + "loss": 0.5213, + "step": 6848 + }, + { + "epoch": 1.1111291369240752, + "grad_norm": 0.6066753851055916, + "learning_rate": 3.5147865982289887e-06, + "loss": 0.5098, + "step": 6849 + }, + { + "epoch": 1.1112913692407527, + "grad_norm": 0.587909146737431, + "learning_rate": 3.514396313735202e-06, + "loss": 0.5119, + "step": 6850 + }, + { + "epoch": 1.1114536015574303, + "grad_norm": 0.606871984043776, + "learning_rate": 3.5140059996442926e-06, + "loss": 0.5432, + "step": 6851 + }, + { + "epoch": 1.1116158338741078, + "grad_norm": 0.604112916597233, + "learning_rate": 3.513615655967648e-06, + "loss": 0.5356, + "step": 6852 + }, + { + "epoch": 1.1117780661907852, + "grad_norm": 0.6313449244130156, + "learning_rate": 3.5132252827166603e-06, + "loss": 0.5285, + "step": 6853 + }, + { + "epoch": 1.1119402985074627, + "grad_norm": 0.6485563254908272, + "learning_rate": 3.5128348799027157e-06, + "loss": 0.5313, + "step": 6854 + }, + { + "epoch": 1.1121025308241401, + "grad_norm": 0.5940960484082716, + "learning_rate": 3.512444447537207e-06, + "loss": 0.542, + "step": 6855 + }, + { + "epoch": 1.1122647631408176, + "grad_norm": 0.6118569969674198, + "learning_rate": 3.5120539856315262e-06, + "loss": 0.5151, + "step": 6856 + }, + { + "epoch": 1.1124269954574952, + "grad_norm": 0.6543173992715223, + "learning_rate": 3.5116634941970644e-06, + "loss": 0.5274, + "step": 6857 + }, + { + "epoch": 1.1125892277741727, + "grad_norm": 0.613776804971869, + "learning_rate": 3.5112729732452163e-06, + "loss": 0.5347, + "step": 6858 + }, + { + "epoch": 1.1127514600908501, + "grad_norm": 0.6069370122391635, + "learning_rate": 3.510882422787375e-06, + "loss": 0.5378, + "step": 6859 + }, + { + "epoch": 1.1129136924075276, + "grad_norm": 0.5891073458048923, + "learning_rate": 3.5104918428349364e-06, + "loss": 0.5549, + "step": 6860 + }, + { + "epoch": 1.113075924724205, + "grad_norm": 0.6005624993336832, + "learning_rate": 3.5101012333992966e-06, + "loss": 0.5309, + "step": 6861 + }, + { + "epoch": 1.1132381570408825, + "grad_norm": 0.5671390121773513, + "learning_rate": 3.5097105944918523e-06, + "loss": 0.4824, + "step": 6862 + }, + { + "epoch": 1.11340038935756, + "grad_norm": 0.6028480565704146, + "learning_rate": 3.5093199261240015e-06, + "loss": 0.5337, + "step": 6863 + }, + { + "epoch": 1.1135626216742376, + "grad_norm": 0.5774472350093158, + "learning_rate": 3.508929228307142e-06, + "loss": 0.538, + "step": 6864 + }, + { + "epoch": 1.113724853990915, + "grad_norm": 0.6117176632122829, + "learning_rate": 3.508538501052673e-06, + "loss": 0.5346, + "step": 6865 + }, + { + "epoch": 1.1138870863075925, + "grad_norm": 0.6172219016147878, + "learning_rate": 3.5081477443719964e-06, + "loss": 0.5568, + "step": 6866 + }, + { + "epoch": 1.11404931862427, + "grad_norm": 0.5953709093993393, + "learning_rate": 3.5077569582765114e-06, + "loss": 0.4937, + "step": 6867 + }, + { + "epoch": 1.1142115509409474, + "grad_norm": 0.5907161419504082, + "learning_rate": 3.5073661427776214e-06, + "loss": 0.5262, + "step": 6868 + }, + { + "epoch": 1.1143737832576248, + "grad_norm": 0.5719228549296415, + "learning_rate": 3.5069752978867284e-06, + "loss": 0.5202, + "step": 6869 + }, + { + "epoch": 1.1145360155743025, + "grad_norm": 0.6029790257590609, + "learning_rate": 3.506584423615236e-06, + "loss": 0.4876, + "step": 6870 + }, + { + "epoch": 1.11469824789098, + "grad_norm": 0.5732406453083319, + "learning_rate": 3.5061935199745507e-06, + "loss": 0.5, + "step": 6871 + }, + { + "epoch": 1.1148604802076574, + "grad_norm": 0.5762023845019246, + "learning_rate": 3.505802586976075e-06, + "loss": 0.5307, + "step": 6872 + }, + { + "epoch": 1.1150227125243348, + "grad_norm": 0.5988336409004861, + "learning_rate": 3.5054116246312176e-06, + "loss": 0.5291, + "step": 6873 + }, + { + "epoch": 1.1151849448410123, + "grad_norm": 0.6065665316276665, + "learning_rate": 3.5050206329513842e-06, + "loss": 0.5033, + "step": 6874 + }, + { + "epoch": 1.1153471771576897, + "grad_norm": 0.6319011790754585, + "learning_rate": 3.5046296119479834e-06, + "loss": 0.5258, + "step": 6875 + }, + { + "epoch": 1.1155094094743674, + "grad_norm": 0.6002002942803631, + "learning_rate": 3.5042385616324243e-06, + "loss": 0.5371, + "step": 6876 + }, + { + "epoch": 1.1156716417910448, + "grad_norm": 0.5884135066095494, + "learning_rate": 3.503847482016116e-06, + "loss": 0.547, + "step": 6877 + }, + { + "epoch": 1.1158338741077223, + "grad_norm": 0.6276694743235357, + "learning_rate": 3.503456373110469e-06, + "loss": 0.5392, + "step": 6878 + }, + { + "epoch": 1.1159961064243997, + "grad_norm": 0.5808568712601646, + "learning_rate": 3.5030652349268953e-06, + "loss": 0.5021, + "step": 6879 + }, + { + "epoch": 1.1161583387410772, + "grad_norm": 0.5919061617559691, + "learning_rate": 3.5026740674768067e-06, + "loss": 0.5364, + "step": 6880 + }, + { + "epoch": 1.1163205710577546, + "grad_norm": 0.5900725739162364, + "learning_rate": 3.5022828707716168e-06, + "loss": 0.5356, + "step": 6881 + }, + { + "epoch": 1.1164828033744323, + "grad_norm": 0.6186375581198486, + "learning_rate": 3.5018916448227393e-06, + "loss": 0.4755, + "step": 6882 + }, + { + "epoch": 1.1166450356911097, + "grad_norm": 0.6017148461170292, + "learning_rate": 3.501500389641588e-06, + "loss": 0.485, + "step": 6883 + }, + { + "epoch": 1.1168072680077872, + "grad_norm": 0.7418715881433727, + "learning_rate": 3.501109105239581e-06, + "loss": 0.4779, + "step": 6884 + }, + { + "epoch": 1.1169695003244646, + "grad_norm": 0.6025138081254001, + "learning_rate": 3.500717791628132e-06, + "loss": 0.5826, + "step": 6885 + }, + { + "epoch": 1.117131732641142, + "grad_norm": 0.6052399237880591, + "learning_rate": 3.500326448818661e-06, + "loss": 0.5302, + "step": 6886 + }, + { + "epoch": 1.1172939649578195, + "grad_norm": 0.5931763598569418, + "learning_rate": 3.499935076822585e-06, + "loss": 0.55, + "step": 6887 + }, + { + "epoch": 1.117456197274497, + "grad_norm": 0.5940352714423268, + "learning_rate": 3.499543675651323e-06, + "loss": 0.5126, + "step": 6888 + }, + { + "epoch": 1.1176184295911746, + "grad_norm": 0.6348678942834809, + "learning_rate": 3.499152245316295e-06, + "loss": 0.5449, + "step": 6889 + }, + { + "epoch": 1.117780661907852, + "grad_norm": 0.6010950749258105, + "learning_rate": 3.4987607858289215e-06, + "loss": 0.5373, + "step": 6890 + }, + { + "epoch": 1.1179428942245295, + "grad_norm": 0.5942721050253728, + "learning_rate": 3.4983692972006254e-06, + "loss": 0.5267, + "step": 6891 + }, + { + "epoch": 1.118105126541207, + "grad_norm": 0.5907458806871617, + "learning_rate": 3.4979777794428277e-06, + "loss": 0.5363, + "step": 6892 + }, + { + "epoch": 1.1182673588578844, + "grad_norm": 0.5797061654098672, + "learning_rate": 3.4975862325669524e-06, + "loss": 0.5264, + "step": 6893 + }, + { + "epoch": 1.118429591174562, + "grad_norm": 0.5967875030619092, + "learning_rate": 3.4971946565844238e-06, + "loss": 0.4729, + "step": 6894 + }, + { + "epoch": 1.1185918234912395, + "grad_norm": 0.6228360762117414, + "learning_rate": 3.496803051506667e-06, + "loss": 0.4797, + "step": 6895 + }, + { + "epoch": 1.118754055807917, + "grad_norm": 0.5916192309978829, + "learning_rate": 3.496411417345107e-06, + "loss": 0.5312, + "step": 6896 + }, + { + "epoch": 1.1189162881245944, + "grad_norm": 0.5637597922236113, + "learning_rate": 3.496019754111172e-06, + "loss": 0.525, + "step": 6897 + }, + { + "epoch": 1.1190785204412719, + "grad_norm": 0.5993035786685604, + "learning_rate": 3.4956280618162887e-06, + "loss": 0.5088, + "step": 6898 + }, + { + "epoch": 1.1192407527579493, + "grad_norm": 0.6011787482823592, + "learning_rate": 3.495236340471886e-06, + "loss": 0.4849, + "step": 6899 + }, + { + "epoch": 1.1194029850746268, + "grad_norm": 0.5955866029111382, + "learning_rate": 3.494844590089392e-06, + "loss": 0.5528, + "step": 6900 + }, + { + "epoch": 1.1195652173913044, + "grad_norm": 0.5811212800702255, + "learning_rate": 3.4944528106802385e-06, + "loss": 0.5234, + "step": 6901 + }, + { + "epoch": 1.1197274497079819, + "grad_norm": 0.5893690336973628, + "learning_rate": 3.494061002255856e-06, + "loss": 0.4948, + "step": 6902 + }, + { + "epoch": 1.1198896820246593, + "grad_norm": 0.6369305366630031, + "learning_rate": 3.4936691648276745e-06, + "loss": 0.5086, + "step": 6903 + }, + { + "epoch": 1.1200519143413368, + "grad_norm": 0.5724842550326511, + "learning_rate": 3.4932772984071295e-06, + "loss": 0.4493, + "step": 6904 + }, + { + "epoch": 1.1202141466580142, + "grad_norm": 0.6149527271340229, + "learning_rate": 3.4928854030056524e-06, + "loss": 0.5199, + "step": 6905 + }, + { + "epoch": 1.1203763789746917, + "grad_norm": 0.5966568472284255, + "learning_rate": 3.4924934786346787e-06, + "loss": 0.4856, + "step": 6906 + }, + { + "epoch": 1.1205386112913693, + "grad_norm": 0.612325109162989, + "learning_rate": 3.4921015253056433e-06, + "loss": 0.5104, + "step": 6907 + }, + { + "epoch": 1.1207008436080468, + "grad_norm": 0.5684886447797562, + "learning_rate": 3.491709543029982e-06, + "loss": 0.5075, + "step": 6908 + }, + { + "epoch": 1.1208630759247242, + "grad_norm": 0.582467897901383, + "learning_rate": 3.4913175318191326e-06, + "loss": 0.5214, + "step": 6909 + }, + { + "epoch": 1.1210253082414017, + "grad_norm": 0.5711629078310464, + "learning_rate": 3.4909254916845315e-06, + "loss": 0.5325, + "step": 6910 + }, + { + "epoch": 1.1211875405580791, + "grad_norm": 0.6188178054981177, + "learning_rate": 3.4905334226376182e-06, + "loss": 0.5522, + "step": 6911 + }, + { + "epoch": 1.1213497728747566, + "grad_norm": 0.5984725906611379, + "learning_rate": 3.490141324689832e-06, + "loss": 0.5407, + "step": 6912 + }, + { + "epoch": 1.1215120051914342, + "grad_norm": 0.569170877029311, + "learning_rate": 3.4897491978526126e-06, + "loss": 0.518, + "step": 6913 + }, + { + "epoch": 1.1216742375081117, + "grad_norm": 0.615984796391357, + "learning_rate": 3.4893570421374025e-06, + "loss": 0.526, + "step": 6914 + }, + { + "epoch": 1.1218364698247891, + "grad_norm": 0.5863960267896984, + "learning_rate": 3.488964857555641e-06, + "loss": 0.4986, + "step": 6915 + }, + { + "epoch": 1.1219987021414666, + "grad_norm": 0.6143718058300222, + "learning_rate": 3.4885726441187737e-06, + "loss": 0.5282, + "step": 6916 + }, + { + "epoch": 1.122160934458144, + "grad_norm": 0.5702604620709423, + "learning_rate": 3.488180401838243e-06, + "loss": 0.4942, + "step": 6917 + }, + { + "epoch": 1.1223231667748215, + "grad_norm": 0.598625030358029, + "learning_rate": 3.487788130725494e-06, + "loss": 0.5404, + "step": 6918 + }, + { + "epoch": 1.1224853990914991, + "grad_norm": 0.592871670536823, + "learning_rate": 3.487395830791971e-06, + "loss": 0.5118, + "step": 6919 + }, + { + "epoch": 1.1226476314081766, + "grad_norm": 0.5965699750848098, + "learning_rate": 3.4870035020491216e-06, + "loss": 0.5367, + "step": 6920 + }, + { + "epoch": 1.122809863724854, + "grad_norm": 0.5935586876424477, + "learning_rate": 3.4866111445083904e-06, + "loss": 0.543, + "step": 6921 + }, + { + "epoch": 1.1229720960415315, + "grad_norm": 0.7412795199503558, + "learning_rate": 3.486218758181229e-06, + "loss": 0.5275, + "step": 6922 + }, + { + "epoch": 1.123134328358209, + "grad_norm": 0.5907403101524983, + "learning_rate": 3.4858263430790816e-06, + "loss": 0.5488, + "step": 6923 + }, + { + "epoch": 1.1232965606748864, + "grad_norm": 0.5754078101258342, + "learning_rate": 3.4854338992134014e-06, + "loss": 0.4533, + "step": 6924 + }, + { + "epoch": 1.1234587929915638, + "grad_norm": 0.6013888003583413, + "learning_rate": 3.485041426595637e-06, + "loss": 0.4912, + "step": 6925 + }, + { + "epoch": 1.1236210253082415, + "grad_norm": 0.6112610478545418, + "learning_rate": 3.4846489252372395e-06, + "loss": 0.5454, + "step": 6926 + }, + { + "epoch": 1.123783257624919, + "grad_norm": 0.6286875787500303, + "learning_rate": 3.484256395149663e-06, + "loss": 0.539, + "step": 6927 + }, + { + "epoch": 1.1239454899415964, + "grad_norm": 0.6049416329280345, + "learning_rate": 3.4838638363443573e-06, + "loss": 0.5318, + "step": 6928 + }, + { + "epoch": 1.1241077222582738, + "grad_norm": 0.59254086256137, + "learning_rate": 3.4834712488327783e-06, + "loss": 0.5281, + "step": 6929 + }, + { + "epoch": 1.1242699545749513, + "grad_norm": 0.560285605007839, + "learning_rate": 3.4830786326263793e-06, + "loss": 0.4934, + "step": 6930 + }, + { + "epoch": 1.1244321868916287, + "grad_norm": 0.5790327906923777, + "learning_rate": 3.482685987736617e-06, + "loss": 0.517, + "step": 6931 + }, + { + "epoch": 1.1245944192083064, + "grad_norm": 0.6080230972724298, + "learning_rate": 3.4822933141749464e-06, + "loss": 0.5203, + "step": 6932 + }, + { + "epoch": 1.1247566515249838, + "grad_norm": 0.6038839831952947, + "learning_rate": 3.4819006119528247e-06, + "loss": 0.4984, + "step": 6933 + }, + { + "epoch": 1.1249188838416613, + "grad_norm": 0.5961116803491616, + "learning_rate": 3.481507881081711e-06, + "loss": 0.5332, + "step": 6934 + }, + { + "epoch": 1.1250811161583387, + "grad_norm": 0.5778517091302032, + "learning_rate": 3.4811151215730632e-06, + "loss": 0.5006, + "step": 6935 + }, + { + "epoch": 1.1252433484750162, + "grad_norm": 0.5732029458218564, + "learning_rate": 3.4807223334383404e-06, + "loss": 0.5121, + "step": 6936 + }, + { + "epoch": 1.1254055807916936, + "grad_norm": 0.600016069672163, + "learning_rate": 3.480329516689004e-06, + "loss": 0.5109, + "step": 6937 + }, + { + "epoch": 1.1255678131083713, + "grad_norm": 0.6123406475561317, + "learning_rate": 3.4799366713365144e-06, + "loss": 0.5395, + "step": 6938 + }, + { + "epoch": 1.1257300454250487, + "grad_norm": 0.5718045738932455, + "learning_rate": 3.479543797392334e-06, + "loss": 0.5277, + "step": 6939 + }, + { + "epoch": 1.1258922777417262, + "grad_norm": 0.6382504556625889, + "learning_rate": 3.4791508948679263e-06, + "loss": 0.5373, + "step": 6940 + }, + { + "epoch": 1.1260545100584036, + "grad_norm": 0.5989445145239688, + "learning_rate": 3.4787579637747536e-06, + "loss": 0.5161, + "step": 6941 + }, + { + "epoch": 1.126216742375081, + "grad_norm": 0.6032774001513742, + "learning_rate": 3.4783650041242823e-06, + "loss": 0.5379, + "step": 6942 + }, + { + "epoch": 1.1263789746917585, + "grad_norm": 0.6023140425811858, + "learning_rate": 3.477972015927976e-06, + "loss": 0.5333, + "step": 6943 + }, + { + "epoch": 1.1265412070084362, + "grad_norm": 0.5953941402185083, + "learning_rate": 3.477578999197302e-06, + "loss": 0.5369, + "step": 6944 + }, + { + "epoch": 1.1267034393251136, + "grad_norm": 0.632489236266952, + "learning_rate": 3.4771859539437282e-06, + "loss": 0.5277, + "step": 6945 + }, + { + "epoch": 1.126865671641791, + "grad_norm": 0.5759475185329355, + "learning_rate": 3.4767928801787205e-06, + "loss": 0.5071, + "step": 6946 + }, + { + "epoch": 1.1270279039584685, + "grad_norm": 0.5695007907198016, + "learning_rate": 3.4763997779137493e-06, + "loss": 0.5064, + "step": 6947 + }, + { + "epoch": 1.127190136275146, + "grad_norm": 0.5751958226255889, + "learning_rate": 3.476006647160283e-06, + "loss": 0.5483, + "step": 6948 + }, + { + "epoch": 1.1273523685918234, + "grad_norm": 0.5717255338533603, + "learning_rate": 3.4756134879297933e-06, + "loss": 0.573, + "step": 6949 + }, + { + "epoch": 1.1275146009085009, + "grad_norm": 0.5372212742385608, + "learning_rate": 3.4752203002337503e-06, + "loss": 0.4692, + "step": 6950 + }, + { + "epoch": 1.1276768332251785, + "grad_norm": 0.6241300445261962, + "learning_rate": 3.474827084083627e-06, + "loss": 0.5002, + "step": 6951 + }, + { + "epoch": 1.127839065541856, + "grad_norm": 0.5907802064968741, + "learning_rate": 3.4744338394908943e-06, + "loss": 0.5192, + "step": 6952 + }, + { + "epoch": 1.1280012978585334, + "grad_norm": 0.6013225772891087, + "learning_rate": 3.4740405664670286e-06, + "loss": 0.5271, + "step": 6953 + }, + { + "epoch": 1.1281635301752109, + "grad_norm": 0.5608557511083037, + "learning_rate": 3.473647265023503e-06, + "loss": 0.5325, + "step": 6954 + }, + { + "epoch": 1.1283257624918883, + "grad_norm": 0.6267625552270473, + "learning_rate": 3.4732539351717932e-06, + "loss": 0.5589, + "step": 6955 + }, + { + "epoch": 1.128487994808566, + "grad_norm": 0.6442388441801332, + "learning_rate": 3.472860576923376e-06, + "loss": 0.5281, + "step": 6956 + }, + { + "epoch": 1.1286502271252434, + "grad_norm": 0.605111656217075, + "learning_rate": 3.472467190289726e-06, + "loss": 0.518, + "step": 6957 + }, + { + "epoch": 1.1288124594419209, + "grad_norm": 0.5944145463777558, + "learning_rate": 3.4720737752823254e-06, + "loss": 0.5, + "step": 6958 + }, + { + "epoch": 1.1289746917585983, + "grad_norm": 0.5972608255095628, + "learning_rate": 3.471680331912648e-06, + "loss": 0.5056, + "step": 6959 + }, + { + "epoch": 1.1291369240752758, + "grad_norm": 0.6201640375529796, + "learning_rate": 3.471286860192177e-06, + "loss": 0.521, + "step": 6960 + }, + { + "epoch": 1.1292991563919532, + "grad_norm": 0.6070862792012062, + "learning_rate": 3.4708933601323914e-06, + "loss": 0.5227, + "step": 6961 + }, + { + "epoch": 1.1294613887086307, + "grad_norm": 0.5664859084639885, + "learning_rate": 3.4704998317447715e-06, + "loss": 0.5166, + "step": 6962 + }, + { + "epoch": 1.1296236210253083, + "grad_norm": 0.6254971863438993, + "learning_rate": 3.4701062750408016e-06, + "loss": 0.507, + "step": 6963 + }, + { + "epoch": 1.1297858533419858, + "grad_norm": 0.5724354060107842, + "learning_rate": 3.469712690031962e-06, + "loss": 0.5383, + "step": 6964 + }, + { + "epoch": 1.1299480856586632, + "grad_norm": 0.6166858293459445, + "learning_rate": 3.4693190767297385e-06, + "loss": 0.5425, + "step": 6965 + }, + { + "epoch": 1.1301103179753407, + "grad_norm": 0.5864741605180575, + "learning_rate": 3.4689254351456147e-06, + "loss": 0.5134, + "step": 6966 + }, + { + "epoch": 1.1302725502920181, + "grad_norm": 0.6002391526500547, + "learning_rate": 3.4685317652910755e-06, + "loss": 0.5221, + "step": 6967 + }, + { + "epoch": 1.1304347826086956, + "grad_norm": 0.6299644234826811, + "learning_rate": 3.468138067177607e-06, + "loss": 0.5335, + "step": 6968 + }, + { + "epoch": 1.1305970149253732, + "grad_norm": 0.5870967616102795, + "learning_rate": 3.467744340816697e-06, + "loss": 0.5335, + "step": 6969 + }, + { + "epoch": 1.1307592472420507, + "grad_norm": 0.5910142702673158, + "learning_rate": 3.4673505862198327e-06, + "loss": 0.4916, + "step": 6970 + }, + { + "epoch": 1.1309214795587281, + "grad_norm": 0.6077514325968864, + "learning_rate": 3.466956803398503e-06, + "loss": 0.4888, + "step": 6971 + }, + { + "epoch": 1.1310837118754056, + "grad_norm": 0.676997361462295, + "learning_rate": 3.4665629923641975e-06, + "loss": 0.5469, + "step": 6972 + }, + { + "epoch": 1.131245944192083, + "grad_norm": 0.5930542334104201, + "learning_rate": 3.4661691531284053e-06, + "loss": 0.5225, + "step": 6973 + }, + { + "epoch": 1.1314081765087605, + "grad_norm": 0.5495448576894505, + "learning_rate": 3.465775285702619e-06, + "loss": 0.4946, + "step": 6974 + }, + { + "epoch": 1.131570408825438, + "grad_norm": 0.6141441519117254, + "learning_rate": 3.465381390098329e-06, + "loss": 0.5365, + "step": 6975 + }, + { + "epoch": 1.1317326411421156, + "grad_norm": 0.5866166827023631, + "learning_rate": 3.46498746632703e-06, + "loss": 0.5314, + "step": 6976 + }, + { + "epoch": 1.131894873458793, + "grad_norm": 0.617198195058486, + "learning_rate": 3.4645935144002136e-06, + "loss": 0.5274, + "step": 6977 + }, + { + "epoch": 1.1320571057754705, + "grad_norm": 0.6243262881307763, + "learning_rate": 3.4641995343293756e-06, + "loss": 0.5416, + "step": 6978 + }, + { + "epoch": 1.132219338092148, + "grad_norm": 0.6312854248482817, + "learning_rate": 3.4638055261260094e-06, + "loss": 0.5425, + "step": 6979 + }, + { + "epoch": 1.1323815704088254, + "grad_norm": 0.5846232801427557, + "learning_rate": 3.4634114898016136e-06, + "loss": 0.5173, + "step": 6980 + }, + { + "epoch": 1.132543802725503, + "grad_norm": 0.6276332397688725, + "learning_rate": 3.4630174253676827e-06, + "loss": 0.5124, + "step": 6981 + }, + { + "epoch": 1.1327060350421805, + "grad_norm": 0.5871724124229221, + "learning_rate": 3.462623332835715e-06, + "loss": 0.5222, + "step": 6982 + }, + { + "epoch": 1.132868267358858, + "grad_norm": 0.638708601894736, + "learning_rate": 3.4622292122172103e-06, + "loss": 0.5469, + "step": 6983 + }, + { + "epoch": 1.1330304996755354, + "grad_norm": 0.5833155095579231, + "learning_rate": 3.4618350635236653e-06, + "loss": 0.5151, + "step": 6984 + }, + { + "epoch": 1.1331927319922128, + "grad_norm": 0.6027713030750509, + "learning_rate": 3.461440886766583e-06, + "loss": 0.5342, + "step": 6985 + }, + { + "epoch": 1.1333549643088903, + "grad_norm": 0.6079022348751011, + "learning_rate": 3.4610466819574617e-06, + "loss": 0.5252, + "step": 6986 + }, + { + "epoch": 1.1335171966255677, + "grad_norm": 0.6045821723132098, + "learning_rate": 3.4606524491078052e-06, + "loss": 0.5457, + "step": 6987 + }, + { + "epoch": 1.1336794289422454, + "grad_norm": 0.6323811860390988, + "learning_rate": 3.460258188229115e-06, + "loss": 0.535, + "step": 6988 + }, + { + "epoch": 1.1338416612589228, + "grad_norm": 0.5922746775021064, + "learning_rate": 3.4598638993328947e-06, + "loss": 0.5447, + "step": 6989 + }, + { + "epoch": 1.1340038935756003, + "grad_norm": 0.591396779795012, + "learning_rate": 3.4594695824306486e-06, + "loss": 0.5363, + "step": 6990 + }, + { + "epoch": 1.1341661258922777, + "grad_norm": 0.6572867750074913, + "learning_rate": 3.4590752375338814e-06, + "loss": 0.5109, + "step": 6991 + }, + { + "epoch": 1.1343283582089552, + "grad_norm": 0.6630659678763423, + "learning_rate": 3.4586808646540992e-06, + "loss": 0.5292, + "step": 6992 + }, + { + "epoch": 1.1344905905256326, + "grad_norm": 0.6135659800164072, + "learning_rate": 3.4582864638028086e-06, + "loss": 0.5238, + "step": 6993 + }, + { + "epoch": 1.1346528228423103, + "grad_norm": 0.5894154310744001, + "learning_rate": 3.4578920349915178e-06, + "loss": 0.5514, + "step": 6994 + }, + { + "epoch": 1.1348150551589877, + "grad_norm": 0.6211866716590911, + "learning_rate": 3.4574975782317332e-06, + "loss": 0.4962, + "step": 6995 + }, + { + "epoch": 1.1349772874756652, + "grad_norm": 0.6237672973575796, + "learning_rate": 3.4571030935349657e-06, + "loss": 0.5159, + "step": 6996 + }, + { + "epoch": 1.1351395197923426, + "grad_norm": 0.6215189375183953, + "learning_rate": 3.4567085809127247e-06, + "loss": 0.5396, + "step": 6997 + }, + { + "epoch": 1.13530175210902, + "grad_norm": 0.579514986828383, + "learning_rate": 3.4563140403765205e-06, + "loss": 0.5386, + "step": 6998 + }, + { + "epoch": 1.1354639844256975, + "grad_norm": 0.5726070554569634, + "learning_rate": 3.455919471937865e-06, + "loss": 0.5097, + "step": 6999 + }, + { + "epoch": 1.135626216742375, + "grad_norm": 0.6109483519267738, + "learning_rate": 3.4555248756082704e-06, + "loss": 0.5213, + "step": 7000 + }, + { + "epoch": 1.1357884490590526, + "grad_norm": 0.5754441794587126, + "learning_rate": 3.4551302513992506e-06, + "loss": 0.5023, + "step": 7001 + }, + { + "epoch": 1.13595068137573, + "grad_norm": 0.6100176519435278, + "learning_rate": 3.454735599322318e-06, + "loss": 0.5142, + "step": 7002 + }, + { + "epoch": 1.1361129136924075, + "grad_norm": 0.5676596451343218, + "learning_rate": 3.454340919388989e-06, + "loss": 0.507, + "step": 7003 + }, + { + "epoch": 1.136275146009085, + "grad_norm": 0.5724733261174344, + "learning_rate": 3.453946211610778e-06, + "loss": 0.4792, + "step": 7004 + }, + { + "epoch": 1.1364373783257624, + "grad_norm": 0.5932101011342765, + "learning_rate": 3.453551475999203e-06, + "loss": 0.4939, + "step": 7005 + }, + { + "epoch": 1.13659961064244, + "grad_norm": 0.6018223098418763, + "learning_rate": 3.4531567125657794e-06, + "loss": 0.5319, + "step": 7006 + }, + { + "epoch": 1.1367618429591175, + "grad_norm": 0.6229826726657889, + "learning_rate": 3.4527619213220255e-06, + "loss": 0.4826, + "step": 7007 + }, + { + "epoch": 1.136924075275795, + "grad_norm": 0.5790995289003379, + "learning_rate": 3.4523671022794612e-06, + "loss": 0.5194, + "step": 7008 + }, + { + "epoch": 1.1370863075924724, + "grad_norm": 0.5796253918032372, + "learning_rate": 3.451972255449606e-06, + "loss": 0.5302, + "step": 7009 + }, + { + "epoch": 1.1372485399091499, + "grad_norm": 0.5780114105233862, + "learning_rate": 3.45157738084398e-06, + "loss": 0.5382, + "step": 7010 + }, + { + "epoch": 1.1374107722258273, + "grad_norm": 0.6324247780370199, + "learning_rate": 3.4511824784741037e-06, + "loss": 0.5044, + "step": 7011 + }, + { + "epoch": 1.1375730045425048, + "grad_norm": 0.601752381772069, + "learning_rate": 3.4507875483515015e-06, + "loss": 0.5287, + "step": 7012 + }, + { + "epoch": 1.1377352368591824, + "grad_norm": 0.6202563147722238, + "learning_rate": 3.4503925904876932e-06, + "loss": 0.5623, + "step": 7013 + }, + { + "epoch": 1.1378974691758599, + "grad_norm": 0.6092603650778324, + "learning_rate": 3.4499976048942053e-06, + "loss": 0.486, + "step": 7014 + }, + { + "epoch": 1.1380597014925373, + "grad_norm": 0.5994444734493832, + "learning_rate": 3.44960259158256e-06, + "loss": 0.4948, + "step": 7015 + }, + { + "epoch": 1.1382219338092148, + "grad_norm": 0.6421978713158114, + "learning_rate": 3.4492075505642853e-06, + "loss": 0.567, + "step": 7016 + }, + { + "epoch": 1.1383841661258922, + "grad_norm": 0.5864677089870053, + "learning_rate": 3.448812481850905e-06, + "loss": 0.5293, + "step": 7017 + }, + { + "epoch": 1.1385463984425699, + "grad_norm": 0.6186672626821863, + "learning_rate": 3.448417385453947e-06, + "loss": 0.5087, + "step": 7018 + }, + { + "epoch": 1.1387086307592473, + "grad_norm": 0.571899227801012, + "learning_rate": 3.4480222613849397e-06, + "loss": 0.5094, + "step": 7019 + }, + { + "epoch": 1.1388708630759248, + "grad_norm": 0.580329527618988, + "learning_rate": 3.4476271096554098e-06, + "loss": 0.5272, + "step": 7020 + }, + { + "epoch": 1.1390330953926022, + "grad_norm": 0.5952390556098821, + "learning_rate": 3.447231930276888e-06, + "loss": 0.5525, + "step": 7021 + }, + { + "epoch": 1.1391953277092797, + "grad_norm": 0.5704426508988536, + "learning_rate": 3.4468367232609045e-06, + "loss": 0.5355, + "step": 7022 + }, + { + "epoch": 1.1393575600259571, + "grad_norm": 0.5953405513904887, + "learning_rate": 3.446441488618991e-06, + "loss": 0.5314, + "step": 7023 + }, + { + "epoch": 1.1395197923426346, + "grad_norm": 0.5949895670777293, + "learning_rate": 3.446046226362677e-06, + "loss": 0.5434, + "step": 7024 + }, + { + "epoch": 1.1396820246593122, + "grad_norm": 0.6328169584517352, + "learning_rate": 3.445650936503497e-06, + "loss": 0.5054, + "step": 7025 + }, + { + "epoch": 1.1398442569759897, + "grad_norm": 0.6114228289133345, + "learning_rate": 3.4452556190529835e-06, + "loss": 0.4813, + "step": 7026 + }, + { + "epoch": 1.1400064892926671, + "grad_norm": 0.5860353839631727, + "learning_rate": 3.4448602740226716e-06, + "loss": 0.5467, + "step": 7027 + }, + { + "epoch": 1.1401687216093446, + "grad_norm": 0.6019469285718092, + "learning_rate": 3.444464901424096e-06, + "loss": 0.5267, + "step": 7028 + }, + { + "epoch": 1.140330953926022, + "grad_norm": 0.6672497618303957, + "learning_rate": 3.444069501268792e-06, + "loss": 0.4974, + "step": 7029 + }, + { + "epoch": 1.1404931862426995, + "grad_norm": 0.6104719230439397, + "learning_rate": 3.443674073568296e-06, + "loss": 0.5344, + "step": 7030 + }, + { + "epoch": 1.1406554185593771, + "grad_norm": 0.5679740797258646, + "learning_rate": 3.443278618334146e-06, + "loss": 0.498, + "step": 7031 + }, + { + "epoch": 1.1408176508760546, + "grad_norm": 0.584184386019966, + "learning_rate": 3.4428831355778815e-06, + "loss": 0.5411, + "step": 7032 + }, + { + "epoch": 1.140979883192732, + "grad_norm": 0.5857649305826105, + "learning_rate": 3.442487625311039e-06, + "loss": 0.5266, + "step": 7033 + }, + { + "epoch": 1.1411421155094095, + "grad_norm": 0.5962970158001383, + "learning_rate": 3.4420920875451595e-06, + "loss": 0.5358, + "step": 7034 + }, + { + "epoch": 1.141304347826087, + "grad_norm": 0.6121098586586554, + "learning_rate": 3.441696522291784e-06, + "loss": 0.5197, + "step": 7035 + }, + { + "epoch": 1.1414665801427644, + "grad_norm": 0.5892503677402385, + "learning_rate": 3.441300929562454e-06, + "loss": 0.5595, + "step": 7036 + }, + { + "epoch": 1.1416288124594418, + "grad_norm": 0.5890749430925774, + "learning_rate": 3.440905309368712e-06, + "loss": 0.4521, + "step": 7037 + }, + { + "epoch": 1.1417910447761195, + "grad_norm": 0.6088162202094941, + "learning_rate": 3.4405096617220983e-06, + "loss": 0.5435, + "step": 7038 + }, + { + "epoch": 1.141953277092797, + "grad_norm": 0.5918266658356621, + "learning_rate": 3.4401139866341605e-06, + "loss": 0.5429, + "step": 7039 + }, + { + "epoch": 1.1421155094094744, + "grad_norm": 0.5813275634467653, + "learning_rate": 3.4397182841164413e-06, + "loss": 0.5447, + "step": 7040 + }, + { + "epoch": 1.1422777417261518, + "grad_norm": 0.5884333087289692, + "learning_rate": 3.439322554180486e-06, + "loss": 0.492, + "step": 7041 + }, + { + "epoch": 1.1424399740428293, + "grad_norm": 0.6189329588811934, + "learning_rate": 3.4389267968378414e-06, + "loss": 0.4793, + "step": 7042 + }, + { + "epoch": 1.142602206359507, + "grad_norm": 0.5724649293032816, + "learning_rate": 3.438531012100054e-06, + "loss": 0.4894, + "step": 7043 + }, + { + "epoch": 1.1427644386761844, + "grad_norm": 0.6199608325770719, + "learning_rate": 3.4381351999786734e-06, + "loss": 0.47, + "step": 7044 + }, + { + "epoch": 1.1429266709928618, + "grad_norm": 0.6098790746135889, + "learning_rate": 3.437739360485246e-06, + "loss": 0.5139, + "step": 7045 + }, + { + "epoch": 1.1430889033095393, + "grad_norm": 0.5972724314364654, + "learning_rate": 3.4373434936313223e-06, + "loss": 0.4971, + "step": 7046 + }, + { + "epoch": 1.1432511356262167, + "grad_norm": 0.5997062527911553, + "learning_rate": 3.436947599428453e-06, + "loss": 0.5369, + "step": 7047 + }, + { + "epoch": 1.1434133679428942, + "grad_norm": 0.624029471024289, + "learning_rate": 3.4365516778881875e-06, + "loss": 0.5085, + "step": 7048 + }, + { + "epoch": 1.1435756002595716, + "grad_norm": 0.5843052764134508, + "learning_rate": 3.436155729022079e-06, + "loss": 0.5297, + "step": 7049 + }, + { + "epoch": 1.1437378325762493, + "grad_norm": 0.6136030815735067, + "learning_rate": 3.4357597528416804e-06, + "loss": 0.5671, + "step": 7050 + }, + { + "epoch": 1.1439000648929267, + "grad_norm": 0.6266666698525847, + "learning_rate": 3.4353637493585434e-06, + "loss": 0.5025, + "step": 7051 + }, + { + "epoch": 1.1440622972096042, + "grad_norm": 0.5718208601857067, + "learning_rate": 3.4349677185842246e-06, + "loss": 0.5348, + "step": 7052 + }, + { + "epoch": 1.1442245295262816, + "grad_norm": 0.5724277499846443, + "learning_rate": 3.434571660530277e-06, + "loss": 0.5124, + "step": 7053 + }, + { + "epoch": 1.144386761842959, + "grad_norm": 0.5650231825231775, + "learning_rate": 3.4341755752082567e-06, + "loss": 0.5259, + "step": 7054 + }, + { + "epoch": 1.1445489941596365, + "grad_norm": 0.611270256549033, + "learning_rate": 3.4337794626297223e-06, + "loss": 0.5153, + "step": 7055 + }, + { + "epoch": 1.1447112264763142, + "grad_norm": 0.5984269227191213, + "learning_rate": 3.4333833228062287e-06, + "loss": 0.5382, + "step": 7056 + }, + { + "epoch": 1.1448734587929916, + "grad_norm": 0.5808822551743034, + "learning_rate": 3.432987155749335e-06, + "loss": 0.5308, + "step": 7057 + }, + { + "epoch": 1.145035691109669, + "grad_norm": 0.5531722011525906, + "learning_rate": 3.432590961470601e-06, + "loss": 0.5097, + "step": 7058 + }, + { + "epoch": 1.1451979234263465, + "grad_norm": 0.5765694278777872, + "learning_rate": 3.4321947399815854e-06, + "loss": 0.5078, + "step": 7059 + }, + { + "epoch": 1.145360155743024, + "grad_norm": 0.5899137352639203, + "learning_rate": 3.4317984912938497e-06, + "loss": 0.5209, + "step": 7060 + }, + { + "epoch": 1.1455223880597014, + "grad_norm": 0.574368030787264, + "learning_rate": 3.4314022154189543e-06, + "loss": 0.5073, + "step": 7061 + }, + { + "epoch": 1.1456846203763789, + "grad_norm": 0.6000944693244024, + "learning_rate": 3.4310059123684616e-06, + "loss": 0.4969, + "step": 7062 + }, + { + "epoch": 1.1458468526930565, + "grad_norm": 0.5959051952628203, + "learning_rate": 3.4306095821539343e-06, + "loss": 0.5248, + "step": 7063 + }, + { + "epoch": 1.146009085009734, + "grad_norm": 0.6025889557414809, + "learning_rate": 3.430213224786938e-06, + "loss": 0.5753, + "step": 7064 + }, + { + "epoch": 1.1461713173264114, + "grad_norm": 0.621725270435976, + "learning_rate": 3.4298168402790354e-06, + "loss": 0.5541, + "step": 7065 + }, + { + "epoch": 1.1463335496430889, + "grad_norm": 0.591255003876539, + "learning_rate": 3.429420428641792e-06, + "loss": 0.5356, + "step": 7066 + }, + { + "epoch": 1.1464957819597663, + "grad_norm": 0.6106744349259912, + "learning_rate": 3.4290239898867743e-06, + "loss": 0.5413, + "step": 7067 + }, + { + "epoch": 1.146658014276444, + "grad_norm": 0.580974508443298, + "learning_rate": 3.4286275240255503e-06, + "loss": 0.5518, + "step": 7068 + }, + { + "epoch": 1.1468202465931214, + "grad_norm": 0.5699038753594562, + "learning_rate": 3.4282310310696855e-06, + "loss": 0.5494, + "step": 7069 + }, + { + "epoch": 1.1469824789097989, + "grad_norm": 0.6063303859464796, + "learning_rate": 3.42783451103075e-06, + "loss": 0.5101, + "step": 7070 + }, + { + "epoch": 1.1471447112264763, + "grad_norm": 0.6130264628858462, + "learning_rate": 3.4274379639203125e-06, + "loss": 0.5046, + "step": 7071 + }, + { + "epoch": 1.1473069435431538, + "grad_norm": 0.5897513140557412, + "learning_rate": 3.4270413897499427e-06, + "loss": 0.5068, + "step": 7072 + }, + { + "epoch": 1.1474691758598312, + "grad_norm": 0.5943374460637108, + "learning_rate": 3.426644788531213e-06, + "loss": 0.5247, + "step": 7073 + }, + { + "epoch": 1.1476314081765087, + "grad_norm": 0.5656208657418933, + "learning_rate": 3.4262481602756937e-06, + "loss": 0.4916, + "step": 7074 + }, + { + "epoch": 1.1477936404931863, + "grad_norm": 0.5855306492004265, + "learning_rate": 3.4258515049949583e-06, + "loss": 0.5648, + "step": 7075 + }, + { + "epoch": 1.1479558728098638, + "grad_norm": 0.6066852518006534, + "learning_rate": 3.425454822700578e-06, + "loss": 0.5397, + "step": 7076 + }, + { + "epoch": 1.1481181051265412, + "grad_norm": 0.6081927288311585, + "learning_rate": 3.4250581134041288e-06, + "loss": 0.5649, + "step": 7077 + }, + { + "epoch": 1.1482803374432187, + "grad_norm": 0.6279978599222925, + "learning_rate": 3.4246613771171853e-06, + "loss": 0.5291, + "step": 7078 + }, + { + "epoch": 1.1484425697598961, + "grad_norm": 0.6047837392215843, + "learning_rate": 3.4242646138513226e-06, + "loss": 0.5082, + "step": 7079 + }, + { + "epoch": 1.1486048020765736, + "grad_norm": 0.6439877384510414, + "learning_rate": 3.4238678236181166e-06, + "loss": 0.4873, + "step": 7080 + }, + { + "epoch": 1.1487670343932512, + "grad_norm": 0.6085781456191544, + "learning_rate": 3.4234710064291454e-06, + "loss": 0.5463, + "step": 7081 + }, + { + "epoch": 1.1489292667099287, + "grad_norm": 0.5679600703526375, + "learning_rate": 3.4230741622959867e-06, + "loss": 0.5593, + "step": 7082 + }, + { + "epoch": 1.1490914990266061, + "grad_norm": 0.8566997923332401, + "learning_rate": 3.42267729123022e-06, + "loss": 0.5416, + "step": 7083 + }, + { + "epoch": 1.1492537313432836, + "grad_norm": 0.6357067722965922, + "learning_rate": 3.422280393243423e-06, + "loss": 0.5084, + "step": 7084 + }, + { + "epoch": 1.149415963659961, + "grad_norm": 0.5729809921250616, + "learning_rate": 3.4218834683471767e-06, + "loss": 0.5571, + "step": 7085 + }, + { + "epoch": 1.1495781959766385, + "grad_norm": 0.5483062841028898, + "learning_rate": 3.4214865165530642e-06, + "loss": 0.5121, + "step": 7086 + }, + { + "epoch": 1.149740428293316, + "grad_norm": 0.6300557695600577, + "learning_rate": 3.4210895378726644e-06, + "loss": 0.5457, + "step": 7087 + }, + { + "epoch": 1.1499026606099936, + "grad_norm": 0.6122848517565015, + "learning_rate": 3.420692532317562e-06, + "loss": 0.5161, + "step": 7088 + }, + { + "epoch": 1.150064892926671, + "grad_norm": 0.5707459251677807, + "learning_rate": 3.4202954998993397e-06, + "loss": 0.5321, + "step": 7089 + }, + { + "epoch": 1.1502271252433485, + "grad_norm": 0.5940782641308092, + "learning_rate": 3.419898440629582e-06, + "loss": 0.5261, + "step": 7090 + }, + { + "epoch": 1.150389357560026, + "grad_norm": 0.5847036367226311, + "learning_rate": 3.419501354519874e-06, + "loss": 0.5352, + "step": 7091 + }, + { + "epoch": 1.1505515898767034, + "grad_norm": 0.5899260971338612, + "learning_rate": 3.4191042415818003e-06, + "loss": 0.506, + "step": 7092 + }, + { + "epoch": 1.150713822193381, + "grad_norm": 0.6170153346331302, + "learning_rate": 3.41870710182695e-06, + "loss": 0.535, + "step": 7093 + }, + { + "epoch": 1.1508760545100585, + "grad_norm": 0.5848800756902187, + "learning_rate": 3.4183099352669082e-06, + "loss": 0.5453, + "step": 7094 + }, + { + "epoch": 1.151038286826736, + "grad_norm": 0.6064543472132224, + "learning_rate": 3.4179127419132636e-06, + "loss": 0.502, + "step": 7095 + }, + { + "epoch": 1.1512005191434134, + "grad_norm": 0.5948857131613207, + "learning_rate": 3.4175155217776057e-06, + "loss": 0.5114, + "step": 7096 + }, + { + "epoch": 1.1513627514600908, + "grad_norm": 0.5785196938214727, + "learning_rate": 3.4171182748715235e-06, + "loss": 0.5028, + "step": 7097 + }, + { + "epoch": 1.1515249837767683, + "grad_norm": 0.6258879458431833, + "learning_rate": 3.416721001206609e-06, + "loss": 0.5487, + "step": 7098 + }, + { + "epoch": 1.1516872160934457, + "grad_norm": 0.6097179094427898, + "learning_rate": 3.416323700794451e-06, + "loss": 0.5579, + "step": 7099 + }, + { + "epoch": 1.1518494484101234, + "grad_norm": 0.5730714865080487, + "learning_rate": 3.415926373646644e-06, + "loss": 0.5341, + "step": 7100 + }, + { + "epoch": 1.1520116807268008, + "grad_norm": 0.5990556044326582, + "learning_rate": 3.4155290197747794e-06, + "loss": 0.5532, + "step": 7101 + }, + { + "epoch": 1.1521739130434783, + "grad_norm": 0.5820268988877626, + "learning_rate": 3.4151316391904514e-06, + "loss": 0.5362, + "step": 7102 + }, + { + "epoch": 1.1523361453601557, + "grad_norm": 0.5656608545234862, + "learning_rate": 3.414734231905254e-06, + "loss": 0.5063, + "step": 7103 + }, + { + "epoch": 1.1524983776768332, + "grad_norm": 0.590909007164983, + "learning_rate": 3.4143367979307834e-06, + "loss": 0.526, + "step": 7104 + }, + { + "epoch": 1.1526606099935108, + "grad_norm": 0.5816363798034904, + "learning_rate": 3.4139393372786335e-06, + "loss": 0.5384, + "step": 7105 + }, + { + "epoch": 1.1528228423101883, + "grad_norm": 0.5783868753722345, + "learning_rate": 3.4135418499604033e-06, + "loss": 0.5288, + "step": 7106 + }, + { + "epoch": 1.1529850746268657, + "grad_norm": 0.5964336283667565, + "learning_rate": 3.413144335987689e-06, + "loss": 0.5528, + "step": 7107 + }, + { + "epoch": 1.1531473069435432, + "grad_norm": 0.5509444310394487, + "learning_rate": 3.4127467953720893e-06, + "loss": 0.5217, + "step": 7108 + }, + { + "epoch": 1.1533095392602206, + "grad_norm": 0.5777162669062195, + "learning_rate": 3.4123492281252035e-06, + "loss": 0.5228, + "step": 7109 + }, + { + "epoch": 1.153471771576898, + "grad_norm": 0.618805900903058, + "learning_rate": 3.411951634258631e-06, + "loss": 0.5102, + "step": 7110 + }, + { + "epoch": 1.1536340038935755, + "grad_norm": 0.5569967684678606, + "learning_rate": 3.411554013783973e-06, + "loss": 0.5569, + "step": 7111 + }, + { + "epoch": 1.1537962362102532, + "grad_norm": 0.5884356106454138, + "learning_rate": 3.41115636671283e-06, + "loss": 0.5191, + "step": 7112 + }, + { + "epoch": 1.1539584685269306, + "grad_norm": 0.5864109587712618, + "learning_rate": 3.410758693056805e-06, + "loss": 0.5277, + "step": 7113 + }, + { + "epoch": 1.154120700843608, + "grad_norm": 0.5790518835740759, + "learning_rate": 3.4103609928275007e-06, + "loss": 0.5238, + "step": 7114 + }, + { + "epoch": 1.1542829331602855, + "grad_norm": 0.6362343935796568, + "learning_rate": 3.40996326603652e-06, + "loss": 0.5299, + "step": 7115 + }, + { + "epoch": 1.154445165476963, + "grad_norm": 0.5618604628133871, + "learning_rate": 3.4095655126954693e-06, + "loss": 0.5034, + "step": 7116 + }, + { + "epoch": 1.1546073977936404, + "grad_norm": 0.5865263496152779, + "learning_rate": 3.4091677328159517e-06, + "loss": 0.5392, + "step": 7117 + }, + { + "epoch": 1.154769630110318, + "grad_norm": 0.6145563218262081, + "learning_rate": 3.4087699264095746e-06, + "loss": 0.5189, + "step": 7118 + }, + { + "epoch": 1.1549318624269955, + "grad_norm": 0.5854542656449111, + "learning_rate": 3.408372093487945e-06, + "loss": 0.4894, + "step": 7119 + }, + { + "epoch": 1.155094094743673, + "grad_norm": 0.5742283845439129, + "learning_rate": 3.40797423406267e-06, + "loss": 0.5472, + "step": 7120 + }, + { + "epoch": 1.1552563270603504, + "grad_norm": 0.630263035838442, + "learning_rate": 3.4075763481453584e-06, + "loss": 0.5548, + "step": 7121 + }, + { + "epoch": 1.1554185593770279, + "grad_norm": 0.6002439661827991, + "learning_rate": 3.4071784357476184e-06, + "loss": 0.5626, + "step": 7122 + }, + { + "epoch": 1.1555807916937053, + "grad_norm": 0.62269636098094, + "learning_rate": 3.40678049688106e-06, + "loss": 0.5122, + "step": 7123 + }, + { + "epoch": 1.1557430240103828, + "grad_norm": 0.5660512615472864, + "learning_rate": 3.4063825315572955e-06, + "loss": 0.503, + "step": 7124 + }, + { + "epoch": 1.1559052563270604, + "grad_norm": 0.5790857887391887, + "learning_rate": 3.405984539787934e-06, + "loss": 0.5506, + "step": 7125 + }, + { + "epoch": 1.1560674886437379, + "grad_norm": 0.6090649018899856, + "learning_rate": 3.40558652158459e-06, + "loss": 0.5494, + "step": 7126 + }, + { + "epoch": 1.1562297209604153, + "grad_norm": 0.5929480837888185, + "learning_rate": 3.4051884769588758e-06, + "loss": 0.5015, + "step": 7127 + }, + { + "epoch": 1.1563919532770928, + "grad_norm": 0.5966862612535373, + "learning_rate": 3.4047904059224034e-06, + "loss": 0.4955, + "step": 7128 + }, + { + "epoch": 1.1565541855937702, + "grad_norm": 0.5939078134736059, + "learning_rate": 3.4043923084867907e-06, + "loss": 0.5408, + "step": 7129 + }, + { + "epoch": 1.1567164179104479, + "grad_norm": 0.5940228831227227, + "learning_rate": 3.40399418466365e-06, + "loss": 0.5078, + "step": 7130 + }, + { + "epoch": 1.1568786502271253, + "grad_norm": 0.6151851168867271, + "learning_rate": 3.403596034464599e-06, + "loss": 0.5153, + "step": 7131 + }, + { + "epoch": 1.1570408825438028, + "grad_norm": 0.6207317178782676, + "learning_rate": 3.4031978579012537e-06, + "loss": 0.505, + "step": 7132 + }, + { + "epoch": 1.1572031148604802, + "grad_norm": 0.6142917835348614, + "learning_rate": 3.402799654985233e-06, + "loss": 0.5102, + "step": 7133 + }, + { + "epoch": 1.1573653471771577, + "grad_norm": 0.5861427950115733, + "learning_rate": 3.4024014257281534e-06, + "loss": 0.5252, + "step": 7134 + }, + { + "epoch": 1.1575275794938351, + "grad_norm": 0.5959723244810287, + "learning_rate": 3.4020031701416354e-06, + "loss": 0.5284, + "step": 7135 + }, + { + "epoch": 1.1576898118105126, + "grad_norm": 0.5761667221281681, + "learning_rate": 3.4016048882372985e-06, + "loss": 0.5054, + "step": 7136 + }, + { + "epoch": 1.1578520441271902, + "grad_norm": 0.5871454124107861, + "learning_rate": 3.401206580026764e-06, + "loss": 0.5173, + "step": 7137 + }, + { + "epoch": 1.1580142764438677, + "grad_norm": 0.5887032977463937, + "learning_rate": 3.4008082455216527e-06, + "loss": 0.508, + "step": 7138 + }, + { + "epoch": 1.1581765087605451, + "grad_norm": 0.6036923901292733, + "learning_rate": 3.4004098847335877e-06, + "loss": 0.504, + "step": 7139 + }, + { + "epoch": 1.1583387410772226, + "grad_norm": 0.5815660648182386, + "learning_rate": 3.4000114976741905e-06, + "loss": 0.5163, + "step": 7140 + }, + { + "epoch": 1.1585009733939, + "grad_norm": 0.6206407809021867, + "learning_rate": 3.3996130843550856e-06, + "loss": 0.5303, + "step": 7141 + }, + { + "epoch": 1.1586632057105775, + "grad_norm": 0.6610278703031996, + "learning_rate": 3.399214644787899e-06, + "loss": 0.521, + "step": 7142 + }, + { + "epoch": 1.1588254380272551, + "grad_norm": 0.6320544428760747, + "learning_rate": 3.398816178984253e-06, + "loss": 0.5353, + "step": 7143 + }, + { + "epoch": 1.1589876703439326, + "grad_norm": 0.5686100747535534, + "learning_rate": 3.3984176869557762e-06, + "loss": 0.5257, + "step": 7144 + }, + { + "epoch": 1.15914990266061, + "grad_norm": 0.5866488285222873, + "learning_rate": 3.3980191687140945e-06, + "loss": 0.5124, + "step": 7145 + }, + { + "epoch": 1.1593121349772875, + "grad_norm": 0.6168157965476838, + "learning_rate": 3.3976206242708355e-06, + "loss": 0.5235, + "step": 7146 + }, + { + "epoch": 1.159474367293965, + "grad_norm": 0.6353978022255673, + "learning_rate": 3.397222053637628e-06, + "loss": 0.5111, + "step": 7147 + }, + { + "epoch": 1.1596365996106424, + "grad_norm": 0.5833446890478691, + "learning_rate": 3.3968234568261e-06, + "loss": 0.5067, + "step": 7148 + }, + { + "epoch": 1.1597988319273198, + "grad_norm": 0.6184639555962617, + "learning_rate": 3.3964248338478833e-06, + "loss": 0.5351, + "step": 7149 + }, + { + "epoch": 1.1599610642439975, + "grad_norm": 0.5815798068141492, + "learning_rate": 3.396026184714606e-06, + "loss": 0.5401, + "step": 7150 + }, + { + "epoch": 1.160123296560675, + "grad_norm": 0.5864084597353404, + "learning_rate": 3.395627509437902e-06, + "loss": 0.5303, + "step": 7151 + }, + { + "epoch": 1.1602855288773524, + "grad_norm": 0.6075157684431478, + "learning_rate": 3.3952288080294022e-06, + "loss": 0.5376, + "step": 7152 + }, + { + "epoch": 1.1604477611940298, + "grad_norm": 0.5824943174901827, + "learning_rate": 3.3948300805007387e-06, + "loss": 0.5479, + "step": 7153 + }, + { + "epoch": 1.1606099935107073, + "grad_norm": 0.604058147761852, + "learning_rate": 3.3944313268635475e-06, + "loss": 0.5588, + "step": 7154 + }, + { + "epoch": 1.160772225827385, + "grad_norm": 0.6254123225776791, + "learning_rate": 3.394032547129462e-06, + "loss": 0.5324, + "step": 7155 + }, + { + "epoch": 1.1609344581440624, + "grad_norm": 0.6118609996805213, + "learning_rate": 3.393633741310116e-06, + "loss": 0.5148, + "step": 7156 + }, + { + "epoch": 1.1610966904607398, + "grad_norm": 0.6375299265929741, + "learning_rate": 3.393234909417148e-06, + "loss": 0.5361, + "step": 7157 + }, + { + "epoch": 1.1612589227774173, + "grad_norm": 0.6178335306927185, + "learning_rate": 3.392836051462193e-06, + "loss": 0.48, + "step": 7158 + }, + { + "epoch": 1.1614211550940947, + "grad_norm": 0.6263685038183675, + "learning_rate": 3.3924371674568885e-06, + "loss": 0.4812, + "step": 7159 + }, + { + "epoch": 1.1615833874107722, + "grad_norm": 0.6046524219866216, + "learning_rate": 3.392038257412874e-06, + "loss": 0.501, + "step": 7160 + }, + { + "epoch": 1.1617456197274496, + "grad_norm": 0.6430102766290643, + "learning_rate": 3.3916393213417876e-06, + "loss": 0.519, + "step": 7161 + }, + { + "epoch": 1.1619078520441273, + "grad_norm": 0.6116829229502004, + "learning_rate": 3.391240359255269e-06, + "loss": 0.5315, + "step": 7162 + }, + { + "epoch": 1.1620700843608047, + "grad_norm": 0.5933098952676006, + "learning_rate": 3.390841371164959e-06, + "loss": 0.5256, + "step": 7163 + }, + { + "epoch": 1.1622323166774822, + "grad_norm": 0.6281775822325537, + "learning_rate": 3.390442357082499e-06, + "loss": 0.5238, + "step": 7164 + }, + { + "epoch": 1.1623945489941596, + "grad_norm": 0.6046444773715828, + "learning_rate": 3.3900433170195318e-06, + "loss": 0.5257, + "step": 7165 + }, + { + "epoch": 1.162556781310837, + "grad_norm": 0.629670696520008, + "learning_rate": 3.3896442509876988e-06, + "loss": 0.5067, + "step": 7166 + }, + { + "epoch": 1.1627190136275145, + "grad_norm": 0.6198480793392531, + "learning_rate": 3.389245158998645e-06, + "loss": 0.4777, + "step": 7167 + }, + { + "epoch": 1.1628812459441922, + "grad_norm": 0.6164742225277114, + "learning_rate": 3.388846041064012e-06, + "loss": 0.5186, + "step": 7168 + }, + { + "epoch": 1.1630434782608696, + "grad_norm": 0.602220438422506, + "learning_rate": 3.388446897195449e-06, + "loss": 0.5396, + "step": 7169 + }, + { + "epoch": 1.163205710577547, + "grad_norm": 0.5935855897545842, + "learning_rate": 3.3880477274045984e-06, + "loss": 0.5182, + "step": 7170 + }, + { + "epoch": 1.1633679428942245, + "grad_norm": 0.6339118498393386, + "learning_rate": 3.387648531703109e-06, + "loss": 0.5404, + "step": 7171 + }, + { + "epoch": 1.163530175210902, + "grad_norm": 0.5943198058097751, + "learning_rate": 3.3872493101026264e-06, + "loss": 0.5203, + "step": 7172 + }, + { + "epoch": 1.1636924075275794, + "grad_norm": 0.6014235353493154, + "learning_rate": 3.3868500626148e-06, + "loss": 0.5334, + "step": 7173 + }, + { + "epoch": 1.1638546398442569, + "grad_norm": 0.5788732825034094, + "learning_rate": 3.386450789251278e-06, + "loss": 0.5292, + "step": 7174 + }, + { + "epoch": 1.1640168721609345, + "grad_norm": 0.6314006151258138, + "learning_rate": 3.3860514900237106e-06, + "loss": 0.4899, + "step": 7175 + }, + { + "epoch": 1.164179104477612, + "grad_norm": 0.574269223542034, + "learning_rate": 3.3856521649437478e-06, + "loss": 0.5164, + "step": 7176 + }, + { + "epoch": 1.1643413367942894, + "grad_norm": 0.5739161612018389, + "learning_rate": 3.3852528140230407e-06, + "loss": 0.5281, + "step": 7177 + }, + { + "epoch": 1.1645035691109669, + "grad_norm": 0.5885889033264834, + "learning_rate": 3.384853437273242e-06, + "loss": 0.498, + "step": 7178 + }, + { + "epoch": 1.1646658014276443, + "grad_norm": 0.6209898334295807, + "learning_rate": 3.3844540347060027e-06, + "loss": 0.5215, + "step": 7179 + }, + { + "epoch": 1.164828033744322, + "grad_norm": 0.5872088686859027, + "learning_rate": 3.384054606332978e-06, + "loss": 0.514, + "step": 7180 + }, + { + "epoch": 1.1649902660609994, + "grad_norm": 0.6387949125873085, + "learning_rate": 3.3836551521658214e-06, + "loss": 0.4913, + "step": 7181 + }, + { + "epoch": 1.1651524983776769, + "grad_norm": 0.5818516886190745, + "learning_rate": 3.3832556722161865e-06, + "loss": 0.5421, + "step": 7182 + }, + { + "epoch": 1.1653147306943543, + "grad_norm": 0.612491736351178, + "learning_rate": 3.3828561664957314e-06, + "loss": 0.5366, + "step": 7183 + }, + { + "epoch": 1.1654769630110318, + "grad_norm": 0.6170385585328084, + "learning_rate": 3.38245663501611e-06, + "loss": 0.5233, + "step": 7184 + }, + { + "epoch": 1.1656391953277092, + "grad_norm": 0.6068905012669767, + "learning_rate": 3.3820570777889818e-06, + "loss": 0.4937, + "step": 7185 + }, + { + "epoch": 1.1658014276443867, + "grad_norm": 0.6148298014628577, + "learning_rate": 3.3816574948260027e-06, + "loss": 0.5223, + "step": 7186 + }, + { + "epoch": 1.1659636599610643, + "grad_norm": 0.6238777544822737, + "learning_rate": 3.381257886138832e-06, + "loss": 0.5274, + "step": 7187 + }, + { + "epoch": 1.1661258922777418, + "grad_norm": 0.6142187731911151, + "learning_rate": 3.3808582517391302e-06, + "loss": 0.5279, + "step": 7188 + }, + { + "epoch": 1.1662881245944192, + "grad_norm": 0.6077165274946886, + "learning_rate": 3.380458591638557e-06, + "loss": 0.535, + "step": 7189 + }, + { + "epoch": 1.1664503569110967, + "grad_norm": 0.5646419300815725, + "learning_rate": 3.380058905848772e-06, + "loss": 0.5071, + "step": 7190 + }, + { + "epoch": 1.166612589227774, + "grad_norm": 0.628061800607386, + "learning_rate": 3.3796591943814376e-06, + "loss": 0.5387, + "step": 7191 + }, + { + "epoch": 1.1667748215444518, + "grad_norm": 0.5946800215993482, + "learning_rate": 3.379259457248217e-06, + "loss": 0.4952, + "step": 7192 + }, + { + "epoch": 1.1669370538611292, + "grad_norm": 0.5735419856682161, + "learning_rate": 3.378859694460773e-06, + "loss": 0.5573, + "step": 7193 + }, + { + "epoch": 1.1670992861778067, + "grad_norm": 0.5834620695181233, + "learning_rate": 3.378459906030769e-06, + "loss": 0.5042, + "step": 7194 + }, + { + "epoch": 1.1672615184944841, + "grad_norm": 0.6041608125888375, + "learning_rate": 3.3780600919698697e-06, + "loss": 0.5061, + "step": 7195 + }, + { + "epoch": 1.1674237508111616, + "grad_norm": 0.6016699367436869, + "learning_rate": 3.3776602522897413e-06, + "loss": 0.5554, + "step": 7196 + }, + { + "epoch": 1.167585983127839, + "grad_norm": 0.591883033342458, + "learning_rate": 3.3772603870020484e-06, + "loss": 0.5528, + "step": 7197 + }, + { + "epoch": 1.1677482154445165, + "grad_norm": 0.5731590625147086, + "learning_rate": 3.3768604961184593e-06, + "loss": 0.5078, + "step": 7198 + }, + { + "epoch": 1.1679104477611941, + "grad_norm": 0.6003079265142518, + "learning_rate": 3.3764605796506416e-06, + "loss": 0.5361, + "step": 7199 + }, + { + "epoch": 1.1680726800778716, + "grad_norm": 0.5923928727405396, + "learning_rate": 3.3760606376102635e-06, + "loss": 0.512, + "step": 7200 + }, + { + "epoch": 1.168234912394549, + "grad_norm": 0.6192968306254448, + "learning_rate": 3.3756606700089938e-06, + "loss": 0.5294, + "step": 7201 + }, + { + "epoch": 1.1683971447112265, + "grad_norm": 0.582960530775308, + "learning_rate": 3.3752606768585024e-06, + "loss": 0.4897, + "step": 7202 + }, + { + "epoch": 1.168559377027904, + "grad_norm": 0.5639121849121774, + "learning_rate": 3.3748606581704606e-06, + "loss": 0.5276, + "step": 7203 + }, + { + "epoch": 1.1687216093445814, + "grad_norm": 0.6072202558953279, + "learning_rate": 3.3744606139565384e-06, + "loss": 0.5334, + "step": 7204 + }, + { + "epoch": 1.168883841661259, + "grad_norm": 0.588016566071594, + "learning_rate": 3.3740605442284097e-06, + "loss": 0.5305, + "step": 7205 + }, + { + "epoch": 1.1690460739779365, + "grad_norm": 0.5893383705934298, + "learning_rate": 3.3736604489977465e-06, + "loss": 0.5055, + "step": 7206 + }, + { + "epoch": 1.169208306294614, + "grad_norm": 0.57498179764193, + "learning_rate": 3.373260328276222e-06, + "loss": 0.4802, + "step": 7207 + }, + { + "epoch": 1.1693705386112914, + "grad_norm": 0.5820880923630649, + "learning_rate": 3.372860182075511e-06, + "loss": 0.5064, + "step": 7208 + }, + { + "epoch": 1.1695327709279688, + "grad_norm": 0.6094162239266583, + "learning_rate": 3.372460010407288e-06, + "loss": 0.5286, + "step": 7209 + }, + { + "epoch": 1.1696950032446463, + "grad_norm": 0.6047791974075575, + "learning_rate": 3.37205981328323e-06, + "loss": 0.484, + "step": 7210 + }, + { + "epoch": 1.1698572355613237, + "grad_norm": 0.7803507855247132, + "learning_rate": 3.371659590715013e-06, + "loss": 0.5233, + "step": 7211 + }, + { + "epoch": 1.1700194678780014, + "grad_norm": 0.5862756274587101, + "learning_rate": 3.3712593427143142e-06, + "loss": 0.5246, + "step": 7212 + }, + { + "epoch": 1.1701817001946788, + "grad_norm": 0.6100953726305487, + "learning_rate": 3.3708590692928115e-06, + "loss": 0.4873, + "step": 7213 + }, + { + "epoch": 1.1703439325113563, + "grad_norm": 0.586199253466017, + "learning_rate": 3.370458770462185e-06, + "loss": 0.513, + "step": 7214 + }, + { + "epoch": 1.1705061648280337, + "grad_norm": 0.595388917567349, + "learning_rate": 3.370058446234112e-06, + "loss": 0.5463, + "step": 7215 + }, + { + "epoch": 1.1706683971447112, + "grad_norm": 0.5753391308279946, + "learning_rate": 3.369658096620274e-06, + "loss": 0.5257, + "step": 7216 + }, + { + "epoch": 1.1708306294613888, + "grad_norm": 0.5905149128227234, + "learning_rate": 3.369257721632353e-06, + "loss": 0.5028, + "step": 7217 + }, + { + "epoch": 1.1709928617780663, + "grad_norm": 0.5732906520190805, + "learning_rate": 3.368857321282029e-06, + "loss": 0.5356, + "step": 7218 + }, + { + "epoch": 1.1711550940947437, + "grad_norm": 0.6109382226558967, + "learning_rate": 3.368456895580986e-06, + "loss": 0.5171, + "step": 7219 + }, + { + "epoch": 1.1713173264114212, + "grad_norm": 0.6352091257728745, + "learning_rate": 3.368056444540906e-06, + "loss": 0.5361, + "step": 7220 + }, + { + "epoch": 1.1714795587280986, + "grad_norm": 0.6087364304979356, + "learning_rate": 3.3676559681734744e-06, + "loss": 0.5508, + "step": 7221 + }, + { + "epoch": 1.171641791044776, + "grad_norm": 0.6253850881370102, + "learning_rate": 3.367255466490374e-06, + "loss": 0.557, + "step": 7222 + }, + { + "epoch": 1.1718040233614535, + "grad_norm": 0.6491545866744304, + "learning_rate": 3.3668549395032918e-06, + "loss": 0.5327, + "step": 7223 + }, + { + "epoch": 1.1719662556781312, + "grad_norm": 0.6303563679087789, + "learning_rate": 3.3664543872239137e-06, + "loss": 0.4978, + "step": 7224 + }, + { + "epoch": 1.1721284879948086, + "grad_norm": 0.6114117903632637, + "learning_rate": 3.3660538096639268e-06, + "loss": 0.4957, + "step": 7225 + }, + { + "epoch": 1.172290720311486, + "grad_norm": 0.5976762685148549, + "learning_rate": 3.365653206835018e-06, + "loss": 0.5092, + "step": 7226 + }, + { + "epoch": 1.1724529526281635, + "grad_norm": 0.5980710620774243, + "learning_rate": 3.365252578748876e-06, + "loss": 0.505, + "step": 7227 + }, + { + "epoch": 1.172615184944841, + "grad_norm": 0.7288034916428423, + "learning_rate": 3.3648519254171906e-06, + "loss": 0.5312, + "step": 7228 + }, + { + "epoch": 1.1727774172615184, + "grad_norm": 0.6002986912543672, + "learning_rate": 3.3644512468516515e-06, + "loss": 0.5073, + "step": 7229 + }, + { + "epoch": 1.172939649578196, + "grad_norm": 0.5852063665721413, + "learning_rate": 3.3640505430639493e-06, + "loss": 0.5254, + "step": 7230 + }, + { + "epoch": 1.1731018818948735, + "grad_norm": 0.5672459761852487, + "learning_rate": 3.3636498140657746e-06, + "loss": 0.505, + "step": 7231 + }, + { + "epoch": 1.173264114211551, + "grad_norm": 0.5868529850240857, + "learning_rate": 3.3632490598688204e-06, + "loss": 0.539, + "step": 7232 + }, + { + "epoch": 1.1734263465282284, + "grad_norm": 0.6270994099616161, + "learning_rate": 3.362848280484779e-06, + "loss": 0.5044, + "step": 7233 + }, + { + "epoch": 1.1735885788449059, + "grad_norm": 0.5814204020695091, + "learning_rate": 3.362447475925345e-06, + "loss": 0.5215, + "step": 7234 + }, + { + "epoch": 1.1737508111615833, + "grad_norm": 0.6180069308368858, + "learning_rate": 3.3620466462022107e-06, + "loss": 0.5177, + "step": 7235 + }, + { + "epoch": 1.1739130434782608, + "grad_norm": 0.6467788951323148, + "learning_rate": 3.3616457913270728e-06, + "loss": 0.5374, + "step": 7236 + }, + { + "epoch": 1.1740752757949384, + "grad_norm": 0.6066925177341683, + "learning_rate": 3.361244911311627e-06, + "loss": 0.4939, + "step": 7237 + }, + { + "epoch": 1.1742375081116159, + "grad_norm": 0.5882551373332837, + "learning_rate": 3.3608440061675684e-06, + "loss": 0.5323, + "step": 7238 + }, + { + "epoch": 1.1743997404282933, + "grad_norm": 0.5601829275567497, + "learning_rate": 3.360443075906597e-06, + "loss": 0.5225, + "step": 7239 + }, + { + "epoch": 1.1745619727449708, + "grad_norm": 0.5824502370806867, + "learning_rate": 3.3600421205404078e-06, + "loss": 0.5361, + "step": 7240 + }, + { + "epoch": 1.1747242050616482, + "grad_norm": 0.6161486256806298, + "learning_rate": 3.3596411400807016e-06, + "loss": 0.5374, + "step": 7241 + }, + { + "epoch": 1.1748864373783259, + "grad_norm": 0.5903581035685085, + "learning_rate": 3.3592401345391755e-06, + "loss": 0.55, + "step": 7242 + }, + { + "epoch": 1.1750486696950033, + "grad_norm": 0.614305504753898, + "learning_rate": 3.3588391039275324e-06, + "loss": 0.5338, + "step": 7243 + }, + { + "epoch": 1.1752109020116808, + "grad_norm": 0.5638065327841595, + "learning_rate": 3.358438048257472e-06, + "loss": 0.5096, + "step": 7244 + }, + { + "epoch": 1.1753731343283582, + "grad_norm": 0.5700942918601656, + "learning_rate": 3.358036967540695e-06, + "loss": 0.4937, + "step": 7245 + }, + { + "epoch": 1.1755353666450357, + "grad_norm": 0.6218701154917374, + "learning_rate": 3.3576358617889047e-06, + "loss": 0.5145, + "step": 7246 + }, + { + "epoch": 1.175697598961713, + "grad_norm": 0.5884776449746657, + "learning_rate": 3.3572347310138055e-06, + "loss": 0.4808, + "step": 7247 + }, + { + "epoch": 1.1758598312783906, + "grad_norm": 0.6047433288076427, + "learning_rate": 3.3568335752270986e-06, + "loss": 0.5319, + "step": 7248 + }, + { + "epoch": 1.1760220635950682, + "grad_norm": 0.6401432732887474, + "learning_rate": 3.3564323944404903e-06, + "loss": 0.5263, + "step": 7249 + }, + { + "epoch": 1.1761842959117457, + "grad_norm": 0.6142352090774121, + "learning_rate": 3.3560311886656855e-06, + "loss": 0.5292, + "step": 7250 + }, + { + "epoch": 1.1763465282284231, + "grad_norm": 0.591861412926995, + "learning_rate": 3.3556299579143897e-06, + "loss": 0.5164, + "step": 7251 + }, + { + "epoch": 1.1765087605451006, + "grad_norm": 0.6035688384737571, + "learning_rate": 3.355228702198311e-06, + "loss": 0.5151, + "step": 7252 + }, + { + "epoch": 1.176670992861778, + "grad_norm": 0.6282621987117147, + "learning_rate": 3.354827421529155e-06, + "loss": 0.5356, + "step": 7253 + }, + { + "epoch": 1.1768332251784555, + "grad_norm": 0.5970033140362413, + "learning_rate": 3.354426115918631e-06, + "loss": 0.5236, + "step": 7254 + }, + { + "epoch": 1.1769954574951331, + "grad_norm": 0.5949900902501991, + "learning_rate": 3.354024785378448e-06, + "loss": 0.5117, + "step": 7255 + }, + { + "epoch": 1.1771576898118106, + "grad_norm": 0.5993097121172705, + "learning_rate": 3.353623429920315e-06, + "loss": 0.4914, + "step": 7256 + }, + { + "epoch": 1.177319922128488, + "grad_norm": 0.5900226122469461, + "learning_rate": 3.3532220495559435e-06, + "loss": 0.52, + "step": 7257 + }, + { + "epoch": 1.1774821544451655, + "grad_norm": 0.5902810300562557, + "learning_rate": 3.3528206442970433e-06, + "loss": 0.5314, + "step": 7258 + }, + { + "epoch": 1.177644386761843, + "grad_norm": 0.6420650192401296, + "learning_rate": 3.352419214155328e-06, + "loss": 0.5346, + "step": 7259 + }, + { + "epoch": 1.1778066190785204, + "grad_norm": 0.5863035662816384, + "learning_rate": 3.3520177591425074e-06, + "loss": 0.5012, + "step": 7260 + }, + { + "epoch": 1.1779688513951978, + "grad_norm": 0.6104118607599481, + "learning_rate": 3.351616279270297e-06, + "loss": 0.5025, + "step": 7261 + }, + { + "epoch": 1.1781310837118755, + "grad_norm": 0.593661756265048, + "learning_rate": 3.3512147745504107e-06, + "loss": 0.5283, + "step": 7262 + }, + { + "epoch": 1.178293316028553, + "grad_norm": 0.557827540784861, + "learning_rate": 3.3508132449945623e-06, + "loss": 0.5128, + "step": 7263 + }, + { + "epoch": 1.1784555483452304, + "grad_norm": 0.5689002369663875, + "learning_rate": 3.3504116906144678e-06, + "loss": 0.5158, + "step": 7264 + }, + { + "epoch": 1.1786177806619078, + "grad_norm": 0.5997995824934333, + "learning_rate": 3.350010111421843e-06, + "loss": 0.5323, + "step": 7265 + }, + { + "epoch": 1.1787800129785853, + "grad_norm": 0.5918430961858273, + "learning_rate": 3.3496085074284053e-06, + "loss": 0.5472, + "step": 7266 + }, + { + "epoch": 1.178942245295263, + "grad_norm": 0.6060674387615212, + "learning_rate": 3.349206878645872e-06, + "loss": 0.5352, + "step": 7267 + }, + { + "epoch": 1.1791044776119404, + "grad_norm": 0.5936794050385346, + "learning_rate": 3.3488052250859614e-06, + "loss": 0.5095, + "step": 7268 + }, + { + "epoch": 1.1792667099286178, + "grad_norm": 0.5822701132261741, + "learning_rate": 3.3484035467603924e-06, + "loss": 0.5565, + "step": 7269 + }, + { + "epoch": 1.1794289422452953, + "grad_norm": 0.5822204715234619, + "learning_rate": 3.348001843680887e-06, + "loss": 0.5282, + "step": 7270 + }, + { + "epoch": 1.1795911745619727, + "grad_norm": 0.6600841999680258, + "learning_rate": 3.347600115859162e-06, + "loss": 0.4901, + "step": 7271 + }, + { + "epoch": 1.1797534068786502, + "grad_norm": 0.6623319708498471, + "learning_rate": 3.3471983633069414e-06, + "loss": 0.527, + "step": 7272 + }, + { + "epoch": 1.1799156391953276, + "grad_norm": 0.6159557580639917, + "learning_rate": 3.346796586035946e-06, + "loss": 0.526, + "step": 7273 + }, + { + "epoch": 1.1800778715120053, + "grad_norm": 0.5954997553742762, + "learning_rate": 3.346394784057898e-06, + "loss": 0.5279, + "step": 7274 + }, + { + "epoch": 1.1802401038286827, + "grad_norm": 0.5818684865407068, + "learning_rate": 3.3459929573845233e-06, + "loss": 0.5204, + "step": 7275 + }, + { + "epoch": 1.1804023361453602, + "grad_norm": 0.5804294454638439, + "learning_rate": 3.3455911060275433e-06, + "loss": 0.5225, + "step": 7276 + }, + { + "epoch": 1.1805645684620376, + "grad_norm": 0.605162154608931, + "learning_rate": 3.3451892299986844e-06, + "loss": 0.5058, + "step": 7277 + }, + { + "epoch": 1.180726800778715, + "grad_norm": 0.5782954419071025, + "learning_rate": 3.3447873293096704e-06, + "loss": 0.5021, + "step": 7278 + }, + { + "epoch": 1.1808890330953927, + "grad_norm": 0.5836875627938302, + "learning_rate": 3.34438540397223e-06, + "loss": 0.5361, + "step": 7279 + }, + { + "epoch": 1.1810512654120702, + "grad_norm": 0.6275675728541098, + "learning_rate": 3.3439834539980887e-06, + "loss": 0.536, + "step": 7280 + }, + { + "epoch": 1.1812134977287476, + "grad_norm": 0.6089845880057739, + "learning_rate": 3.3435814793989744e-06, + "loss": 0.5279, + "step": 7281 + }, + { + "epoch": 1.181375730045425, + "grad_norm": 0.5876246146324209, + "learning_rate": 3.343179480186616e-06, + "loss": 0.5349, + "step": 7282 + }, + { + "epoch": 1.1815379623621025, + "grad_norm": 0.5935380829352059, + "learning_rate": 3.3427774563727415e-06, + "loss": 0.5276, + "step": 7283 + }, + { + "epoch": 1.18170019467878, + "grad_norm": 0.5960377824129067, + "learning_rate": 3.342375407969082e-06, + "loss": 0.5193, + "step": 7284 + }, + { + "epoch": 1.1818624269954574, + "grad_norm": 0.587142919124341, + "learning_rate": 3.3419733349873685e-06, + "loss": 0.4851, + "step": 7285 + }, + { + "epoch": 1.182024659312135, + "grad_norm": 0.5956669733277258, + "learning_rate": 3.341571237439331e-06, + "loss": 0.5191, + "step": 7286 + }, + { + "epoch": 1.1821868916288125, + "grad_norm": 0.5651166086688073, + "learning_rate": 3.3411691153367026e-06, + "loss": 0.5403, + "step": 7287 + }, + { + "epoch": 1.18234912394549, + "grad_norm": 0.5885121403354616, + "learning_rate": 3.3407669686912148e-06, + "loss": 0.5046, + "step": 7288 + }, + { + "epoch": 1.1825113562621674, + "grad_norm": 0.6029655178858448, + "learning_rate": 3.340364797514602e-06, + "loss": 0.5062, + "step": 7289 + }, + { + "epoch": 1.1826735885788449, + "grad_norm": 0.6007429813647555, + "learning_rate": 3.339962601818598e-06, + "loss": 0.5201, + "step": 7290 + }, + { + "epoch": 1.1828358208955223, + "grad_norm": 0.6051303474939894, + "learning_rate": 3.3395603816149386e-06, + "loss": 0.5276, + "step": 7291 + }, + { + "epoch": 1.1829980532122, + "grad_norm": 0.5676209288097165, + "learning_rate": 3.339158136915358e-06, + "loss": 0.5374, + "step": 7292 + }, + { + "epoch": 1.1831602855288774, + "grad_norm": 0.5892923764417358, + "learning_rate": 3.3387558677315936e-06, + "loss": 0.5511, + "step": 7293 + }, + { + "epoch": 1.1833225178455549, + "grad_norm": 0.5993161081714871, + "learning_rate": 3.3383535740753813e-06, + "loss": 0.5214, + "step": 7294 + }, + { + "epoch": 1.1834847501622323, + "grad_norm": 0.5768926682633663, + "learning_rate": 3.337951255958461e-06, + "loss": 0.4918, + "step": 7295 + }, + { + "epoch": 1.1836469824789098, + "grad_norm": 0.5691515663374902, + "learning_rate": 3.3375489133925685e-06, + "loss": 0.523, + "step": 7296 + }, + { + "epoch": 1.1838092147955872, + "grad_norm": 0.5925397117024078, + "learning_rate": 3.337146546389445e-06, + "loss": 0.5339, + "step": 7297 + }, + { + "epoch": 1.1839714471122647, + "grad_norm": 0.5582475490971484, + "learning_rate": 3.3367441549608293e-06, + "loss": 0.5154, + "step": 7298 + }, + { + "epoch": 1.1841336794289423, + "grad_norm": 0.684648928248899, + "learning_rate": 3.3363417391184627e-06, + "loss": 0.5335, + "step": 7299 + }, + { + "epoch": 1.1842959117456198, + "grad_norm": 0.5658945474489223, + "learning_rate": 3.3359392988740856e-06, + "loss": 0.5204, + "step": 7300 + }, + { + "epoch": 1.1844581440622972, + "grad_norm": 0.5901083621103241, + "learning_rate": 3.335536834239441e-06, + "loss": 0.538, + "step": 7301 + }, + { + "epoch": 1.1846203763789747, + "grad_norm": 0.6274092772308486, + "learning_rate": 3.3351343452262713e-06, + "loss": 0.5351, + "step": 7302 + }, + { + "epoch": 1.184782608695652, + "grad_norm": 0.5999024471171764, + "learning_rate": 3.3347318318463197e-06, + "loss": 0.5404, + "step": 7303 + }, + { + "epoch": 1.1849448410123298, + "grad_norm": 0.6000796404005019, + "learning_rate": 3.3343292941113308e-06, + "loss": 0.5548, + "step": 7304 + }, + { + "epoch": 1.1851070733290072, + "grad_norm": 0.6063817026827467, + "learning_rate": 3.3339267320330495e-06, + "loss": 0.4791, + "step": 7305 + }, + { + "epoch": 1.1852693056456847, + "grad_norm": 0.618797685216283, + "learning_rate": 3.3335241456232203e-06, + "loss": 0.4994, + "step": 7306 + }, + { + "epoch": 1.1854315379623621, + "grad_norm": 0.5928957972391803, + "learning_rate": 3.333121534893591e-06, + "loss": 0.5183, + "step": 7307 + }, + { + "epoch": 1.1855937702790396, + "grad_norm": 0.5980895904469758, + "learning_rate": 3.3327188998559072e-06, + "loss": 0.5099, + "step": 7308 + }, + { + "epoch": 1.185756002595717, + "grad_norm": 0.5892275625447837, + "learning_rate": 3.332316240521918e-06, + "loss": 0.5276, + "step": 7309 + }, + { + "epoch": 1.1859182349123945, + "grad_norm": 0.6278310244757442, + "learning_rate": 3.331913556903371e-06, + "loss": 0.4792, + "step": 7310 + }, + { + "epoch": 1.1860804672290721, + "grad_norm": 0.6332043391960672, + "learning_rate": 3.3315108490120157e-06, + "loss": 0.515, + "step": 7311 + }, + { + "epoch": 1.1862426995457496, + "grad_norm": 0.6047031957299396, + "learning_rate": 3.3311081168596016e-06, + "loss": 0.5232, + "step": 7312 + }, + { + "epoch": 1.186404931862427, + "grad_norm": 0.5636960342382913, + "learning_rate": 3.33070536045788e-06, + "loss": 0.519, + "step": 7313 + }, + { + "epoch": 1.1865671641791045, + "grad_norm": 0.5849489654645921, + "learning_rate": 3.3303025798186e-06, + "loss": 0.5359, + "step": 7314 + }, + { + "epoch": 1.186729396495782, + "grad_norm": 0.5768147493462431, + "learning_rate": 3.3298997749535163e-06, + "loss": 0.528, + "step": 7315 + }, + { + "epoch": 1.1868916288124594, + "grad_norm": 0.5847824582761372, + "learning_rate": 3.32949694587438e-06, + "loss": 0.5318, + "step": 7316 + }, + { + "epoch": 1.187053861129137, + "grad_norm": 0.5662368023183787, + "learning_rate": 3.3290940925929453e-06, + "loss": 0.506, + "step": 7317 + }, + { + "epoch": 1.1872160934458145, + "grad_norm": 0.5790409911961327, + "learning_rate": 3.3286912151209656e-06, + "loss": 0.512, + "step": 7318 + }, + { + "epoch": 1.187378325762492, + "grad_norm": 0.6237523133159253, + "learning_rate": 3.3282883134701954e-06, + "loss": 0.5366, + "step": 7319 + }, + { + "epoch": 1.1875405580791694, + "grad_norm": 0.6009336553125753, + "learning_rate": 3.3278853876523915e-06, + "loss": 0.5467, + "step": 7320 + }, + { + "epoch": 1.1877027903958468, + "grad_norm": 0.5901981323347213, + "learning_rate": 3.327482437679309e-06, + "loss": 0.5625, + "step": 7321 + }, + { + "epoch": 1.1878650227125243, + "grad_norm": 0.5895919218464618, + "learning_rate": 3.327079463562705e-06, + "loss": 0.4965, + "step": 7322 + }, + { + "epoch": 1.1880272550292017, + "grad_norm": 0.5807189628356356, + "learning_rate": 3.3266764653143372e-06, + "loss": 0.4819, + "step": 7323 + }, + { + "epoch": 1.1881894873458794, + "grad_norm": 0.6312370192346751, + "learning_rate": 3.326273442945964e-06, + "loss": 0.5404, + "step": 7324 + }, + { + "epoch": 1.1883517196625568, + "grad_norm": 0.5936199746518857, + "learning_rate": 3.3258703964693443e-06, + "loss": 0.5266, + "step": 7325 + }, + { + "epoch": 1.1885139519792343, + "grad_norm": 0.6061075844607707, + "learning_rate": 3.3254673258962385e-06, + "loss": 0.5211, + "step": 7326 + }, + { + "epoch": 1.1886761842959117, + "grad_norm": 0.5882706195944778, + "learning_rate": 3.3250642312384064e-06, + "loss": 0.5317, + "step": 7327 + }, + { + "epoch": 1.1888384166125892, + "grad_norm": 0.5901622227016367, + "learning_rate": 3.3246611125076083e-06, + "loss": 0.5205, + "step": 7328 + }, + { + "epoch": 1.1890006489292668, + "grad_norm": 0.5739969964809631, + "learning_rate": 3.3242579697156073e-06, + "loss": 0.5307, + "step": 7329 + }, + { + "epoch": 1.1891628812459443, + "grad_norm": 0.6212412378284645, + "learning_rate": 3.3238548028741653e-06, + "loss": 0.505, + "step": 7330 + }, + { + "epoch": 1.1893251135626217, + "grad_norm": 0.6124393384894127, + "learning_rate": 3.323451611995046e-06, + "loss": 0.4605, + "step": 7331 + }, + { + "epoch": 1.1894873458792992, + "grad_norm": 0.6162882994380967, + "learning_rate": 3.323048397090013e-06, + "loss": 0.5371, + "step": 7332 + }, + { + "epoch": 1.1896495781959766, + "grad_norm": 0.6081632560365553, + "learning_rate": 3.322645158170831e-06, + "loss": 0.5422, + "step": 7333 + }, + { + "epoch": 1.189811810512654, + "grad_norm": 0.6049652258977323, + "learning_rate": 3.322241895249265e-06, + "loss": 0.5638, + "step": 7334 + }, + { + "epoch": 1.1899740428293315, + "grad_norm": 0.5914560598386694, + "learning_rate": 3.3218386083370814e-06, + "loss": 0.5109, + "step": 7335 + }, + { + "epoch": 1.1901362751460092, + "grad_norm": 0.6276733425364829, + "learning_rate": 3.3214352974460472e-06, + "loss": 0.5128, + "step": 7336 + }, + { + "epoch": 1.1902985074626866, + "grad_norm": 0.5846458660090468, + "learning_rate": 3.321031962587929e-06, + "loss": 0.5373, + "step": 7337 + }, + { + "epoch": 1.190460739779364, + "grad_norm": 0.6283057052661091, + "learning_rate": 3.320628603774496e-06, + "loss": 0.5094, + "step": 7338 + }, + { + "epoch": 1.1906229720960415, + "grad_norm": 0.5848965613689171, + "learning_rate": 3.320225221017516e-06, + "loss": 0.5318, + "step": 7339 + }, + { + "epoch": 1.190785204412719, + "grad_norm": 0.6170397629138701, + "learning_rate": 3.31982181432876e-06, + "loss": 0.5324, + "step": 7340 + }, + { + "epoch": 1.1909474367293964, + "grad_norm": 0.6139544769023393, + "learning_rate": 3.319418383719996e-06, + "loss": 0.5548, + "step": 7341 + }, + { + "epoch": 1.191109669046074, + "grad_norm": 0.6166323330145244, + "learning_rate": 3.319014929202997e-06, + "loss": 0.5299, + "step": 7342 + }, + { + "epoch": 1.1912719013627515, + "grad_norm": 0.5990204027932865, + "learning_rate": 3.3186114507895335e-06, + "loss": 0.5062, + "step": 7343 + }, + { + "epoch": 1.191434133679429, + "grad_norm": 0.591954202222516, + "learning_rate": 3.318207948491379e-06, + "loss": 0.4833, + "step": 7344 + }, + { + "epoch": 1.1915963659961064, + "grad_norm": 0.6385935970554785, + "learning_rate": 3.3178044223203038e-06, + "loss": 0.5491, + "step": 7345 + }, + { + "epoch": 1.1917585983127839, + "grad_norm": 0.5809831164661985, + "learning_rate": 3.3174008722880848e-06, + "loss": 0.5148, + "step": 7346 + }, + { + "epoch": 1.1919208306294613, + "grad_norm": 0.5935711288187682, + "learning_rate": 3.316997298406495e-06, + "loss": 0.4824, + "step": 7347 + }, + { + "epoch": 1.1920830629461387, + "grad_norm": 0.5985374164154951, + "learning_rate": 3.3165937006873085e-06, + "loss": 0.4908, + "step": 7348 + }, + { + "epoch": 1.1922452952628164, + "grad_norm": 0.6306130070286353, + "learning_rate": 3.3161900791423035e-06, + "loss": 0.5562, + "step": 7349 + }, + { + "epoch": 1.1924075275794939, + "grad_norm": 0.5948138191883408, + "learning_rate": 3.3157864337832546e-06, + "loss": 0.4982, + "step": 7350 + }, + { + "epoch": 1.1925697598961713, + "grad_norm": 0.611596778201077, + "learning_rate": 3.31538276462194e-06, + "loss": 0.5314, + "step": 7351 + }, + { + "epoch": 1.1927319922128488, + "grad_norm": 0.6368913257261278, + "learning_rate": 3.3149790716701365e-06, + "loss": 0.4985, + "step": 7352 + }, + { + "epoch": 1.1928942245295262, + "grad_norm": 0.5888769729701948, + "learning_rate": 3.314575354939623e-06, + "loss": 0.542, + "step": 7353 + }, + { + "epoch": 1.1930564568462039, + "grad_norm": 0.5976714079087535, + "learning_rate": 3.3141716144421805e-06, + "loss": 0.5173, + "step": 7354 + }, + { + "epoch": 1.1932186891628813, + "grad_norm": 0.570142300989732, + "learning_rate": 3.3137678501895863e-06, + "loss": 0.5271, + "step": 7355 + }, + { + "epoch": 1.1933809214795588, + "grad_norm": 0.6077385804543776, + "learning_rate": 3.3133640621936235e-06, + "loss": 0.535, + "step": 7356 + }, + { + "epoch": 1.1935431537962362, + "grad_norm": 0.6094980486278023, + "learning_rate": 3.312960250466071e-06, + "loss": 0.5367, + "step": 7357 + }, + { + "epoch": 1.1937053861129137, + "grad_norm": 0.5911949418407066, + "learning_rate": 3.312556415018713e-06, + "loss": 0.478, + "step": 7358 + }, + { + "epoch": 1.193867618429591, + "grad_norm": 0.6027129832578352, + "learning_rate": 3.312152555863331e-06, + "loss": 0.5541, + "step": 7359 + }, + { + "epoch": 1.1940298507462686, + "grad_norm": 0.5958611869297513, + "learning_rate": 3.3117486730117092e-06, + "loss": 0.5547, + "step": 7360 + }, + { + "epoch": 1.1941920830629462, + "grad_norm": 0.6126418954945472, + "learning_rate": 3.3113447664756305e-06, + "loss": 0.5008, + "step": 7361 + }, + { + "epoch": 1.1943543153796237, + "grad_norm": 0.6028448660688543, + "learning_rate": 3.310940836266882e-06, + "loss": 0.5233, + "step": 7362 + }, + { + "epoch": 1.1945165476963011, + "grad_norm": 0.5953397235208677, + "learning_rate": 3.3105368823972467e-06, + "loss": 0.5355, + "step": 7363 + }, + { + "epoch": 1.1946787800129786, + "grad_norm": 0.6162889260536444, + "learning_rate": 3.310132904878512e-06, + "loss": 0.5126, + "step": 7364 + }, + { + "epoch": 1.194841012329656, + "grad_norm": 0.6254974833972984, + "learning_rate": 3.3097289037224647e-06, + "loss": 0.5077, + "step": 7365 + }, + { + "epoch": 1.1950032446463337, + "grad_norm": 0.6048376405005613, + "learning_rate": 3.3093248789408923e-06, + "loss": 0.5407, + "step": 7366 + }, + { + "epoch": 1.1951654769630111, + "grad_norm": 0.5918191485503927, + "learning_rate": 3.3089208305455834e-06, + "loss": 0.5553, + "step": 7367 + }, + { + "epoch": 1.1953277092796886, + "grad_norm": 0.6210143783735378, + "learning_rate": 3.3085167585483257e-06, + "loss": 0.5409, + "step": 7368 + }, + { + "epoch": 1.195489941596366, + "grad_norm": 0.5897320843677367, + "learning_rate": 3.308112662960911e-06, + "loss": 0.5019, + "step": 7369 + }, + { + "epoch": 1.1956521739130435, + "grad_norm": 0.6008896281113101, + "learning_rate": 3.3077085437951275e-06, + "loss": 0.5523, + "step": 7370 + }, + { + "epoch": 1.195814406229721, + "grad_norm": 0.616828398564899, + "learning_rate": 3.3073044010627675e-06, + "loss": 0.5026, + "step": 7371 + }, + { + "epoch": 1.1959766385463984, + "grad_norm": 0.6300208612794286, + "learning_rate": 3.3069002347756224e-06, + "loss": 0.527, + "step": 7372 + }, + { + "epoch": 1.196138870863076, + "grad_norm": 0.6060526353693018, + "learning_rate": 3.306496044945484e-06, + "loss": 0.4915, + "step": 7373 + }, + { + "epoch": 1.1963011031797535, + "grad_norm": 0.6100256271232292, + "learning_rate": 3.3060918315841466e-06, + "loss": 0.5337, + "step": 7374 + }, + { + "epoch": 1.196463335496431, + "grad_norm": 0.6428420368178702, + "learning_rate": 3.3056875947034025e-06, + "loss": 0.4761, + "step": 7375 + }, + { + "epoch": 1.1966255678131084, + "grad_norm": 0.6207179285412382, + "learning_rate": 3.305283334315047e-06, + "loss": 0.5405, + "step": 7376 + }, + { + "epoch": 1.1967878001297858, + "grad_norm": 0.6184798814295802, + "learning_rate": 3.304879050430876e-06, + "loss": 0.5243, + "step": 7377 + }, + { + "epoch": 1.1969500324464633, + "grad_norm": 0.6017654645632612, + "learning_rate": 3.304474743062684e-06, + "loss": 0.546, + "step": 7378 + }, + { + "epoch": 1.197112264763141, + "grad_norm": 0.6331550600336432, + "learning_rate": 3.304070412222268e-06, + "loss": 0.5456, + "step": 7379 + }, + { + "epoch": 1.1972744970798184, + "grad_norm": 0.6116770742080792, + "learning_rate": 3.3036660579214247e-06, + "loss": 0.5348, + "step": 7380 + }, + { + "epoch": 1.1974367293964958, + "grad_norm": 0.6018818617801083, + "learning_rate": 3.3032616801719525e-06, + "loss": 0.5486, + "step": 7381 + }, + { + "epoch": 1.1975989617131733, + "grad_norm": 0.6044212767024654, + "learning_rate": 3.3028572789856507e-06, + "loss": 0.5553, + "step": 7382 + }, + { + "epoch": 1.1977611940298507, + "grad_norm": 0.5853228029094604, + "learning_rate": 3.3024528543743166e-06, + "loss": 0.5145, + "step": 7383 + }, + { + "epoch": 1.1979234263465282, + "grad_norm": 0.5799197863536422, + "learning_rate": 3.302048406349752e-06, + "loss": 0.5157, + "step": 7384 + }, + { + "epoch": 1.1980856586632056, + "grad_norm": 0.5944228033329256, + "learning_rate": 3.3016439349237574e-06, + "loss": 0.5333, + "step": 7385 + }, + { + "epoch": 1.1982478909798833, + "grad_norm": 0.5866615538165607, + "learning_rate": 3.301239440108132e-06, + "loss": 0.536, + "step": 7386 + }, + { + "epoch": 1.1984101232965607, + "grad_norm": 0.6193497294575891, + "learning_rate": 3.3008349219146806e-06, + "loss": 0.4837, + "step": 7387 + }, + { + "epoch": 1.1985723556132382, + "grad_norm": 0.6311872643539032, + "learning_rate": 3.300430380355204e-06, + "loss": 0.541, + "step": 7388 + }, + { + "epoch": 1.1987345879299156, + "grad_norm": 0.6123631123939502, + "learning_rate": 3.300025815441506e-06, + "loss": 0.5333, + "step": 7389 + }, + { + "epoch": 1.198896820246593, + "grad_norm": 0.6027396668342957, + "learning_rate": 3.2996212271853907e-06, + "loss": 0.5096, + "step": 7390 + }, + { + "epoch": 1.1990590525632707, + "grad_norm": 0.6186200272608684, + "learning_rate": 3.2992166155986625e-06, + "loss": 0.56, + "step": 7391 + }, + { + "epoch": 1.1992212848799482, + "grad_norm": 0.5864443386519684, + "learning_rate": 3.298811980693128e-06, + "loss": 0.5087, + "step": 7392 + }, + { + "epoch": 1.1993835171966256, + "grad_norm": 0.5964189740209186, + "learning_rate": 3.2984073224805914e-06, + "loss": 0.52, + "step": 7393 + }, + { + "epoch": 1.199545749513303, + "grad_norm": 0.5850358800409823, + "learning_rate": 3.2980026409728604e-06, + "loss": 0.5577, + "step": 7394 + }, + { + "epoch": 1.1997079818299805, + "grad_norm": 0.6140055930118801, + "learning_rate": 3.2975979361817433e-06, + "loss": 0.5032, + "step": 7395 + }, + { + "epoch": 1.199870214146658, + "grad_norm": 0.5731663553562635, + "learning_rate": 3.2971932081190474e-06, + "loss": 0.5251, + "step": 7396 + }, + { + "epoch": 1.2000324464633354, + "grad_norm": 0.5976808091278792, + "learning_rate": 3.2967884567965803e-06, + "loss": 0.5373, + "step": 7397 + }, + { + "epoch": 1.200194678780013, + "grad_norm": 0.6072891495717785, + "learning_rate": 3.296383682226153e-06, + "loss": 0.5348, + "step": 7398 + }, + { + "epoch": 1.2003569110966905, + "grad_norm": 0.5857866578616434, + "learning_rate": 3.295978884419575e-06, + "loss": 0.5217, + "step": 7399 + }, + { + "epoch": 1.200519143413368, + "grad_norm": 0.583537332995448, + "learning_rate": 3.295574063388658e-06, + "loss": 0.5217, + "step": 7400 + }, + { + "epoch": 1.2006813757300454, + "grad_norm": 0.6167259209977358, + "learning_rate": 3.2951692191452124e-06, + "loss": 0.5253, + "step": 7401 + }, + { + "epoch": 1.2008436080467229, + "grad_norm": 0.557670561323841, + "learning_rate": 3.294764351701051e-06, + "loss": 0.548, + "step": 7402 + }, + { + "epoch": 1.2010058403634003, + "grad_norm": 0.5815582417541901, + "learning_rate": 3.2943594610679862e-06, + "loss": 0.5035, + "step": 7403 + }, + { + "epoch": 1.201168072680078, + "grad_norm": 0.5925228152247011, + "learning_rate": 3.2939545472578314e-06, + "loss": 0.5111, + "step": 7404 + }, + { + "epoch": 1.2013303049967554, + "grad_norm": 0.5807609816898859, + "learning_rate": 3.293549610282402e-06, + "loss": 0.5562, + "step": 7405 + }, + { + "epoch": 1.2014925373134329, + "grad_norm": 0.6017871329286978, + "learning_rate": 3.2931446501535114e-06, + "loss": 0.5155, + "step": 7406 + }, + { + "epoch": 1.2016547696301103, + "grad_norm": 0.5967853671366986, + "learning_rate": 3.2927396668829763e-06, + "loss": 0.5076, + "step": 7407 + }, + { + "epoch": 1.2018170019467878, + "grad_norm": 0.5783549704119942, + "learning_rate": 3.2923346604826124e-06, + "loss": 0.4914, + "step": 7408 + }, + { + "epoch": 1.2019792342634652, + "grad_norm": 0.6104768424197954, + "learning_rate": 3.2919296309642364e-06, + "loss": 0.527, + "step": 7409 + }, + { + "epoch": 1.2021414665801426, + "grad_norm": 0.6196200109985142, + "learning_rate": 3.291524578339666e-06, + "loss": 0.539, + "step": 7410 + }, + { + "epoch": 1.2023036988968203, + "grad_norm": 0.6022759371538208, + "learning_rate": 3.2911195026207192e-06, + "loss": 0.5603, + "step": 7411 + }, + { + "epoch": 1.2024659312134978, + "grad_norm": 0.6101973185466264, + "learning_rate": 3.2907144038192163e-06, + "loss": 0.5148, + "step": 7412 + }, + { + "epoch": 1.2026281635301752, + "grad_norm": 0.5904734215331309, + "learning_rate": 3.290309281946975e-06, + "loss": 0.506, + "step": 7413 + }, + { + "epoch": 1.2027903958468527, + "grad_norm": 0.5749571640130522, + "learning_rate": 3.2899041370158168e-06, + "loss": 0.5088, + "step": 7414 + }, + { + "epoch": 1.20295262816353, + "grad_norm": 0.6065642775890565, + "learning_rate": 3.289498969037563e-06, + "loss": 0.5177, + "step": 7415 + }, + { + "epoch": 1.2031148604802078, + "grad_norm": 0.6149955574836181, + "learning_rate": 3.2890937780240338e-06, + "loss": 0.5126, + "step": 7416 + }, + { + "epoch": 1.2032770927968852, + "grad_norm": 0.6076469662661206, + "learning_rate": 3.288688563987052e-06, + "loss": 0.5227, + "step": 7417 + }, + { + "epoch": 1.2034393251135627, + "grad_norm": 0.5843561409520026, + "learning_rate": 3.288283326938441e-06, + "loss": 0.5116, + "step": 7418 + }, + { + "epoch": 1.20360155743024, + "grad_norm": 0.612759128227656, + "learning_rate": 3.2878780668900247e-06, + "loss": 0.5307, + "step": 7419 + }, + { + "epoch": 1.2037637897469176, + "grad_norm": 0.610379066959914, + "learning_rate": 3.287472783853626e-06, + "loss": 0.5343, + "step": 7420 + }, + { + "epoch": 1.203926022063595, + "grad_norm": 0.5712416487778776, + "learning_rate": 3.2870674778410714e-06, + "loss": 0.5252, + "step": 7421 + }, + { + "epoch": 1.2040882543802724, + "grad_norm": 0.6072482000488002, + "learning_rate": 3.286662148864186e-06, + "loss": 0.5424, + "step": 7422 + }, + { + "epoch": 1.2042504866969501, + "grad_norm": 0.6300404108918383, + "learning_rate": 3.2862567969347967e-06, + "loss": 0.5409, + "step": 7423 + }, + { + "epoch": 1.2044127190136276, + "grad_norm": 0.6100764178039064, + "learning_rate": 3.2858514220647285e-06, + "loss": 0.5268, + "step": 7424 + }, + { + "epoch": 1.204574951330305, + "grad_norm": 0.5898947771948754, + "learning_rate": 3.2854460242658116e-06, + "loss": 0.5029, + "step": 7425 + }, + { + "epoch": 1.2047371836469825, + "grad_norm": 0.5923719677912057, + "learning_rate": 3.285040603549872e-06, + "loss": 0.5322, + "step": 7426 + }, + { + "epoch": 1.20489941596366, + "grad_norm": 0.5847017046240033, + "learning_rate": 3.2846351599287407e-06, + "loss": 0.5311, + "step": 7427 + }, + { + "epoch": 1.2050616482803373, + "grad_norm": 0.5976320208171435, + "learning_rate": 3.2842296934142464e-06, + "loss": 0.516, + "step": 7428 + }, + { + "epoch": 1.205223880597015, + "grad_norm": 0.6208686286871604, + "learning_rate": 3.2838242040182188e-06, + "loss": 0.5122, + "step": 7429 + }, + { + "epoch": 1.2053861129136925, + "grad_norm": 0.6263850813141814, + "learning_rate": 3.283418691752491e-06, + "loss": 0.5431, + "step": 7430 + }, + { + "epoch": 1.20554834523037, + "grad_norm": 0.5830550997593861, + "learning_rate": 3.2830131566288915e-06, + "loss": 0.5095, + "step": 7431 + }, + { + "epoch": 1.2057105775470474, + "grad_norm": 0.5946064977549768, + "learning_rate": 3.2826075986592554e-06, + "loss": 0.5161, + "step": 7432 + }, + { + "epoch": 1.2058728098637248, + "grad_norm": 0.5733328825092304, + "learning_rate": 3.282202017855415e-06, + "loss": 0.4973, + "step": 7433 + }, + { + "epoch": 1.2060350421804023, + "grad_norm": 0.5809089036191597, + "learning_rate": 3.281796414229203e-06, + "loss": 0.5064, + "step": 7434 + }, + { + "epoch": 1.2061972744970797, + "grad_norm": 0.5897948756436986, + "learning_rate": 3.2813907877924537e-06, + "loss": 0.513, + "step": 7435 + }, + { + "epoch": 1.2063595068137574, + "grad_norm": 0.5959218414149055, + "learning_rate": 3.2809851385570047e-06, + "loss": 0.5114, + "step": 7436 + }, + { + "epoch": 1.2065217391304348, + "grad_norm": 0.5772005171951382, + "learning_rate": 3.280579466534688e-06, + "loss": 0.5131, + "step": 7437 + }, + { + "epoch": 1.2066839714471123, + "grad_norm": 0.5821876495552969, + "learning_rate": 3.280173771737343e-06, + "loss": 0.5321, + "step": 7438 + }, + { + "epoch": 1.2068462037637897, + "grad_norm": 0.6109412901518974, + "learning_rate": 3.2797680541768044e-06, + "loss": 0.5388, + "step": 7439 + }, + { + "epoch": 1.2070084360804672, + "grad_norm": 0.6084427962995732, + "learning_rate": 3.279362313864911e-06, + "loss": 0.5353, + "step": 7440 + }, + { + "epoch": 1.2071706683971448, + "grad_norm": 0.6029729628678512, + "learning_rate": 3.278956550813502e-06, + "loss": 0.5202, + "step": 7441 + }, + { + "epoch": 1.2073329007138223, + "grad_norm": 0.6463574423256966, + "learning_rate": 3.278550765034414e-06, + "loss": 0.5155, + "step": 7442 + }, + { + "epoch": 1.2074951330304997, + "grad_norm": 0.5868798747738702, + "learning_rate": 3.2781449565394884e-06, + "loss": 0.5118, + "step": 7443 + }, + { + "epoch": 1.2076573653471772, + "grad_norm": 0.5843119570315647, + "learning_rate": 3.2777391253405648e-06, + "loss": 0.5008, + "step": 7444 + }, + { + "epoch": 1.2078195976638546, + "grad_norm": 0.6059778883268634, + "learning_rate": 3.277333271449485e-06, + "loss": 0.5215, + "step": 7445 + }, + { + "epoch": 1.207981829980532, + "grad_norm": 0.6062725986257406, + "learning_rate": 3.27692739487809e-06, + "loss": 0.5704, + "step": 7446 + }, + { + "epoch": 1.2081440622972095, + "grad_norm": 0.6381482868612763, + "learning_rate": 3.276521495638221e-06, + "loss": 0.5513, + "step": 7447 + }, + { + "epoch": 1.2083062946138872, + "grad_norm": 0.5668722669475399, + "learning_rate": 3.276115573741724e-06, + "loss": 0.5622, + "step": 7448 + }, + { + "epoch": 1.2084685269305646, + "grad_norm": 0.5818896063510492, + "learning_rate": 3.2757096292004393e-06, + "loss": 0.5301, + "step": 7449 + }, + { + "epoch": 1.208630759247242, + "grad_norm": 0.5711605213066732, + "learning_rate": 3.275303662026213e-06, + "loss": 0.5262, + "step": 7450 + }, + { + "epoch": 1.2087929915639195, + "grad_norm": 0.5888474452204767, + "learning_rate": 3.27489767223089e-06, + "loss": 0.5294, + "step": 7451 + }, + { + "epoch": 1.208955223880597, + "grad_norm": 0.5824001267227643, + "learning_rate": 3.2744916598263154e-06, + "loss": 0.5299, + "step": 7452 + }, + { + "epoch": 1.2091174561972746, + "grad_norm": 0.6086641351178249, + "learning_rate": 3.274085624824335e-06, + "loss": 0.5468, + "step": 7453 + }, + { + "epoch": 1.209279688513952, + "grad_norm": 0.573180385483084, + "learning_rate": 3.273679567236797e-06, + "loss": 0.5163, + "step": 7454 + }, + { + "epoch": 1.2094419208306295, + "grad_norm": 0.5761647763914622, + "learning_rate": 3.273273487075548e-06, + "loss": 0.5383, + "step": 7455 + }, + { + "epoch": 1.209604153147307, + "grad_norm": 0.5777417759735356, + "learning_rate": 3.2728673843524367e-06, + "loss": 0.5505, + "step": 7456 + }, + { + "epoch": 1.2097663854639844, + "grad_norm": 0.5723875878560329, + "learning_rate": 3.2724612590793113e-06, + "loss": 0.5343, + "step": 7457 + }, + { + "epoch": 1.2099286177806619, + "grad_norm": 0.6290010911146159, + "learning_rate": 3.2720551112680223e-06, + "loss": 0.5388, + "step": 7458 + }, + { + "epoch": 1.2100908500973393, + "grad_norm": 0.5795048516463449, + "learning_rate": 3.2716489409304198e-06, + "loss": 0.4911, + "step": 7459 + }, + { + "epoch": 1.210253082414017, + "grad_norm": 0.5880494164342102, + "learning_rate": 3.2712427480783537e-06, + "loss": 0.5344, + "step": 7460 + }, + { + "epoch": 1.2104153147306944, + "grad_norm": 0.6682725177137444, + "learning_rate": 3.270836532723677e-06, + "loss": 0.5469, + "step": 7461 + }, + { + "epoch": 1.2105775470473719, + "grad_norm": 0.5646666323384331, + "learning_rate": 3.2704302948782394e-06, + "loss": 0.5291, + "step": 7462 + }, + { + "epoch": 1.2107397793640493, + "grad_norm": 0.6033916074652229, + "learning_rate": 3.270024034553897e-06, + "loss": 0.5415, + "step": 7463 + }, + { + "epoch": 1.2109020116807268, + "grad_norm": 0.6257917977528319, + "learning_rate": 3.269617751762501e-06, + "loss": 0.538, + "step": 7464 + }, + { + "epoch": 1.2110642439974042, + "grad_norm": 0.5827181088060881, + "learning_rate": 3.269211446515906e-06, + "loss": 0.5255, + "step": 7465 + }, + { + "epoch": 1.2112264763140819, + "grad_norm": 0.5572408200862004, + "learning_rate": 3.2688051188259677e-06, + "loss": 0.4967, + "step": 7466 + }, + { + "epoch": 1.2113887086307593, + "grad_norm": 0.5908981324936662, + "learning_rate": 3.2683987687045405e-06, + "loss": 0.526, + "step": 7467 + }, + { + "epoch": 1.2115509409474368, + "grad_norm": 0.5927918475534742, + "learning_rate": 3.2679923961634806e-06, + "loss": 0.5566, + "step": 7468 + }, + { + "epoch": 1.2117131732641142, + "grad_norm": 0.6484772511173638, + "learning_rate": 3.2675860012146456e-06, + "loss": 0.53, + "step": 7469 + }, + { + "epoch": 1.2118754055807917, + "grad_norm": 0.5773019603022471, + "learning_rate": 3.267179583869892e-06, + "loss": 0.5647, + "step": 7470 + }, + { + "epoch": 1.212037637897469, + "grad_norm": 0.5731688709983448, + "learning_rate": 3.2667731441410778e-06, + "loss": 0.5368, + "step": 7471 + }, + { + "epoch": 1.2121998702141465, + "grad_norm": 0.560847081875894, + "learning_rate": 3.266366682040063e-06, + "loss": 0.4813, + "step": 7472 + }, + { + "epoch": 1.2123621025308242, + "grad_norm": 0.6064577165808529, + "learning_rate": 3.265960197578705e-06, + "loss": 0.541, + "step": 7473 + }, + { + "epoch": 1.2125243348475017, + "grad_norm": 0.594546934599715, + "learning_rate": 3.265553690768866e-06, + "loss": 0.5398, + "step": 7474 + }, + { + "epoch": 1.212686567164179, + "grad_norm": 0.585258280607881, + "learning_rate": 3.265147161622405e-06, + "loss": 0.4905, + "step": 7475 + }, + { + "epoch": 1.2128487994808566, + "grad_norm": 0.6143380474267568, + "learning_rate": 3.2647406101511848e-06, + "loss": 0.5034, + "step": 7476 + }, + { + "epoch": 1.213011031797534, + "grad_norm": 0.5969363369790236, + "learning_rate": 3.2643340363670656e-06, + "loss": 0.5362, + "step": 7477 + }, + { + "epoch": 1.2131732641142117, + "grad_norm": 0.6298399227347683, + "learning_rate": 3.263927440281911e-06, + "loss": 0.4869, + "step": 7478 + }, + { + "epoch": 1.2133354964308891, + "grad_norm": 0.5988147218538202, + "learning_rate": 3.2635208219075854e-06, + "loss": 0.4917, + "step": 7479 + }, + { + "epoch": 1.2134977287475666, + "grad_norm": 0.6012122890753361, + "learning_rate": 3.2631141812559503e-06, + "loss": 0.4888, + "step": 7480 + }, + { + "epoch": 1.213659961064244, + "grad_norm": 0.603922629494927, + "learning_rate": 3.2627075183388725e-06, + "loss": 0.4932, + "step": 7481 + }, + { + "epoch": 1.2138221933809215, + "grad_norm": 0.5710270029437875, + "learning_rate": 3.262300833168216e-06, + "loss": 0.499, + "step": 7482 + }, + { + "epoch": 1.213984425697599, + "grad_norm": 0.5909496857083351, + "learning_rate": 3.2618941257558467e-06, + "loss": 0.5219, + "step": 7483 + }, + { + "epoch": 1.2141466580142763, + "grad_norm": 0.5541415391637299, + "learning_rate": 3.2614873961136325e-06, + "loss": 0.4776, + "step": 7484 + }, + { + "epoch": 1.214308890330954, + "grad_norm": 0.5872036720666926, + "learning_rate": 3.2610806442534383e-06, + "loss": 0.5026, + "step": 7485 + }, + { + "epoch": 1.2144711226476315, + "grad_norm": 0.6316835702057226, + "learning_rate": 3.260673870187134e-06, + "loss": 0.533, + "step": 7486 + }, + { + "epoch": 1.214633354964309, + "grad_norm": 0.6422509512103882, + "learning_rate": 3.260267073926587e-06, + "loss": 0.5306, + "step": 7487 + }, + { + "epoch": 1.2147955872809864, + "grad_norm": 0.601349137574422, + "learning_rate": 3.2598602554836662e-06, + "loss": 0.5236, + "step": 7488 + }, + { + "epoch": 1.2149578195976638, + "grad_norm": 0.5774485804826364, + "learning_rate": 3.259453414870243e-06, + "loss": 0.5278, + "step": 7489 + }, + { + "epoch": 1.2151200519143412, + "grad_norm": 0.5978632561521456, + "learning_rate": 3.2590465520981855e-06, + "loss": 0.5108, + "step": 7490 + }, + { + "epoch": 1.215282284231019, + "grad_norm": 0.5664058036483519, + "learning_rate": 3.2586396671793665e-06, + "loss": 0.5165, + "step": 7491 + }, + { + "epoch": 1.2154445165476964, + "grad_norm": 0.5694598920883918, + "learning_rate": 3.2582327601256567e-06, + "loss": 0.5362, + "step": 7492 + }, + { + "epoch": 1.2156067488643738, + "grad_norm": 0.6172726534893844, + "learning_rate": 3.257825830948929e-06, + "loss": 0.4842, + "step": 7493 + }, + { + "epoch": 1.2157689811810513, + "grad_norm": 0.5929838545666032, + "learning_rate": 3.2574188796610575e-06, + "loss": 0.5216, + "step": 7494 + }, + { + "epoch": 1.2159312134977287, + "grad_norm": 0.6200058020123256, + "learning_rate": 3.2570119062739136e-06, + "loss": 0.5214, + "step": 7495 + }, + { + "epoch": 1.2160934458144061, + "grad_norm": 0.5816970373360058, + "learning_rate": 3.2566049107993718e-06, + "loss": 0.5164, + "step": 7496 + }, + { + "epoch": 1.2162556781310836, + "grad_norm": 0.5815825792428487, + "learning_rate": 3.25619789324931e-06, + "loss": 0.5423, + "step": 7497 + }, + { + "epoch": 1.2164179104477613, + "grad_norm": 0.6021694513262902, + "learning_rate": 3.2557908536356e-06, + "loss": 0.5475, + "step": 7498 + }, + { + "epoch": 1.2165801427644387, + "grad_norm": 0.5895265525159264, + "learning_rate": 3.2553837919701203e-06, + "loss": 0.5222, + "step": 7499 + }, + { + "epoch": 1.2167423750811162, + "grad_norm": 0.585178544969193, + "learning_rate": 3.2549767082647477e-06, + "loss": 0.5133, + "step": 7500 + }, + { + "epoch": 1.2169046073977936, + "grad_norm": 0.5908727584817521, + "learning_rate": 3.254569602531359e-06, + "loss": 0.5393, + "step": 7501 + }, + { + "epoch": 1.217066839714471, + "grad_norm": 0.6465933185793092, + "learning_rate": 3.254162474781832e-06, + "loss": 0.5267, + "step": 7502 + }, + { + "epoch": 1.2172290720311487, + "grad_norm": 0.571900978885053, + "learning_rate": 3.253755325028046e-06, + "loss": 0.5051, + "step": 7503 + }, + { + "epoch": 1.2173913043478262, + "grad_norm": 0.6112502251980706, + "learning_rate": 3.2533481532818806e-06, + "loss": 0.493, + "step": 7504 + }, + { + "epoch": 1.2175535366645036, + "grad_norm": 0.6149156981507823, + "learning_rate": 3.2529409595552164e-06, + "loss": 0.5049, + "step": 7505 + }, + { + "epoch": 1.217715768981181, + "grad_norm": 0.6210499205746091, + "learning_rate": 3.252533743859933e-06, + "loss": 0.5261, + "step": 7506 + }, + { + "epoch": 1.2178780012978585, + "grad_norm": 0.5988160734973088, + "learning_rate": 3.2521265062079127e-06, + "loss": 0.4813, + "step": 7507 + }, + { + "epoch": 1.218040233614536, + "grad_norm": 0.6027967743178896, + "learning_rate": 3.251719246611037e-06, + "loss": 0.4895, + "step": 7508 + }, + { + "epoch": 1.2182024659312134, + "grad_norm": 0.6428001283025587, + "learning_rate": 3.2513119650811875e-06, + "loss": 0.5222, + "step": 7509 + }, + { + "epoch": 1.218364698247891, + "grad_norm": 0.6005272970538815, + "learning_rate": 3.2509046616302496e-06, + "loss": 0.5226, + "step": 7510 + }, + { + "epoch": 1.2185269305645685, + "grad_norm": 0.6158335474327894, + "learning_rate": 3.2504973362701067e-06, + "loss": 0.5788, + "step": 7511 + }, + { + "epoch": 1.218689162881246, + "grad_norm": 0.5893297533807104, + "learning_rate": 3.250089989012642e-06, + "loss": 0.4498, + "step": 7512 + }, + { + "epoch": 1.2188513951979234, + "grad_norm": 0.6136271245070204, + "learning_rate": 3.2496826198697425e-06, + "loss": 0.5303, + "step": 7513 + }, + { + "epoch": 1.2190136275146009, + "grad_norm": 0.5788674252251219, + "learning_rate": 3.249275228853292e-06, + "loss": 0.5352, + "step": 7514 + }, + { + "epoch": 1.2191758598312783, + "grad_norm": 0.5889411178547548, + "learning_rate": 3.248867815975179e-06, + "loss": 0.5466, + "step": 7515 + }, + { + "epoch": 1.219338092147956, + "grad_norm": 0.5645472680856481, + "learning_rate": 3.2484603812472892e-06, + "loss": 0.5129, + "step": 7516 + }, + { + "epoch": 1.2195003244646334, + "grad_norm": 0.5833285554935991, + "learning_rate": 3.248052924681512e-06, + "loss": 0.5277, + "step": 7517 + }, + { + "epoch": 1.2196625567813109, + "grad_norm": 0.6697752469573128, + "learning_rate": 3.2476454462897326e-06, + "loss": 0.5589, + "step": 7518 + }, + { + "epoch": 1.2198247890979883, + "grad_norm": 0.5985430634301184, + "learning_rate": 3.2472379460838438e-06, + "loss": 0.539, + "step": 7519 + }, + { + "epoch": 1.2199870214146658, + "grad_norm": 0.6645853676241881, + "learning_rate": 3.246830424075733e-06, + "loss": 0.5285, + "step": 7520 + }, + { + "epoch": 1.2201492537313432, + "grad_norm": 0.5997933138291679, + "learning_rate": 3.2464228802772914e-06, + "loss": 0.5148, + "step": 7521 + }, + { + "epoch": 1.2203114860480206, + "grad_norm": 0.5951027665640092, + "learning_rate": 3.2460153147004096e-06, + "loss": 0.5184, + "step": 7522 + }, + { + "epoch": 1.2204737183646983, + "grad_norm": 0.6126542474738942, + "learning_rate": 3.2456077273569787e-06, + "loss": 0.5208, + "step": 7523 + }, + { + "epoch": 1.2206359506813758, + "grad_norm": 0.606942459687832, + "learning_rate": 3.2452001182588923e-06, + "loss": 0.5184, + "step": 7524 + }, + { + "epoch": 1.2207981829980532, + "grad_norm": 0.6097095211218494, + "learning_rate": 3.2447924874180415e-06, + "loss": 0.4763, + "step": 7525 + }, + { + "epoch": 1.2209604153147307, + "grad_norm": 0.6326062139226604, + "learning_rate": 3.2443848348463206e-06, + "loss": 0.5026, + "step": 7526 + }, + { + "epoch": 1.221122647631408, + "grad_norm": 0.6202521090740437, + "learning_rate": 3.2439771605556235e-06, + "loss": 0.4803, + "step": 7527 + }, + { + "epoch": 1.2212848799480858, + "grad_norm": 0.5971155453155081, + "learning_rate": 3.2435694645578465e-06, + "loss": 0.5388, + "step": 7528 + }, + { + "epoch": 1.2214471122647632, + "grad_norm": 0.6014697172926431, + "learning_rate": 3.243161746864883e-06, + "loss": 0.4924, + "step": 7529 + }, + { + "epoch": 1.2216093445814407, + "grad_norm": 0.5857831298859763, + "learning_rate": 3.2427540074886294e-06, + "loss": 0.5453, + "step": 7530 + }, + { + "epoch": 1.221771576898118, + "grad_norm": 0.604432990320296, + "learning_rate": 3.242346246440983e-06, + "loss": 0.5518, + "step": 7531 + }, + { + "epoch": 1.2219338092147956, + "grad_norm": 0.594464182812507, + "learning_rate": 3.2419384637338402e-06, + "loss": 0.5401, + "step": 7532 + }, + { + "epoch": 1.222096041531473, + "grad_norm": 0.5849904666936842, + "learning_rate": 3.2415306593791003e-06, + "loss": 0.5206, + "step": 7533 + }, + { + "epoch": 1.2222582738481504, + "grad_norm": 0.6058441432704548, + "learning_rate": 3.2411228333886597e-06, + "loss": 0.5437, + "step": 7534 + }, + { + "epoch": 1.2224205061648281, + "grad_norm": 0.6069124741496538, + "learning_rate": 3.2407149857744204e-06, + "loss": 0.5492, + "step": 7535 + }, + { + "epoch": 1.2225827384815056, + "grad_norm": 0.6341475150092519, + "learning_rate": 3.240307116548279e-06, + "loss": 0.5644, + "step": 7536 + }, + { + "epoch": 1.222744970798183, + "grad_norm": 0.5766696272967755, + "learning_rate": 3.2398992257221385e-06, + "loss": 0.4806, + "step": 7537 + }, + { + "epoch": 1.2229072031148605, + "grad_norm": 0.6312501671492463, + "learning_rate": 3.239491313307899e-06, + "loss": 0.5514, + "step": 7538 + }, + { + "epoch": 1.223069435431538, + "grad_norm": 0.5781439895683769, + "learning_rate": 3.2390833793174616e-06, + "loss": 0.5047, + "step": 7539 + }, + { + "epoch": 1.2232316677482156, + "grad_norm": 0.6335592933060632, + "learning_rate": 3.2386754237627298e-06, + "loss": 0.5124, + "step": 7540 + }, + { + "epoch": 1.223393900064893, + "grad_norm": 0.6038325952370385, + "learning_rate": 3.238267446655606e-06, + "loss": 0.5345, + "step": 7541 + }, + { + "epoch": 1.2235561323815705, + "grad_norm": 0.602102151056711, + "learning_rate": 3.2378594480079933e-06, + "loss": 0.5134, + "step": 7542 + }, + { + "epoch": 1.223718364698248, + "grad_norm": 0.5789843509739361, + "learning_rate": 3.237451427831797e-06, + "loss": 0.5504, + "step": 7543 + }, + { + "epoch": 1.2238805970149254, + "grad_norm": 0.6138808762685879, + "learning_rate": 3.2370433861389207e-06, + "loss": 0.5146, + "step": 7544 + }, + { + "epoch": 1.2240428293316028, + "grad_norm": 0.5979258301043757, + "learning_rate": 3.2366353229412705e-06, + "loss": 0.5436, + "step": 7545 + }, + { + "epoch": 1.2242050616482802, + "grad_norm": 0.5880548867495564, + "learning_rate": 3.236227238250753e-06, + "loss": 0.5233, + "step": 7546 + }, + { + "epoch": 1.224367293964958, + "grad_norm": 0.6131284379773071, + "learning_rate": 3.2358191320792738e-06, + "loss": 0.5172, + "step": 7547 + }, + { + "epoch": 1.2245295262816354, + "grad_norm": 0.59097672263161, + "learning_rate": 3.235411004438741e-06, + "loss": 0.5461, + "step": 7548 + }, + { + "epoch": 1.2246917585983128, + "grad_norm": 0.5841500684449152, + "learning_rate": 3.2350028553410627e-06, + "loss": 0.4932, + "step": 7549 + }, + { + "epoch": 1.2248539909149903, + "grad_norm": 0.5881640476294797, + "learning_rate": 3.234594684798147e-06, + "loss": 0.5209, + "step": 7550 + }, + { + "epoch": 1.2250162232316677, + "grad_norm": 0.6035438532233437, + "learning_rate": 3.2341864928219036e-06, + "loss": 0.5034, + "step": 7551 + }, + { + "epoch": 1.2251784555483451, + "grad_norm": 0.570791710252654, + "learning_rate": 3.233778279424242e-06, + "loss": 0.531, + "step": 7552 + }, + { + "epoch": 1.2253406878650228, + "grad_norm": 0.5788653228181403, + "learning_rate": 3.233370044617073e-06, + "loss": 0.5267, + "step": 7553 + }, + { + "epoch": 1.2255029201817003, + "grad_norm": 0.6177689896030568, + "learning_rate": 3.2329617884123065e-06, + "loss": 0.5001, + "step": 7554 + }, + { + "epoch": 1.2256651524983777, + "grad_norm": 0.5847630764691821, + "learning_rate": 3.232553510821856e-06, + "loss": 0.4863, + "step": 7555 + }, + { + "epoch": 1.2258273848150552, + "grad_norm": 0.5729977564544194, + "learning_rate": 3.232145211857633e-06, + "loss": 0.5489, + "step": 7556 + }, + { + "epoch": 1.2259896171317326, + "grad_norm": 0.578184711336817, + "learning_rate": 3.2317368915315496e-06, + "loss": 0.5325, + "step": 7557 + }, + { + "epoch": 1.22615184944841, + "grad_norm": 0.6188879898614709, + "learning_rate": 3.231328549855522e-06, + "loss": 0.4951, + "step": 7558 + }, + { + "epoch": 1.2263140817650875, + "grad_norm": 0.5886329303143689, + "learning_rate": 3.2309201868414608e-06, + "loss": 0.5465, + "step": 7559 + }, + { + "epoch": 1.2264763140817652, + "grad_norm": 0.5964049883224021, + "learning_rate": 3.230511802501284e-06, + "loss": 0.5045, + "step": 7560 + }, + { + "epoch": 1.2266385463984426, + "grad_norm": 0.575929341658445, + "learning_rate": 3.2301033968469053e-06, + "loss": 0.5055, + "step": 7561 + }, + { + "epoch": 1.22680077871512, + "grad_norm": 0.6181022085201937, + "learning_rate": 3.229694969890242e-06, + "loss": 0.5219, + "step": 7562 + }, + { + "epoch": 1.2269630110317975, + "grad_norm": 0.613519546239579, + "learning_rate": 3.2292865216432096e-06, + "loss": 0.5182, + "step": 7563 + }, + { + "epoch": 1.227125243348475, + "grad_norm": 0.6122031493183071, + "learning_rate": 3.228878052117726e-06, + "loss": 0.5143, + "step": 7564 + }, + { + "epoch": 1.2272874756651526, + "grad_norm": 0.6038733674886597, + "learning_rate": 3.2284695613257084e-06, + "loss": 0.5477, + "step": 7565 + }, + { + "epoch": 1.22744970798183, + "grad_norm": 0.5675398569923955, + "learning_rate": 3.2280610492790763e-06, + "loss": 0.534, + "step": 7566 + }, + { + "epoch": 1.2276119402985075, + "grad_norm": 0.5916555182175155, + "learning_rate": 3.2276525159897498e-06, + "loss": 0.501, + "step": 7567 + }, + { + "epoch": 1.227774172615185, + "grad_norm": 0.5951977828628291, + "learning_rate": 3.2272439614696466e-06, + "loss": 0.5197, + "step": 7568 + }, + { + "epoch": 1.2279364049318624, + "grad_norm": 0.5625883510715306, + "learning_rate": 3.2268353857306884e-06, + "loss": 0.5188, + "step": 7569 + }, + { + "epoch": 1.2280986372485398, + "grad_norm": 0.5986192222662213, + "learning_rate": 3.2264267887847954e-06, + "loss": 0.5063, + "step": 7570 + }, + { + "epoch": 1.2282608695652173, + "grad_norm": 0.6425835180457224, + "learning_rate": 3.226018170643891e-06, + "loss": 0.5471, + "step": 7571 + }, + { + "epoch": 1.228423101881895, + "grad_norm": 0.569397338394956, + "learning_rate": 3.2256095313198943e-06, + "loss": 0.5579, + "step": 7572 + }, + { + "epoch": 1.2285853341985724, + "grad_norm": 0.6231322332810906, + "learning_rate": 3.225200870824732e-06, + "loss": 0.5322, + "step": 7573 + }, + { + "epoch": 1.2287475665152499, + "grad_norm": 0.5680308618365987, + "learning_rate": 3.2247921891703247e-06, + "loss": 0.5266, + "step": 7574 + }, + { + "epoch": 1.2289097988319273, + "grad_norm": 0.6495604668812748, + "learning_rate": 3.2243834863685987e-06, + "loss": 0.525, + "step": 7575 + }, + { + "epoch": 1.2290720311486047, + "grad_norm": 0.5681819695672451, + "learning_rate": 3.2239747624314766e-06, + "loss": 0.5263, + "step": 7576 + }, + { + "epoch": 1.2292342634652822, + "grad_norm": 0.6243961029487624, + "learning_rate": 3.223566017370885e-06, + "loss": 0.5357, + "step": 7577 + }, + { + "epoch": 1.2293964957819599, + "grad_norm": 0.6076733402222831, + "learning_rate": 3.22315725119875e-06, + "loss": 0.5119, + "step": 7578 + }, + { + "epoch": 1.2295587280986373, + "grad_norm": 0.5832161945576901, + "learning_rate": 3.2227484639269977e-06, + "loss": 0.5232, + "step": 7579 + }, + { + "epoch": 1.2297209604153148, + "grad_norm": 0.6093587838092366, + "learning_rate": 3.222339655567556e-06, + "loss": 0.5345, + "step": 7580 + }, + { + "epoch": 1.2298831927319922, + "grad_norm": 0.596489890724723, + "learning_rate": 3.221930826132352e-06, + "loss": 0.5486, + "step": 7581 + }, + { + "epoch": 1.2300454250486696, + "grad_norm": 0.5798622234777399, + "learning_rate": 3.221521975633315e-06, + "loss": 0.4957, + "step": 7582 + }, + { + "epoch": 1.230207657365347, + "grad_norm": 0.5861881610966788, + "learning_rate": 3.2211131040823727e-06, + "loss": 0.5152, + "step": 7583 + }, + { + "epoch": 1.2303698896820245, + "grad_norm": 0.5854573279524086, + "learning_rate": 3.220704211491456e-06, + "loss": 0.533, + "step": 7584 + }, + { + "epoch": 1.2305321219987022, + "grad_norm": 0.5675110024627898, + "learning_rate": 3.220295297872495e-06, + "loss": 0.5147, + "step": 7585 + }, + { + "epoch": 1.2306943543153797, + "grad_norm": 0.6129553362015339, + "learning_rate": 3.21988636323742e-06, + "loss": 0.4767, + "step": 7586 + }, + { + "epoch": 1.230856586632057, + "grad_norm": 0.577172108234473, + "learning_rate": 3.2194774075981634e-06, + "loss": 0.5017, + "step": 7587 + }, + { + "epoch": 1.2310188189487346, + "grad_norm": 0.5914111766929758, + "learning_rate": 3.2190684309666563e-06, + "loss": 0.5202, + "step": 7588 + }, + { + "epoch": 1.231181051265412, + "grad_norm": 0.6107419765184735, + "learning_rate": 3.2186594333548326e-06, + "loss": 0.5241, + "step": 7589 + }, + { + "epoch": 1.2313432835820897, + "grad_norm": 0.5946399681956832, + "learning_rate": 3.218250414774624e-06, + "loss": 0.5268, + "step": 7590 + }, + { + "epoch": 1.2315055158987671, + "grad_norm": 0.602937462142396, + "learning_rate": 3.217841375237967e-06, + "loss": 0.5337, + "step": 7591 + }, + { + "epoch": 1.2316677482154446, + "grad_norm": 0.6124866635174878, + "learning_rate": 3.217432314756793e-06, + "loss": 0.5236, + "step": 7592 + }, + { + "epoch": 1.231829980532122, + "grad_norm": 0.5666934549738569, + "learning_rate": 3.21702323334304e-06, + "loss": 0.5152, + "step": 7593 + }, + { + "epoch": 1.2319922128487995, + "grad_norm": 0.5791423275426627, + "learning_rate": 3.216614131008643e-06, + "loss": 0.525, + "step": 7594 + }, + { + "epoch": 1.232154445165477, + "grad_norm": 0.5703624170604727, + "learning_rate": 3.2162050077655372e-06, + "loss": 0.5029, + "step": 7595 + }, + { + "epoch": 1.2323166774821543, + "grad_norm": 0.7006724392394548, + "learning_rate": 3.21579586362566e-06, + "loss": 0.4673, + "step": 7596 + }, + { + "epoch": 1.232478909798832, + "grad_norm": 0.5835365580491079, + "learning_rate": 3.215386698600951e-06, + "loss": 0.5091, + "step": 7597 + }, + { + "epoch": 1.2326411421155095, + "grad_norm": 0.5784645231098576, + "learning_rate": 3.214977512703347e-06, + "loss": 0.5179, + "step": 7598 + }, + { + "epoch": 1.232803374432187, + "grad_norm": 0.6029024869364882, + "learning_rate": 3.214568305944786e-06, + "loss": 0.5402, + "step": 7599 + }, + { + "epoch": 1.2329656067488644, + "grad_norm": 0.6389612155880345, + "learning_rate": 3.2141590783372077e-06, + "loss": 0.5253, + "step": 7600 + }, + { + "epoch": 1.2331278390655418, + "grad_norm": 0.5791944248807971, + "learning_rate": 3.2137498298925534e-06, + "loss": 0.5436, + "step": 7601 + }, + { + "epoch": 1.2332900713822192, + "grad_norm": 0.585478595603722, + "learning_rate": 3.2133405606227636e-06, + "loss": 0.5217, + "step": 7602 + }, + { + "epoch": 1.233452303698897, + "grad_norm": 0.5993041879549997, + "learning_rate": 3.2129312705397784e-06, + "loss": 0.5084, + "step": 7603 + }, + { + "epoch": 1.2336145360155744, + "grad_norm": 0.574187049277527, + "learning_rate": 3.212521959655541e-06, + "loss": 0.4905, + "step": 7604 + }, + { + "epoch": 1.2337767683322518, + "grad_norm": 0.5722627163287628, + "learning_rate": 3.2121126279819935e-06, + "loss": 0.5441, + "step": 7605 + }, + { + "epoch": 1.2339390006489293, + "grad_norm": 0.6034287208875095, + "learning_rate": 3.211703275531078e-06, + "loss": 0.5538, + "step": 7606 + }, + { + "epoch": 1.2341012329656067, + "grad_norm": 0.5877999671383956, + "learning_rate": 3.21129390231474e-06, + "loss": 0.5107, + "step": 7607 + }, + { + "epoch": 1.2342634652822841, + "grad_norm": 0.5783474158238878, + "learning_rate": 3.2108845083449213e-06, + "loss": 0.5186, + "step": 7608 + }, + { + "epoch": 1.2344256975989616, + "grad_norm": 0.5900525897667707, + "learning_rate": 3.2104750936335707e-06, + "loss": 0.5195, + "step": 7609 + }, + { + "epoch": 1.2345879299156393, + "grad_norm": 0.6382167155824999, + "learning_rate": 3.2100656581926294e-06, + "loss": 0.5348, + "step": 7610 + }, + { + "epoch": 1.2347501622323167, + "grad_norm": 0.6264520136597833, + "learning_rate": 3.209656202034046e-06, + "loss": 0.4972, + "step": 7611 + }, + { + "epoch": 1.2349123945489942, + "grad_norm": 0.6118692029904977, + "learning_rate": 3.2092467251697667e-06, + "loss": 0.4939, + "step": 7612 + }, + { + "epoch": 1.2350746268656716, + "grad_norm": 0.6026669488957471, + "learning_rate": 3.2088372276117383e-06, + "loss": 0.5342, + "step": 7613 + }, + { + "epoch": 1.235236859182349, + "grad_norm": 0.6097642221971904, + "learning_rate": 3.2084277093719107e-06, + "loss": 0.5332, + "step": 7614 + }, + { + "epoch": 1.2353990914990267, + "grad_norm": 0.6047963542907686, + "learning_rate": 3.20801817046223e-06, + "loss": 0.4971, + "step": 7615 + }, + { + "epoch": 1.2355613238157042, + "grad_norm": 0.5875606223452748, + "learning_rate": 3.207608610894647e-06, + "loss": 0.5197, + "step": 7616 + }, + { + "epoch": 1.2357235561323816, + "grad_norm": 0.570252083633382, + "learning_rate": 3.2071990306811107e-06, + "loss": 0.5219, + "step": 7617 + }, + { + "epoch": 1.235885788449059, + "grad_norm": 0.5774061066957047, + "learning_rate": 3.2067894298335715e-06, + "loss": 0.5241, + "step": 7618 + }, + { + "epoch": 1.2360480207657365, + "grad_norm": 0.5934124328765795, + "learning_rate": 3.2063798083639806e-06, + "loss": 0.5351, + "step": 7619 + }, + { + "epoch": 1.236210253082414, + "grad_norm": 0.6206767506809117, + "learning_rate": 3.2059701662842895e-06, + "loss": 0.4842, + "step": 7620 + }, + { + "epoch": 1.2363724853990914, + "grad_norm": 0.6244081400890473, + "learning_rate": 3.20556050360645e-06, + "loss": 0.5473, + "step": 7621 + }, + { + "epoch": 1.236534717715769, + "grad_norm": 0.6318618422872987, + "learning_rate": 3.2051508203424158e-06, + "loss": 0.5312, + "step": 7622 + }, + { + "epoch": 1.2366969500324465, + "grad_norm": 0.6166486087363654, + "learning_rate": 3.2047411165041393e-06, + "loss": 0.5244, + "step": 7623 + }, + { + "epoch": 1.236859182349124, + "grad_norm": 0.607083643674625, + "learning_rate": 3.2043313921035747e-06, + "loss": 0.5025, + "step": 7624 + }, + { + "epoch": 1.2370214146658014, + "grad_norm": 0.5579015853241932, + "learning_rate": 3.2039216471526766e-06, + "loss": 0.5287, + "step": 7625 + }, + { + "epoch": 1.2371836469824788, + "grad_norm": 0.6131291792008478, + "learning_rate": 3.2035118816634e-06, + "loss": 0.5286, + "step": 7626 + }, + { + "epoch": 1.2373458792991565, + "grad_norm": 0.608732092164998, + "learning_rate": 3.2031020956477023e-06, + "loss": 0.5696, + "step": 7627 + }, + { + "epoch": 1.237508111615834, + "grad_norm": 0.6077976079952032, + "learning_rate": 3.202692289117536e-06, + "loss": 0.5187, + "step": 7628 + }, + { + "epoch": 1.2376703439325114, + "grad_norm": 0.5880118303505802, + "learning_rate": 3.2022824620848625e-06, + "loss": 0.5106, + "step": 7629 + }, + { + "epoch": 1.2378325762491889, + "grad_norm": 0.587078521054223, + "learning_rate": 3.2018726145616364e-06, + "loss": 0.519, + "step": 7630 + }, + { + "epoch": 1.2379948085658663, + "grad_norm": 0.6543100813576841, + "learning_rate": 3.2014627465598163e-06, + "loss": 0.5198, + "step": 7631 + }, + { + "epoch": 1.2381570408825437, + "grad_norm": 0.5691831888134459, + "learning_rate": 3.201052858091363e-06, + "loss": 0.5344, + "step": 7632 + }, + { + "epoch": 1.2383192731992212, + "grad_norm": 0.5794490661414371, + "learning_rate": 3.200642949168233e-06, + "loss": 0.4908, + "step": 7633 + }, + { + "epoch": 1.2384815055158989, + "grad_norm": 0.6046869127618354, + "learning_rate": 3.200233019802388e-06, + "loss": 0.5118, + "step": 7634 + }, + { + "epoch": 1.2386437378325763, + "grad_norm": 0.5971684896885666, + "learning_rate": 3.1998230700057885e-06, + "loss": 0.5692, + "step": 7635 + }, + { + "epoch": 1.2388059701492538, + "grad_norm": 0.5900874030504448, + "learning_rate": 3.199413099790395e-06, + "loss": 0.4865, + "step": 7636 + }, + { + "epoch": 1.2389682024659312, + "grad_norm": 0.601207689052054, + "learning_rate": 3.199003109168169e-06, + "loss": 0.5032, + "step": 7637 + }, + { + "epoch": 1.2391304347826086, + "grad_norm": 0.6724087226500591, + "learning_rate": 3.1985930981510733e-06, + "loss": 0.5046, + "step": 7638 + }, + { + "epoch": 1.239292667099286, + "grad_norm": 0.5575694590561413, + "learning_rate": 3.1981830667510712e-06, + "loss": 0.5039, + "step": 7639 + }, + { + "epoch": 1.2394548994159638, + "grad_norm": 0.5751415801390145, + "learning_rate": 3.197773014980126e-06, + "loss": 0.5012, + "step": 7640 + }, + { + "epoch": 1.2396171317326412, + "grad_norm": 0.6191345381186055, + "learning_rate": 3.1973629428502005e-06, + "loss": 0.5373, + "step": 7641 + }, + { + "epoch": 1.2397793640493187, + "grad_norm": 0.6087807106608981, + "learning_rate": 3.1969528503732615e-06, + "loss": 0.5201, + "step": 7642 + }, + { + "epoch": 1.239941596365996, + "grad_norm": 0.5967631427996284, + "learning_rate": 3.196542737561274e-06, + "loss": 0.5417, + "step": 7643 + }, + { + "epoch": 1.2401038286826735, + "grad_norm": 0.5953905922396334, + "learning_rate": 3.196132604426202e-06, + "loss": 0.5217, + "step": 7644 + }, + { + "epoch": 1.240266060999351, + "grad_norm": 0.6239870080302952, + "learning_rate": 3.1957224509800142e-06, + "loss": 0.524, + "step": 7645 + }, + { + "epoch": 1.2404282933160284, + "grad_norm": 0.5778494788190455, + "learning_rate": 3.1953122772346757e-06, + "loss": 0.5277, + "step": 7646 + }, + { + "epoch": 1.240590525632706, + "grad_norm": 0.5964239311570327, + "learning_rate": 3.194902083202156e-06, + "loss": 0.5126, + "step": 7647 + }, + { + "epoch": 1.2407527579493836, + "grad_norm": 0.5725126929209187, + "learning_rate": 3.194491868894422e-06, + "loss": 0.5211, + "step": 7648 + }, + { + "epoch": 1.240914990266061, + "grad_norm": 0.5813317346885268, + "learning_rate": 3.194081634323443e-06, + "loss": 0.5495, + "step": 7649 + }, + { + "epoch": 1.2410772225827384, + "grad_norm": 0.585191109226932, + "learning_rate": 3.19367137950119e-06, + "loss": 0.5453, + "step": 7650 + }, + { + "epoch": 1.241239454899416, + "grad_norm": 0.5962970862899428, + "learning_rate": 3.19326110443963e-06, + "loss": 0.5006, + "step": 7651 + }, + { + "epoch": 1.2414016872160936, + "grad_norm": 0.5886911617747023, + "learning_rate": 3.1928508091507356e-06, + "loss": 0.5323, + "step": 7652 + }, + { + "epoch": 1.241563919532771, + "grad_norm": 0.5958934301908686, + "learning_rate": 3.1924404936464775e-06, + "loss": 0.5337, + "step": 7653 + }, + { + "epoch": 1.2417261518494485, + "grad_norm": 0.5788732723089461, + "learning_rate": 3.192030157938828e-06, + "loss": 0.5104, + "step": 7654 + }, + { + "epoch": 1.241888384166126, + "grad_norm": 0.5897819699262504, + "learning_rate": 3.1916198020397594e-06, + "loss": 0.5126, + "step": 7655 + }, + { + "epoch": 1.2420506164828033, + "grad_norm": 0.6240791311734215, + "learning_rate": 3.191209425961244e-06, + "loss": 0.5332, + "step": 7656 + }, + { + "epoch": 1.2422128487994808, + "grad_norm": 0.5810575460484597, + "learning_rate": 3.190799029715256e-06, + "loss": 0.4822, + "step": 7657 + }, + { + "epoch": 1.2423750811161582, + "grad_norm": 0.586948271070028, + "learning_rate": 3.190388613313769e-06, + "loss": 0.5142, + "step": 7658 + }, + { + "epoch": 1.242537313432836, + "grad_norm": 0.5966890583051343, + "learning_rate": 3.1899781767687586e-06, + "loss": 0.5086, + "step": 7659 + }, + { + "epoch": 1.2426995457495134, + "grad_norm": 0.6034482979331008, + "learning_rate": 3.1895677200921993e-06, + "loss": 0.5031, + "step": 7660 + }, + { + "epoch": 1.2428617780661908, + "grad_norm": 0.5829886858530913, + "learning_rate": 3.1891572432960676e-06, + "loss": 0.5074, + "step": 7661 + }, + { + "epoch": 1.2430240103828682, + "grad_norm": 0.576054615886694, + "learning_rate": 3.1887467463923393e-06, + "loss": 0.5618, + "step": 7662 + }, + { + "epoch": 1.2431862426995457, + "grad_norm": 0.5943286279029459, + "learning_rate": 3.1883362293929933e-06, + "loss": 0.487, + "step": 7663 + }, + { + "epoch": 1.2433484750162231, + "grad_norm": 0.5903399933072353, + "learning_rate": 3.1879256923100043e-06, + "loss": 0.5083, + "step": 7664 + }, + { + "epoch": 1.2435107073329008, + "grad_norm": 0.6785647603837183, + "learning_rate": 3.1875151351553534e-06, + "loss": 0.5251, + "step": 7665 + }, + { + "epoch": 1.2436729396495783, + "grad_norm": 0.5600225867163662, + "learning_rate": 3.1871045579410183e-06, + "loss": 0.5153, + "step": 7666 + }, + { + "epoch": 1.2438351719662557, + "grad_norm": 0.5676340946715428, + "learning_rate": 3.186693960678978e-06, + "loss": 0.4793, + "step": 7667 + }, + { + "epoch": 1.2439974042829332, + "grad_norm": 0.5800728756327834, + "learning_rate": 3.1862833433812137e-06, + "loss": 0.4948, + "step": 7668 + }, + { + "epoch": 1.2441596365996106, + "grad_norm": 0.5717700297032339, + "learning_rate": 3.1858727060597037e-06, + "loss": 0.5386, + "step": 7669 + }, + { + "epoch": 1.244321868916288, + "grad_norm": 0.604311424745452, + "learning_rate": 3.185462048726432e-06, + "loss": 0.5355, + "step": 7670 + }, + { + "epoch": 1.2444841012329655, + "grad_norm": 0.598863812111231, + "learning_rate": 3.1850513713933796e-06, + "loss": 0.5028, + "step": 7671 + }, + { + "epoch": 1.2446463335496432, + "grad_norm": 0.5965065774565592, + "learning_rate": 3.1846406740725275e-06, + "loss": 0.5121, + "step": 7672 + }, + { + "epoch": 1.2448085658663206, + "grad_norm": 0.6069409849053269, + "learning_rate": 3.1842299567758608e-06, + "loss": 0.5171, + "step": 7673 + }, + { + "epoch": 1.244970798182998, + "grad_norm": 0.5976170650893722, + "learning_rate": 3.183819219515361e-06, + "loss": 0.5415, + "step": 7674 + }, + { + "epoch": 1.2451330304996755, + "grad_norm": 0.5588004737638329, + "learning_rate": 3.1834084623030128e-06, + "loss": 0.5236, + "step": 7675 + }, + { + "epoch": 1.245295262816353, + "grad_norm": 0.5656855805020151, + "learning_rate": 3.1829976851508017e-06, + "loss": 0.5335, + "step": 7676 + }, + { + "epoch": 1.2454574951330306, + "grad_norm": 0.5711371485105183, + "learning_rate": 3.182586888070712e-06, + "loss": 0.5009, + "step": 7677 + }, + { + "epoch": 1.245619727449708, + "grad_norm": 0.5719389695049164, + "learning_rate": 3.1821760710747307e-06, + "loss": 0.5553, + "step": 7678 + }, + { + "epoch": 1.2457819597663855, + "grad_norm": 0.6186517471212434, + "learning_rate": 3.181765234174843e-06, + "loss": 0.518, + "step": 7679 + }, + { + "epoch": 1.245944192083063, + "grad_norm": 0.5817716543222864, + "learning_rate": 3.1813543773830356e-06, + "loss": 0.5069, + "step": 7680 + }, + { + "epoch": 1.2461064243997404, + "grad_norm": 0.6048702482406744, + "learning_rate": 3.1809435007112986e-06, + "loss": 0.5331, + "step": 7681 + }, + { + "epoch": 1.2462686567164178, + "grad_norm": 0.5739359967167804, + "learning_rate": 3.1805326041716173e-06, + "loss": 0.5253, + "step": 7682 + }, + { + "epoch": 1.2464308890330953, + "grad_norm": 0.6026426499321889, + "learning_rate": 3.180121687775982e-06, + "loss": 0.5382, + "step": 7683 + }, + { + "epoch": 1.246593121349773, + "grad_norm": 0.5909629267208156, + "learning_rate": 3.1797107515363823e-06, + "loss": 0.5092, + "step": 7684 + }, + { + "epoch": 1.2467553536664504, + "grad_norm": 0.6098349898332153, + "learning_rate": 3.1792997954648076e-06, + "loss": 0.5241, + "step": 7685 + }, + { + "epoch": 1.2469175859831279, + "grad_norm": 0.5874593975006691, + "learning_rate": 3.1788888195732477e-06, + "loss": 0.5398, + "step": 7686 + }, + { + "epoch": 1.2470798182998053, + "grad_norm": 0.5994130253091184, + "learning_rate": 3.178477823873694e-06, + "loss": 0.4841, + "step": 7687 + }, + { + "epoch": 1.2472420506164827, + "grad_norm": 0.6064964402525569, + "learning_rate": 3.1780668083781396e-06, + "loss": 0.5381, + "step": 7688 + }, + { + "epoch": 1.2474042829331602, + "grad_norm": 0.5788498300249308, + "learning_rate": 3.177655773098575e-06, + "loss": 0.5097, + "step": 7689 + }, + { + "epoch": 1.2475665152498379, + "grad_norm": 0.6122404560611807, + "learning_rate": 3.1772447180469934e-06, + "loss": 0.5303, + "step": 7690 + }, + { + "epoch": 1.2477287475665153, + "grad_norm": 0.5882961331856499, + "learning_rate": 3.1768336432353887e-06, + "loss": 0.5603, + "step": 7691 + }, + { + "epoch": 1.2478909798831928, + "grad_norm": 0.5912333535148382, + "learning_rate": 3.1764225486757542e-06, + "loss": 0.5663, + "step": 7692 + }, + { + "epoch": 1.2480532121998702, + "grad_norm": 0.6156912560372296, + "learning_rate": 3.1760114343800852e-06, + "loss": 0.5127, + "step": 7693 + }, + { + "epoch": 1.2482154445165476, + "grad_norm": 0.586026622739645, + "learning_rate": 3.1756003003603765e-06, + "loss": 0.5299, + "step": 7694 + }, + { + "epoch": 1.248377676833225, + "grad_norm": 0.5948586658846599, + "learning_rate": 3.1751891466286233e-06, + "loss": 0.5263, + "step": 7695 + }, + { + "epoch": 1.2485399091499025, + "grad_norm": 0.5724352377468847, + "learning_rate": 3.1747779731968226e-06, + "loss": 0.519, + "step": 7696 + }, + { + "epoch": 1.2487021414665802, + "grad_norm": 0.5818648187602835, + "learning_rate": 3.1743667800769706e-06, + "loss": 0.524, + "step": 7697 + }, + { + "epoch": 1.2488643737832577, + "grad_norm": 0.5925449861022882, + "learning_rate": 3.1739555672810653e-06, + "loss": 0.5039, + "step": 7698 + }, + { + "epoch": 1.249026606099935, + "grad_norm": 0.5965422517854905, + "learning_rate": 3.1735443348211047e-06, + "loss": 0.5499, + "step": 7699 + }, + { + "epoch": 1.2491888384166125, + "grad_norm": 0.5957980680372648, + "learning_rate": 3.1731330827090865e-06, + "loss": 0.5515, + "step": 7700 + }, + { + "epoch": 1.24935107073329, + "grad_norm": 0.5868669353889047, + "learning_rate": 3.1727218109570107e-06, + "loss": 0.5412, + "step": 7701 + }, + { + "epoch": 1.2495133030499677, + "grad_norm": 0.5924113616637942, + "learning_rate": 3.1723105195768767e-06, + "loss": 0.506, + "step": 7702 + }, + { + "epoch": 1.249675535366645, + "grad_norm": 0.591647326110997, + "learning_rate": 3.1718992085806853e-06, + "loss": 0.5276, + "step": 7703 + }, + { + "epoch": 1.2498377676833226, + "grad_norm": 0.5947516473674501, + "learning_rate": 3.1714878779804364e-06, + "loss": 0.5374, + "step": 7704 + }, + { + "epoch": 1.25, + "grad_norm": 0.5840583609480834, + "learning_rate": 3.1710765277881316e-06, + "loss": 0.4785, + "step": 7705 + }, + { + "epoch": 1.2501622323166774, + "grad_norm": 0.6084493850251194, + "learning_rate": 3.170665158015774e-06, + "loss": 0.5518, + "step": 7706 + }, + { + "epoch": 1.250324464633355, + "grad_norm": 0.6006209151885255, + "learning_rate": 3.1702537686753643e-06, + "loss": 0.5087, + "step": 7707 + }, + { + "epoch": 1.2504866969500323, + "grad_norm": 0.5901897966393089, + "learning_rate": 3.1698423597789075e-06, + "loss": 0.5252, + "step": 7708 + }, + { + "epoch": 1.25064892926671, + "grad_norm": 0.6232448774658531, + "learning_rate": 3.1694309313384064e-06, + "loss": 0.5342, + "step": 7709 + }, + { + "epoch": 1.2508111615833875, + "grad_norm": 0.6074512751438071, + "learning_rate": 3.169019483365865e-06, + "loss": 0.4808, + "step": 7710 + }, + { + "epoch": 1.250973393900065, + "grad_norm": 0.6085815810945956, + "learning_rate": 3.168608015873289e-06, + "loss": 0.5378, + "step": 7711 + }, + { + "epoch": 1.2511356262167423, + "grad_norm": 0.6178645714023249, + "learning_rate": 3.1681965288726825e-06, + "loss": 0.5283, + "step": 7712 + }, + { + "epoch": 1.2512978585334198, + "grad_norm": 0.5806910081686849, + "learning_rate": 3.167785022376053e-06, + "loss": 0.4869, + "step": 7713 + }, + { + "epoch": 1.2514600908500975, + "grad_norm": 0.6009381579251422, + "learning_rate": 3.1673734963954068e-06, + "loss": 0.505, + "step": 7714 + }, + { + "epoch": 1.251622323166775, + "grad_norm": 0.5615635587804294, + "learning_rate": 3.16696195094275e-06, + "loss": 0.4982, + "step": 7715 + }, + { + "epoch": 1.2517845554834524, + "grad_norm": 0.6245315626314046, + "learning_rate": 3.1665503860300905e-06, + "loss": 0.5026, + "step": 7716 + }, + { + "epoch": 1.2519467878001298, + "grad_norm": 0.5572998724721606, + "learning_rate": 3.166138801669438e-06, + "loss": 0.4731, + "step": 7717 + }, + { + "epoch": 1.2521090201168072, + "grad_norm": 0.6018115861455623, + "learning_rate": 3.1657271978728e-06, + "loss": 0.543, + "step": 7718 + }, + { + "epoch": 1.2522712524334847, + "grad_norm": 0.596568298526887, + "learning_rate": 3.165315574652187e-06, + "loss": 0.5078, + "step": 7719 + }, + { + "epoch": 1.2524334847501621, + "grad_norm": 0.5783537572573824, + "learning_rate": 3.164903932019607e-06, + "loss": 0.5129, + "step": 7720 + }, + { + "epoch": 1.2525957170668396, + "grad_norm": 0.5945190054192471, + "learning_rate": 3.164492269987072e-06, + "loss": 0.5166, + "step": 7721 + }, + { + "epoch": 1.2527579493835173, + "grad_norm": 0.6044038096913086, + "learning_rate": 3.1640805885665924e-06, + "loss": 0.5213, + "step": 7722 + }, + { + "epoch": 1.2529201817001947, + "grad_norm": 0.597782078552824, + "learning_rate": 3.163668887770181e-06, + "loss": 0.5304, + "step": 7723 + }, + { + "epoch": 1.2530824140168721, + "grad_norm": 0.5711776052506445, + "learning_rate": 3.1632571676098493e-06, + "loss": 0.4887, + "step": 7724 + }, + { + "epoch": 1.2532446463335496, + "grad_norm": 0.586943846859786, + "learning_rate": 3.1628454280976096e-06, + "loss": 0.518, + "step": 7725 + }, + { + "epoch": 1.2534068786502273, + "grad_norm": 0.6154030275902933, + "learning_rate": 3.162433669245476e-06, + "loss": 0.4976, + "step": 7726 + }, + { + "epoch": 1.2535691109669047, + "grad_norm": 0.5810030996555647, + "learning_rate": 3.162021891065462e-06, + "loss": 0.5064, + "step": 7727 + }, + { + "epoch": 1.2537313432835822, + "grad_norm": 0.5976067207209984, + "learning_rate": 3.1616100935695827e-06, + "loss": 0.4636, + "step": 7728 + }, + { + "epoch": 1.2538935756002596, + "grad_norm": 0.5724327797122525, + "learning_rate": 3.161198276769852e-06, + "loss": 0.5241, + "step": 7729 + }, + { + "epoch": 1.254055807916937, + "grad_norm": 0.5775440522625213, + "learning_rate": 3.160786440678287e-06, + "loss": 0.5262, + "step": 7730 + }, + { + "epoch": 1.2542180402336145, + "grad_norm": 0.5993667717544413, + "learning_rate": 3.1603745853069024e-06, + "loss": 0.5169, + "step": 7731 + }, + { + "epoch": 1.254380272550292, + "grad_norm": 0.6055491896069152, + "learning_rate": 3.159962710667716e-06, + "loss": 0.5368, + "step": 7732 + }, + { + "epoch": 1.2545425048669694, + "grad_norm": 0.6149603505088763, + "learning_rate": 3.1595508167727444e-06, + "loss": 0.5185, + "step": 7733 + }, + { + "epoch": 1.254704737183647, + "grad_norm": 0.5883116276067412, + "learning_rate": 3.1591389036340064e-06, + "loss": 0.5215, + "step": 7734 + }, + { + "epoch": 1.2548669695003245, + "grad_norm": 0.6557080877136677, + "learning_rate": 3.158726971263519e-06, + "loss": 0.5051, + "step": 7735 + }, + { + "epoch": 1.255029201817002, + "grad_norm": 0.5659770849926095, + "learning_rate": 3.1583150196733025e-06, + "loss": 0.4943, + "step": 7736 + }, + { + "epoch": 1.2551914341336794, + "grad_norm": 0.5859698412023395, + "learning_rate": 3.1579030488753763e-06, + "loss": 0.5028, + "step": 7737 + }, + { + "epoch": 1.2553536664503568, + "grad_norm": 0.5810592473844817, + "learning_rate": 3.1574910588817594e-06, + "loss": 0.536, + "step": 7738 + }, + { + "epoch": 1.2555158987670345, + "grad_norm": 0.5894366083482583, + "learning_rate": 3.1570790497044734e-06, + "loss": 0.5166, + "step": 7739 + }, + { + "epoch": 1.255678131083712, + "grad_norm": 0.5996209143624357, + "learning_rate": 3.1566670213555394e-06, + "loss": 0.5251, + "step": 7740 + }, + { + "epoch": 1.2558403634003894, + "grad_norm": 0.5845904486058856, + "learning_rate": 3.1562549738469785e-06, + "loss": 0.5396, + "step": 7741 + }, + { + "epoch": 1.2560025957170668, + "grad_norm": 0.6048739101082554, + "learning_rate": 3.1558429071908147e-06, + "loss": 0.5251, + "step": 7742 + }, + { + "epoch": 1.2561648280337443, + "grad_norm": 0.5656923626473748, + "learning_rate": 3.155430821399069e-06, + "loss": 0.5456, + "step": 7743 + }, + { + "epoch": 1.2563270603504217, + "grad_norm": 0.6276867585439909, + "learning_rate": 3.1550187164837663e-06, + "loss": 0.5384, + "step": 7744 + }, + { + "epoch": 1.2564892926670992, + "grad_norm": 0.6145867738222371, + "learning_rate": 3.1546065924569303e-06, + "loss": 0.5077, + "step": 7745 + }, + { + "epoch": 1.2566515249837766, + "grad_norm": 0.5946204254936296, + "learning_rate": 3.154194449330585e-06, + "loss": 0.5047, + "step": 7746 + }, + { + "epoch": 1.2568137573004543, + "grad_norm": 0.5947656439665472, + "learning_rate": 3.1537822871167554e-06, + "loss": 0.5081, + "step": 7747 + }, + { + "epoch": 1.2569759896171318, + "grad_norm": 0.5931625383072193, + "learning_rate": 3.1533701058274686e-06, + "loss": 0.5237, + "step": 7748 + }, + { + "epoch": 1.2571382219338092, + "grad_norm": 0.6271804856600303, + "learning_rate": 3.152957905474748e-06, + "loss": 0.5061, + "step": 7749 + }, + { + "epoch": 1.2573004542504866, + "grad_norm": 0.593636600377872, + "learning_rate": 3.1525456860706237e-06, + "loss": 0.5385, + "step": 7750 + }, + { + "epoch": 1.2574626865671643, + "grad_norm": 0.6031006785968174, + "learning_rate": 3.152133447627122e-06, + "loss": 0.5285, + "step": 7751 + }, + { + "epoch": 1.2576249188838418, + "grad_norm": 0.6158309276577975, + "learning_rate": 3.1517211901562698e-06, + "loss": 0.5213, + "step": 7752 + }, + { + "epoch": 1.2577871512005192, + "grad_norm": 0.6031395998369712, + "learning_rate": 3.1513089136700965e-06, + "loss": 0.5125, + "step": 7753 + }, + { + "epoch": 1.2579493835171967, + "grad_norm": 0.5985975147174999, + "learning_rate": 3.1508966181806298e-06, + "loss": 0.5512, + "step": 7754 + }, + { + "epoch": 1.258111615833874, + "grad_norm": 0.5555220834297305, + "learning_rate": 3.150484303699902e-06, + "loss": 0.5119, + "step": 7755 + }, + { + "epoch": 1.2582738481505515, + "grad_norm": 0.5972042531528169, + "learning_rate": 3.1500719702399406e-06, + "loss": 0.492, + "step": 7756 + }, + { + "epoch": 1.258436080467229, + "grad_norm": 0.5772644882162214, + "learning_rate": 3.1496596178127777e-06, + "loss": 0.5454, + "step": 7757 + }, + { + "epoch": 1.2585983127839064, + "grad_norm": 0.6071972894294589, + "learning_rate": 3.1492472464304434e-06, + "loss": 0.497, + "step": 7758 + }, + { + "epoch": 1.258760545100584, + "grad_norm": 0.5772026248484011, + "learning_rate": 3.1488348561049707e-06, + "loss": 0.5328, + "step": 7759 + }, + { + "epoch": 1.2589227774172616, + "grad_norm": 0.5776639834591406, + "learning_rate": 3.1484224468483917e-06, + "loss": 0.5382, + "step": 7760 + }, + { + "epoch": 1.259085009733939, + "grad_norm": 0.6496280899328017, + "learning_rate": 3.148010018672738e-06, + "loss": 0.5188, + "step": 7761 + }, + { + "epoch": 1.2592472420506164, + "grad_norm": 0.6047355329377518, + "learning_rate": 3.147597571590045e-06, + "loss": 0.5562, + "step": 7762 + }, + { + "epoch": 1.259409474367294, + "grad_norm": 0.597222955854124, + "learning_rate": 3.1471851056123457e-06, + "loss": 0.4605, + "step": 7763 + }, + { + "epoch": 1.2595717066839716, + "grad_norm": 0.6418945891460403, + "learning_rate": 3.1467726207516747e-06, + "loss": 0.5045, + "step": 7764 + }, + { + "epoch": 1.259733939000649, + "grad_norm": 0.5737008742508781, + "learning_rate": 3.1463601170200668e-06, + "loss": 0.506, + "step": 7765 + }, + { + "epoch": 1.2598961713173265, + "grad_norm": 0.5712917370796521, + "learning_rate": 3.1459475944295582e-06, + "loss": 0.5147, + "step": 7766 + }, + { + "epoch": 1.260058403634004, + "grad_norm": 0.5992960211865076, + "learning_rate": 3.1455350529921846e-06, + "loss": 0.5504, + "step": 7767 + }, + { + "epoch": 1.2602206359506813, + "grad_norm": 0.6304510148790566, + "learning_rate": 3.145122492719983e-06, + "loss": 0.5449, + "step": 7768 + }, + { + "epoch": 1.2603828682673588, + "grad_norm": 0.6170340370587867, + "learning_rate": 3.1447099136249913e-06, + "loss": 0.5333, + "step": 7769 + }, + { + "epoch": 1.2605451005840362, + "grad_norm": 0.6135025993752303, + "learning_rate": 3.144297315719247e-06, + "loss": 0.5419, + "step": 7770 + }, + { + "epoch": 1.2607073329007137, + "grad_norm": 0.575482637345206, + "learning_rate": 3.1438846990147877e-06, + "loss": 0.5607, + "step": 7771 + }, + { + "epoch": 1.2608695652173914, + "grad_norm": 0.5987926231446458, + "learning_rate": 3.1434720635236528e-06, + "loss": 0.5277, + "step": 7772 + }, + { + "epoch": 1.2610317975340688, + "grad_norm": 0.6268629617858271, + "learning_rate": 3.1430594092578826e-06, + "loss": 0.5281, + "step": 7773 + }, + { + "epoch": 1.2611940298507462, + "grad_norm": 0.5768007015927658, + "learning_rate": 3.1426467362295153e-06, + "loss": 0.486, + "step": 7774 + }, + { + "epoch": 1.2613562621674237, + "grad_norm": 0.6236317348561666, + "learning_rate": 3.142234044450594e-06, + "loss": 0.5079, + "step": 7775 + }, + { + "epoch": 1.2615184944841014, + "grad_norm": 0.6104484048248678, + "learning_rate": 3.1418213339331583e-06, + "loss": 0.5156, + "step": 7776 + }, + { + "epoch": 1.2616807268007788, + "grad_norm": 0.5722345877607157, + "learning_rate": 3.141408604689249e-06, + "loss": 0.5102, + "step": 7777 + }, + { + "epoch": 1.2618429591174563, + "grad_norm": 0.6132989675718457, + "learning_rate": 3.1409958567309114e-06, + "loss": 0.4918, + "step": 7778 + }, + { + "epoch": 1.2620051914341337, + "grad_norm": 0.6094346732220124, + "learning_rate": 3.1405830900701846e-06, + "loss": 0.5057, + "step": 7779 + }, + { + "epoch": 1.2621674237508111, + "grad_norm": 0.6222330338931998, + "learning_rate": 3.140170304719115e-06, + "loss": 0.5143, + "step": 7780 + }, + { + "epoch": 1.2623296560674886, + "grad_norm": 0.5736429920572583, + "learning_rate": 3.139757500689744e-06, + "loss": 0.4992, + "step": 7781 + }, + { + "epoch": 1.262491888384166, + "grad_norm": 0.5979081456023474, + "learning_rate": 3.1393446779941176e-06, + "loss": 0.5386, + "step": 7782 + }, + { + "epoch": 1.2626541207008435, + "grad_norm": 0.6066272859189022, + "learning_rate": 3.138931836644281e-06, + "loss": 0.5003, + "step": 7783 + }, + { + "epoch": 1.2628163530175212, + "grad_norm": 0.6391350349172271, + "learning_rate": 3.1385189766522784e-06, + "loss": 0.5067, + "step": 7784 + }, + { + "epoch": 1.2629785853341986, + "grad_norm": 0.6406845025347191, + "learning_rate": 3.138106098030156e-06, + "loss": 0.5229, + "step": 7785 + }, + { + "epoch": 1.263140817650876, + "grad_norm": 0.5780631159987066, + "learning_rate": 3.1376932007899607e-06, + "loss": 0.5495, + "step": 7786 + }, + { + "epoch": 1.2633030499675535, + "grad_norm": 0.6173996983439121, + "learning_rate": 3.1372802849437405e-06, + "loss": 0.533, + "step": 7787 + }, + { + "epoch": 1.263465282284231, + "grad_norm": 0.5863256737153013, + "learning_rate": 3.136867350503543e-06, + "loss": 0.5168, + "step": 7788 + }, + { + "epoch": 1.2636275146009086, + "grad_norm": 0.5856409505162913, + "learning_rate": 3.136454397481415e-06, + "loss": 0.5423, + "step": 7789 + }, + { + "epoch": 1.263789746917586, + "grad_norm": 0.5846508209754717, + "learning_rate": 3.1360414258894057e-06, + "loss": 0.4766, + "step": 7790 + }, + { + "epoch": 1.2639519792342635, + "grad_norm": 0.574092225607239, + "learning_rate": 3.1356284357395657e-06, + "loss": 0.5148, + "step": 7791 + }, + { + "epoch": 1.264114211550941, + "grad_norm": 0.6003422344766717, + "learning_rate": 3.1352154270439426e-06, + "loss": 0.5197, + "step": 7792 + }, + { + "epoch": 1.2642764438676184, + "grad_norm": 0.592219687676602, + "learning_rate": 3.1348023998145894e-06, + "loss": 0.5237, + "step": 7793 + }, + { + "epoch": 1.2644386761842958, + "grad_norm": 0.5810099064050027, + "learning_rate": 3.134389354063555e-06, + "loss": 0.5263, + "step": 7794 + }, + { + "epoch": 1.2646009085009733, + "grad_norm": 0.5947535620114462, + "learning_rate": 3.1339762898028918e-06, + "loss": 0.5487, + "step": 7795 + }, + { + "epoch": 1.264763140817651, + "grad_norm": 0.6192551627392164, + "learning_rate": 3.133563207044652e-06, + "loss": 0.5471, + "step": 7796 + }, + { + "epoch": 1.2649253731343284, + "grad_norm": 0.5967198267233096, + "learning_rate": 3.1331501058008867e-06, + "loss": 0.5162, + "step": 7797 + }, + { + "epoch": 1.2650876054510058, + "grad_norm": 0.5887381383609123, + "learning_rate": 3.1327369860836515e-06, + "loss": 0.5185, + "step": 7798 + }, + { + "epoch": 1.2652498377676833, + "grad_norm": 0.6149888152602649, + "learning_rate": 3.1323238479049974e-06, + "loss": 0.5507, + "step": 7799 + }, + { + "epoch": 1.2654120700843607, + "grad_norm": 0.5601927223372059, + "learning_rate": 3.1319106912769797e-06, + "loss": 0.4919, + "step": 7800 + }, + { + "epoch": 1.2655743024010384, + "grad_norm": 0.6356519237473878, + "learning_rate": 3.131497516211654e-06, + "loss": 0.5303, + "step": 7801 + }, + { + "epoch": 1.2657365347177159, + "grad_norm": 0.5841339104452241, + "learning_rate": 3.131084322721074e-06, + "loss": 0.5064, + "step": 7802 + }, + { + "epoch": 1.2658987670343933, + "grad_norm": 0.5792387016351837, + "learning_rate": 3.1306711108172965e-06, + "loss": 0.5005, + "step": 7803 + }, + { + "epoch": 1.2660609993510707, + "grad_norm": 0.6304022730341329, + "learning_rate": 3.1302578805123774e-06, + "loss": 0.5108, + "step": 7804 + }, + { + "epoch": 1.2662232316677482, + "grad_norm": 0.6084923955700713, + "learning_rate": 3.1298446318183735e-06, + "loss": 0.4989, + "step": 7805 + }, + { + "epoch": 1.2663854639844256, + "grad_norm": 0.600826055661887, + "learning_rate": 3.129431364747342e-06, + "loss": 0.4996, + "step": 7806 + }, + { + "epoch": 1.266547696301103, + "grad_norm": 0.5986342082209889, + "learning_rate": 3.1290180793113423e-06, + "loss": 0.4966, + "step": 7807 + }, + { + "epoch": 1.2667099286177805, + "grad_norm": 0.6031734082315353, + "learning_rate": 3.12860477552243e-06, + "loss": 0.5428, + "step": 7808 + }, + { + "epoch": 1.2668721609344582, + "grad_norm": 0.6059070405647092, + "learning_rate": 3.1281914533926677e-06, + "loss": 0.5125, + "step": 7809 + }, + { + "epoch": 1.2670343932511356, + "grad_norm": 0.6328934638493928, + "learning_rate": 3.127778112934111e-06, + "loss": 0.5373, + "step": 7810 + }, + { + "epoch": 1.267196625567813, + "grad_norm": 0.62807184678414, + "learning_rate": 3.1273647541588235e-06, + "loss": 0.5458, + "step": 7811 + }, + { + "epoch": 1.2673588578844905, + "grad_norm": 0.58464931381212, + "learning_rate": 3.126951377078864e-06, + "loss": 0.5343, + "step": 7812 + }, + { + "epoch": 1.2675210902011682, + "grad_norm": 0.5842853702705566, + "learning_rate": 3.1265379817062943e-06, + "loss": 0.5522, + "step": 7813 + }, + { + "epoch": 1.2676833225178457, + "grad_norm": 0.5937306538081732, + "learning_rate": 3.1261245680531752e-06, + "loss": 0.5176, + "step": 7814 + }, + { + "epoch": 1.267845554834523, + "grad_norm": 0.61784261469084, + "learning_rate": 3.125711136131569e-06, + "loss": 0.4867, + "step": 7815 + }, + { + "epoch": 1.2680077871512005, + "grad_norm": 0.6012350999945837, + "learning_rate": 3.12529768595354e-06, + "loss": 0.5206, + "step": 7816 + }, + { + "epoch": 1.268170019467878, + "grad_norm": 0.6054509411683265, + "learning_rate": 3.1248842175311495e-06, + "loss": 0.543, + "step": 7817 + }, + { + "epoch": 1.2683322517845554, + "grad_norm": 0.5567980353996332, + "learning_rate": 3.124470730876462e-06, + "loss": 0.519, + "step": 7818 + }, + { + "epoch": 1.2684944841012329, + "grad_norm": 0.5867084294511139, + "learning_rate": 3.1240572260015424e-06, + "loss": 0.5085, + "step": 7819 + }, + { + "epoch": 1.2686567164179103, + "grad_norm": 0.5885962112616386, + "learning_rate": 3.123643702918455e-06, + "loss": 0.5338, + "step": 7820 + }, + { + "epoch": 1.268818948734588, + "grad_norm": 0.5917419515439402, + "learning_rate": 3.1232301616392656e-06, + "loss": 0.4986, + "step": 7821 + }, + { + "epoch": 1.2689811810512654, + "grad_norm": 0.5976395894526292, + "learning_rate": 3.122816602176039e-06, + "loss": 0.5429, + "step": 7822 + }, + { + "epoch": 1.269143413367943, + "grad_norm": 0.6105514599546485, + "learning_rate": 3.122403024540843e-06, + "loss": 0.4833, + "step": 7823 + }, + { + "epoch": 1.2693056456846203, + "grad_norm": 0.608974821441468, + "learning_rate": 3.121989428745745e-06, + "loss": 0.5198, + "step": 7824 + }, + { + "epoch": 1.2694678780012978, + "grad_norm": 0.5987833837476549, + "learning_rate": 3.1215758148028102e-06, + "loss": 0.5494, + "step": 7825 + }, + { + "epoch": 1.2696301103179755, + "grad_norm": 0.562398021046147, + "learning_rate": 3.121162182724109e-06, + "loss": 0.4966, + "step": 7826 + }, + { + "epoch": 1.269792342634653, + "grad_norm": 0.6394551178285092, + "learning_rate": 3.1207485325217087e-06, + "loss": 0.5531, + "step": 7827 + }, + { + "epoch": 1.2699545749513304, + "grad_norm": 0.5796323981437925, + "learning_rate": 3.120334864207678e-06, + "loss": 0.5309, + "step": 7828 + }, + { + "epoch": 1.2701168072680078, + "grad_norm": 0.6076666961986511, + "learning_rate": 3.1199211777940883e-06, + "loss": 0.504, + "step": 7829 + }, + { + "epoch": 1.2702790395846852, + "grad_norm": 0.6165096293324431, + "learning_rate": 3.1195074732930084e-06, + "loss": 0.5127, + "step": 7830 + }, + { + "epoch": 1.2704412719013627, + "grad_norm": 0.6015675699599156, + "learning_rate": 3.1190937507165093e-06, + "loss": 0.5052, + "step": 7831 + }, + { + "epoch": 1.2706035042180401, + "grad_norm": 0.594620991950792, + "learning_rate": 3.1186800100766624e-06, + "loss": 0.519, + "step": 7832 + }, + { + "epoch": 1.2707657365347176, + "grad_norm": 0.5633503013853759, + "learning_rate": 3.118266251385539e-06, + "loss": 0.4644, + "step": 7833 + }, + { + "epoch": 1.2709279688513953, + "grad_norm": 0.6430252866580256, + "learning_rate": 3.1178524746552126e-06, + "loss": 0.5435, + "step": 7834 + }, + { + "epoch": 1.2710902011680727, + "grad_norm": 0.6211974210909135, + "learning_rate": 3.1174386798977535e-06, + "loss": 0.5302, + "step": 7835 + }, + { + "epoch": 1.2712524334847501, + "grad_norm": 0.581175754191951, + "learning_rate": 3.1170248671252373e-06, + "loss": 0.5126, + "step": 7836 + }, + { + "epoch": 1.2714146658014276, + "grad_norm": 0.6064311991992679, + "learning_rate": 3.116611036349737e-06, + "loss": 0.4989, + "step": 7837 + }, + { + "epoch": 1.2715768981181053, + "grad_norm": 0.5906137277028477, + "learning_rate": 3.1161971875833274e-06, + "loss": 0.506, + "step": 7838 + }, + { + "epoch": 1.2717391304347827, + "grad_norm": 0.6165187825542793, + "learning_rate": 3.115783320838083e-06, + "loss": 0.5175, + "step": 7839 + }, + { + "epoch": 1.2719013627514602, + "grad_norm": 0.5992232958451706, + "learning_rate": 3.115369436126079e-06, + "loss": 0.5534, + "step": 7840 + }, + { + "epoch": 1.2720635950681376, + "grad_norm": 0.5928700325323893, + "learning_rate": 3.114955533459392e-06, + "loss": 0.5274, + "step": 7841 + }, + { + "epoch": 1.272225827384815, + "grad_norm": 0.5803594740325768, + "learning_rate": 3.114541612850098e-06, + "loss": 0.4968, + "step": 7842 + }, + { + "epoch": 1.2723880597014925, + "grad_norm": 0.5935731929974447, + "learning_rate": 3.1141276743102743e-06, + "loss": 0.5391, + "step": 7843 + }, + { + "epoch": 1.27255029201817, + "grad_norm": 0.6614656978191004, + "learning_rate": 3.1137137178519983e-06, + "loss": 0.5434, + "step": 7844 + }, + { + "epoch": 1.2727125243348474, + "grad_norm": 0.5977431675117972, + "learning_rate": 3.113299743487348e-06, + "loss": 0.5461, + "step": 7845 + }, + { + "epoch": 1.272874756651525, + "grad_norm": 0.6487873719406475, + "learning_rate": 3.112885751228401e-06, + "loss": 0.5198, + "step": 7846 + }, + { + "epoch": 1.2730369889682025, + "grad_norm": 0.6639980632670898, + "learning_rate": 3.112471741087239e-06, + "loss": 0.5133, + "step": 7847 + }, + { + "epoch": 1.27319922128488, + "grad_norm": 0.6041976364970184, + "learning_rate": 3.1120577130759387e-06, + "loss": 0.5765, + "step": 7848 + }, + { + "epoch": 1.2733614536015574, + "grad_norm": 0.5936560217091938, + "learning_rate": 3.111643667206582e-06, + "loss": 0.5502, + "step": 7849 + }, + { + "epoch": 1.2735236859182348, + "grad_norm": 0.592538794403324, + "learning_rate": 3.1112296034912485e-06, + "loss": 0.5498, + "step": 7850 + }, + { + "epoch": 1.2736859182349125, + "grad_norm": 0.5921103984124818, + "learning_rate": 3.11081552194202e-06, + "loss": 0.532, + "step": 7851 + }, + { + "epoch": 1.27384815055159, + "grad_norm": 0.5534402802866112, + "learning_rate": 3.1104014225709787e-06, + "loss": 0.5123, + "step": 7852 + }, + { + "epoch": 1.2740103828682674, + "grad_norm": 0.6247138030828977, + "learning_rate": 3.109987305390206e-06, + "loss": 0.5307, + "step": 7853 + }, + { + "epoch": 1.2741726151849448, + "grad_norm": 0.5961427093911256, + "learning_rate": 3.1095731704117844e-06, + "loss": 0.5245, + "step": 7854 + }, + { + "epoch": 1.2743348475016223, + "grad_norm": 0.6149371783099002, + "learning_rate": 3.109159017647798e-06, + "loss": 0.5021, + "step": 7855 + }, + { + "epoch": 1.2744970798182997, + "grad_norm": 0.5859842161089769, + "learning_rate": 3.10874484711033e-06, + "loss": 0.5338, + "step": 7856 + }, + { + "epoch": 1.2746593121349772, + "grad_norm": 0.6002695755986817, + "learning_rate": 3.1083306588114648e-06, + "loss": 0.5252, + "step": 7857 + }, + { + "epoch": 1.2748215444516546, + "grad_norm": 0.6114225856120571, + "learning_rate": 3.1079164527632875e-06, + "loss": 0.5228, + "step": 7858 + }, + { + "epoch": 1.2749837767683323, + "grad_norm": 0.590825770173253, + "learning_rate": 3.107502228977882e-06, + "loss": 0.4804, + "step": 7859 + }, + { + "epoch": 1.2751460090850097, + "grad_norm": 0.6121215413170625, + "learning_rate": 3.1070879874673365e-06, + "loss": 0.5346, + "step": 7860 + }, + { + "epoch": 1.2753082414016872, + "grad_norm": 0.5825504720574707, + "learning_rate": 3.106673728243736e-06, + "loss": 0.5025, + "step": 7861 + }, + { + "epoch": 1.2754704737183646, + "grad_norm": 0.5972369053188131, + "learning_rate": 3.1062594513191673e-06, + "loss": 0.553, + "step": 7862 + }, + { + "epoch": 1.2756327060350423, + "grad_norm": 0.5958749009002481, + "learning_rate": 3.1058451567057185e-06, + "loss": 0.5149, + "step": 7863 + }, + { + "epoch": 1.2757949383517198, + "grad_norm": 0.5953359786290271, + "learning_rate": 3.1054308444154756e-06, + "loss": 0.5089, + "step": 7864 + }, + { + "epoch": 1.2759571706683972, + "grad_norm": 0.5985721111077896, + "learning_rate": 3.1050165144605304e-06, + "loss": 0.5299, + "step": 7865 + }, + { + "epoch": 1.2761194029850746, + "grad_norm": 0.5937728191840975, + "learning_rate": 3.1046021668529684e-06, + "loss": 0.5461, + "step": 7866 + }, + { + "epoch": 1.276281635301752, + "grad_norm": 0.5792847369040145, + "learning_rate": 3.1041878016048814e-06, + "loss": 0.5322, + "step": 7867 + }, + { + "epoch": 1.2764438676184295, + "grad_norm": 0.5931936976232349, + "learning_rate": 3.1037734187283587e-06, + "loss": 0.5368, + "step": 7868 + }, + { + "epoch": 1.276606099935107, + "grad_norm": 0.6371490386554033, + "learning_rate": 3.1033590182354896e-06, + "loss": 0.5047, + "step": 7869 + }, + { + "epoch": 1.2767683322517844, + "grad_norm": 0.605480585131738, + "learning_rate": 3.1029446001383666e-06, + "loss": 0.5291, + "step": 7870 + }, + { + "epoch": 1.276930564568462, + "grad_norm": 0.5817381763738237, + "learning_rate": 3.102530164449081e-06, + "loss": 0.5522, + "step": 7871 + }, + { + "epoch": 1.2770927968851395, + "grad_norm": 0.5890816578194364, + "learning_rate": 3.1021157111797246e-06, + "loss": 0.5001, + "step": 7872 + }, + { + "epoch": 1.277255029201817, + "grad_norm": 0.5809809037146965, + "learning_rate": 3.1017012403423886e-06, + "loss": 0.4844, + "step": 7873 + }, + { + "epoch": 1.2774172615184944, + "grad_norm": 0.5822986400065925, + "learning_rate": 3.1012867519491684e-06, + "loss": 0.547, + "step": 7874 + }, + { + "epoch": 1.2775794938351719, + "grad_norm": 0.6047895929169027, + "learning_rate": 3.1008722460121563e-06, + "loss": 0.547, + "step": 7875 + }, + { + "epoch": 1.2777417261518496, + "grad_norm": 0.5861680120119833, + "learning_rate": 3.100457722543447e-06, + "loss": 0.535, + "step": 7876 + }, + { + "epoch": 1.277903958468527, + "grad_norm": 0.5962529936886392, + "learning_rate": 3.1000431815551333e-06, + "loss": 0.4858, + "step": 7877 + }, + { + "epoch": 1.2780661907852044, + "grad_norm": 0.632807253998138, + "learning_rate": 3.0996286230593127e-06, + "loss": 0.5443, + "step": 7878 + }, + { + "epoch": 1.278228423101882, + "grad_norm": 0.6436972503615995, + "learning_rate": 3.099214047068079e-06, + "loss": 0.5437, + "step": 7879 + }, + { + "epoch": 1.2783906554185593, + "grad_norm": 0.6081523779281167, + "learning_rate": 3.09879945359353e-06, + "loss": 0.5169, + "step": 7880 + }, + { + "epoch": 1.2785528877352368, + "grad_norm": 0.5918210549173207, + "learning_rate": 3.098384842647761e-06, + "loss": 0.5146, + "step": 7881 + }, + { + "epoch": 1.2787151200519142, + "grad_norm": 0.6079922691001038, + "learning_rate": 3.0979702142428693e-06, + "loss": 0.5245, + "step": 7882 + }, + { + "epoch": 1.278877352368592, + "grad_norm": 0.5697842940878174, + "learning_rate": 3.097555568390953e-06, + "loss": 0.5379, + "step": 7883 + }, + { + "epoch": 1.2790395846852693, + "grad_norm": 0.5815320956148311, + "learning_rate": 3.0971409051041097e-06, + "loss": 0.517, + "step": 7884 + }, + { + "epoch": 1.2792018170019468, + "grad_norm": 0.5824593002539747, + "learning_rate": 3.0967262243944384e-06, + "loss": 0.4771, + "step": 7885 + }, + { + "epoch": 1.2793640493186242, + "grad_norm": 0.5630671956501785, + "learning_rate": 3.0963115262740384e-06, + "loss": 0.4972, + "step": 7886 + }, + { + "epoch": 1.2795262816353017, + "grad_norm": 0.5773970553128088, + "learning_rate": 3.0958968107550095e-06, + "loss": 0.5225, + "step": 7887 + }, + { + "epoch": 1.2796885139519794, + "grad_norm": 0.5893997191641382, + "learning_rate": 3.0954820778494516e-06, + "loss": 0.5047, + "step": 7888 + }, + { + "epoch": 1.2798507462686568, + "grad_norm": 0.6283348203356592, + "learning_rate": 3.0950673275694645e-06, + "loss": 0.5066, + "step": 7889 + }, + { + "epoch": 1.2800129785853342, + "grad_norm": 0.5879028537266087, + "learning_rate": 3.0946525599271514e-06, + "loss": 0.4981, + "step": 7890 + }, + { + "epoch": 1.2801752109020117, + "grad_norm": 0.6479406280343896, + "learning_rate": 3.094237774934612e-06, + "loss": 0.5127, + "step": 7891 + }, + { + "epoch": 1.2803374432186891, + "grad_norm": 0.6046176868439355, + "learning_rate": 3.0938229726039504e-06, + "loss": 0.5237, + "step": 7892 + }, + { + "epoch": 1.2804996755353666, + "grad_norm": 0.6016684586666171, + "learning_rate": 3.0934081529472677e-06, + "loss": 0.5126, + "step": 7893 + }, + { + "epoch": 1.280661907852044, + "grad_norm": 0.6370656003064893, + "learning_rate": 3.092993315976668e-06, + "loss": 0.5139, + "step": 7894 + }, + { + "epoch": 1.2808241401687215, + "grad_norm": 0.6091933578664058, + "learning_rate": 3.092578461704255e-06, + "loss": 0.5289, + "step": 7895 + }, + { + "epoch": 1.2809863724853991, + "grad_norm": 0.5971199719721703, + "learning_rate": 3.092163590142132e-06, + "loss": 0.5353, + "step": 7896 + }, + { + "epoch": 1.2811486048020766, + "grad_norm": 0.6039041085303944, + "learning_rate": 3.091748701302405e-06, + "loss": 0.5158, + "step": 7897 + }, + { + "epoch": 1.281310837118754, + "grad_norm": 0.6012177881082872, + "learning_rate": 3.091333795197179e-06, + "loss": 0.4911, + "step": 7898 + }, + { + "epoch": 1.2814730694354315, + "grad_norm": 0.5862151630803134, + "learning_rate": 3.090918871838559e-06, + "loss": 0.5239, + "step": 7899 + }, + { + "epoch": 1.2816353017521092, + "grad_norm": 0.6482667085465605, + "learning_rate": 3.0905039312386527e-06, + "loss": 0.5203, + "step": 7900 + }, + { + "epoch": 1.2817975340687866, + "grad_norm": 0.6335779056620323, + "learning_rate": 3.090088973409565e-06, + "loss": 0.5357, + "step": 7901 + }, + { + "epoch": 1.281959766385464, + "grad_norm": 0.651118016102775, + "learning_rate": 3.089673998363404e-06, + "loss": 0.5053, + "step": 7902 + }, + { + "epoch": 1.2821219987021415, + "grad_norm": 0.5839974829959964, + "learning_rate": 3.0892590061122776e-06, + "loss": 0.5138, + "step": 7903 + }, + { + "epoch": 1.282284231018819, + "grad_norm": 0.6050793473083197, + "learning_rate": 3.088843996668294e-06, + "loss": 0.5334, + "step": 7904 + }, + { + "epoch": 1.2824464633354964, + "grad_norm": 0.5953394758007233, + "learning_rate": 3.0884289700435623e-06, + "loss": 0.5132, + "step": 7905 + }, + { + "epoch": 1.2826086956521738, + "grad_norm": 0.5854626852122078, + "learning_rate": 3.0880139262501913e-06, + "loss": 0.5235, + "step": 7906 + }, + { + "epoch": 1.2827709279688513, + "grad_norm": 0.6193161080247125, + "learning_rate": 3.0875988653002904e-06, + "loss": 0.523, + "step": 7907 + }, + { + "epoch": 1.282933160285529, + "grad_norm": 0.5892212632835717, + "learning_rate": 3.0871837872059705e-06, + "loss": 0.5157, + "step": 7908 + }, + { + "epoch": 1.2830953926022064, + "grad_norm": 0.6331914403490728, + "learning_rate": 3.0867686919793423e-06, + "loss": 0.5545, + "step": 7909 + }, + { + "epoch": 1.2832576249188838, + "grad_norm": 0.5972241580568319, + "learning_rate": 3.0863535796325173e-06, + "loss": 0.5145, + "step": 7910 + }, + { + "epoch": 1.2834198572355613, + "grad_norm": 0.5600908113230022, + "learning_rate": 3.085938450177607e-06, + "loss": 0.5432, + "step": 7911 + }, + { + "epoch": 1.2835820895522387, + "grad_norm": 0.6146439005223426, + "learning_rate": 3.085523303626723e-06, + "loss": 0.5552, + "step": 7912 + }, + { + "epoch": 1.2837443218689164, + "grad_norm": 0.60068729418455, + "learning_rate": 3.0851081399919792e-06, + "loss": 0.4987, + "step": 7913 + }, + { + "epoch": 1.2839065541855939, + "grad_norm": 0.5934931844633746, + "learning_rate": 3.0846929592854872e-06, + "loss": 0.5381, + "step": 7914 + }, + { + "epoch": 1.2840687865022713, + "grad_norm": 0.589535236322525, + "learning_rate": 3.0842777615193633e-06, + "loss": 0.5091, + "step": 7915 + }, + { + "epoch": 1.2842310188189487, + "grad_norm": 0.5748358986133821, + "learning_rate": 3.083862546705719e-06, + "loss": 0.5453, + "step": 7916 + }, + { + "epoch": 1.2843932511356262, + "grad_norm": 0.5539734380538207, + "learning_rate": 3.0834473148566716e-06, + "loss": 0.5499, + "step": 7917 + }, + { + "epoch": 1.2845554834523036, + "grad_norm": 0.5835368014153803, + "learning_rate": 3.083032065984335e-06, + "loss": 0.5236, + "step": 7918 + }, + { + "epoch": 1.284717715768981, + "grad_norm": 0.6022389432286887, + "learning_rate": 3.082616800100825e-06, + "loss": 0.5443, + "step": 7919 + }, + { + "epoch": 1.2848799480856585, + "grad_norm": 0.6047943010937328, + "learning_rate": 3.0822015172182573e-06, + "loss": 0.529, + "step": 7920 + }, + { + "epoch": 1.2850421804023362, + "grad_norm": 0.6421388626400266, + "learning_rate": 3.08178621734875e-06, + "loss": 0.4814, + "step": 7921 + }, + { + "epoch": 1.2852044127190136, + "grad_norm": 0.591777079369143, + "learning_rate": 3.081370900504419e-06, + "loss": 0.4978, + "step": 7922 + }, + { + "epoch": 1.285366645035691, + "grad_norm": 0.5976776986301945, + "learning_rate": 3.080955566697383e-06, + "loss": 0.5072, + "step": 7923 + }, + { + "epoch": 1.2855288773523685, + "grad_norm": 0.6223144901402006, + "learning_rate": 3.08054021593976e-06, + "loss": 0.5409, + "step": 7924 + }, + { + "epoch": 1.2856911096690462, + "grad_norm": 0.5987145437464788, + "learning_rate": 3.0801248482436684e-06, + "loss": 0.4921, + "step": 7925 + }, + { + "epoch": 1.2858533419857237, + "grad_norm": 0.5927425361231746, + "learning_rate": 3.0797094636212284e-06, + "loss": 0.5401, + "step": 7926 + }, + { + "epoch": 1.286015574302401, + "grad_norm": 0.5606097354779283, + "learning_rate": 3.0792940620845578e-06, + "loss": 0.5109, + "step": 7927 + }, + { + "epoch": 1.2861778066190785, + "grad_norm": 0.5872248264265226, + "learning_rate": 3.0788786436457783e-06, + "loss": 0.5425, + "step": 7928 + }, + { + "epoch": 1.286340038935756, + "grad_norm": 0.5976329381305517, + "learning_rate": 3.0784632083170108e-06, + "loss": 0.4811, + "step": 7929 + }, + { + "epoch": 1.2865022712524334, + "grad_norm": 0.6264093677812548, + "learning_rate": 3.0780477561103756e-06, + "loss": 0.5289, + "step": 7930 + }, + { + "epoch": 1.2866645035691109, + "grad_norm": 0.640555747382457, + "learning_rate": 3.077632287037995e-06, + "loss": 0.5273, + "step": 7931 + }, + { + "epoch": 1.2868267358857883, + "grad_norm": 0.5809503288598311, + "learning_rate": 3.0772168011119894e-06, + "loss": 0.5354, + "step": 7932 + }, + { + "epoch": 1.286988968202466, + "grad_norm": 0.5906079256565967, + "learning_rate": 3.0768012983444847e-06, + "loss": 0.509, + "step": 7933 + }, + { + "epoch": 1.2871512005191434, + "grad_norm": 0.609714682345799, + "learning_rate": 3.076385778747602e-06, + "loss": 0.5334, + "step": 7934 + }, + { + "epoch": 1.287313432835821, + "grad_norm": 0.5819489261935288, + "learning_rate": 3.0759702423334647e-06, + "loss": 0.5189, + "step": 7935 + }, + { + "epoch": 1.2874756651524983, + "grad_norm": 0.6237267509109733, + "learning_rate": 3.075554689114198e-06, + "loss": 0.4912, + "step": 7936 + }, + { + "epoch": 1.2876378974691758, + "grad_norm": 0.5875273313454128, + "learning_rate": 3.0751391191019257e-06, + "loss": 0.4999, + "step": 7937 + }, + { + "epoch": 1.2878001297858535, + "grad_norm": 0.6193529252129026, + "learning_rate": 3.0747235323087734e-06, + "loss": 0.5136, + "step": 7938 + }, + { + "epoch": 1.287962362102531, + "grad_norm": 0.5813671621509379, + "learning_rate": 3.074307928746867e-06, + "loss": 0.4985, + "step": 7939 + }, + { + "epoch": 1.2881245944192083, + "grad_norm": 0.6062631869979583, + "learning_rate": 3.0738923084283314e-06, + "loss": 0.5421, + "step": 7940 + }, + { + "epoch": 1.2882868267358858, + "grad_norm": 0.5893318375092795, + "learning_rate": 3.0734766713652936e-06, + "loss": 0.5224, + "step": 7941 + }, + { + "epoch": 1.2884490590525632, + "grad_norm": 0.6426062528646131, + "learning_rate": 3.0730610175698814e-06, + "loss": 0.5277, + "step": 7942 + }, + { + "epoch": 1.2886112913692407, + "grad_norm": 0.601666493033751, + "learning_rate": 3.0726453470542223e-06, + "loss": 0.4977, + "step": 7943 + }, + { + "epoch": 1.2887735236859181, + "grad_norm": 0.6448181956570913, + "learning_rate": 3.0722296598304448e-06, + "loss": 0.5225, + "step": 7944 + }, + { + "epoch": 1.2889357560025956, + "grad_norm": 0.5845919112040362, + "learning_rate": 3.071813955910675e-06, + "loss": 0.5478, + "step": 7945 + }, + { + "epoch": 1.2890979883192732, + "grad_norm": 0.6061677877668972, + "learning_rate": 3.0713982353070448e-06, + "loss": 0.5382, + "step": 7946 + }, + { + "epoch": 1.2892602206359507, + "grad_norm": 0.5670408602869444, + "learning_rate": 3.070982498031682e-06, + "loss": 0.5082, + "step": 7947 + }, + { + "epoch": 1.2894224529526281, + "grad_norm": 0.6216609966855416, + "learning_rate": 3.0705667440967176e-06, + "loss": 0.4886, + "step": 7948 + }, + { + "epoch": 1.2895846852693056, + "grad_norm": 0.653151456894224, + "learning_rate": 3.070150973514281e-06, + "loss": 0.511, + "step": 7949 + }, + { + "epoch": 1.2897469175859833, + "grad_norm": 0.5858483015230397, + "learning_rate": 3.0697351862965034e-06, + "loss": 0.5262, + "step": 7950 + }, + { + "epoch": 1.2899091499026607, + "grad_norm": 0.6268548899724776, + "learning_rate": 3.069319382455517e-06, + "loss": 0.5354, + "step": 7951 + }, + { + "epoch": 1.2900713822193381, + "grad_norm": 0.5781941423528683, + "learning_rate": 3.068903562003454e-06, + "loss": 0.5164, + "step": 7952 + }, + { + "epoch": 1.2902336145360156, + "grad_norm": 0.5946088058788611, + "learning_rate": 3.0684877249524456e-06, + "loss": 0.5415, + "step": 7953 + }, + { + "epoch": 1.290395846852693, + "grad_norm": 0.6349012825474214, + "learning_rate": 3.068071871314626e-06, + "loss": 0.5392, + "step": 7954 + }, + { + "epoch": 1.2905580791693705, + "grad_norm": 0.5906966955504762, + "learning_rate": 3.0676560011021274e-06, + "loss": 0.5422, + "step": 7955 + }, + { + "epoch": 1.290720311486048, + "grad_norm": 0.6213060237934032, + "learning_rate": 3.0672401143270835e-06, + "loss": 0.5034, + "step": 7956 + }, + { + "epoch": 1.2908825438027254, + "grad_norm": 0.6056638607481678, + "learning_rate": 3.0668242110016312e-06, + "loss": 0.5589, + "step": 7957 + }, + { + "epoch": 1.291044776119403, + "grad_norm": 0.5598673241101341, + "learning_rate": 3.0664082911379017e-06, + "loss": 0.5027, + "step": 7958 + }, + { + "epoch": 1.2912070084360805, + "grad_norm": 0.5846254942554693, + "learning_rate": 3.0659923547480325e-06, + "loss": 0.5232, + "step": 7959 + }, + { + "epoch": 1.291369240752758, + "grad_norm": 0.5620512519541645, + "learning_rate": 3.0655764018441596e-06, + "loss": 0.5166, + "step": 7960 + }, + { + "epoch": 1.2915314730694354, + "grad_norm": 0.6305955112793491, + "learning_rate": 3.065160432438418e-06, + "loss": 0.5141, + "step": 7961 + }, + { + "epoch": 1.2916937053861128, + "grad_norm": 0.6044588512945519, + "learning_rate": 3.064744446542946e-06, + "loss": 0.5396, + "step": 7962 + }, + { + "epoch": 1.2918559377027905, + "grad_norm": 0.5550337669611851, + "learning_rate": 3.064328444169879e-06, + "loss": 0.5269, + "step": 7963 + }, + { + "epoch": 1.292018170019468, + "grad_norm": 0.6031363988366195, + "learning_rate": 3.0639124253313573e-06, + "loss": 0.534, + "step": 7964 + }, + { + "epoch": 1.2921804023361454, + "grad_norm": 0.5770842765299716, + "learning_rate": 3.063496390039516e-06, + "loss": 0.4932, + "step": 7965 + }, + { + "epoch": 1.2923426346528228, + "grad_norm": 0.5841243649005514, + "learning_rate": 3.063080338306496e-06, + "loss": 0.5126, + "step": 7966 + }, + { + "epoch": 1.2925048669695003, + "grad_norm": 0.5951066172472904, + "learning_rate": 3.0626642701444366e-06, + "loss": 0.5346, + "step": 7967 + }, + { + "epoch": 1.2926670992861777, + "grad_norm": 0.6316641622296189, + "learning_rate": 3.0622481855654757e-06, + "loss": 0.5049, + "step": 7968 + }, + { + "epoch": 1.2928293316028552, + "grad_norm": 0.582430851626227, + "learning_rate": 3.061832084581755e-06, + "loss": 0.5552, + "step": 7969 + }, + { + "epoch": 1.2929915639195328, + "grad_norm": 0.5978129537352909, + "learning_rate": 3.061415967205414e-06, + "loss": 0.5173, + "step": 7970 + }, + { + "epoch": 1.2931537962362103, + "grad_norm": 0.6066814301118234, + "learning_rate": 3.0609998334485947e-06, + "loss": 0.5337, + "step": 7971 + }, + { + "epoch": 1.2933160285528877, + "grad_norm": 0.5962548956479449, + "learning_rate": 3.060583683323438e-06, + "loss": 0.5246, + "step": 7972 + }, + { + "epoch": 1.2934782608695652, + "grad_norm": 0.6003395751182125, + "learning_rate": 3.060167516842087e-06, + "loss": 0.5264, + "step": 7973 + }, + { + "epoch": 1.2936404931862426, + "grad_norm": 0.6094054076985304, + "learning_rate": 3.0597513340166825e-06, + "loss": 0.5187, + "step": 7974 + }, + { + "epoch": 1.2938027255029203, + "grad_norm": 0.6185446650309637, + "learning_rate": 3.05933513485937e-06, + "loss": 0.5224, + "step": 7975 + }, + { + "epoch": 1.2939649578195977, + "grad_norm": 0.5793758856126364, + "learning_rate": 3.0589189193822894e-06, + "loss": 0.522, + "step": 7976 + }, + { + "epoch": 1.2941271901362752, + "grad_norm": 0.5778178969241126, + "learning_rate": 3.0585026875975882e-06, + "loss": 0.5308, + "step": 7977 + }, + { + "epoch": 1.2942894224529526, + "grad_norm": 0.5986235408642255, + "learning_rate": 3.058086439517409e-06, + "loss": 0.5297, + "step": 7978 + }, + { + "epoch": 1.29445165476963, + "grad_norm": 0.6839055792093095, + "learning_rate": 3.057670175153896e-06, + "loss": 0.5116, + "step": 7979 + }, + { + "epoch": 1.2946138870863075, + "grad_norm": 0.6132434713842818, + "learning_rate": 3.0572538945191975e-06, + "loss": 0.5184, + "step": 7980 + }, + { + "epoch": 1.294776119402985, + "grad_norm": 0.5854651819859403, + "learning_rate": 3.056837597625456e-06, + "loss": 0.5456, + "step": 7981 + }, + { + "epoch": 1.2949383517196624, + "grad_norm": 0.588372007792828, + "learning_rate": 3.0564212844848207e-06, + "loss": 0.498, + "step": 7982 + }, + { + "epoch": 1.29510058403634, + "grad_norm": 0.588203159508837, + "learning_rate": 3.056004955109436e-06, + "loss": 0.5272, + "step": 7983 + }, + { + "epoch": 1.2952628163530175, + "grad_norm": 0.5831917200546959, + "learning_rate": 3.0555886095114504e-06, + "loss": 0.5083, + "step": 7984 + }, + { + "epoch": 1.295425048669695, + "grad_norm": 0.6094661580215818, + "learning_rate": 3.0551722477030117e-06, + "loss": 0.5392, + "step": 7985 + }, + { + "epoch": 1.2955872809863724, + "grad_norm": 0.5850057062189492, + "learning_rate": 3.0547558696962677e-06, + "loss": 0.545, + "step": 7986 + }, + { + "epoch": 1.29574951330305, + "grad_norm": 0.6012149212952075, + "learning_rate": 3.0543394755033666e-06, + "loss": 0.4998, + "step": 7987 + }, + { + "epoch": 1.2959117456197276, + "grad_norm": 0.6184620821783111, + "learning_rate": 3.053923065136459e-06, + "loss": 0.4941, + "step": 7988 + }, + { + "epoch": 1.296073977936405, + "grad_norm": 0.5893267386742064, + "learning_rate": 3.0535066386076934e-06, + "loss": 0.5295, + "step": 7989 + }, + { + "epoch": 1.2962362102530824, + "grad_norm": 0.6120190129184554, + "learning_rate": 3.0530901959292203e-06, + "loss": 0.5257, + "step": 7990 + }, + { + "epoch": 1.29639844256976, + "grad_norm": 0.5885780894504035, + "learning_rate": 3.05267373711319e-06, + "loss": 0.537, + "step": 7991 + }, + { + "epoch": 1.2965606748864373, + "grad_norm": 0.6101991830117489, + "learning_rate": 3.0522572621717543e-06, + "loss": 0.525, + "step": 7992 + }, + { + "epoch": 1.2967229072031148, + "grad_norm": 0.6052870177886231, + "learning_rate": 3.0518407711170645e-06, + "loss": 0.5485, + "step": 7993 + }, + { + "epoch": 1.2968851395197922, + "grad_norm": 0.5968531434385703, + "learning_rate": 3.051424263961271e-06, + "loss": 0.5287, + "step": 7994 + }, + { + "epoch": 1.29704737183647, + "grad_norm": 0.6129920742692836, + "learning_rate": 3.051007740716529e-06, + "loss": 0.5244, + "step": 7995 + }, + { + "epoch": 1.2972096041531473, + "grad_norm": 0.6104517370720389, + "learning_rate": 3.050591201394989e-06, + "loss": 0.5351, + "step": 7996 + }, + { + "epoch": 1.2973718364698248, + "grad_norm": 0.6026686683211259, + "learning_rate": 3.050174646008806e-06, + "loss": 0.536, + "step": 7997 + }, + { + "epoch": 1.2975340687865022, + "grad_norm": 0.581028375690912, + "learning_rate": 3.0497580745701334e-06, + "loss": 0.5466, + "step": 7998 + }, + { + "epoch": 1.2976963011031797, + "grad_norm": 0.6152032210650473, + "learning_rate": 3.0493414870911247e-06, + "loss": 0.5127, + "step": 7999 + }, + { + "epoch": 1.2978585334198574, + "grad_norm": 0.6011179148915161, + "learning_rate": 3.0489248835839364e-06, + "loss": 0.5045, + "step": 8000 + }, + { + "epoch": 1.2980207657365348, + "grad_norm": 0.5764866544691188, + "learning_rate": 3.0485082640607217e-06, + "loss": 0.5467, + "step": 8001 + }, + { + "epoch": 1.2981829980532122, + "grad_norm": 0.6069657245458853, + "learning_rate": 3.048091628533638e-06, + "loss": 0.5226, + "step": 8002 + }, + { + "epoch": 1.2983452303698897, + "grad_norm": 0.6237522716517041, + "learning_rate": 3.047674977014841e-06, + "loss": 0.5465, + "step": 8003 + }, + { + "epoch": 1.2985074626865671, + "grad_norm": 0.5766884647076893, + "learning_rate": 3.0472583095164875e-06, + "loss": 0.5128, + "step": 8004 + }, + { + "epoch": 1.2986696950032446, + "grad_norm": 0.5841983670355919, + "learning_rate": 3.0468416260507345e-06, + "loss": 0.5164, + "step": 8005 + }, + { + "epoch": 1.298831927319922, + "grad_norm": 0.5686818326014983, + "learning_rate": 3.0464249266297387e-06, + "loss": 0.5064, + "step": 8006 + }, + { + "epoch": 1.2989941596365995, + "grad_norm": 0.5913452354067847, + "learning_rate": 3.0460082112656596e-06, + "loss": 0.5381, + "step": 8007 + }, + { + "epoch": 1.2991563919532771, + "grad_norm": 0.6026282560815087, + "learning_rate": 3.0455914799706555e-06, + "loss": 0.5001, + "step": 8008 + }, + { + "epoch": 1.2993186242699546, + "grad_norm": 0.5516826441778317, + "learning_rate": 3.045174732756885e-06, + "loss": 0.5052, + "step": 8009 + }, + { + "epoch": 1.299480856586632, + "grad_norm": 0.5799236899306216, + "learning_rate": 3.044757969636507e-06, + "loss": 0.4849, + "step": 8010 + }, + { + "epoch": 1.2996430889033095, + "grad_norm": 0.5833526946072113, + "learning_rate": 3.044341190621683e-06, + "loss": 0.5479, + "step": 8011 + }, + { + "epoch": 1.2998053212199872, + "grad_norm": 0.6045666474455451, + "learning_rate": 3.0439243957245713e-06, + "loss": 0.5025, + "step": 8012 + }, + { + "epoch": 1.2999675535366646, + "grad_norm": 0.5837479270169954, + "learning_rate": 3.0435075849573346e-06, + "loss": 0.5159, + "step": 8013 + }, + { + "epoch": 1.300129785853342, + "grad_norm": 0.5876919992868564, + "learning_rate": 3.0430907583321338e-06, + "loss": 0.5413, + "step": 8014 + }, + { + "epoch": 1.3002920181700195, + "grad_norm": 0.5581649977519693, + "learning_rate": 3.04267391586113e-06, + "loss": 0.5238, + "step": 8015 + }, + { + "epoch": 1.300454250486697, + "grad_norm": 0.5778398482989177, + "learning_rate": 3.0422570575564863e-06, + "loss": 0.5214, + "step": 8016 + }, + { + "epoch": 1.3006164828033744, + "grad_norm": 0.5831618573043745, + "learning_rate": 3.0418401834303647e-06, + "loss": 0.5223, + "step": 8017 + }, + { + "epoch": 1.3007787151200518, + "grad_norm": 0.5943176684832107, + "learning_rate": 3.041423293494929e-06, + "loss": 0.4998, + "step": 8018 + }, + { + "epoch": 1.3009409474367293, + "grad_norm": 0.5985092204214422, + "learning_rate": 3.041006387762342e-06, + "loss": 0.5288, + "step": 8019 + }, + { + "epoch": 1.301103179753407, + "grad_norm": 0.6363780471947018, + "learning_rate": 3.0405894662447682e-06, + "loss": 0.5638, + "step": 8020 + }, + { + "epoch": 1.3012654120700844, + "grad_norm": 0.6017781849988304, + "learning_rate": 3.0401725289543728e-06, + "loss": 0.5257, + "step": 8021 + }, + { + "epoch": 1.3014276443867618, + "grad_norm": 0.5623802995998523, + "learning_rate": 3.0397555759033204e-06, + "loss": 0.5279, + "step": 8022 + }, + { + "epoch": 1.3015898767034393, + "grad_norm": 0.5957687988655295, + "learning_rate": 3.0393386071037758e-06, + "loss": 0.5392, + "step": 8023 + }, + { + "epoch": 1.3017521090201167, + "grad_norm": 0.6146624906923865, + "learning_rate": 3.0389216225679052e-06, + "loss": 0.5438, + "step": 8024 + }, + { + "epoch": 1.3019143413367944, + "grad_norm": 0.5907885272546293, + "learning_rate": 3.0385046223078756e-06, + "loss": 0.5249, + "step": 8025 + }, + { + "epoch": 1.3020765736534718, + "grad_norm": 0.6226026079494021, + "learning_rate": 3.038087606335854e-06, + "loss": 0.4936, + "step": 8026 + }, + { + "epoch": 1.3022388059701493, + "grad_norm": 0.6315740052121337, + "learning_rate": 3.037670574664007e-06, + "loss": 0.5598, + "step": 8027 + }, + { + "epoch": 1.3024010382868267, + "grad_norm": 0.5901816002065963, + "learning_rate": 3.0372535273045018e-06, + "loss": 0.5299, + "step": 8028 + }, + { + "epoch": 1.3025632706035042, + "grad_norm": 0.6305110735745298, + "learning_rate": 3.036836464269508e-06, + "loss": 0.5097, + "step": 8029 + }, + { + "epoch": 1.3027255029201816, + "grad_norm": 0.5832518879508433, + "learning_rate": 3.036419385571193e-06, + "loss": 0.526, + "step": 8030 + }, + { + "epoch": 1.302887735236859, + "grad_norm": 0.5947979172935683, + "learning_rate": 3.036002291221728e-06, + "loss": 0.5187, + "step": 8031 + }, + { + "epoch": 1.3030499675535367, + "grad_norm": 0.5982132894182957, + "learning_rate": 3.0355851812332798e-06, + "loss": 0.5047, + "step": 8032 + }, + { + "epoch": 1.3032121998702142, + "grad_norm": 0.6067685361551102, + "learning_rate": 3.0351680556180207e-06, + "loss": 0.5456, + "step": 8033 + }, + { + "epoch": 1.3033744321868916, + "grad_norm": 0.6061863554493617, + "learning_rate": 3.03475091438812e-06, + "loss": 0.5291, + "step": 8034 + }, + { + "epoch": 1.303536664503569, + "grad_norm": 0.6092773789442066, + "learning_rate": 3.0343337575557488e-06, + "loss": 0.5328, + "step": 8035 + }, + { + "epoch": 1.3036988968202465, + "grad_norm": 0.5723296398319866, + "learning_rate": 3.0339165851330797e-06, + "loss": 0.5051, + "step": 8036 + }, + { + "epoch": 1.3038611291369242, + "grad_norm": 0.6032104042432072, + "learning_rate": 3.0334993971322825e-06, + "loss": 0.5324, + "step": 8037 + }, + { + "epoch": 1.3040233614536016, + "grad_norm": 0.5767429329357482, + "learning_rate": 3.033082193565532e-06, + "loss": 0.5287, + "step": 8038 + }, + { + "epoch": 1.304185593770279, + "grad_norm": 0.5801285951997821, + "learning_rate": 3.032664974444998e-06, + "loss": 0.5135, + "step": 8039 + }, + { + "epoch": 1.3043478260869565, + "grad_norm": 0.5894813907871531, + "learning_rate": 3.0322477397828567e-06, + "loss": 0.5274, + "step": 8040 + }, + { + "epoch": 1.304510058403634, + "grad_norm": 0.5997104479683832, + "learning_rate": 3.0318304895912797e-06, + "loss": 0.5335, + "step": 8041 + }, + { + "epoch": 1.3046722907203114, + "grad_norm": 0.5929943628043364, + "learning_rate": 3.0314132238824416e-06, + "loss": 0.5332, + "step": 8042 + }, + { + "epoch": 1.3048345230369889, + "grad_norm": 0.6021951580341068, + "learning_rate": 3.030995942668518e-06, + "loss": 0.4965, + "step": 8043 + }, + { + "epoch": 1.3049967553536663, + "grad_norm": 0.6081483671061435, + "learning_rate": 3.0305786459616833e-06, + "loss": 0.5369, + "step": 8044 + }, + { + "epoch": 1.305158987670344, + "grad_norm": 0.6137474102377437, + "learning_rate": 3.0301613337741128e-06, + "loss": 0.5333, + "step": 8045 + }, + { + "epoch": 1.3053212199870214, + "grad_norm": 0.5870682701262803, + "learning_rate": 3.029744006117983e-06, + "loss": 0.5085, + "step": 8046 + }, + { + "epoch": 1.3054834523036989, + "grad_norm": 0.6174114797196213, + "learning_rate": 3.029326663005469e-06, + "loss": 0.5281, + "step": 8047 + }, + { + "epoch": 1.3056456846203763, + "grad_norm": 0.5703138778168856, + "learning_rate": 3.028909304448748e-06, + "loss": 0.5125, + "step": 8048 + }, + { + "epoch": 1.3058079169370538, + "grad_norm": 0.5816702529945186, + "learning_rate": 3.0284919304600003e-06, + "loss": 0.5652, + "step": 8049 + }, + { + "epoch": 1.3059701492537314, + "grad_norm": 0.5891446227646964, + "learning_rate": 3.028074541051399e-06, + "loss": 0.4841, + "step": 8050 + }, + { + "epoch": 1.306132381570409, + "grad_norm": 0.5761446014131785, + "learning_rate": 3.0276571362351255e-06, + "loss": 0.5076, + "step": 8051 + }, + { + "epoch": 1.3062946138870863, + "grad_norm": 0.5984122059072299, + "learning_rate": 3.027239716023357e-06, + "loss": 0.5123, + "step": 8052 + }, + { + "epoch": 1.3064568462037638, + "grad_norm": 0.6064593811021762, + "learning_rate": 3.0268222804282725e-06, + "loss": 0.4945, + "step": 8053 + }, + { + "epoch": 1.3066190785204412, + "grad_norm": 0.5793220048365181, + "learning_rate": 3.026404829462054e-06, + "loss": 0.5305, + "step": 8054 + }, + { + "epoch": 1.3067813108371187, + "grad_norm": 0.5916670361401817, + "learning_rate": 3.025987363136878e-06, + "loss": 0.5515, + "step": 8055 + }, + { + "epoch": 1.3069435431537961, + "grad_norm": 0.5792407134421644, + "learning_rate": 3.0255698814649277e-06, + "loss": 0.5223, + "step": 8056 + }, + { + "epoch": 1.3071057754704738, + "grad_norm": 0.6331866759943838, + "learning_rate": 3.025152384458382e-06, + "loss": 0.5306, + "step": 8057 + }, + { + "epoch": 1.3072680077871512, + "grad_norm": 0.6037831140503745, + "learning_rate": 3.0247348721294227e-06, + "loss": 0.5149, + "step": 8058 + }, + { + "epoch": 1.3074302401038287, + "grad_norm": 0.5879266951727095, + "learning_rate": 3.024317344490233e-06, + "loss": 0.4843, + "step": 8059 + }, + { + "epoch": 1.3075924724205061, + "grad_norm": 0.5700185042516394, + "learning_rate": 3.023899801552994e-06, + "loss": 0.5184, + "step": 8060 + }, + { + "epoch": 1.3077547047371836, + "grad_norm": 0.5705319924822803, + "learning_rate": 3.0234822433298884e-06, + "loss": 0.5046, + "step": 8061 + }, + { + "epoch": 1.3079169370538613, + "grad_norm": 0.5818676768325353, + "learning_rate": 3.023064669833099e-06, + "loss": 0.5062, + "step": 8062 + }, + { + "epoch": 1.3080791693705387, + "grad_norm": 0.6017562680476228, + "learning_rate": 3.02264708107481e-06, + "loss": 0.5025, + "step": 8063 + }, + { + "epoch": 1.3082414016872161, + "grad_norm": 0.61745157414027, + "learning_rate": 3.0222294770672054e-06, + "loss": 0.4838, + "step": 8064 + }, + { + "epoch": 1.3084036340038936, + "grad_norm": 0.6029172936258782, + "learning_rate": 3.0218118578224696e-06, + "loss": 0.5399, + "step": 8065 + }, + { + "epoch": 1.308565866320571, + "grad_norm": 0.631342095412832, + "learning_rate": 3.021394223352787e-06, + "loss": 0.5172, + "step": 8066 + }, + { + "epoch": 1.3087280986372485, + "grad_norm": 0.6202574996075696, + "learning_rate": 3.0209765736703444e-06, + "loss": 0.5329, + "step": 8067 + }, + { + "epoch": 1.308890330953926, + "grad_norm": 0.5746505131255732, + "learning_rate": 3.0205589087873255e-06, + "loss": 0.5031, + "step": 8068 + }, + { + "epoch": 1.3090525632706034, + "grad_norm": 0.5738688967219205, + "learning_rate": 3.0201412287159183e-06, + "loss": 0.52, + "step": 8069 + }, + { + "epoch": 1.309214795587281, + "grad_norm": 0.6006481295313103, + "learning_rate": 3.019723533468308e-06, + "loss": 0.5424, + "step": 8070 + }, + { + "epoch": 1.3093770279039585, + "grad_norm": 0.6309237532139538, + "learning_rate": 3.019305823056683e-06, + "loss": 0.4653, + "step": 8071 + }, + { + "epoch": 1.309539260220636, + "grad_norm": 0.6240636584274305, + "learning_rate": 3.018888097493231e-06, + "loss": 0.5249, + "step": 8072 + }, + { + "epoch": 1.3097014925373134, + "grad_norm": 0.6191829351839087, + "learning_rate": 3.0184703567901386e-06, + "loss": 0.5714, + "step": 8073 + }, + { + "epoch": 1.309863724853991, + "grad_norm": 0.6049802157045923, + "learning_rate": 3.0180526009595955e-06, + "loss": 0.5316, + "step": 8074 + }, + { + "epoch": 1.3100259571706685, + "grad_norm": 0.5746954059748521, + "learning_rate": 3.0176348300137898e-06, + "loss": 0.545, + "step": 8075 + }, + { + "epoch": 1.310188189487346, + "grad_norm": 0.6160876673372149, + "learning_rate": 3.0172170439649113e-06, + "loss": 0.5257, + "step": 8076 + }, + { + "epoch": 1.3103504218040234, + "grad_norm": 0.6020672982174725, + "learning_rate": 3.0167992428251503e-06, + "loss": 0.5613, + "step": 8077 + }, + { + "epoch": 1.3105126541207008, + "grad_norm": 0.5919989248727643, + "learning_rate": 3.0163814266066956e-06, + "loss": 0.5103, + "step": 8078 + }, + { + "epoch": 1.3106748864373783, + "grad_norm": 0.6028697588914176, + "learning_rate": 3.0159635953217393e-06, + "loss": 0.4909, + "step": 8079 + }, + { + "epoch": 1.3108371187540557, + "grad_norm": 0.5863367998942342, + "learning_rate": 3.0155457489824707e-06, + "loss": 0.5147, + "step": 8080 + }, + { + "epoch": 1.3109993510707332, + "grad_norm": 0.5784292056458935, + "learning_rate": 3.015127887601083e-06, + "loss": 0.547, + "step": 8081 + }, + { + "epoch": 1.3111615833874108, + "grad_norm": 0.6202434661995209, + "learning_rate": 3.014710011189768e-06, + "loss": 0.5308, + "step": 8082 + }, + { + "epoch": 1.3113238157040883, + "grad_norm": 0.6060614937810986, + "learning_rate": 3.0142921197607177e-06, + "loss": 0.5596, + "step": 8083 + }, + { + "epoch": 1.3114860480207657, + "grad_norm": 0.586539450265341, + "learning_rate": 3.013874213326125e-06, + "loss": 0.5007, + "step": 8084 + }, + { + "epoch": 1.3116482803374432, + "grad_norm": 0.6152140540093675, + "learning_rate": 3.0134562918981836e-06, + "loss": 0.5374, + "step": 8085 + }, + { + "epoch": 1.3118105126541206, + "grad_norm": 0.5855372516369407, + "learning_rate": 3.013038355489086e-06, + "loss": 0.531, + "step": 8086 + }, + { + "epoch": 1.3119727449707983, + "grad_norm": 0.5967729140863569, + "learning_rate": 3.0126204041110275e-06, + "loss": 0.5402, + "step": 8087 + }, + { + "epoch": 1.3121349772874757, + "grad_norm": 0.5734112264582546, + "learning_rate": 3.0122024377762028e-06, + "loss": 0.5383, + "step": 8088 + }, + { + "epoch": 1.3122972096041532, + "grad_norm": 0.5903271966877203, + "learning_rate": 3.0117844564968056e-06, + "loss": 0.5179, + "step": 8089 + }, + { + "epoch": 1.3124594419208306, + "grad_norm": 0.5855489604873282, + "learning_rate": 3.0113664602850333e-06, + "loss": 0.5155, + "step": 8090 + }, + { + "epoch": 1.312621674237508, + "grad_norm": 0.6109457423896993, + "learning_rate": 3.0109484491530806e-06, + "loss": 0.5064, + "step": 8091 + }, + { + "epoch": 1.3127839065541855, + "grad_norm": 0.5953651871533667, + "learning_rate": 3.0105304231131443e-06, + "loss": 0.5274, + "step": 8092 + }, + { + "epoch": 1.312946138870863, + "grad_norm": 0.6299064626538516, + "learning_rate": 3.0101123821774204e-06, + "loss": 0.5091, + "step": 8093 + }, + { + "epoch": 1.3131083711875404, + "grad_norm": 0.5728341811789244, + "learning_rate": 3.009694326358107e-06, + "loss": 0.5039, + "step": 8094 + }, + { + "epoch": 1.313270603504218, + "grad_norm": 0.5923647071186615, + "learning_rate": 3.009276255667401e-06, + "loss": 0.5292, + "step": 8095 + }, + { + "epoch": 1.3134328358208955, + "grad_norm": 0.602569480430132, + "learning_rate": 3.0088581701175015e-06, + "loss": 0.5233, + "step": 8096 + }, + { + "epoch": 1.313595068137573, + "grad_norm": 0.6183379525218005, + "learning_rate": 3.0084400697206063e-06, + "loss": 0.5184, + "step": 8097 + }, + { + "epoch": 1.3137573004542504, + "grad_norm": 0.6160056551684281, + "learning_rate": 3.0080219544889134e-06, + "loss": 0.5248, + "step": 8098 + }, + { + "epoch": 1.313919532770928, + "grad_norm": 0.6038872259407514, + "learning_rate": 3.0076038244346246e-06, + "loss": 0.5129, + "step": 8099 + }, + { + "epoch": 1.3140817650876055, + "grad_norm": 0.5955078768066541, + "learning_rate": 3.007185679569938e-06, + "loss": 0.5632, + "step": 8100 + }, + { + "epoch": 1.314243997404283, + "grad_norm": 0.5709482857626114, + "learning_rate": 3.006767519907054e-06, + "loss": 0.511, + "step": 8101 + }, + { + "epoch": 1.3144062297209604, + "grad_norm": 0.6584438677829383, + "learning_rate": 3.0063493454581733e-06, + "loss": 0.5537, + "step": 8102 + }, + { + "epoch": 1.3145684620376379, + "grad_norm": 0.5657139526057564, + "learning_rate": 3.0059311562354974e-06, + "loss": 0.5302, + "step": 8103 + }, + { + "epoch": 1.3147306943543153, + "grad_norm": 0.6023806693939534, + "learning_rate": 3.0055129522512273e-06, + "loss": 0.5599, + "step": 8104 + }, + { + "epoch": 1.3148929266709928, + "grad_norm": 0.5771089407332272, + "learning_rate": 3.0050947335175656e-06, + "loss": 0.5132, + "step": 8105 + }, + { + "epoch": 1.3150551589876702, + "grad_norm": 0.6032322918647006, + "learning_rate": 3.0046765000467144e-06, + "loss": 0.52, + "step": 8106 + }, + { + "epoch": 1.315217391304348, + "grad_norm": 0.6108747412340932, + "learning_rate": 3.004258251850877e-06, + "loss": 0.5575, + "step": 8107 + }, + { + "epoch": 1.3153796236210253, + "grad_norm": 0.6140183382037183, + "learning_rate": 3.0038399889422553e-06, + "loss": 0.5339, + "step": 8108 + }, + { + "epoch": 1.3155418559377028, + "grad_norm": 0.6076157069907702, + "learning_rate": 3.0034217113330538e-06, + "loss": 0.5005, + "step": 8109 + }, + { + "epoch": 1.3157040882543802, + "grad_norm": 0.6281797696198845, + "learning_rate": 3.003003419035478e-06, + "loss": 0.5293, + "step": 8110 + }, + { + "epoch": 1.3158663205710577, + "grad_norm": 0.5914327645209472, + "learning_rate": 3.00258511206173e-06, + "loss": 0.5411, + "step": 8111 + }, + { + "epoch": 1.3160285528877353, + "grad_norm": 0.6144379613996865, + "learning_rate": 3.002166790424016e-06, + "loss": 0.5562, + "step": 8112 + }, + { + "epoch": 1.3161907852044128, + "grad_norm": 0.5762933884914333, + "learning_rate": 3.001748454134542e-06, + "loss": 0.5367, + "step": 8113 + }, + { + "epoch": 1.3163530175210902, + "grad_norm": 0.5940735887506983, + "learning_rate": 3.0013301032055126e-06, + "loss": 0.56, + "step": 8114 + }, + { + "epoch": 1.3165152498377677, + "grad_norm": 0.6141337986793636, + "learning_rate": 3.0009117376491348e-06, + "loss": 0.5041, + "step": 8115 + }, + { + "epoch": 1.3166774821544451, + "grad_norm": 0.5903712433009662, + "learning_rate": 3.0004933574776146e-06, + "loss": 0.5191, + "step": 8116 + }, + { + "epoch": 1.3168397144711226, + "grad_norm": 0.6387039034269278, + "learning_rate": 3.00007496270316e-06, + "loss": 0.5367, + "step": 8117 + }, + { + "epoch": 1.3170019467878, + "grad_norm": 0.6061831348442807, + "learning_rate": 2.9996565533379785e-06, + "loss": 0.4999, + "step": 8118 + }, + { + "epoch": 1.3171641791044777, + "grad_norm": 0.5992895019077915, + "learning_rate": 2.9992381293942775e-06, + "loss": 0.5595, + "step": 8119 + }, + { + "epoch": 1.3173264114211551, + "grad_norm": 0.5864469985619831, + "learning_rate": 2.998819690884266e-06, + "loss": 0.51, + "step": 8120 + }, + { + "epoch": 1.3174886437378326, + "grad_norm": 0.6000179007050691, + "learning_rate": 2.9984012378201516e-06, + "loss": 0.5321, + "step": 8121 + }, + { + "epoch": 1.31765087605451, + "grad_norm": 0.5998600036099675, + "learning_rate": 2.9979827702141446e-06, + "loss": 0.5, + "step": 8122 + }, + { + "epoch": 1.3178131083711875, + "grad_norm": 0.5880013509277895, + "learning_rate": 2.9975642880784543e-06, + "loss": 0.51, + "step": 8123 + }, + { + "epoch": 1.3179753406878651, + "grad_norm": 0.5903570495639366, + "learning_rate": 2.9971457914252917e-06, + "loss": 0.5496, + "step": 8124 + }, + { + "epoch": 1.3181375730045426, + "grad_norm": 0.6495716610766474, + "learning_rate": 2.9967272802668657e-06, + "loss": 0.5411, + "step": 8125 + }, + { + "epoch": 1.31829980532122, + "grad_norm": 0.6044748258341962, + "learning_rate": 2.9963087546153887e-06, + "loss": 0.5195, + "step": 8126 + }, + { + "epoch": 1.3184620376378975, + "grad_norm": 0.5795286774988166, + "learning_rate": 2.9958902144830704e-06, + "loss": 0.5237, + "step": 8127 + }, + { + "epoch": 1.318624269954575, + "grad_norm": 0.602690181545821, + "learning_rate": 2.995471659882125e-06, + "loss": 0.5232, + "step": 8128 + }, + { + "epoch": 1.3187865022712524, + "grad_norm": 0.6304392661035475, + "learning_rate": 2.9950530908247617e-06, + "loss": 0.5455, + "step": 8129 + }, + { + "epoch": 1.3189487345879298, + "grad_norm": 0.5950057768395516, + "learning_rate": 2.9946345073231964e-06, + "loss": 0.52, + "step": 8130 + }, + { + "epoch": 1.3191109669046073, + "grad_norm": 0.6344880853748341, + "learning_rate": 2.994215909389639e-06, + "loss": 0.5302, + "step": 8131 + }, + { + "epoch": 1.319273199221285, + "grad_norm": 0.595544150742791, + "learning_rate": 2.993797297036305e-06, + "loss": 0.5163, + "step": 8132 + }, + { + "epoch": 1.3194354315379624, + "grad_norm": 0.5752097765225519, + "learning_rate": 2.9933786702754076e-06, + "loss": 0.4796, + "step": 8133 + }, + { + "epoch": 1.3195976638546398, + "grad_norm": 0.6078149895017975, + "learning_rate": 2.9929600291191607e-06, + "loss": 0.5204, + "step": 8134 + }, + { + "epoch": 1.3197598961713173, + "grad_norm": 0.587208361602753, + "learning_rate": 2.9925413735797803e-06, + "loss": 0.5336, + "step": 8135 + }, + { + "epoch": 1.3199221284879947, + "grad_norm": 0.6243865776108781, + "learning_rate": 2.9921227036694813e-06, + "loss": 0.4968, + "step": 8136 + }, + { + "epoch": 1.3200843608046724, + "grad_norm": 0.5722583794015351, + "learning_rate": 2.9917040194004776e-06, + "loss": 0.4996, + "step": 8137 + }, + { + "epoch": 1.3202465931213498, + "grad_norm": 0.5816121222527965, + "learning_rate": 2.9912853207849875e-06, + "loss": 0.5405, + "step": 8138 + }, + { + "epoch": 1.3204088254380273, + "grad_norm": 0.6143901233038228, + "learning_rate": 2.9908666078352254e-06, + "loss": 0.4923, + "step": 8139 + }, + { + "epoch": 1.3205710577547047, + "grad_norm": 0.6269444226798743, + "learning_rate": 2.9904478805634084e-06, + "loss": 0.5482, + "step": 8140 + }, + { + "epoch": 1.3207332900713822, + "grad_norm": 0.5866848743357386, + "learning_rate": 2.990029138981756e-06, + "loss": 0.5056, + "step": 8141 + }, + { + "epoch": 1.3208955223880596, + "grad_norm": 0.5803611034829945, + "learning_rate": 2.9896103831024827e-06, + "loss": 0.5429, + "step": 8142 + }, + { + "epoch": 1.321057754704737, + "grad_norm": 0.6099110738383007, + "learning_rate": 2.9891916129378084e-06, + "loss": 0.5644, + "step": 8143 + }, + { + "epoch": 1.3212199870214147, + "grad_norm": 0.5940343957713402, + "learning_rate": 2.9887728284999512e-06, + "loss": 0.5171, + "step": 8144 + }, + { + "epoch": 1.3213822193380922, + "grad_norm": 0.6010409353272089, + "learning_rate": 2.9883540298011297e-06, + "loss": 0.5223, + "step": 8145 + }, + { + "epoch": 1.3215444516547696, + "grad_norm": 0.5803993136994532, + "learning_rate": 2.9879352168535645e-06, + "loss": 0.4895, + "step": 8146 + }, + { + "epoch": 1.321706683971447, + "grad_norm": 0.6021387193319403, + "learning_rate": 2.9875163896694735e-06, + "loss": 0.5044, + "step": 8147 + }, + { + "epoch": 1.3218689162881245, + "grad_norm": 0.6234230534211576, + "learning_rate": 2.9870975482610786e-06, + "loss": 0.5136, + "step": 8148 + }, + { + "epoch": 1.3220311486048022, + "grad_norm": 0.5891853328480882, + "learning_rate": 2.9866786926405987e-06, + "loss": 0.4853, + "step": 8149 + }, + { + "epoch": 1.3221933809214796, + "grad_norm": 0.6354240750483695, + "learning_rate": 2.986259822820256e-06, + "loss": 0.5359, + "step": 8150 + }, + { + "epoch": 1.322355613238157, + "grad_norm": 0.5982009119309106, + "learning_rate": 2.9858409388122715e-06, + "loss": 0.5356, + "step": 8151 + }, + { + "epoch": 1.3225178455548345, + "grad_norm": 0.6104083999545807, + "learning_rate": 2.985422040628867e-06, + "loss": 0.5153, + "step": 8152 + }, + { + "epoch": 1.322680077871512, + "grad_norm": 0.641366865820546, + "learning_rate": 2.985003128282265e-06, + "loss": 0.5117, + "step": 8153 + }, + { + "epoch": 1.3228423101881894, + "grad_norm": 0.5964165642096197, + "learning_rate": 2.984584201784687e-06, + "loss": 0.5201, + "step": 8154 + }, + { + "epoch": 1.3230045425048669, + "grad_norm": 0.5794728316456017, + "learning_rate": 2.9841652611483572e-06, + "loss": 0.529, + "step": 8155 + }, + { + "epoch": 1.3231667748215443, + "grad_norm": 0.5797889389019604, + "learning_rate": 2.9837463063854995e-06, + "loss": 0.5005, + "step": 8156 + }, + { + "epoch": 1.323329007138222, + "grad_norm": 0.6013206721616065, + "learning_rate": 2.9833273375083365e-06, + "loss": 0.4839, + "step": 8157 + }, + { + "epoch": 1.3234912394548994, + "grad_norm": 0.6165105944725049, + "learning_rate": 2.982908354529094e-06, + "loss": 0.5421, + "step": 8158 + }, + { + "epoch": 1.3236534717715769, + "grad_norm": 0.5923420957790886, + "learning_rate": 2.9824893574599944e-06, + "loss": 0.4797, + "step": 8159 + }, + { + "epoch": 1.3238157040882543, + "grad_norm": 0.5682842284194038, + "learning_rate": 2.9820703463132646e-06, + "loss": 0.503, + "step": 8160 + }, + { + "epoch": 1.323977936404932, + "grad_norm": 0.5828700644991296, + "learning_rate": 2.9816513211011295e-06, + "loss": 0.5093, + "step": 8161 + }, + { + "epoch": 1.3241401687216094, + "grad_norm": 0.5994900617060667, + "learning_rate": 2.9812322818358157e-06, + "loss": 0.5457, + "step": 8162 + }, + { + "epoch": 1.324302401038287, + "grad_norm": 0.5844604494619897, + "learning_rate": 2.980813228529548e-06, + "loss": 0.4913, + "step": 8163 + }, + { + "epoch": 1.3244646333549643, + "grad_norm": 0.5791219192382496, + "learning_rate": 2.9803941611945565e-06, + "loss": 0.5177, + "step": 8164 + }, + { + "epoch": 1.3246268656716418, + "grad_norm": 0.5808176650612386, + "learning_rate": 2.979975079843064e-06, + "loss": 0.5302, + "step": 8165 + }, + { + "epoch": 1.3247890979883192, + "grad_norm": 0.6339369824801194, + "learning_rate": 2.9795559844873013e-06, + "loss": 0.5433, + "step": 8166 + }, + { + "epoch": 1.3249513303049967, + "grad_norm": 0.5864320652065418, + "learning_rate": 2.9791368751394945e-06, + "loss": 0.5615, + "step": 8167 + }, + { + "epoch": 1.3251135626216741, + "grad_norm": 0.5716801750213759, + "learning_rate": 2.9787177518118727e-06, + "loss": 0.4991, + "step": 8168 + }, + { + "epoch": 1.3252757949383518, + "grad_norm": 0.5873800740025468, + "learning_rate": 2.978298614516665e-06, + "loss": 0.5037, + "step": 8169 + }, + { + "epoch": 1.3254380272550292, + "grad_norm": 0.5839027990205456, + "learning_rate": 2.9778794632661e-06, + "loss": 0.5312, + "step": 8170 + }, + { + "epoch": 1.3256002595717067, + "grad_norm": 0.5365312335820458, + "learning_rate": 2.9774602980724082e-06, + "loss": 0.5115, + "step": 8171 + }, + { + "epoch": 1.3257624918883841, + "grad_norm": 0.6203662763933023, + "learning_rate": 2.9770411189478183e-06, + "loss": 0.5261, + "step": 8172 + }, + { + "epoch": 1.3259247242050616, + "grad_norm": 0.5846920660571692, + "learning_rate": 2.976621925904562e-06, + "loss": 0.5043, + "step": 8173 + }, + { + "epoch": 1.3260869565217392, + "grad_norm": 0.5835373215424938, + "learning_rate": 2.976202718954869e-06, + "loss": 0.5615, + "step": 8174 + }, + { + "epoch": 1.3262491888384167, + "grad_norm": 0.557237293582999, + "learning_rate": 2.9757834981109723e-06, + "loss": 0.5241, + "step": 8175 + }, + { + "epoch": 1.3264114211550941, + "grad_norm": 0.6063621167901678, + "learning_rate": 2.9753642633851014e-06, + "loss": 0.5172, + "step": 8176 + }, + { + "epoch": 1.3265736534717716, + "grad_norm": 0.623891792927038, + "learning_rate": 2.97494501478949e-06, + "loss": 0.5593, + "step": 8177 + }, + { + "epoch": 1.326735885788449, + "grad_norm": 0.6153740294407147, + "learning_rate": 2.9745257523363685e-06, + "loss": 0.5205, + "step": 8178 + }, + { + "epoch": 1.3268981181051265, + "grad_norm": 0.560625346532912, + "learning_rate": 2.9741064760379723e-06, + "loss": 0.5291, + "step": 8179 + }, + { + "epoch": 1.327060350421804, + "grad_norm": 0.610732444639031, + "learning_rate": 2.973687185906533e-06, + "loss": 0.5212, + "step": 8180 + }, + { + "epoch": 1.3272225827384814, + "grad_norm": 0.5752063385647399, + "learning_rate": 2.973267881954285e-06, + "loss": 0.5257, + "step": 8181 + }, + { + "epoch": 1.327384815055159, + "grad_norm": 0.5916869281119537, + "learning_rate": 2.9728485641934625e-06, + "loss": 0.5077, + "step": 8182 + }, + { + "epoch": 1.3275470473718365, + "grad_norm": 0.5848191089190653, + "learning_rate": 2.972429232636298e-06, + "loss": 0.5196, + "step": 8183 + }, + { + "epoch": 1.327709279688514, + "grad_norm": 0.603600036386621, + "learning_rate": 2.9720098872950297e-06, + "loss": 0.5044, + "step": 8184 + }, + { + "epoch": 1.3278715120051914, + "grad_norm": 0.5947184799097931, + "learning_rate": 2.97159052818189e-06, + "loss": 0.5089, + "step": 8185 + }, + { + "epoch": 1.328033744321869, + "grad_norm": 0.622568728974122, + "learning_rate": 2.971171155309116e-06, + "loss": 0.4796, + "step": 8186 + }, + { + "epoch": 1.3281959766385465, + "grad_norm": 0.6090887583561293, + "learning_rate": 2.9707517686889435e-06, + "loss": 0.515, + "step": 8187 + }, + { + "epoch": 1.328358208955224, + "grad_norm": 0.6136801451087143, + "learning_rate": 2.9703323683336093e-06, + "loss": 0.5417, + "step": 8188 + }, + { + "epoch": 1.3285204412719014, + "grad_norm": 0.6225313599931049, + "learning_rate": 2.9699129542553494e-06, + "loss": 0.4713, + "step": 8189 + }, + { + "epoch": 1.3286826735885788, + "grad_norm": 0.6230638993792176, + "learning_rate": 2.9694935264664015e-06, + "loss": 0.4984, + "step": 8190 + }, + { + "epoch": 1.3288449059052563, + "grad_norm": 0.606269465665871, + "learning_rate": 2.9690740849790033e-06, + "loss": 0.5151, + "step": 8191 + }, + { + "epoch": 1.3290071382219337, + "grad_norm": 0.5911535885874095, + "learning_rate": 2.9686546298053938e-06, + "loss": 0.5029, + "step": 8192 + }, + { + "epoch": 1.3291693705386112, + "grad_norm": 0.5964070209001591, + "learning_rate": 2.9682351609578103e-06, + "loss": 0.5371, + "step": 8193 + }, + { + "epoch": 1.3293316028552888, + "grad_norm": 0.6049781605747594, + "learning_rate": 2.9678156784484918e-06, + "loss": 0.5271, + "step": 8194 + }, + { + "epoch": 1.3294938351719663, + "grad_norm": 0.5827115526332645, + "learning_rate": 2.967396182289678e-06, + "loss": 0.517, + "step": 8195 + }, + { + "epoch": 1.3296560674886437, + "grad_norm": 0.6010796870318718, + "learning_rate": 2.9669766724936074e-06, + "loss": 0.5076, + "step": 8196 + }, + { + "epoch": 1.3298182998053212, + "grad_norm": 0.5873842600484507, + "learning_rate": 2.9665571490725213e-06, + "loss": 0.5209, + "step": 8197 + }, + { + "epoch": 1.3299805321219986, + "grad_norm": 0.5914721593959129, + "learning_rate": 2.966137612038661e-06, + "loss": 0.5193, + "step": 8198 + }, + { + "epoch": 1.3301427644386763, + "grad_norm": 0.5990221504889652, + "learning_rate": 2.965718061404265e-06, + "loss": 0.5153, + "step": 8199 + }, + { + "epoch": 1.3303049967553537, + "grad_norm": 0.618926108571251, + "learning_rate": 2.965298497181577e-06, + "loss": 0.5035, + "step": 8200 + }, + { + "epoch": 1.3304672290720312, + "grad_norm": 0.5951456887540816, + "learning_rate": 2.9648789193828365e-06, + "loss": 0.5297, + "step": 8201 + }, + { + "epoch": 1.3306294613887086, + "grad_norm": 0.6284562924989452, + "learning_rate": 2.9644593280202873e-06, + "loss": 0.5171, + "step": 8202 + }, + { + "epoch": 1.330791693705386, + "grad_norm": 0.5902215361819929, + "learning_rate": 2.96403972310617e-06, + "loss": 0.5184, + "step": 8203 + }, + { + "epoch": 1.3309539260220635, + "grad_norm": 0.6149024548755724, + "learning_rate": 2.9636201046527295e-06, + "loss": 0.5094, + "step": 8204 + }, + { + "epoch": 1.331116158338741, + "grad_norm": 0.6140857040297101, + "learning_rate": 2.9632004726722073e-06, + "loss": 0.5278, + "step": 8205 + }, + { + "epoch": 1.3312783906554186, + "grad_norm": 0.6133854436537352, + "learning_rate": 2.962780827176849e-06, + "loss": 0.5104, + "step": 8206 + }, + { + "epoch": 1.331440622972096, + "grad_norm": 0.6083625036667263, + "learning_rate": 2.9623611681788967e-06, + "loss": 0.5212, + "step": 8207 + }, + { + "epoch": 1.3316028552887735, + "grad_norm": 0.6449019690820414, + "learning_rate": 2.9619414956905947e-06, + "loss": 0.4967, + "step": 8208 + }, + { + "epoch": 1.331765087605451, + "grad_norm": 0.622236179511098, + "learning_rate": 2.96152180972419e-06, + "loss": 0.5541, + "step": 8209 + }, + { + "epoch": 1.3319273199221284, + "grad_norm": 0.6080760798449544, + "learning_rate": 2.961102110291926e-06, + "loss": 0.5609, + "step": 8210 + }, + { + "epoch": 1.332089552238806, + "grad_norm": 0.5879421691070716, + "learning_rate": 2.9606823974060484e-06, + "loss": 0.5105, + "step": 8211 + }, + { + "epoch": 1.3322517845554835, + "grad_norm": 0.6302358529359584, + "learning_rate": 2.960262671078804e-06, + "loss": 0.5506, + "step": 8212 + }, + { + "epoch": 1.332414016872161, + "grad_norm": 0.6035836366708399, + "learning_rate": 2.959842931322439e-06, + "loss": 0.5711, + "step": 8213 + }, + { + "epoch": 1.3325762491888384, + "grad_norm": 0.5962928259240524, + "learning_rate": 2.9594231781491993e-06, + "loss": 0.5291, + "step": 8214 + }, + { + "epoch": 1.3327384815055159, + "grad_norm": 0.5915128338322151, + "learning_rate": 2.9590034115713328e-06, + "loss": 0.5406, + "step": 8215 + }, + { + "epoch": 1.3329007138221933, + "grad_norm": 0.5687538588656257, + "learning_rate": 2.958583631601088e-06, + "loss": 0.5241, + "step": 8216 + }, + { + "epoch": 1.3330629461388708, + "grad_norm": 0.5701965892862619, + "learning_rate": 2.958163838250711e-06, + "loss": 0.5618, + "step": 8217 + }, + { + "epoch": 1.3332251784555482, + "grad_norm": 0.5784325336555223, + "learning_rate": 2.957744031532451e-06, + "loss": 0.5204, + "step": 8218 + }, + { + "epoch": 1.333387410772226, + "grad_norm": 0.5869522090826382, + "learning_rate": 2.9573242114585564e-06, + "loss": 0.5058, + "step": 8219 + }, + { + "epoch": 1.3335496430889033, + "grad_norm": 0.5795953983647185, + "learning_rate": 2.9569043780412776e-06, + "loss": 0.4892, + "step": 8220 + }, + { + "epoch": 1.3337118754055808, + "grad_norm": 0.5799315814073635, + "learning_rate": 2.9564845312928624e-06, + "loss": 0.548, + "step": 8221 + }, + { + "epoch": 1.3338741077222582, + "grad_norm": 0.5751109275621211, + "learning_rate": 2.9560646712255626e-06, + "loss": 0.5437, + "step": 8222 + }, + { + "epoch": 1.3340363400389357, + "grad_norm": 0.6096521489779535, + "learning_rate": 2.9556447978516255e-06, + "loss": 0.5296, + "step": 8223 + }, + { + "epoch": 1.3341985723556133, + "grad_norm": 0.5673923515474241, + "learning_rate": 2.9552249111833055e-06, + "loss": 0.5197, + "step": 8224 + }, + { + "epoch": 1.3343608046722908, + "grad_norm": 0.6073217515293456, + "learning_rate": 2.9548050112328508e-06, + "loss": 0.501, + "step": 8225 + }, + { + "epoch": 1.3345230369889682, + "grad_norm": 0.6395501000200998, + "learning_rate": 2.954385098012513e-06, + "loss": 0.5297, + "step": 8226 + }, + { + "epoch": 1.3346852693056457, + "grad_norm": 0.6131690699094768, + "learning_rate": 2.9539651715345464e-06, + "loss": 0.5364, + "step": 8227 + }, + { + "epoch": 1.3348475016223231, + "grad_norm": 0.5857892729467428, + "learning_rate": 2.9535452318112005e-06, + "loss": 0.5057, + "step": 8228 + }, + { + "epoch": 1.3350097339390006, + "grad_norm": 0.5741863010440849, + "learning_rate": 2.95312527885473e-06, + "loss": 0.5237, + "step": 8229 + }, + { + "epoch": 1.335171966255678, + "grad_norm": 0.5988687774524578, + "learning_rate": 2.9527053126773866e-06, + "loss": 0.5069, + "step": 8230 + }, + { + "epoch": 1.3353341985723557, + "grad_norm": 0.5473260912937736, + "learning_rate": 2.9522853332914235e-06, + "loss": 0.4954, + "step": 8231 + }, + { + "epoch": 1.3354964308890331, + "grad_norm": 0.6181696743146752, + "learning_rate": 2.9518653407090954e-06, + "loss": 0.5284, + "step": 8232 + }, + { + "epoch": 1.3356586632057106, + "grad_norm": 0.5874936639180567, + "learning_rate": 2.951445334942656e-06, + "loss": 0.5175, + "step": 8233 + }, + { + "epoch": 1.335820895522388, + "grad_norm": 0.5996147132215375, + "learning_rate": 2.9510253160043602e-06, + "loss": 0.5464, + "step": 8234 + }, + { + "epoch": 1.3359831278390655, + "grad_norm": 0.6040539647103986, + "learning_rate": 2.950605283906462e-06, + "loss": 0.5372, + "step": 8235 + }, + { + "epoch": 1.3361453601557431, + "grad_norm": 0.6263989724765803, + "learning_rate": 2.950185238661218e-06, + "loss": 0.5146, + "step": 8236 + }, + { + "epoch": 1.3363075924724206, + "grad_norm": 0.5873404518159228, + "learning_rate": 2.9497651802808825e-06, + "loss": 0.5094, + "step": 8237 + }, + { + "epoch": 1.336469824789098, + "grad_norm": 0.5925158483746672, + "learning_rate": 2.949345108777713e-06, + "loss": 0.5185, + "step": 8238 + }, + { + "epoch": 1.3366320571057755, + "grad_norm": 0.5956970276239495, + "learning_rate": 2.9489250241639644e-06, + "loss": 0.5275, + "step": 8239 + }, + { + "epoch": 1.336794289422453, + "grad_norm": 0.5837256351741928, + "learning_rate": 2.948504926451896e-06, + "loss": 0.5055, + "step": 8240 + }, + { + "epoch": 1.3369565217391304, + "grad_norm": 0.6000662279497724, + "learning_rate": 2.948084815653762e-06, + "loss": 0.512, + "step": 8241 + }, + { + "epoch": 1.3371187540558078, + "grad_norm": 0.599747098924868, + "learning_rate": 2.947664691781822e-06, + "loss": 0.4973, + "step": 8242 + }, + { + "epoch": 1.3372809863724853, + "grad_norm": 0.5629078503165977, + "learning_rate": 2.947244554848333e-06, + "loss": 0.5356, + "step": 8243 + }, + { + "epoch": 1.337443218689163, + "grad_norm": 0.5986017095374733, + "learning_rate": 2.9468244048655536e-06, + "loss": 0.5148, + "step": 8244 + }, + { + "epoch": 1.3376054510058404, + "grad_norm": 0.6047141374958247, + "learning_rate": 2.946404241845744e-06, + "loss": 0.4985, + "step": 8245 + }, + { + "epoch": 1.3377676833225178, + "grad_norm": 0.6184179309565555, + "learning_rate": 2.9459840658011605e-06, + "loss": 0.5003, + "step": 8246 + }, + { + "epoch": 1.3379299156391953, + "grad_norm": 0.6095267281944923, + "learning_rate": 2.945563876744065e-06, + "loss": 0.5458, + "step": 8247 + }, + { + "epoch": 1.338092147955873, + "grad_norm": 0.5799585289819063, + "learning_rate": 2.945143674686717e-06, + "loss": 0.5241, + "step": 8248 + }, + { + "epoch": 1.3382543802725504, + "grad_norm": 0.6098407867611677, + "learning_rate": 2.944723459641376e-06, + "loss": 0.5453, + "step": 8249 + }, + { + "epoch": 1.3384166125892278, + "grad_norm": 0.5891809949955205, + "learning_rate": 2.944303231620303e-06, + "loss": 0.519, + "step": 8250 + }, + { + "epoch": 1.3385788449059053, + "grad_norm": 0.5765525716675437, + "learning_rate": 2.943882990635759e-06, + "loss": 0.5262, + "step": 8251 + }, + { + "epoch": 1.3387410772225827, + "grad_norm": 0.6173504299390856, + "learning_rate": 2.9434627367000047e-06, + "loss": 0.5231, + "step": 8252 + }, + { + "epoch": 1.3389033095392602, + "grad_norm": 0.5871310616278389, + "learning_rate": 2.9430424698253034e-06, + "loss": 0.4919, + "step": 8253 + }, + { + "epoch": 1.3390655418559376, + "grad_norm": 0.5832024525474675, + "learning_rate": 2.942622190023916e-06, + "loss": 0.5157, + "step": 8254 + }, + { + "epoch": 1.339227774172615, + "grad_norm": 0.5668902131422805, + "learning_rate": 2.9422018973081057e-06, + "loss": 0.51, + "step": 8255 + }, + { + "epoch": 1.3393900064892927, + "grad_norm": 0.5896131017793008, + "learning_rate": 2.941781591690136e-06, + "loss": 0.5269, + "step": 8256 + }, + { + "epoch": 1.3395522388059702, + "grad_norm": 0.620950655628483, + "learning_rate": 2.9413612731822683e-06, + "loss": 0.4973, + "step": 8257 + }, + { + "epoch": 1.3397144711226476, + "grad_norm": 0.590750744588412, + "learning_rate": 2.940940941796768e-06, + "loss": 0.5193, + "step": 8258 + }, + { + "epoch": 1.339876703439325, + "grad_norm": 0.6131577503374313, + "learning_rate": 2.9405205975458985e-06, + "loss": 0.5142, + "step": 8259 + }, + { + "epoch": 1.3400389357560025, + "grad_norm": 0.6358681204252137, + "learning_rate": 2.940100240441924e-06, + "loss": 0.5251, + "step": 8260 + }, + { + "epoch": 1.3402011680726802, + "grad_norm": 0.6302327849659772, + "learning_rate": 2.9396798704971097e-06, + "loss": 0.53, + "step": 8261 + }, + { + "epoch": 1.3403634003893576, + "grad_norm": 0.6153591227776202, + "learning_rate": 2.9392594877237194e-06, + "loss": 0.5379, + "step": 8262 + }, + { + "epoch": 1.340525632706035, + "grad_norm": 0.6226099258885451, + "learning_rate": 2.9388390921340214e-06, + "loss": 0.5246, + "step": 8263 + }, + { + "epoch": 1.3406878650227125, + "grad_norm": 0.6143953334625472, + "learning_rate": 2.9384186837402795e-06, + "loss": 0.5396, + "step": 8264 + }, + { + "epoch": 1.34085009733939, + "grad_norm": 0.6092677663044462, + "learning_rate": 2.9379982625547608e-06, + "loss": 0.5236, + "step": 8265 + }, + { + "epoch": 1.3410123296560674, + "grad_norm": 0.6079607414291852, + "learning_rate": 2.9375778285897315e-06, + "loss": 0.5017, + "step": 8266 + }, + { + "epoch": 1.3411745619727449, + "grad_norm": 0.6203860756678082, + "learning_rate": 2.937157381857459e-06, + "loss": 0.5296, + "step": 8267 + }, + { + "epoch": 1.3413367942894223, + "grad_norm": 0.5917403530140766, + "learning_rate": 2.9367369223702102e-06, + "loss": 0.5163, + "step": 8268 + }, + { + "epoch": 1.3414990266061, + "grad_norm": 0.6001500969742627, + "learning_rate": 2.9363164501402537e-06, + "loss": 0.5255, + "step": 8269 + }, + { + "epoch": 1.3416612589227774, + "grad_norm": 0.595938319705067, + "learning_rate": 2.9358959651798562e-06, + "loss": 0.5234, + "step": 8270 + }, + { + "epoch": 1.3418234912394549, + "grad_norm": 0.6449343536719387, + "learning_rate": 2.9354754675012882e-06, + "loss": 0.5283, + "step": 8271 + }, + { + "epoch": 1.3419857235561323, + "grad_norm": 0.624736690698057, + "learning_rate": 2.9350549571168173e-06, + "loss": 0.521, + "step": 8272 + }, + { + "epoch": 1.34214795587281, + "grad_norm": 0.58022968023946, + "learning_rate": 2.9346344340387132e-06, + "loss": 0.5414, + "step": 8273 + }, + { + "epoch": 1.3423101881894874, + "grad_norm": 0.5928413890314599, + "learning_rate": 2.9342138982792456e-06, + "loss": 0.5167, + "step": 8274 + }, + { + "epoch": 1.3424724205061649, + "grad_norm": 0.6045574500041473, + "learning_rate": 2.9337933498506833e-06, + "loss": 0.5166, + "step": 8275 + }, + { + "epoch": 1.3426346528228423, + "grad_norm": 0.6619446834685317, + "learning_rate": 2.933372788765299e-06, + "loss": 0.5165, + "step": 8276 + }, + { + "epoch": 1.3427968851395198, + "grad_norm": 0.5532125304712454, + "learning_rate": 2.932952215035361e-06, + "loss": 0.5229, + "step": 8277 + }, + { + "epoch": 1.3429591174561972, + "grad_norm": 0.6464689953362075, + "learning_rate": 2.932531628673142e-06, + "loss": 0.5023, + "step": 8278 + }, + { + "epoch": 1.3431213497728747, + "grad_norm": 0.616093729560311, + "learning_rate": 2.9321110296909133e-06, + "loss": 0.5035, + "step": 8279 + }, + { + "epoch": 1.3432835820895521, + "grad_norm": 0.6108359840167323, + "learning_rate": 2.9316904181009464e-06, + "loss": 0.5167, + "step": 8280 + }, + { + "epoch": 1.3434458144062298, + "grad_norm": 0.5682975674289683, + "learning_rate": 2.9312697939155132e-06, + "loss": 0.4934, + "step": 8281 + }, + { + "epoch": 1.3436080467229072, + "grad_norm": 0.5753576973028, + "learning_rate": 2.930849157146887e-06, + "loss": 0.5141, + "step": 8282 + }, + { + "epoch": 1.3437702790395847, + "grad_norm": 0.6279158355740215, + "learning_rate": 2.9304285078073402e-06, + "loss": 0.525, + "step": 8283 + }, + { + "epoch": 1.3439325113562621, + "grad_norm": 0.5773210440336597, + "learning_rate": 2.930007845909146e-06, + "loss": 0.4855, + "step": 8284 + }, + { + "epoch": 1.3440947436729396, + "grad_norm": 0.6279678806956981, + "learning_rate": 2.92958717146458e-06, + "loss": 0.5033, + "step": 8285 + }, + { + "epoch": 1.3442569759896172, + "grad_norm": 0.5637384153507586, + "learning_rate": 2.929166484485913e-06, + "loss": 0.5096, + "step": 8286 + }, + { + "epoch": 1.3444192083062947, + "grad_norm": 0.609953418645141, + "learning_rate": 2.9287457849854217e-06, + "loss": 0.5348, + "step": 8287 + }, + { + "epoch": 1.3445814406229721, + "grad_norm": 0.645777244842919, + "learning_rate": 2.92832507297538e-06, + "loss": 0.5283, + "step": 8288 + }, + { + "epoch": 1.3447436729396496, + "grad_norm": 0.6137202345981572, + "learning_rate": 2.9279043484680637e-06, + "loss": 0.4958, + "step": 8289 + }, + { + "epoch": 1.344905905256327, + "grad_norm": 0.595133773995363, + "learning_rate": 2.9274836114757478e-06, + "loss": 0.5091, + "step": 8290 + }, + { + "epoch": 1.3450681375730045, + "grad_norm": 0.5753205413923215, + "learning_rate": 2.9270628620107083e-06, + "loss": 0.4946, + "step": 8291 + }, + { + "epoch": 1.345230369889682, + "grad_norm": 0.5959654382762087, + "learning_rate": 2.926642100085222e-06, + "loss": 0.514, + "step": 8292 + }, + { + "epoch": 1.3453926022063596, + "grad_norm": 0.5756543025105005, + "learning_rate": 2.9262213257115634e-06, + "loss": 0.5276, + "step": 8293 + }, + { + "epoch": 1.345554834523037, + "grad_norm": 0.6140598174424045, + "learning_rate": 2.9258005389020126e-06, + "loss": 0.5047, + "step": 8294 + }, + { + "epoch": 1.3457170668397145, + "grad_norm": 0.5834633784884485, + "learning_rate": 2.925379739668845e-06, + "loss": 0.523, + "step": 8295 + }, + { + "epoch": 1.345879299156392, + "grad_norm": 0.6008140593338631, + "learning_rate": 2.9249589280243383e-06, + "loss": 0.498, + "step": 8296 + }, + { + "epoch": 1.3460415314730694, + "grad_norm": 0.5743712740919384, + "learning_rate": 2.9245381039807714e-06, + "loss": 0.5324, + "step": 8297 + }, + { + "epoch": 1.346203763789747, + "grad_norm": 0.5789190535841274, + "learning_rate": 2.924117267550422e-06, + "loss": 0.5181, + "step": 8298 + }, + { + "epoch": 1.3463659961064245, + "grad_norm": 0.6162798207474948, + "learning_rate": 2.923696418745569e-06, + "loss": 0.5051, + "step": 8299 + }, + { + "epoch": 1.346528228423102, + "grad_norm": 0.5987332906682284, + "learning_rate": 2.9232755575784915e-06, + "loss": 0.5229, + "step": 8300 + }, + { + "epoch": 1.3466904607397794, + "grad_norm": 0.58165514675173, + "learning_rate": 2.9228546840614695e-06, + "loss": 0.4983, + "step": 8301 + }, + { + "epoch": 1.3468526930564568, + "grad_norm": 0.568141805209564, + "learning_rate": 2.9224337982067825e-06, + "loss": 0.5124, + "step": 8302 + }, + { + "epoch": 1.3470149253731343, + "grad_norm": 0.5985173145659004, + "learning_rate": 2.9220129000267107e-06, + "loss": 0.5111, + "step": 8303 + }, + { + "epoch": 1.3471771576898117, + "grad_norm": 0.6137234457601061, + "learning_rate": 2.921591989533535e-06, + "loss": 0.5482, + "step": 8304 + }, + { + "epoch": 1.3473393900064892, + "grad_norm": 0.6011424838537585, + "learning_rate": 2.9211710667395355e-06, + "loss": 0.512, + "step": 8305 + }, + { + "epoch": 1.3475016223231668, + "grad_norm": 0.6252097830108032, + "learning_rate": 2.9207501316569936e-06, + "loss": 0.5369, + "step": 8306 + }, + { + "epoch": 1.3476638546398443, + "grad_norm": 0.5851666997327487, + "learning_rate": 2.9203291842981922e-06, + "loss": 0.5276, + "step": 8307 + }, + { + "epoch": 1.3478260869565217, + "grad_norm": 0.6280632620812096, + "learning_rate": 2.919908224675412e-06, + "loss": 0.5353, + "step": 8308 + }, + { + "epoch": 1.3479883192731992, + "grad_norm": 0.5775551853735748, + "learning_rate": 2.9194872528009367e-06, + "loss": 0.5185, + "step": 8309 + }, + { + "epoch": 1.3481505515898766, + "grad_norm": 0.5806438062922676, + "learning_rate": 2.919066268687048e-06, + "loss": 0.5192, + "step": 8310 + }, + { + "epoch": 1.3483127839065543, + "grad_norm": 0.5736340863941292, + "learning_rate": 2.9186452723460284e-06, + "loss": 0.5431, + "step": 8311 + }, + { + "epoch": 1.3484750162232317, + "grad_norm": 0.5800721807646648, + "learning_rate": 2.918224263790163e-06, + "loss": 0.527, + "step": 8312 + }, + { + "epoch": 1.3486372485399092, + "grad_norm": 0.5896900623204478, + "learning_rate": 2.9178032430317344e-06, + "loss": 0.5414, + "step": 8313 + }, + { + "epoch": 1.3487994808565866, + "grad_norm": 0.6049764325052499, + "learning_rate": 2.9173822100830277e-06, + "loss": 0.5322, + "step": 8314 + }, + { + "epoch": 1.348961713173264, + "grad_norm": 0.6341022152301169, + "learning_rate": 2.9169611649563255e-06, + "loss": 0.5389, + "step": 8315 + }, + { + "epoch": 1.3491239454899415, + "grad_norm": 0.5768631358220775, + "learning_rate": 2.916540107663915e-06, + "loss": 0.4937, + "step": 8316 + }, + { + "epoch": 1.349286177806619, + "grad_norm": 0.5699620395950571, + "learning_rate": 2.9161190382180802e-06, + "loss": 0.5101, + "step": 8317 + }, + { + "epoch": 1.3494484101232966, + "grad_norm": 0.6133937879748176, + "learning_rate": 2.9156979566311066e-06, + "loss": 0.497, + "step": 8318 + }, + { + "epoch": 1.349610642439974, + "grad_norm": 0.5916693103790704, + "learning_rate": 2.9152768629152818e-06, + "loss": 0.5223, + "step": 8319 + }, + { + "epoch": 1.3497728747566515, + "grad_norm": 0.604433433796066, + "learning_rate": 2.914855757082889e-06, + "loss": 0.5201, + "step": 8320 + }, + { + "epoch": 1.349935107073329, + "grad_norm": 0.625254172109739, + "learning_rate": 2.914434639146217e-06, + "loss": 0.5258, + "step": 8321 + }, + { + "epoch": 1.3500973393900064, + "grad_norm": 0.5773668163520033, + "learning_rate": 2.9140135091175527e-06, + "loss": 0.5308, + "step": 8322 + }, + { + "epoch": 1.350259571706684, + "grad_norm": 0.59044703370769, + "learning_rate": 2.9135923670091826e-06, + "loss": 0.5597, + "step": 8323 + }, + { + "epoch": 1.3504218040233615, + "grad_norm": 0.5811036896407005, + "learning_rate": 2.9131712128333945e-06, + "loss": 0.5231, + "step": 8324 + }, + { + "epoch": 1.350584036340039, + "grad_norm": 0.5884641897042802, + "learning_rate": 2.9127500466024777e-06, + "loss": 0.4938, + "step": 8325 + }, + { + "epoch": 1.3507462686567164, + "grad_norm": 0.6135865808359421, + "learning_rate": 2.9123288683287197e-06, + "loss": 0.5275, + "step": 8326 + }, + { + "epoch": 1.3509085009733939, + "grad_norm": 0.5783076594995692, + "learning_rate": 2.9119076780244095e-06, + "loss": 0.5039, + "step": 8327 + }, + { + "epoch": 1.3510707332900713, + "grad_norm": 0.5746939949346855, + "learning_rate": 2.911486475701835e-06, + "loss": 0.5176, + "step": 8328 + }, + { + "epoch": 1.3512329656067488, + "grad_norm": 0.6202908280426741, + "learning_rate": 2.911065261373287e-06, + "loss": 0.5224, + "step": 8329 + }, + { + "epoch": 1.3513951979234262, + "grad_norm": 0.6302396237551041, + "learning_rate": 2.9106440350510556e-06, + "loss": 0.5068, + "step": 8330 + }, + { + "epoch": 1.3515574302401039, + "grad_norm": 0.6024419630292969, + "learning_rate": 2.9102227967474293e-06, + "loss": 0.5328, + "step": 8331 + }, + { + "epoch": 1.3517196625567813, + "grad_norm": 0.5745748468620506, + "learning_rate": 2.909801546474701e-06, + "loss": 0.544, + "step": 8332 + }, + { + "epoch": 1.3518818948734588, + "grad_norm": 0.5872038010009378, + "learning_rate": 2.9093802842451585e-06, + "loss": 0.5077, + "step": 8333 + }, + { + "epoch": 1.3520441271901362, + "grad_norm": 0.6001654995218282, + "learning_rate": 2.908959010071096e-06, + "loss": 0.4852, + "step": 8334 + }, + { + "epoch": 1.352206359506814, + "grad_norm": 0.5954118989057702, + "learning_rate": 2.9085377239648026e-06, + "loss": 0.5279, + "step": 8335 + }, + { + "epoch": 1.3523685918234913, + "grad_norm": 0.6223449423633395, + "learning_rate": 2.9081164259385713e-06, + "loss": 0.535, + "step": 8336 + }, + { + "epoch": 1.3525308241401688, + "grad_norm": 0.5966393823164294, + "learning_rate": 2.9076951160046957e-06, + "loss": 0.5197, + "step": 8337 + }, + { + "epoch": 1.3526930564568462, + "grad_norm": 0.5839256384181494, + "learning_rate": 2.907273794175466e-06, + "loss": 0.5024, + "step": 8338 + }, + { + "epoch": 1.3528552887735237, + "grad_norm": 0.619565931903227, + "learning_rate": 2.906852460463176e-06, + "loss": 0.5339, + "step": 8339 + }, + { + "epoch": 1.3530175210902011, + "grad_norm": 0.5778955959345063, + "learning_rate": 2.9064311148801196e-06, + "loss": 0.5052, + "step": 8340 + }, + { + "epoch": 1.3531797534068786, + "grad_norm": 0.5873322445666254, + "learning_rate": 2.9060097574385898e-06, + "loss": 0.545, + "step": 8341 + }, + { + "epoch": 1.353341985723556, + "grad_norm": 0.5940261546602381, + "learning_rate": 2.905588388150881e-06, + "loss": 0.5108, + "step": 8342 + }, + { + "epoch": 1.3535042180402337, + "grad_norm": 0.5968322088110195, + "learning_rate": 2.9051670070292874e-06, + "loss": 0.5344, + "step": 8343 + }, + { + "epoch": 1.3536664503569111, + "grad_norm": 0.5929099089653398, + "learning_rate": 2.904745614086103e-06, + "loss": 0.5455, + "step": 8344 + }, + { + "epoch": 1.3538286826735886, + "grad_norm": 0.6152703647486185, + "learning_rate": 2.9043242093336237e-06, + "loss": 0.5396, + "step": 8345 + }, + { + "epoch": 1.353990914990266, + "grad_norm": 0.6248108873406861, + "learning_rate": 2.9039027927841452e-06, + "loss": 0.5041, + "step": 8346 + }, + { + "epoch": 1.3541531473069435, + "grad_norm": 0.5910113709045929, + "learning_rate": 2.9034813644499617e-06, + "loss": 0.5464, + "step": 8347 + }, + { + "epoch": 1.3543153796236211, + "grad_norm": 0.6181070533788331, + "learning_rate": 2.9030599243433703e-06, + "loss": 0.5202, + "step": 8348 + }, + { + "epoch": 1.3544776119402986, + "grad_norm": 0.6651451220409732, + "learning_rate": 2.902638472476667e-06, + "loss": 0.5156, + "step": 8349 + }, + { + "epoch": 1.354639844256976, + "grad_norm": 0.6046934996325316, + "learning_rate": 2.9022170088621497e-06, + "loss": 0.529, + "step": 8350 + }, + { + "epoch": 1.3548020765736535, + "grad_norm": 0.6037522262829816, + "learning_rate": 2.9017955335121133e-06, + "loss": 0.5112, + "step": 8351 + }, + { + "epoch": 1.354964308890331, + "grad_norm": 0.6250951022946739, + "learning_rate": 2.9013740464388575e-06, + "loss": 0.5172, + "step": 8352 + }, + { + "epoch": 1.3551265412070084, + "grad_norm": 0.614407027273557, + "learning_rate": 2.9009525476546783e-06, + "loss": 0.5656, + "step": 8353 + }, + { + "epoch": 1.3552887735236858, + "grad_norm": 0.6057929651447554, + "learning_rate": 2.9005310371718743e-06, + "loss": 0.5064, + "step": 8354 + }, + { + "epoch": 1.3554510058403633, + "grad_norm": 0.5597159789472897, + "learning_rate": 2.9001095150027447e-06, + "loss": 0.5082, + "step": 8355 + }, + { + "epoch": 1.355613238157041, + "grad_norm": 0.6196476300620053, + "learning_rate": 2.8996879811595872e-06, + "loss": 0.505, + "step": 8356 + }, + { + "epoch": 1.3557754704737184, + "grad_norm": 0.6198540338328953, + "learning_rate": 2.8992664356547017e-06, + "loss": 0.5243, + "step": 8357 + }, + { + "epoch": 1.3559377027903958, + "grad_norm": 0.6182807012036943, + "learning_rate": 2.898844878500388e-06, + "loss": 0.5621, + "step": 8358 + }, + { + "epoch": 1.3560999351070733, + "grad_norm": 0.588081538527557, + "learning_rate": 2.8984233097089448e-06, + "loss": 0.534, + "step": 8359 + }, + { + "epoch": 1.356262167423751, + "grad_norm": 0.5638841619510953, + "learning_rate": 2.898001729292673e-06, + "loss": 0.5252, + "step": 8360 + }, + { + "epoch": 1.3564243997404284, + "grad_norm": 0.6073669853400835, + "learning_rate": 2.897580137263873e-06, + "loss": 0.5262, + "step": 8361 + }, + { + "epoch": 1.3565866320571058, + "grad_norm": 0.5792006790664823, + "learning_rate": 2.8971585336348446e-06, + "loss": 0.5441, + "step": 8362 + }, + { + "epoch": 1.3567488643737833, + "grad_norm": 0.6092236000720659, + "learning_rate": 2.8967369184178905e-06, + "loss": 0.5085, + "step": 8363 + }, + { + "epoch": 1.3569110966904607, + "grad_norm": 0.624042324956364, + "learning_rate": 2.8963152916253113e-06, + "loss": 0.4713, + "step": 8364 + }, + { + "epoch": 1.3570733290071382, + "grad_norm": 0.5764499130799619, + "learning_rate": 2.8958936532694092e-06, + "loss": 0.5341, + "step": 8365 + }, + { + "epoch": 1.3572355613238156, + "grad_norm": 0.61782598303677, + "learning_rate": 2.8954720033624867e-06, + "loss": 0.5167, + "step": 8366 + }, + { + "epoch": 1.357397793640493, + "grad_norm": 0.611121273247324, + "learning_rate": 2.895050341916845e-06, + "loss": 0.5014, + "step": 8367 + }, + { + "epoch": 1.3575600259571707, + "grad_norm": 0.6046844640363024, + "learning_rate": 2.894628668944789e-06, + "loss": 0.5302, + "step": 8368 + }, + { + "epoch": 1.3577222582738482, + "grad_norm": 0.6144528832816012, + "learning_rate": 2.8942069844586197e-06, + "loss": 0.5466, + "step": 8369 + }, + { + "epoch": 1.3578844905905256, + "grad_norm": 0.5869829233418332, + "learning_rate": 2.8937852884706417e-06, + "loss": 0.5478, + "step": 8370 + }, + { + "epoch": 1.358046722907203, + "grad_norm": 0.600333738406782, + "learning_rate": 2.8933635809931594e-06, + "loss": 0.5125, + "step": 8371 + }, + { + "epoch": 1.3582089552238805, + "grad_norm": 0.6680795887170035, + "learning_rate": 2.892941862038475e-06, + "loss": 0.5336, + "step": 8372 + }, + { + "epoch": 1.3583711875405582, + "grad_norm": 0.6137980453074784, + "learning_rate": 2.8925201316188966e-06, + "loss": 0.5559, + "step": 8373 + }, + { + "epoch": 1.3585334198572356, + "grad_norm": 0.6083568889246652, + "learning_rate": 2.8920983897467254e-06, + "loss": 0.5116, + "step": 8374 + }, + { + "epoch": 1.358695652173913, + "grad_norm": 0.6097165296152304, + "learning_rate": 2.891676636434268e-06, + "loss": 0.5201, + "step": 8375 + }, + { + "epoch": 1.3588578844905905, + "grad_norm": 0.6216804610369778, + "learning_rate": 2.8912548716938306e-06, + "loss": 0.5405, + "step": 8376 + }, + { + "epoch": 1.359020116807268, + "grad_norm": 0.6030777770487387, + "learning_rate": 2.8908330955377183e-06, + "loss": 0.491, + "step": 8377 + }, + { + "epoch": 1.3591823491239454, + "grad_norm": 0.604827640350107, + "learning_rate": 2.8904113079782374e-06, + "loss": 0.5332, + "step": 8378 + }, + { + "epoch": 1.3593445814406229, + "grad_norm": 0.6306228469835428, + "learning_rate": 2.889989509027695e-06, + "loss": 0.5099, + "step": 8379 + }, + { + "epoch": 1.3595068137573005, + "grad_norm": 0.6508173190992956, + "learning_rate": 2.889567698698396e-06, + "loss": 0.5237, + "step": 8380 + }, + { + "epoch": 1.359669046073978, + "grad_norm": 0.5854767842724753, + "learning_rate": 2.8891458770026502e-06, + "loss": 0.4833, + "step": 8381 + }, + { + "epoch": 1.3598312783906554, + "grad_norm": 0.6060775587187095, + "learning_rate": 2.8887240439527643e-06, + "loss": 0.5595, + "step": 8382 + }, + { + "epoch": 1.3599935107073329, + "grad_norm": 0.5989096413022755, + "learning_rate": 2.888302199561045e-06, + "loss": 0.4998, + "step": 8383 + }, + { + "epoch": 1.3601557430240103, + "grad_norm": 0.6014862607203448, + "learning_rate": 2.8878803438398017e-06, + "loss": 0.5395, + "step": 8384 + }, + { + "epoch": 1.360317975340688, + "grad_norm": 0.7149985534391913, + "learning_rate": 2.887458476801342e-06, + "loss": 0.5017, + "step": 8385 + }, + { + "epoch": 1.3604802076573654, + "grad_norm": 0.6061175733128863, + "learning_rate": 2.8870365984579764e-06, + "loss": 0.5358, + "step": 8386 + }, + { + "epoch": 1.3606424399740429, + "grad_norm": 0.5646199102096199, + "learning_rate": 2.886614708822012e-06, + "loss": 0.5055, + "step": 8387 + }, + { + "epoch": 1.3608046722907203, + "grad_norm": 0.6003400603471774, + "learning_rate": 2.8861928079057593e-06, + "loss": 0.5281, + "step": 8388 + }, + { + "epoch": 1.3609669046073978, + "grad_norm": 0.6107979493536784, + "learning_rate": 2.8857708957215284e-06, + "loss": 0.5251, + "step": 8389 + }, + { + "epoch": 1.3611291369240752, + "grad_norm": 0.6192894345304679, + "learning_rate": 2.88534897228163e-06, + "loss": 0.5344, + "step": 8390 + }, + { + "epoch": 1.3612913692407527, + "grad_norm": 0.5849370688887492, + "learning_rate": 2.8849270375983727e-06, + "loss": 0.5326, + "step": 8391 + }, + { + "epoch": 1.3614536015574301, + "grad_norm": 0.6143656509200928, + "learning_rate": 2.8845050916840676e-06, + "loss": 0.5364, + "step": 8392 + }, + { + "epoch": 1.3616158338741078, + "grad_norm": 0.6050081135695349, + "learning_rate": 2.8840831345510285e-06, + "loss": 0.5359, + "step": 8393 + }, + { + "epoch": 1.3617780661907852, + "grad_norm": 0.5830059946275135, + "learning_rate": 2.883661166211564e-06, + "loss": 0.4735, + "step": 8394 + }, + { + "epoch": 1.3619402985074627, + "grad_norm": 0.6252416567844123, + "learning_rate": 2.883239186677987e-06, + "loss": 0.5115, + "step": 8395 + }, + { + "epoch": 1.3621025308241401, + "grad_norm": 0.5892127894024866, + "learning_rate": 2.88281719596261e-06, + "loss": 0.5232, + "step": 8396 + }, + { + "epoch": 1.3622647631408176, + "grad_norm": 0.5789035114924399, + "learning_rate": 2.8823951940777443e-06, + "loss": 0.5276, + "step": 8397 + }, + { + "epoch": 1.3624269954574952, + "grad_norm": 0.5956223843675705, + "learning_rate": 2.8819731810357034e-06, + "loss": 0.5125, + "step": 8398 + }, + { + "epoch": 1.3625892277741727, + "grad_norm": 0.622097611805415, + "learning_rate": 2.8815511568488006e-06, + "loss": 0.5015, + "step": 8399 + }, + { + "epoch": 1.3627514600908501, + "grad_norm": 0.6053296914322496, + "learning_rate": 2.8811291215293498e-06, + "loss": 0.499, + "step": 8400 + }, + { + "epoch": 1.3629136924075276, + "grad_norm": 0.5670904601771238, + "learning_rate": 2.8807070750896633e-06, + "loss": 0.4895, + "step": 8401 + }, + { + "epoch": 1.363075924724205, + "grad_norm": 0.6141358769030456, + "learning_rate": 2.8802850175420566e-06, + "loss": 0.5372, + "step": 8402 + }, + { + "epoch": 1.3632381570408825, + "grad_norm": 0.5975572466337292, + "learning_rate": 2.879862948898842e-06, + "loss": 0.5397, + "step": 8403 + }, + { + "epoch": 1.36340038935756, + "grad_norm": 0.610188327513196, + "learning_rate": 2.879440869172338e-06, + "loss": 0.5534, + "step": 8404 + }, + { + "epoch": 1.3635626216742376, + "grad_norm": 0.6219899810835585, + "learning_rate": 2.8790187783748557e-06, + "loss": 0.5239, + "step": 8405 + }, + { + "epoch": 1.363724853990915, + "grad_norm": 0.5767260595388302, + "learning_rate": 2.8785966765187125e-06, + "loss": 0.5437, + "step": 8406 + }, + { + "epoch": 1.3638870863075925, + "grad_norm": 0.5654628366314953, + "learning_rate": 2.878174563616224e-06, + "loss": 0.5165, + "step": 8407 + }, + { + "epoch": 1.36404931862427, + "grad_norm": 0.5676267956717388, + "learning_rate": 2.8777524396797056e-06, + "loss": 0.5773, + "step": 8408 + }, + { + "epoch": 1.3642115509409474, + "grad_norm": 0.59808448000056, + "learning_rate": 2.8773303047214745e-06, + "loss": 0.496, + "step": 8409 + }, + { + "epoch": 1.364373783257625, + "grad_norm": 0.5905024872945713, + "learning_rate": 2.8769081587538463e-06, + "loss": 0.5446, + "step": 8410 + }, + { + "epoch": 1.3645360155743025, + "grad_norm": 0.5857077143948971, + "learning_rate": 2.8764860017891394e-06, + "loss": 0.5263, + "step": 8411 + }, + { + "epoch": 1.36469824789098, + "grad_norm": 0.602554826065286, + "learning_rate": 2.876063833839669e-06, + "loss": 0.5383, + "step": 8412 + }, + { + "epoch": 1.3648604802076574, + "grad_norm": 0.6242277636911726, + "learning_rate": 2.8756416549177544e-06, + "loss": 0.5285, + "step": 8413 + }, + { + "epoch": 1.3650227125243348, + "grad_norm": 0.5963338820789936, + "learning_rate": 2.875219465035714e-06, + "loss": 0.5037, + "step": 8414 + }, + { + "epoch": 1.3651849448410123, + "grad_norm": 0.5839312962274241, + "learning_rate": 2.874797264205865e-06, + "loss": 0.54, + "step": 8415 + }, + { + "epoch": 1.3653471771576897, + "grad_norm": 0.6275486603382542, + "learning_rate": 2.8743750524405254e-06, + "loss": 0.5372, + "step": 8416 + }, + { + "epoch": 1.3655094094743672, + "grad_norm": 0.5904976782186191, + "learning_rate": 2.873952829752015e-06, + "loss": 0.4924, + "step": 8417 + }, + { + "epoch": 1.3656716417910448, + "grad_norm": 0.6271871224183583, + "learning_rate": 2.8735305961526533e-06, + "loss": 0.5021, + "step": 8418 + }, + { + "epoch": 1.3658338741077223, + "grad_norm": 0.5800842023423126, + "learning_rate": 2.87310835165476e-06, + "loss": 0.5032, + "step": 8419 + }, + { + "epoch": 1.3659961064243997, + "grad_norm": 0.5862085636629407, + "learning_rate": 2.8726860962706537e-06, + "loss": 0.5212, + "step": 8420 + }, + { + "epoch": 1.3661583387410772, + "grad_norm": 0.5842497148781727, + "learning_rate": 2.872263830012655e-06, + "loss": 0.4646, + "step": 8421 + }, + { + "epoch": 1.3663205710577548, + "grad_norm": 0.5930667886534786, + "learning_rate": 2.871841552893086e-06, + "loss": 0.5199, + "step": 8422 + }, + { + "epoch": 1.3664828033744323, + "grad_norm": 0.5993635561361039, + "learning_rate": 2.8714192649242643e-06, + "loss": 0.5068, + "step": 8423 + }, + { + "epoch": 1.3666450356911097, + "grad_norm": 0.6145365155546808, + "learning_rate": 2.870996966118515e-06, + "loss": 0.5334, + "step": 8424 + }, + { + "epoch": 1.3668072680077872, + "grad_norm": 0.5516320894179849, + "learning_rate": 2.8705746564881566e-06, + "loss": 0.5205, + "step": 8425 + }, + { + "epoch": 1.3669695003244646, + "grad_norm": 0.5883953477125217, + "learning_rate": 2.8701523360455115e-06, + "loss": 0.5027, + "step": 8426 + }, + { + "epoch": 1.367131732641142, + "grad_norm": 0.6222242217222425, + "learning_rate": 2.8697300048029023e-06, + "loss": 0.5222, + "step": 8427 + }, + { + "epoch": 1.3672939649578195, + "grad_norm": 0.6887968865434541, + "learning_rate": 2.8693076627726506e-06, + "loss": 0.5569, + "step": 8428 + }, + { + "epoch": 1.367456197274497, + "grad_norm": 0.590491593935969, + "learning_rate": 2.868885309967081e-06, + "loss": 0.5153, + "step": 8429 + }, + { + "epoch": 1.3676184295911746, + "grad_norm": 0.6080968602673872, + "learning_rate": 2.8684629463985135e-06, + "loss": 0.5013, + "step": 8430 + }, + { + "epoch": 1.367780661907852, + "grad_norm": 0.5941575615405431, + "learning_rate": 2.8680405720792744e-06, + "loss": 0.5312, + "step": 8431 + }, + { + "epoch": 1.3679428942245295, + "grad_norm": 0.5852989884479666, + "learning_rate": 2.8676181870216856e-06, + "loss": 0.4914, + "step": 8432 + }, + { + "epoch": 1.368105126541207, + "grad_norm": 0.6064889355675929, + "learning_rate": 2.8671957912380717e-06, + "loss": 0.5462, + "step": 8433 + }, + { + "epoch": 1.3682673588578844, + "grad_norm": 0.6069947455206238, + "learning_rate": 2.8667733847407564e-06, + "loss": 0.5229, + "step": 8434 + }, + { + "epoch": 1.368429591174562, + "grad_norm": 0.565931466150739, + "learning_rate": 2.866350967542064e-06, + "loss": 0.5136, + "step": 8435 + }, + { + "epoch": 1.3685918234912395, + "grad_norm": 0.6018989800451162, + "learning_rate": 2.8659285396543213e-06, + "loss": 0.5077, + "step": 8436 + }, + { + "epoch": 1.368754055807917, + "grad_norm": 0.5939818329677089, + "learning_rate": 2.8655061010898515e-06, + "loss": 0.5323, + "step": 8437 + }, + { + "epoch": 1.3689162881245944, + "grad_norm": 0.5803968994703924, + "learning_rate": 2.8650836518609814e-06, + "loss": 0.526, + "step": 8438 + }, + { + "epoch": 1.3690785204412719, + "grad_norm": 0.5896498025594865, + "learning_rate": 2.8646611919800364e-06, + "loss": 0.503, + "step": 8439 + }, + { + "epoch": 1.3692407527579493, + "grad_norm": 0.5835037139824767, + "learning_rate": 2.8642387214593424e-06, + "loss": 0.4943, + "step": 8440 + }, + { + "epoch": 1.3694029850746268, + "grad_norm": 0.6331771999124317, + "learning_rate": 2.863816240311226e-06, + "loss": 0.5229, + "step": 8441 + }, + { + "epoch": 1.3695652173913042, + "grad_norm": 0.6338742928869868, + "learning_rate": 2.8633937485480144e-06, + "loss": 0.5064, + "step": 8442 + }, + { + "epoch": 1.3697274497079819, + "grad_norm": 0.6134911390406889, + "learning_rate": 2.8629712461820338e-06, + "loss": 0.5288, + "step": 8443 + }, + { + "epoch": 1.3698896820246593, + "grad_norm": 0.6045606923991574, + "learning_rate": 2.862548733225612e-06, + "loss": 0.4936, + "step": 8444 + }, + { + "epoch": 1.3700519143413368, + "grad_norm": 0.5992122128464562, + "learning_rate": 2.8621262096910774e-06, + "loss": 0.5008, + "step": 8445 + }, + { + "epoch": 1.3702141466580142, + "grad_norm": 0.6469332352785605, + "learning_rate": 2.8617036755907563e-06, + "loss": 0.5324, + "step": 8446 + }, + { + "epoch": 1.3703763789746919, + "grad_norm": 0.6010824742850635, + "learning_rate": 2.8612811309369794e-06, + "loss": 0.4871, + "step": 8447 + }, + { + "epoch": 1.3705386112913693, + "grad_norm": 0.5874518645035879, + "learning_rate": 2.8608585757420726e-06, + "loss": 0.5257, + "step": 8448 + }, + { + "epoch": 1.3707008436080468, + "grad_norm": 0.6205949947528545, + "learning_rate": 2.860436010018367e-06, + "loss": 0.5199, + "step": 8449 + }, + { + "epoch": 1.3708630759247242, + "grad_norm": 0.638642362298772, + "learning_rate": 2.8600134337781917e-06, + "loss": 0.5083, + "step": 8450 + }, + { + "epoch": 1.3710253082414017, + "grad_norm": 0.6341383010059597, + "learning_rate": 2.8595908470338746e-06, + "loss": 0.4934, + "step": 8451 + }, + { + "epoch": 1.3711875405580791, + "grad_norm": 0.5541480490693727, + "learning_rate": 2.859168249797747e-06, + "loss": 0.4654, + "step": 8452 + }, + { + "epoch": 1.3713497728747566, + "grad_norm": 0.5996587404899959, + "learning_rate": 2.8587456420821385e-06, + "loss": 0.5131, + "step": 8453 + }, + { + "epoch": 1.371512005191434, + "grad_norm": 0.5928429687695214, + "learning_rate": 2.858323023899379e-06, + "loss": 0.5326, + "step": 8454 + }, + { + "epoch": 1.3716742375081117, + "grad_norm": 0.5888333157421588, + "learning_rate": 2.8579003952618006e-06, + "loss": 0.5314, + "step": 8455 + }, + { + "epoch": 1.3718364698247891, + "grad_norm": 0.6258484810358451, + "learning_rate": 2.8574777561817343e-06, + "loss": 0.5461, + "step": 8456 + }, + { + "epoch": 1.3719987021414666, + "grad_norm": 0.6342576789546112, + "learning_rate": 2.85705510667151e-06, + "loss": 0.5092, + "step": 8457 + }, + { + "epoch": 1.372160934458144, + "grad_norm": 0.5962860443567667, + "learning_rate": 2.8566324467434604e-06, + "loss": 0.4922, + "step": 8458 + }, + { + "epoch": 1.3723231667748215, + "grad_norm": 0.6047716084662558, + "learning_rate": 2.856209776409917e-06, + "loss": 0.5477, + "step": 8459 + }, + { + "epoch": 1.3724853990914991, + "grad_norm": 0.5885357365180433, + "learning_rate": 2.8557870956832135e-06, + "loss": 0.5524, + "step": 8460 + }, + { + "epoch": 1.3726476314081766, + "grad_norm": 0.5694469930013492, + "learning_rate": 2.8553644045756803e-06, + "loss": 0.5181, + "step": 8461 + }, + { + "epoch": 1.372809863724854, + "grad_norm": 0.604787031073294, + "learning_rate": 2.854941703099652e-06, + "loss": 0.5119, + "step": 8462 + }, + { + "epoch": 1.3729720960415315, + "grad_norm": 0.6132368732629859, + "learning_rate": 2.8545189912674607e-06, + "loss": 0.5328, + "step": 8463 + }, + { + "epoch": 1.373134328358209, + "grad_norm": 0.6226129109884105, + "learning_rate": 2.854096269091441e-06, + "loss": 0.5254, + "step": 8464 + }, + { + "epoch": 1.3732965606748864, + "grad_norm": 0.6010150944247011, + "learning_rate": 2.8536735365839263e-06, + "loss": 0.5353, + "step": 8465 + }, + { + "epoch": 1.3734587929915638, + "grad_norm": 0.6475308552161051, + "learning_rate": 2.8532507937572495e-06, + "loss": 0.4954, + "step": 8466 + }, + { + "epoch": 1.3736210253082415, + "grad_norm": 0.6171215080274643, + "learning_rate": 2.8528280406237463e-06, + "loss": 0.5269, + "step": 8467 + }, + { + "epoch": 1.373783257624919, + "grad_norm": 0.584452691431792, + "learning_rate": 2.852405277195752e-06, + "loss": 0.5118, + "step": 8468 + }, + { + "epoch": 1.3739454899415964, + "grad_norm": 0.5980936578683956, + "learning_rate": 2.8519825034856e-06, + "loss": 0.5149, + "step": 8469 + }, + { + "epoch": 1.3741077222582738, + "grad_norm": 0.5898161436978517, + "learning_rate": 2.851559719505626e-06, + "loss": 0.5097, + "step": 8470 + }, + { + "epoch": 1.3742699545749513, + "grad_norm": 0.6209609007659647, + "learning_rate": 2.8511369252681657e-06, + "loss": 0.503, + "step": 8471 + }, + { + "epoch": 1.374432186891629, + "grad_norm": 0.5736955830407209, + "learning_rate": 2.8507141207855554e-06, + "loss": 0.5208, + "step": 8472 + }, + { + "epoch": 1.3745944192083064, + "grad_norm": 0.584995186783499, + "learning_rate": 2.8502913060701314e-06, + "loss": 0.5085, + "step": 8473 + }, + { + "epoch": 1.3747566515249838, + "grad_norm": 0.5891129656258911, + "learning_rate": 2.8498684811342305e-06, + "loss": 0.5398, + "step": 8474 + }, + { + "epoch": 1.3749188838416613, + "grad_norm": 0.6007174286142496, + "learning_rate": 2.849445645990188e-06, + "loss": 0.5298, + "step": 8475 + }, + { + "epoch": 1.3750811161583387, + "grad_norm": 0.6201457303115017, + "learning_rate": 2.8490228006503424e-06, + "loss": 0.5056, + "step": 8476 + }, + { + "epoch": 1.3752433484750162, + "grad_norm": 0.620394570674001, + "learning_rate": 2.8485999451270298e-06, + "loss": 0.5257, + "step": 8477 + }, + { + "epoch": 1.3754055807916936, + "grad_norm": 0.6021946328941833, + "learning_rate": 2.8481770794325895e-06, + "loss": 0.5331, + "step": 8478 + }, + { + "epoch": 1.375567813108371, + "grad_norm": 0.5944678356296352, + "learning_rate": 2.8477542035793583e-06, + "loss": 0.5452, + "step": 8479 + }, + { + "epoch": 1.3757300454250487, + "grad_norm": 0.6178748330244537, + "learning_rate": 2.8473313175796753e-06, + "loss": 0.4993, + "step": 8480 + }, + { + "epoch": 1.3758922777417262, + "grad_norm": 0.6071986482415301, + "learning_rate": 2.846908421445878e-06, + "loss": 0.5393, + "step": 8481 + }, + { + "epoch": 1.3760545100584036, + "grad_norm": 0.641300246121441, + "learning_rate": 2.8464855151903065e-06, + "loss": 0.4709, + "step": 8482 + }, + { + "epoch": 1.376216742375081, + "grad_norm": 0.5928607019189688, + "learning_rate": 2.8460625988252987e-06, + "loss": 0.5059, + "step": 8483 + }, + { + "epoch": 1.3763789746917585, + "grad_norm": 0.5834340700459972, + "learning_rate": 2.8456396723631953e-06, + "loss": 0.4767, + "step": 8484 + }, + { + "epoch": 1.3765412070084362, + "grad_norm": 0.5959626509300505, + "learning_rate": 2.8452167358163353e-06, + "loss": 0.4833, + "step": 8485 + }, + { + "epoch": 1.3767034393251136, + "grad_norm": 0.5926203196587273, + "learning_rate": 2.8447937891970585e-06, + "loss": 0.502, + "step": 8486 + }, + { + "epoch": 1.376865671641791, + "grad_norm": 0.5966156894048605, + "learning_rate": 2.844370832517706e-06, + "loss": 0.5023, + "step": 8487 + }, + { + "epoch": 1.3770279039584685, + "grad_norm": 0.6102627518972662, + "learning_rate": 2.8439478657906183e-06, + "loss": 0.5158, + "step": 8488 + }, + { + "epoch": 1.377190136275146, + "grad_norm": 0.5832037231082353, + "learning_rate": 2.8435248890281363e-06, + "loss": 0.5169, + "step": 8489 + }, + { + "epoch": 1.3773523685918234, + "grad_norm": 0.5852526435427472, + "learning_rate": 2.8431019022426002e-06, + "loss": 0.494, + "step": 8490 + }, + { + "epoch": 1.3775146009085009, + "grad_norm": 0.5970277486426928, + "learning_rate": 2.8426789054463533e-06, + "loss": 0.5217, + "step": 8491 + }, + { + "epoch": 1.3776768332251785, + "grad_norm": 0.5792402114514171, + "learning_rate": 2.842255898651737e-06, + "loss": 0.4619, + "step": 8492 + }, + { + "epoch": 1.377839065541856, + "grad_norm": 0.5900545191759771, + "learning_rate": 2.8418328818710914e-06, + "loss": 0.5192, + "step": 8493 + }, + { + "epoch": 1.3780012978585334, + "grad_norm": 0.6097196472629144, + "learning_rate": 2.8414098551167617e-06, + "loss": 0.5224, + "step": 8494 + }, + { + "epoch": 1.3781635301752109, + "grad_norm": 0.624727025324973, + "learning_rate": 2.8409868184010886e-06, + "loss": 0.5184, + "step": 8495 + }, + { + "epoch": 1.3783257624918883, + "grad_norm": 0.6360956030549487, + "learning_rate": 2.8405637717364163e-06, + "loss": 0.5288, + "step": 8496 + }, + { + "epoch": 1.378487994808566, + "grad_norm": 0.5843718263868403, + "learning_rate": 2.8401407151350866e-06, + "loss": 0.5175, + "step": 8497 + }, + { + "epoch": 1.3786502271252434, + "grad_norm": 0.6411766745202403, + "learning_rate": 2.839717648609445e-06, + "loss": 0.5192, + "step": 8498 + }, + { + "epoch": 1.3788124594419209, + "grad_norm": 0.6085070792312655, + "learning_rate": 2.8392945721718345e-06, + "loss": 0.5096, + "step": 8499 + }, + { + "epoch": 1.3789746917585983, + "grad_norm": 0.5883026210629183, + "learning_rate": 2.8388714858345983e-06, + "loss": 0.5098, + "step": 8500 + }, + { + "epoch": 1.3791369240752758, + "grad_norm": 0.6094024778807431, + "learning_rate": 2.838448389610082e-06, + "loss": 0.526, + "step": 8501 + }, + { + "epoch": 1.3792991563919532, + "grad_norm": 0.6339869035713129, + "learning_rate": 2.8380252835106297e-06, + "loss": 0.5444, + "step": 8502 + }, + { + "epoch": 1.3794613887086307, + "grad_norm": 0.5999888553346228, + "learning_rate": 2.8376021675485877e-06, + "loss": 0.5201, + "step": 8503 + }, + { + "epoch": 1.3796236210253081, + "grad_norm": 0.6369185359044672, + "learning_rate": 2.837179041736299e-06, + "loss": 0.5306, + "step": 8504 + }, + { + "epoch": 1.3797858533419858, + "grad_norm": 0.6002484015687087, + "learning_rate": 2.8367559060861107e-06, + "loss": 0.5214, + "step": 8505 + }, + { + "epoch": 1.3799480856586632, + "grad_norm": 0.581150175182678, + "learning_rate": 2.836332760610368e-06, + "loss": 0.5339, + "step": 8506 + }, + { + "epoch": 1.3801103179753407, + "grad_norm": 0.5807168250961298, + "learning_rate": 2.835909605321418e-06, + "loss": 0.5074, + "step": 8507 + }, + { + "epoch": 1.3802725502920181, + "grad_norm": 0.6085503514219589, + "learning_rate": 2.835486440231606e-06, + "loss": 0.5282, + "step": 8508 + }, + { + "epoch": 1.3804347826086958, + "grad_norm": 0.5819678192232035, + "learning_rate": 2.8350632653532795e-06, + "loss": 0.5102, + "step": 8509 + }, + { + "epoch": 1.3805970149253732, + "grad_norm": 0.5773628565951133, + "learning_rate": 2.834640080698785e-06, + "loss": 0.49, + "step": 8510 + }, + { + "epoch": 1.3807592472420507, + "grad_norm": 0.6071733116245205, + "learning_rate": 2.8342168862804708e-06, + "loss": 0.5158, + "step": 8511 + }, + { + "epoch": 1.3809214795587281, + "grad_norm": 0.5824385038520578, + "learning_rate": 2.833793682110683e-06, + "loss": 0.511, + "step": 8512 + }, + { + "epoch": 1.3810837118754056, + "grad_norm": 0.6345046019779188, + "learning_rate": 2.8333704682017703e-06, + "loss": 0.4947, + "step": 8513 + }, + { + "epoch": 1.381245944192083, + "grad_norm": 0.6623509859814455, + "learning_rate": 2.8329472445660817e-06, + "loss": 0.5071, + "step": 8514 + }, + { + "epoch": 1.3814081765087605, + "grad_norm": 0.5870121447014822, + "learning_rate": 2.8325240112159637e-06, + "loss": 0.4957, + "step": 8515 + }, + { + "epoch": 1.381570408825438, + "grad_norm": 0.6111445996177504, + "learning_rate": 2.8321007681637664e-06, + "loss": 0.5264, + "step": 8516 + }, + { + "epoch": 1.3817326411421156, + "grad_norm": 0.6183203192597893, + "learning_rate": 2.831677515421838e-06, + "loss": 0.4767, + "step": 8517 + }, + { + "epoch": 1.381894873458793, + "grad_norm": 0.5635204784773624, + "learning_rate": 2.8312542530025283e-06, + "loss": 0.5379, + "step": 8518 + }, + { + "epoch": 1.3820571057754705, + "grad_norm": 0.6095294057052364, + "learning_rate": 2.8308309809181865e-06, + "loss": 0.4978, + "step": 8519 + }, + { + "epoch": 1.382219338092148, + "grad_norm": 0.5973745176773689, + "learning_rate": 2.8304076991811623e-06, + "loss": 0.4969, + "step": 8520 + }, + { + "epoch": 1.3823815704088254, + "grad_norm": 0.5868589091316233, + "learning_rate": 2.829984407803808e-06, + "loss": 0.5283, + "step": 8521 + }, + { + "epoch": 1.382543802725503, + "grad_norm": 0.6298039351978318, + "learning_rate": 2.82956110679847e-06, + "loss": 0.5059, + "step": 8522 + }, + { + "epoch": 1.3827060350421805, + "grad_norm": 0.5786790693821262, + "learning_rate": 2.8291377961775026e-06, + "loss": 0.5202, + "step": 8523 + }, + { + "epoch": 1.382868267358858, + "grad_norm": 0.5635386607797479, + "learning_rate": 2.8287144759532553e-06, + "loss": 0.5353, + "step": 8524 + }, + { + "epoch": 1.3830304996755354, + "grad_norm": 0.6076843353133603, + "learning_rate": 2.828291146138079e-06, + "loss": 0.5383, + "step": 8525 + }, + { + "epoch": 1.3831927319922128, + "grad_norm": 0.5773822859099123, + "learning_rate": 2.8278678067443255e-06, + "loss": 0.5106, + "step": 8526 + }, + { + "epoch": 1.3833549643088903, + "grad_norm": 0.59464606623009, + "learning_rate": 2.827444457784347e-06, + "loss": 0.5347, + "step": 8527 + }, + { + "epoch": 1.3835171966255677, + "grad_norm": 0.5979643921960172, + "learning_rate": 2.827021099270495e-06, + "loss": 0.4983, + "step": 8528 + }, + { + "epoch": 1.3836794289422452, + "grad_norm": 0.5736303336753502, + "learning_rate": 2.8265977312151228e-06, + "loss": 0.5235, + "step": 8529 + }, + { + "epoch": 1.3838416612589228, + "grad_norm": 0.6110543369211159, + "learning_rate": 2.826174353630582e-06, + "loss": 0.5095, + "step": 8530 + }, + { + "epoch": 1.3840038935756003, + "grad_norm": 0.631380339713548, + "learning_rate": 2.825750966529226e-06, + "loss": 0.5402, + "step": 8531 + }, + { + "epoch": 1.3841661258922777, + "grad_norm": 0.6298265383537829, + "learning_rate": 2.825327569923408e-06, + "loss": 0.5394, + "step": 8532 + }, + { + "epoch": 1.3843283582089552, + "grad_norm": 0.6169765212433806, + "learning_rate": 2.824904163825481e-06, + "loss": 0.52, + "step": 8533 + }, + { + "epoch": 1.3844905905256328, + "grad_norm": 0.6029380350669018, + "learning_rate": 2.8244807482478e-06, + "loss": 0.5325, + "step": 8534 + }, + { + "epoch": 1.3846528228423103, + "grad_norm": 0.5723935570995103, + "learning_rate": 2.8240573232027175e-06, + "loss": 0.5169, + "step": 8535 + }, + { + "epoch": 1.3848150551589877, + "grad_norm": 0.6104800904029797, + "learning_rate": 2.823633888702589e-06, + "loss": 0.5232, + "step": 8536 + }, + { + "epoch": 1.3849772874756652, + "grad_norm": 0.6146455783226784, + "learning_rate": 2.8232104447597688e-06, + "loss": 0.5211, + "step": 8537 + }, + { + "epoch": 1.3851395197923426, + "grad_norm": 0.5706723678524426, + "learning_rate": 2.8227869913866102e-06, + "loss": 0.5133, + "step": 8538 + }, + { + "epoch": 1.38530175210902, + "grad_norm": 0.5964871069808612, + "learning_rate": 2.8223635285954715e-06, + "loss": 0.5024, + "step": 8539 + }, + { + "epoch": 1.3854639844256975, + "grad_norm": 0.585226024686069, + "learning_rate": 2.821940056398705e-06, + "loss": 0.4802, + "step": 8540 + }, + { + "epoch": 1.385626216742375, + "grad_norm": 0.6256352370616358, + "learning_rate": 2.821516574808668e-06, + "loss": 0.5619, + "step": 8541 + }, + { + "epoch": 1.3857884490590526, + "grad_norm": 0.6106534031867287, + "learning_rate": 2.8210930838377164e-06, + "loss": 0.5408, + "step": 8542 + }, + { + "epoch": 1.38595068137573, + "grad_norm": 0.5865495486975911, + "learning_rate": 2.8206695834982064e-06, + "loss": 0.4935, + "step": 8543 + }, + { + "epoch": 1.3861129136924075, + "grad_norm": 0.6231583929648259, + "learning_rate": 2.820246073802494e-06, + "loss": 0.5228, + "step": 8544 + }, + { + "epoch": 1.386275146009085, + "grad_norm": 0.6141982779742077, + "learning_rate": 2.819822554762936e-06, + "loss": 0.5179, + "step": 8545 + }, + { + "epoch": 1.3864373783257624, + "grad_norm": 0.5889023835736233, + "learning_rate": 2.8193990263918897e-06, + "loss": 0.514, + "step": 8546 + }, + { + "epoch": 1.38659961064244, + "grad_norm": 0.6024506128107986, + "learning_rate": 2.818975488701713e-06, + "loss": 0.523, + "step": 8547 + }, + { + "epoch": 1.3867618429591175, + "grad_norm": 0.601657965144203, + "learning_rate": 2.8185519417047624e-06, + "loss": 0.5314, + "step": 8548 + }, + { + "epoch": 1.386924075275795, + "grad_norm": 0.611338321642387, + "learning_rate": 2.8181283854133967e-06, + "loss": 0.5226, + "step": 8549 + }, + { + "epoch": 1.3870863075924724, + "grad_norm": 0.582338284551207, + "learning_rate": 2.8177048198399733e-06, + "loss": 0.5216, + "step": 8550 + }, + { + "epoch": 1.3872485399091499, + "grad_norm": 0.622112040638797, + "learning_rate": 2.81728124499685e-06, + "loss": 0.5397, + "step": 8551 + }, + { + "epoch": 1.3874107722258273, + "grad_norm": 0.606486568919188, + "learning_rate": 2.816857660896388e-06, + "loss": 0.4852, + "step": 8552 + }, + { + "epoch": 1.3875730045425048, + "grad_norm": 0.6204222471114491, + "learning_rate": 2.816434067550944e-06, + "loss": 0.5143, + "step": 8553 + }, + { + "epoch": 1.3877352368591824, + "grad_norm": 0.5766053238150313, + "learning_rate": 2.816010464972878e-06, + "loss": 0.4747, + "step": 8554 + }, + { + "epoch": 1.3878974691758599, + "grad_norm": 0.5972999224523946, + "learning_rate": 2.8155868531745488e-06, + "loss": 0.5219, + "step": 8555 + }, + { + "epoch": 1.3880597014925373, + "grad_norm": 0.6020431450392598, + "learning_rate": 2.815163232168317e-06, + "loss": 0.5068, + "step": 8556 + }, + { + "epoch": 1.3882219338092148, + "grad_norm": 0.6099498274733208, + "learning_rate": 2.814739601966543e-06, + "loss": 0.5433, + "step": 8557 + }, + { + "epoch": 1.3883841661258922, + "grad_norm": 0.6379668098881627, + "learning_rate": 2.814315962581585e-06, + "loss": 0.5154, + "step": 8558 + }, + { + "epoch": 1.3885463984425699, + "grad_norm": 0.6222182138473601, + "learning_rate": 2.8138923140258063e-06, + "loss": 0.5104, + "step": 8559 + }, + { + "epoch": 1.3887086307592473, + "grad_norm": 0.5754406848895194, + "learning_rate": 2.813468656311566e-06, + "loss": 0.5389, + "step": 8560 + }, + { + "epoch": 1.3888708630759248, + "grad_norm": 0.6055709110221122, + "learning_rate": 2.813044989451226e-06, + "loss": 0.5288, + "step": 8561 + }, + { + "epoch": 1.3890330953926022, + "grad_norm": 0.5915250928057407, + "learning_rate": 2.8126213134571467e-06, + "loss": 0.5434, + "step": 8562 + }, + { + "epoch": 1.3891953277092797, + "grad_norm": 0.5753883763193376, + "learning_rate": 2.8121976283416904e-06, + "loss": 0.4991, + "step": 8563 + }, + { + "epoch": 1.3893575600259571, + "grad_norm": 0.5761255709528013, + "learning_rate": 2.8117739341172184e-06, + "loss": 0.5214, + "step": 8564 + }, + { + "epoch": 1.3895197923426346, + "grad_norm": 0.5798232834447011, + "learning_rate": 2.8113502307960936e-06, + "loss": 0.4959, + "step": 8565 + }, + { + "epoch": 1.389682024659312, + "grad_norm": 0.5941707045876626, + "learning_rate": 2.8109265183906786e-06, + "loss": 0.5101, + "step": 8566 + }, + { + "epoch": 1.3898442569759897, + "grad_norm": 0.5686248602502149, + "learning_rate": 2.8105027969133357e-06, + "loss": 0.5067, + "step": 8567 + }, + { + "epoch": 1.3900064892926671, + "grad_norm": 0.5923147358834981, + "learning_rate": 2.810079066376428e-06, + "loss": 0.5328, + "step": 8568 + }, + { + "epoch": 1.3901687216093446, + "grad_norm": 0.5935261994258388, + "learning_rate": 2.809655326792317e-06, + "loss": 0.5026, + "step": 8569 + }, + { + "epoch": 1.390330953926022, + "grad_norm": 0.6493542693426168, + "learning_rate": 2.80923157817337e-06, + "loss": 0.5088, + "step": 8570 + }, + { + "epoch": 1.3904931862426995, + "grad_norm": 0.6111431363410407, + "learning_rate": 2.8088078205319465e-06, + "loss": 0.5236, + "step": 8571 + }, + { + "epoch": 1.3906554185593771, + "grad_norm": 0.5809978052462685, + "learning_rate": 2.8083840538804134e-06, + "loss": 0.552, + "step": 8572 + }, + { + "epoch": 1.3908176508760546, + "grad_norm": 0.5916955955475878, + "learning_rate": 2.8079602782311343e-06, + "loss": 0.5098, + "step": 8573 + }, + { + "epoch": 1.390979883192732, + "grad_norm": 0.6175427986428856, + "learning_rate": 2.807536493596473e-06, + "loss": 0.5243, + "step": 8574 + }, + { + "epoch": 1.3911421155094095, + "grad_norm": 0.5702466210610945, + "learning_rate": 2.8071126999887947e-06, + "loss": 0.4995, + "step": 8575 + }, + { + "epoch": 1.391304347826087, + "grad_norm": 0.6338520654877632, + "learning_rate": 2.8066888974204646e-06, + "loss": 0.5028, + "step": 8576 + }, + { + "epoch": 1.3914665801427644, + "grad_norm": 0.633022068391817, + "learning_rate": 2.8062650859038485e-06, + "loss": 0.537, + "step": 8577 + }, + { + "epoch": 1.3916288124594418, + "grad_norm": 0.6183921545373059, + "learning_rate": 2.8058412654513114e-06, + "loss": 0.5175, + "step": 8578 + }, + { + "epoch": 1.3917910447761195, + "grad_norm": 0.5916444590534577, + "learning_rate": 2.8054174360752186e-06, + "loss": 0.5128, + "step": 8579 + }, + { + "epoch": 1.391953277092797, + "grad_norm": 0.6022251854340098, + "learning_rate": 2.8049935977879374e-06, + "loss": 0.5251, + "step": 8580 + }, + { + "epoch": 1.3921155094094744, + "grad_norm": 0.6289239703649555, + "learning_rate": 2.8045697506018334e-06, + "loss": 0.535, + "step": 8581 + }, + { + "epoch": 1.3922777417261518, + "grad_norm": 0.5588990258575702, + "learning_rate": 2.8041458945292726e-06, + "loss": 0.5228, + "step": 8582 + }, + { + "epoch": 1.3924399740428293, + "grad_norm": 0.5986389911418512, + "learning_rate": 2.8037220295826233e-06, + "loss": 0.5352, + "step": 8583 + }, + { + "epoch": 1.392602206359507, + "grad_norm": 0.6057196894748536, + "learning_rate": 2.803298155774252e-06, + "loss": 0.5512, + "step": 8584 + }, + { + "epoch": 1.3927644386761844, + "grad_norm": 0.5880564847730319, + "learning_rate": 2.8028742731165264e-06, + "loss": 0.5248, + "step": 8585 + }, + { + "epoch": 1.3929266709928618, + "grad_norm": 0.5695483342544003, + "learning_rate": 2.802450381621814e-06, + "loss": 0.4988, + "step": 8586 + }, + { + "epoch": 1.3930889033095393, + "grad_norm": 0.5833403947049192, + "learning_rate": 2.8020264813024817e-06, + "loss": 0.5166, + "step": 8587 + }, + { + "epoch": 1.3932511356262167, + "grad_norm": 0.5548439691135351, + "learning_rate": 2.8016025721708995e-06, + "loss": 0.5243, + "step": 8588 + }, + { + "epoch": 1.3934133679428942, + "grad_norm": 0.565614825766033, + "learning_rate": 2.8011786542394344e-06, + "loss": 0.5293, + "step": 8589 + }, + { + "epoch": 1.3935756002595716, + "grad_norm": 0.6144596278214431, + "learning_rate": 2.8007547275204557e-06, + "loss": 0.5402, + "step": 8590 + }, + { + "epoch": 1.393737832576249, + "grad_norm": 0.5881427763150416, + "learning_rate": 2.8003307920263317e-06, + "loss": 0.5361, + "step": 8591 + }, + { + "epoch": 1.3939000648929267, + "grad_norm": 0.6278706078203531, + "learning_rate": 2.799906847769433e-06, + "loss": 0.5053, + "step": 8592 + }, + { + "epoch": 1.3940622972096042, + "grad_norm": 0.5877826437477921, + "learning_rate": 2.7994828947621277e-06, + "loss": 0.5276, + "step": 8593 + }, + { + "epoch": 1.3942245295262816, + "grad_norm": 0.6429375576337446, + "learning_rate": 2.7990589330167857e-06, + "loss": 0.5207, + "step": 8594 + }, + { + "epoch": 1.394386761842959, + "grad_norm": 0.5790779910063388, + "learning_rate": 2.7986349625457782e-06, + "loss": 0.5151, + "step": 8595 + }, + { + "epoch": 1.3945489941596367, + "grad_norm": 0.6004130000334523, + "learning_rate": 2.7982109833614733e-06, + "loss": 0.5229, + "step": 8596 + }, + { + "epoch": 1.3947112264763142, + "grad_norm": 0.6031175851766349, + "learning_rate": 2.797786995476243e-06, + "loss": 0.5075, + "step": 8597 + }, + { + "epoch": 1.3948734587929916, + "grad_norm": 0.63175730141642, + "learning_rate": 2.797362998902458e-06, + "loss": 0.5261, + "step": 8598 + }, + { + "epoch": 1.395035691109669, + "grad_norm": 0.5740544264501064, + "learning_rate": 2.796938993652489e-06, + "loss": 0.5026, + "step": 8599 + }, + { + "epoch": 1.3951979234263465, + "grad_norm": 0.6577255546171006, + "learning_rate": 2.7965149797387077e-06, + "loss": 0.536, + "step": 8600 + }, + { + "epoch": 1.395360155743024, + "grad_norm": 0.6097902822046694, + "learning_rate": 2.7960909571734833e-06, + "loss": 0.5415, + "step": 8601 + }, + { + "epoch": 1.3955223880597014, + "grad_norm": 0.5776647621879887, + "learning_rate": 2.795666925969191e-06, + "loss": 0.4861, + "step": 8602 + }, + { + "epoch": 1.3956846203763789, + "grad_norm": 0.5710525107280331, + "learning_rate": 2.795242886138201e-06, + "loss": 0.5539, + "step": 8603 + }, + { + "epoch": 1.3958468526930565, + "grad_norm": 0.6014004887539196, + "learning_rate": 2.7948188376928848e-06, + "loss": 0.554, + "step": 8604 + }, + { + "epoch": 1.396009085009734, + "grad_norm": 0.5886319885149257, + "learning_rate": 2.7943947806456157e-06, + "loss": 0.5188, + "step": 8605 + }, + { + "epoch": 1.3961713173264114, + "grad_norm": 0.6483539494872994, + "learning_rate": 2.7939707150087677e-06, + "loss": 0.5612, + "step": 8606 + }, + { + "epoch": 1.3963335496430889, + "grad_norm": 0.5953109882683203, + "learning_rate": 2.7935466407947115e-06, + "loss": 0.5301, + "step": 8607 + }, + { + "epoch": 1.3964957819597663, + "grad_norm": 0.589650776905655, + "learning_rate": 2.7931225580158216e-06, + "loss": 0.5069, + "step": 8608 + }, + { + "epoch": 1.396658014276444, + "grad_norm": 0.5944296570877173, + "learning_rate": 2.7926984666844718e-06, + "loss": 0.5294, + "step": 8609 + }, + { + "epoch": 1.3968202465931214, + "grad_norm": 0.5885771382263882, + "learning_rate": 2.7922743668130355e-06, + "loss": 0.5667, + "step": 8610 + }, + { + "epoch": 1.3969824789097989, + "grad_norm": 0.6019943472884789, + "learning_rate": 2.7918502584138862e-06, + "loss": 0.5389, + "step": 8611 + }, + { + "epoch": 1.3971447112264763, + "grad_norm": 0.6116679361710629, + "learning_rate": 2.7914261414993983e-06, + "loss": 0.5218, + "step": 8612 + }, + { + "epoch": 1.3973069435431538, + "grad_norm": 0.5691609683625295, + "learning_rate": 2.791002016081947e-06, + "loss": 0.4896, + "step": 8613 + }, + { + "epoch": 1.3974691758598312, + "grad_norm": 0.6226332052709049, + "learning_rate": 2.790577882173906e-06, + "loss": 0.534, + "step": 8614 + }, + { + "epoch": 1.3976314081765087, + "grad_norm": 0.5906094314063471, + "learning_rate": 2.7901537397876517e-06, + "loss": 0.5237, + "step": 8615 + }, + { + "epoch": 1.397793640493186, + "grad_norm": 0.5872498492407738, + "learning_rate": 2.789729588935558e-06, + "loss": 0.5026, + "step": 8616 + }, + { + "epoch": 1.3979558728098638, + "grad_norm": 0.5936920204663171, + "learning_rate": 2.789305429630001e-06, + "loss": 0.526, + "step": 8617 + }, + { + "epoch": 1.3981181051265412, + "grad_norm": 0.6493169094554266, + "learning_rate": 2.788881261883356e-06, + "loss": 0.5527, + "step": 8618 + }, + { + "epoch": 1.3982803374432187, + "grad_norm": 0.6220983941220413, + "learning_rate": 2.788457085707999e-06, + "loss": 0.525, + "step": 8619 + }, + { + "epoch": 1.3984425697598961, + "grad_norm": 0.6041139814537608, + "learning_rate": 2.788032901116307e-06, + "loss": 0.5556, + "step": 8620 + }, + { + "epoch": 1.3986048020765738, + "grad_norm": 0.5976596091923012, + "learning_rate": 2.787608708120656e-06, + "loss": 0.5065, + "step": 8621 + }, + { + "epoch": 1.3987670343932512, + "grad_norm": 0.5768767811227494, + "learning_rate": 2.787184506733423e-06, + "loss": 0.4867, + "step": 8622 + }, + { + "epoch": 1.3989292667099287, + "grad_norm": 0.5992951442223955, + "learning_rate": 2.7867602969669844e-06, + "loss": 0.4932, + "step": 8623 + }, + { + "epoch": 1.3990914990266061, + "grad_norm": 0.6031716643181552, + "learning_rate": 2.7863360788337173e-06, + "loss": 0.5249, + "step": 8624 + }, + { + "epoch": 1.3992537313432836, + "grad_norm": 0.5944972969879035, + "learning_rate": 2.7859118523459995e-06, + "loss": 0.5308, + "step": 8625 + }, + { + "epoch": 1.399415963659961, + "grad_norm": 0.6047472856176749, + "learning_rate": 2.7854876175162095e-06, + "loss": 0.5193, + "step": 8626 + }, + { + "epoch": 1.3995781959766385, + "grad_norm": 0.6312592596868001, + "learning_rate": 2.785063374356723e-06, + "loss": 0.5454, + "step": 8627 + }, + { + "epoch": 1.399740428293316, + "grad_norm": 0.6005393283535593, + "learning_rate": 2.784639122879921e-06, + "loss": 0.4862, + "step": 8628 + }, + { + "epoch": 1.3999026606099936, + "grad_norm": 0.6016172451231925, + "learning_rate": 2.7842148630981798e-06, + "loss": 0.501, + "step": 8629 + }, + { + "epoch": 1.400064892926671, + "grad_norm": 0.6033031014487619, + "learning_rate": 2.783790595023878e-06, + "loss": 0.5377, + "step": 8630 + }, + { + "epoch": 1.4002271252433485, + "grad_norm": 0.5831644410929645, + "learning_rate": 2.783366318669397e-06, + "loss": 0.5286, + "step": 8631 + }, + { + "epoch": 1.400389357560026, + "grad_norm": 0.6092300068212533, + "learning_rate": 2.782942034047113e-06, + "loss": 0.5062, + "step": 8632 + }, + { + "epoch": 1.4005515898767034, + "grad_norm": 0.6096774895145872, + "learning_rate": 2.782517741169407e-06, + "loss": 0.5246, + "step": 8633 + }, + { + "epoch": 1.400713822193381, + "grad_norm": 0.5996960661115673, + "learning_rate": 2.782093440048658e-06, + "loss": 0.5369, + "step": 8634 + }, + { + "epoch": 1.4008760545100585, + "grad_norm": 0.6205448326618913, + "learning_rate": 2.7816691306972466e-06, + "loss": 0.476, + "step": 8635 + }, + { + "epoch": 1.401038286826736, + "grad_norm": 0.6496612150023462, + "learning_rate": 2.781244813127552e-06, + "loss": 0.5086, + "step": 8636 + }, + { + "epoch": 1.4012005191434134, + "grad_norm": 0.5895706809280022, + "learning_rate": 2.7808204873519545e-06, + "loss": 0.5461, + "step": 8637 + }, + { + "epoch": 1.4013627514600908, + "grad_norm": 0.5755843642236708, + "learning_rate": 2.780396153382836e-06, + "loss": 0.4856, + "step": 8638 + }, + { + "epoch": 1.4015249837767683, + "grad_norm": 0.6100727128091573, + "learning_rate": 2.779971811232576e-06, + "loss": 0.4994, + "step": 8639 + }, + { + "epoch": 1.4016872160934457, + "grad_norm": 0.6099461414290083, + "learning_rate": 2.7795474609135564e-06, + "loss": 0.4965, + "step": 8640 + }, + { + "epoch": 1.4018494484101234, + "grad_norm": 0.5849446317142031, + "learning_rate": 2.7791231024381582e-06, + "loss": 0.4958, + "step": 8641 + }, + { + "epoch": 1.4020116807268008, + "grad_norm": 0.6086164326612934, + "learning_rate": 2.7786987358187627e-06, + "loss": 0.5048, + "step": 8642 + }, + { + "epoch": 1.4021739130434783, + "grad_norm": 0.6391108111472354, + "learning_rate": 2.7782743610677514e-06, + "loss": 0.4765, + "step": 8643 + }, + { + "epoch": 1.4023361453601557, + "grad_norm": 0.5645768062539852, + "learning_rate": 2.7778499781975075e-06, + "loss": 0.5189, + "step": 8644 + }, + { + "epoch": 1.4024983776768332, + "grad_norm": 0.5921860328884001, + "learning_rate": 2.777425587220412e-06, + "loss": 0.5344, + "step": 8645 + }, + { + "epoch": 1.4026606099935108, + "grad_norm": 0.5934909061179691, + "learning_rate": 2.7770011881488477e-06, + "loss": 0.5313, + "step": 8646 + }, + { + "epoch": 1.4028228423101883, + "grad_norm": 0.6112541916819253, + "learning_rate": 2.7765767809951984e-06, + "loss": 0.5356, + "step": 8647 + }, + { + "epoch": 1.4029850746268657, + "grad_norm": 0.6129338981020962, + "learning_rate": 2.7761523657718453e-06, + "loss": 0.5268, + "step": 8648 + }, + { + "epoch": 1.4031473069435432, + "grad_norm": 0.5911408038315957, + "learning_rate": 2.7757279424911735e-06, + "loss": 0.522, + "step": 8649 + }, + { + "epoch": 1.4033095392602206, + "grad_norm": 0.6218441875495948, + "learning_rate": 2.7753035111655645e-06, + "loss": 0.5096, + "step": 8650 + }, + { + "epoch": 1.403471771576898, + "grad_norm": 0.607123304712777, + "learning_rate": 2.774879071807403e-06, + "loss": 0.5172, + "step": 8651 + }, + { + "epoch": 1.4036340038935755, + "grad_norm": 0.6041211356605356, + "learning_rate": 2.774454624429074e-06, + "loss": 0.5248, + "step": 8652 + }, + { + "epoch": 1.403796236210253, + "grad_norm": 0.6235616502185788, + "learning_rate": 2.7740301690429586e-06, + "loss": 0.4954, + "step": 8653 + }, + { + "epoch": 1.4039584685269306, + "grad_norm": 0.5751678065084719, + "learning_rate": 2.7736057056614445e-06, + "loss": 0.5157, + "step": 8654 + }, + { + "epoch": 1.404120700843608, + "grad_norm": 0.5859181773084622, + "learning_rate": 2.7731812342969146e-06, + "loss": 0.536, + "step": 8655 + }, + { + "epoch": 1.4042829331602855, + "grad_norm": 0.6062884008756297, + "learning_rate": 2.772756754961753e-06, + "loss": 0.547, + "step": 8656 + }, + { + "epoch": 1.404445165476963, + "grad_norm": 0.5967950968762984, + "learning_rate": 2.7723322676683462e-06, + "loss": 0.5179, + "step": 8657 + }, + { + "epoch": 1.4046073977936404, + "grad_norm": 0.6222593347314643, + "learning_rate": 2.7719077724290793e-06, + "loss": 0.5084, + "step": 8658 + }, + { + "epoch": 1.404769630110318, + "grad_norm": 0.6167362727204903, + "learning_rate": 2.7714832692563374e-06, + "loss": 0.5, + "step": 8659 + }, + { + "epoch": 1.4049318624269955, + "grad_norm": 0.5838056724996682, + "learning_rate": 2.7710587581625064e-06, + "loss": 0.5155, + "step": 8660 + }, + { + "epoch": 1.405094094743673, + "grad_norm": 0.5849708949966081, + "learning_rate": 2.7706342391599715e-06, + "loss": 0.4819, + "step": 8661 + }, + { + "epoch": 1.4052563270603504, + "grad_norm": 0.6116238018996479, + "learning_rate": 2.7702097122611215e-06, + "loss": 0.5317, + "step": 8662 + }, + { + "epoch": 1.4054185593770279, + "grad_norm": 0.6287314336818569, + "learning_rate": 2.7697851774783395e-06, + "loss": 0.4915, + "step": 8663 + }, + { + "epoch": 1.4055807916937053, + "grad_norm": 0.6147701941548449, + "learning_rate": 2.7693606348240142e-06, + "loss": 0.5151, + "step": 8664 + }, + { + "epoch": 1.4057430240103828, + "grad_norm": 0.5874320799959883, + "learning_rate": 2.7689360843105323e-06, + "loss": 0.5095, + "step": 8665 + }, + { + "epoch": 1.4059052563270604, + "grad_norm": 0.581443675246537, + "learning_rate": 2.7685115259502804e-06, + "loss": 0.5136, + "step": 8666 + }, + { + "epoch": 1.4060674886437379, + "grad_norm": 0.5885479035357459, + "learning_rate": 2.768086959755647e-06, + "loss": 0.5352, + "step": 8667 + }, + { + "epoch": 1.4062297209604153, + "grad_norm": 0.6260710179648721, + "learning_rate": 2.7676623857390176e-06, + "loss": 0.5373, + "step": 8668 + }, + { + "epoch": 1.4063919532770928, + "grad_norm": 0.5865228718182685, + "learning_rate": 2.767237803912783e-06, + "loss": 0.5444, + "step": 8669 + }, + { + "epoch": 1.4065541855937702, + "grad_norm": 0.6297535831319031, + "learning_rate": 2.7668132142893276e-06, + "loss": 0.522, + "step": 8670 + }, + { + "epoch": 1.4067164179104479, + "grad_norm": 0.5900994587798116, + "learning_rate": 2.7663886168810427e-06, + "loss": 0.5391, + "step": 8671 + }, + { + "epoch": 1.4068786502271253, + "grad_norm": 0.6053875555030757, + "learning_rate": 2.765964011700316e-06, + "loss": 0.5185, + "step": 8672 + }, + { + "epoch": 1.4070408825438028, + "grad_norm": 0.648366472401452, + "learning_rate": 2.7655393987595357e-06, + "loss": 0.4905, + "step": 8673 + }, + { + "epoch": 1.4072031148604802, + "grad_norm": 0.5808695567746402, + "learning_rate": 2.765114778071092e-06, + "loss": 0.4908, + "step": 8674 + }, + { + "epoch": 1.4073653471771577, + "grad_norm": 0.574917290432722, + "learning_rate": 2.7646901496473717e-06, + "loss": 0.5264, + "step": 8675 + }, + { + "epoch": 1.4075275794938351, + "grad_norm": 0.5846366417498707, + "learning_rate": 2.7642655135007664e-06, + "loss": 0.5174, + "step": 8676 + }, + { + "epoch": 1.4076898118105126, + "grad_norm": 0.6281489128037981, + "learning_rate": 2.7638408696436655e-06, + "loss": 0.5135, + "step": 8677 + }, + { + "epoch": 1.40785204412719, + "grad_norm": 0.6077191271904877, + "learning_rate": 2.7634162180884576e-06, + "loss": 0.5134, + "step": 8678 + }, + { + "epoch": 1.4080142764438677, + "grad_norm": 0.6032587260176997, + "learning_rate": 2.7629915588475333e-06, + "loss": 0.5229, + "step": 8679 + }, + { + "epoch": 1.4081765087605451, + "grad_norm": 0.6141112819648533, + "learning_rate": 2.762566891933285e-06, + "loss": 0.5134, + "step": 8680 + }, + { + "epoch": 1.4083387410772226, + "grad_norm": 0.6249109092722194, + "learning_rate": 2.762142217358099e-06, + "loss": 0.5366, + "step": 8681 + }, + { + "epoch": 1.4085009733939, + "grad_norm": 0.6287556469841857, + "learning_rate": 2.7617175351343707e-06, + "loss": 0.5327, + "step": 8682 + }, + { + "epoch": 1.4086632057105777, + "grad_norm": 0.5933029931019443, + "learning_rate": 2.761292845274488e-06, + "loss": 0.5185, + "step": 8683 + }, + { + "epoch": 1.4088254380272551, + "grad_norm": 0.6325156973075783, + "learning_rate": 2.7608681477908423e-06, + "loss": 0.5301, + "step": 8684 + }, + { + "epoch": 1.4089876703439326, + "grad_norm": 0.5754306780873932, + "learning_rate": 2.760443442695827e-06, + "loss": 0.5133, + "step": 8685 + }, + { + "epoch": 1.40914990266061, + "grad_norm": 0.6296534698329143, + "learning_rate": 2.7600187300018315e-06, + "loss": 0.5171, + "step": 8686 + }, + { + "epoch": 1.4093121349772875, + "grad_norm": 0.5967174320818592, + "learning_rate": 2.7595940097212502e-06, + "loss": 0.5352, + "step": 8687 + }, + { + "epoch": 1.409474367293965, + "grad_norm": 0.583067526589369, + "learning_rate": 2.7591692818664724e-06, + "loss": 0.516, + "step": 8688 + }, + { + "epoch": 1.4096365996106424, + "grad_norm": 0.5691844599989301, + "learning_rate": 2.7587445464498918e-06, + "loss": 0.5436, + "step": 8689 + }, + { + "epoch": 1.4097988319273198, + "grad_norm": 0.8694934104322569, + "learning_rate": 2.758319803483901e-06, + "loss": 0.5169, + "step": 8690 + }, + { + "epoch": 1.4099610642439975, + "grad_norm": 0.6108701416445236, + "learning_rate": 2.7578950529808927e-06, + "loss": 0.5152, + "step": 8691 + }, + { + "epoch": 1.410123296560675, + "grad_norm": 0.5948967187903011, + "learning_rate": 2.75747029495326e-06, + "loss": 0.5345, + "step": 8692 + }, + { + "epoch": 1.4102855288773524, + "grad_norm": 0.6436548284904545, + "learning_rate": 2.757045529413395e-06, + "loss": 0.5281, + "step": 8693 + }, + { + "epoch": 1.4104477611940298, + "grad_norm": 0.565967889595258, + "learning_rate": 2.756620756373692e-06, + "loss": 0.5573, + "step": 8694 + }, + { + "epoch": 1.4106099935107073, + "grad_norm": 0.6034080956189127, + "learning_rate": 2.756195975846546e-06, + "loss": 0.4932, + "step": 8695 + }, + { + "epoch": 1.410772225827385, + "grad_norm": 0.5821009351880978, + "learning_rate": 2.755771187844349e-06, + "loss": 0.5337, + "step": 8696 + }, + { + "epoch": 1.4109344581440624, + "grad_norm": 0.5882337725633688, + "learning_rate": 2.7553463923794956e-06, + "loss": 0.4852, + "step": 8697 + }, + { + "epoch": 1.4110966904607398, + "grad_norm": 0.6140523068332235, + "learning_rate": 2.7549215894643794e-06, + "loss": 0.5238, + "step": 8698 + }, + { + "epoch": 1.4112589227774173, + "grad_norm": 0.6304410319368092, + "learning_rate": 2.7544967791113953e-06, + "loss": 0.5288, + "step": 8699 + }, + { + "epoch": 1.4114211550940947, + "grad_norm": 0.6224826828356058, + "learning_rate": 2.7540719613329393e-06, + "loss": 0.5326, + "step": 8700 + }, + { + "epoch": 1.4115833874107722, + "grad_norm": 0.6144641984540528, + "learning_rate": 2.7536471361414046e-06, + "loss": 0.5405, + "step": 8701 + }, + { + "epoch": 1.4117456197274496, + "grad_norm": 0.5944920978765562, + "learning_rate": 2.7532223035491877e-06, + "loss": 0.5308, + "step": 8702 + }, + { + "epoch": 1.411907852044127, + "grad_norm": 0.5663172741100556, + "learning_rate": 2.7527974635686835e-06, + "loss": 0.5285, + "step": 8703 + }, + { + "epoch": 1.4120700843608047, + "grad_norm": 0.6082858011286839, + "learning_rate": 2.752372616212286e-06, + "loss": 0.5268, + "step": 8704 + }, + { + "epoch": 1.4122323166774822, + "grad_norm": 0.6208258872238164, + "learning_rate": 2.751947761492394e-06, + "loss": 0.5317, + "step": 8705 + }, + { + "epoch": 1.4123945489941596, + "grad_norm": 0.5604493226981095, + "learning_rate": 2.751522899421401e-06, + "loss": 0.4963, + "step": 8706 + }, + { + "epoch": 1.412556781310837, + "grad_norm": 0.5963030315929995, + "learning_rate": 2.7510980300117052e-06, + "loss": 0.5474, + "step": 8707 + }, + { + "epoch": 1.4127190136275147, + "grad_norm": 0.5683754231556907, + "learning_rate": 2.7506731532757013e-06, + "loss": 0.503, + "step": 8708 + }, + { + "epoch": 1.4128812459441922, + "grad_norm": 0.6089037125822702, + "learning_rate": 2.750248269225787e-06, + "loss": 0.5427, + "step": 8709 + }, + { + "epoch": 1.4130434782608696, + "grad_norm": 0.6039887043333959, + "learning_rate": 2.749823377874359e-06, + "loss": 0.522, + "step": 8710 + }, + { + "epoch": 1.413205710577547, + "grad_norm": 0.6018163713282298, + "learning_rate": 2.7493984792338137e-06, + "loss": 0.5187, + "step": 8711 + }, + { + "epoch": 1.4133679428942245, + "grad_norm": 0.586046993533632, + "learning_rate": 2.7489735733165494e-06, + "loss": 0.4951, + "step": 8712 + }, + { + "epoch": 1.413530175210902, + "grad_norm": 0.5530247768650607, + "learning_rate": 2.748548660134963e-06, + "loss": 0.4899, + "step": 8713 + }, + { + "epoch": 1.4136924075275794, + "grad_norm": 0.6125440265951104, + "learning_rate": 2.7481237397014532e-06, + "loss": 0.5392, + "step": 8714 + }, + { + "epoch": 1.4138546398442569, + "grad_norm": 0.5829828061346294, + "learning_rate": 2.7476988120284166e-06, + "loss": 0.5298, + "step": 8715 + }, + { + "epoch": 1.4140168721609345, + "grad_norm": 0.5954254081340516, + "learning_rate": 2.747273877128252e-06, + "loss": 0.4874, + "step": 8716 + }, + { + "epoch": 1.414179104477612, + "grad_norm": 0.5689139137411346, + "learning_rate": 2.7468489350133576e-06, + "loss": 0.5304, + "step": 8717 + }, + { + "epoch": 1.4143413367942894, + "grad_norm": 0.5983781458211042, + "learning_rate": 2.7464239856961316e-06, + "loss": 0.5278, + "step": 8718 + }, + { + "epoch": 1.4145035691109669, + "grad_norm": 0.5796714028766742, + "learning_rate": 2.745999029188974e-06, + "loss": 0.5102, + "step": 8719 + }, + { + "epoch": 1.4146658014276443, + "grad_norm": 0.61724361262669, + "learning_rate": 2.745574065504283e-06, + "loss": 0.5266, + "step": 8720 + }, + { + "epoch": 1.414828033744322, + "grad_norm": 0.5969332616215645, + "learning_rate": 2.7451490946544572e-06, + "loss": 0.4762, + "step": 8721 + }, + { + "epoch": 1.4149902660609994, + "grad_norm": 0.5843716236666628, + "learning_rate": 2.744724116651897e-06, + "loss": 0.5178, + "step": 8722 + }, + { + "epoch": 1.4151524983776769, + "grad_norm": 0.5909468575931378, + "learning_rate": 2.7442991315090025e-06, + "loss": 0.5221, + "step": 8723 + }, + { + "epoch": 1.4153147306943543, + "grad_norm": 0.6121494856836921, + "learning_rate": 2.743874139238171e-06, + "loss": 0.5084, + "step": 8724 + }, + { + "epoch": 1.4154769630110318, + "grad_norm": 0.635261932049002, + "learning_rate": 2.743449139851805e-06, + "loss": 0.5372, + "step": 8725 + }, + { + "epoch": 1.4156391953277092, + "grad_norm": 0.565466428919873, + "learning_rate": 2.7430241333623044e-06, + "loss": 0.5317, + "step": 8726 + }, + { + "epoch": 1.4158014276443867, + "grad_norm": 0.6230660872478816, + "learning_rate": 2.7425991197820686e-06, + "loss": 0.508, + "step": 8727 + }, + { + "epoch": 1.4159636599610643, + "grad_norm": 0.6036662993662091, + "learning_rate": 2.742174099123499e-06, + "loss": 0.5354, + "step": 8728 + }, + { + "epoch": 1.4161258922777418, + "grad_norm": 0.60274099060548, + "learning_rate": 2.741749071398996e-06, + "loss": 0.5636, + "step": 8729 + }, + { + "epoch": 1.4162881245944192, + "grad_norm": 0.6206777998150862, + "learning_rate": 2.741324036620961e-06, + "loss": 0.4975, + "step": 8730 + }, + { + "epoch": 1.4164503569110967, + "grad_norm": 0.6026714566621911, + "learning_rate": 2.7408989948017957e-06, + "loss": 0.5329, + "step": 8731 + }, + { + "epoch": 1.416612589227774, + "grad_norm": 0.5998639773360255, + "learning_rate": 2.740473945953901e-06, + "loss": 0.5191, + "step": 8732 + }, + { + "epoch": 1.4167748215444518, + "grad_norm": 0.5900396381357884, + "learning_rate": 2.740048890089679e-06, + "loss": 0.5265, + "step": 8733 + }, + { + "epoch": 1.4169370538611292, + "grad_norm": 0.5889177537191098, + "learning_rate": 2.739623827221531e-06, + "loss": 0.5083, + "step": 8734 + }, + { + "epoch": 1.4170992861778067, + "grad_norm": 0.6135103621225664, + "learning_rate": 2.7391987573618584e-06, + "loss": 0.498, + "step": 8735 + }, + { + "epoch": 1.4172615184944841, + "grad_norm": 0.6373576734890103, + "learning_rate": 2.7387736805230663e-06, + "loss": 0.5408, + "step": 8736 + }, + { + "epoch": 1.4174237508111616, + "grad_norm": 0.6336063977908363, + "learning_rate": 2.738348596717554e-06, + "loss": 0.5231, + "step": 8737 + }, + { + "epoch": 1.417585983127839, + "grad_norm": 0.571140667263574, + "learning_rate": 2.7379235059577263e-06, + "loss": 0.4887, + "step": 8738 + }, + { + "epoch": 1.4177482154445165, + "grad_norm": 0.66472948338441, + "learning_rate": 2.7374984082559854e-06, + "loss": 0.5285, + "step": 8739 + }, + { + "epoch": 1.417910447761194, + "grad_norm": 0.6082189110522304, + "learning_rate": 2.7370733036247337e-06, + "loss": 0.5355, + "step": 8740 + }, + { + "epoch": 1.4180726800778716, + "grad_norm": 0.6102564858940992, + "learning_rate": 2.7366481920763766e-06, + "loss": 0.5239, + "step": 8741 + }, + { + "epoch": 1.418234912394549, + "grad_norm": 0.5957409581682557, + "learning_rate": 2.736223073623315e-06, + "loss": 0.5207, + "step": 8742 + }, + { + "epoch": 1.4183971447112265, + "grad_norm": 0.6024452762160597, + "learning_rate": 2.7357979482779546e-06, + "loss": 0.5351, + "step": 8743 + }, + { + "epoch": 1.418559377027904, + "grad_norm": 0.5684062824258734, + "learning_rate": 2.7353728160526983e-06, + "loss": 0.5215, + "step": 8744 + }, + { + "epoch": 1.4187216093445814, + "grad_norm": 0.5775332856599844, + "learning_rate": 2.7349476769599505e-06, + "loss": 0.5045, + "step": 8745 + }, + { + "epoch": 1.418883841661259, + "grad_norm": 0.588538857180141, + "learning_rate": 2.7345225310121155e-06, + "loss": 0.5207, + "step": 8746 + }, + { + "epoch": 1.4190460739779365, + "grad_norm": 0.5943752699667506, + "learning_rate": 2.734097378221597e-06, + "loss": 0.5009, + "step": 8747 + }, + { + "epoch": 1.419208306294614, + "grad_norm": 0.6248048549792726, + "learning_rate": 2.7336722186008013e-06, + "loss": 0.5317, + "step": 8748 + }, + { + "epoch": 1.4193705386112914, + "grad_norm": 0.5842951651421076, + "learning_rate": 2.733247052162133e-06, + "loss": 0.5255, + "step": 8749 + }, + { + "epoch": 1.4195327709279688, + "grad_norm": 0.6384718949473898, + "learning_rate": 2.732821878917996e-06, + "loss": 0.5087, + "step": 8750 + }, + { + "epoch": 1.4196950032446463, + "grad_norm": 0.5787018787664509, + "learning_rate": 2.732396698880797e-06, + "loss": 0.5312, + "step": 8751 + }, + { + "epoch": 1.4198572355613237, + "grad_norm": 0.6092490928645525, + "learning_rate": 2.7319715120629407e-06, + "loss": 0.5086, + "step": 8752 + }, + { + "epoch": 1.4200194678780014, + "grad_norm": 0.6101750495292206, + "learning_rate": 2.7315463184768327e-06, + "loss": 0.5247, + "step": 8753 + }, + { + "epoch": 1.4201817001946788, + "grad_norm": 0.6154481341513894, + "learning_rate": 2.73112111813488e-06, + "loss": 0.4803, + "step": 8754 + }, + { + "epoch": 1.4203439325113563, + "grad_norm": 0.615930327144342, + "learning_rate": 2.7306959110494867e-06, + "loss": 0.5131, + "step": 8755 + }, + { + "epoch": 1.4205061648280337, + "grad_norm": 0.5947576302475429, + "learning_rate": 2.730270697233062e-06, + "loss": 0.5396, + "step": 8756 + }, + { + "epoch": 1.4206683971447112, + "grad_norm": 0.6021216497622272, + "learning_rate": 2.7298454766980093e-06, + "loss": 0.5369, + "step": 8757 + }, + { + "epoch": 1.4208306294613888, + "grad_norm": 0.616421517539027, + "learning_rate": 2.729420249456737e-06, + "loss": 0.5145, + "step": 8758 + }, + { + "epoch": 1.4209928617780663, + "grad_norm": 0.6029847606059155, + "learning_rate": 2.7289950155216523e-06, + "loss": 0.5301, + "step": 8759 + }, + { + "epoch": 1.4211550940947437, + "grad_norm": 0.5909367897264313, + "learning_rate": 2.7285697749051604e-06, + "loss": 0.5288, + "step": 8760 + }, + { + "epoch": 1.4213173264114212, + "grad_norm": 0.6093954289287231, + "learning_rate": 2.7281445276196715e-06, + "loss": 0.5279, + "step": 8761 + }, + { + "epoch": 1.4214795587280986, + "grad_norm": 0.5869117407365295, + "learning_rate": 2.7277192736775904e-06, + "loss": 0.5226, + "step": 8762 + }, + { + "epoch": 1.421641791044776, + "grad_norm": 0.6049452751690095, + "learning_rate": 2.727294013091326e-06, + "loss": 0.5158, + "step": 8763 + }, + { + "epoch": 1.4218040233614535, + "grad_norm": 0.6212638900423614, + "learning_rate": 2.726868745873286e-06, + "loss": 0.5536, + "step": 8764 + }, + { + "epoch": 1.421966255678131, + "grad_norm": 0.5806367543147464, + "learning_rate": 2.7264434720358786e-06, + "loss": 0.5371, + "step": 8765 + }, + { + "epoch": 1.4221284879948086, + "grad_norm": 0.5850691600001998, + "learning_rate": 2.7260181915915118e-06, + "loss": 0.5057, + "step": 8766 + }, + { + "epoch": 1.422290720311486, + "grad_norm": 0.597236592751807, + "learning_rate": 2.7255929045525936e-06, + "loss": 0.5137, + "step": 8767 + }, + { + "epoch": 1.4224529526281635, + "grad_norm": 0.5889716908188444, + "learning_rate": 2.725167610931534e-06, + "loss": 0.4951, + "step": 8768 + }, + { + "epoch": 1.422615184944841, + "grad_norm": 0.6068312677429744, + "learning_rate": 2.724742310740741e-06, + "loss": 0.5125, + "step": 8769 + }, + { + "epoch": 1.4227774172615186, + "grad_norm": 0.6337602613540981, + "learning_rate": 2.7243170039926225e-06, + "loss": 0.5165, + "step": 8770 + }, + { + "epoch": 1.422939649578196, + "grad_norm": 0.5992285308564167, + "learning_rate": 2.7238916906995895e-06, + "loss": 0.4829, + "step": 8771 + }, + { + "epoch": 1.4231018818948735, + "grad_norm": 0.593112473355465, + "learning_rate": 2.7234663708740512e-06, + "loss": 0.4889, + "step": 8772 + }, + { + "epoch": 1.423264114211551, + "grad_norm": 0.6017515895863101, + "learning_rate": 2.7230410445284154e-06, + "loss": 0.5111, + "step": 8773 + }, + { + "epoch": 1.4234263465282284, + "grad_norm": 0.638467920881662, + "learning_rate": 2.722615711675094e-06, + "loss": 0.5262, + "step": 8774 + }, + { + "epoch": 1.4235885788449059, + "grad_norm": 0.6035866998452438, + "learning_rate": 2.7221903723264962e-06, + "loss": 0.5332, + "step": 8775 + }, + { + "epoch": 1.4237508111615833, + "grad_norm": 0.5797601025688474, + "learning_rate": 2.721765026495032e-06, + "loss": 0.5177, + "step": 8776 + }, + { + "epoch": 1.4239130434782608, + "grad_norm": 0.6207042790231053, + "learning_rate": 2.721339674193112e-06, + "loss": 0.5317, + "step": 8777 + }, + { + "epoch": 1.4240752757949384, + "grad_norm": 0.6505673505676373, + "learning_rate": 2.7209143154331464e-06, + "loss": 0.5089, + "step": 8778 + }, + { + "epoch": 1.4242375081116159, + "grad_norm": 0.5979949855080768, + "learning_rate": 2.7204889502275465e-06, + "loss": 0.5274, + "step": 8779 + }, + { + "epoch": 1.4243997404282933, + "grad_norm": 0.6281177133388262, + "learning_rate": 2.720063578588722e-06, + "loss": 0.5288, + "step": 8780 + }, + { + "epoch": 1.4245619727449708, + "grad_norm": 0.6141291600550016, + "learning_rate": 2.7196382005290854e-06, + "loss": 0.5084, + "step": 8781 + }, + { + "epoch": 1.4247242050616482, + "grad_norm": 0.6030526756578475, + "learning_rate": 2.7192128160610474e-06, + "loss": 0.5541, + "step": 8782 + }, + { + "epoch": 1.4248864373783259, + "grad_norm": 0.588523974661859, + "learning_rate": 2.7187874251970196e-06, + "loss": 0.5184, + "step": 8783 + }, + { + "epoch": 1.4250486696950033, + "grad_norm": 0.6126720734602932, + "learning_rate": 2.718362027949414e-06, + "loss": 0.5301, + "step": 8784 + }, + { + "epoch": 1.4252109020116808, + "grad_norm": 0.5597000400321717, + "learning_rate": 2.717936624330641e-06, + "loss": 0.5282, + "step": 8785 + }, + { + "epoch": 1.4253731343283582, + "grad_norm": 0.5622779337242111, + "learning_rate": 2.717511214353114e-06, + "loss": 0.5359, + "step": 8786 + }, + { + "epoch": 1.4255353666450357, + "grad_norm": 0.6371554624823101, + "learning_rate": 2.7170857980292452e-06, + "loss": 0.528, + "step": 8787 + }, + { + "epoch": 1.425697598961713, + "grad_norm": 0.5843529380136079, + "learning_rate": 2.7166603753714465e-06, + "loss": 0.5135, + "step": 8788 + }, + { + "epoch": 1.4258598312783906, + "grad_norm": 0.5824597347700813, + "learning_rate": 2.716234946392131e-06, + "loss": 0.5252, + "step": 8789 + }, + { + "epoch": 1.426022063595068, + "grad_norm": 0.5783057725531976, + "learning_rate": 2.715809511103711e-06, + "loss": 0.5272, + "step": 8790 + }, + { + "epoch": 1.4261842959117457, + "grad_norm": 0.5922073739625998, + "learning_rate": 2.7153840695185985e-06, + "loss": 0.506, + "step": 8791 + }, + { + "epoch": 1.4263465282284231, + "grad_norm": 0.5825951248140876, + "learning_rate": 2.7149586216492084e-06, + "loss": 0.5152, + "step": 8792 + }, + { + "epoch": 1.4265087605451006, + "grad_norm": 0.6080316072429435, + "learning_rate": 2.714533167507954e-06, + "loss": 0.5264, + "step": 8793 + }, + { + "epoch": 1.426670992861778, + "grad_norm": 0.5955994701233284, + "learning_rate": 2.714107707107248e-06, + "loss": 0.5482, + "step": 8794 + }, + { + "epoch": 1.4268332251784557, + "grad_norm": 0.6116791745512155, + "learning_rate": 2.7136822404595043e-06, + "loss": 0.5298, + "step": 8795 + }, + { + "epoch": 1.4269954574951331, + "grad_norm": 0.6284908456896818, + "learning_rate": 2.713256767577136e-06, + "loss": 0.5156, + "step": 8796 + }, + { + "epoch": 1.4271576898118106, + "grad_norm": 0.5984906240247287, + "learning_rate": 2.712831288472559e-06, + "loss": 0.5393, + "step": 8797 + }, + { + "epoch": 1.427319922128488, + "grad_norm": 0.6114673852910458, + "learning_rate": 2.7124058031581856e-06, + "loss": 0.485, + "step": 8798 + }, + { + "epoch": 1.4274821544451655, + "grad_norm": 0.6134280327249114, + "learning_rate": 2.7119803116464317e-06, + "loss": 0.5243, + "step": 8799 + }, + { + "epoch": 1.427644386761843, + "grad_norm": 0.587451506348978, + "learning_rate": 2.7115548139497106e-06, + "loss": 0.5278, + "step": 8800 + }, + { + "epoch": 1.4278066190785204, + "grad_norm": 0.591814781840253, + "learning_rate": 2.711129310080438e-06, + "loss": 0.5145, + "step": 8801 + }, + { + "epoch": 1.4279688513951978, + "grad_norm": 0.6187666513722586, + "learning_rate": 2.710703800051029e-06, + "loss": 0.4973, + "step": 8802 + }, + { + "epoch": 1.4281310837118755, + "grad_norm": 0.5904596810488149, + "learning_rate": 2.7102782838738972e-06, + "loss": 0.5156, + "step": 8803 + }, + { + "epoch": 1.428293316028553, + "grad_norm": 0.5731797150088601, + "learning_rate": 2.70985276156146e-06, + "loss": 0.5227, + "step": 8804 + }, + { + "epoch": 1.4284555483452304, + "grad_norm": 0.5984860955556484, + "learning_rate": 2.7094272331261324e-06, + "loss": 0.5132, + "step": 8805 + }, + { + "epoch": 1.4286177806619078, + "grad_norm": 0.6082288017461502, + "learning_rate": 2.709001698580329e-06, + "loss": 0.5371, + "step": 8806 + }, + { + "epoch": 1.4287800129785853, + "grad_norm": 0.6118251023635242, + "learning_rate": 2.7085761579364665e-06, + "loss": 0.5666, + "step": 8807 + }, + { + "epoch": 1.428942245295263, + "grad_norm": 0.5996551806214917, + "learning_rate": 2.7081506112069605e-06, + "loss": 0.5472, + "step": 8808 + }, + { + "epoch": 1.4291044776119404, + "grad_norm": 0.6490167868467966, + "learning_rate": 2.7077250584042274e-06, + "loss": 0.518, + "step": 8809 + }, + { + "epoch": 1.4292667099286178, + "grad_norm": 0.5945787091357416, + "learning_rate": 2.707299499540684e-06, + "loss": 0.5282, + "step": 8810 + }, + { + "epoch": 1.4294289422452953, + "grad_norm": 0.6136841202998322, + "learning_rate": 2.7068739346287466e-06, + "loss": 0.5142, + "step": 8811 + }, + { + "epoch": 1.4295911745619727, + "grad_norm": 0.6219309584428073, + "learning_rate": 2.7064483636808314e-06, + "loss": 0.5377, + "step": 8812 + }, + { + "epoch": 1.4297534068786502, + "grad_norm": 0.6207256325322809, + "learning_rate": 2.7060227867093557e-06, + "loss": 0.5268, + "step": 8813 + }, + { + "epoch": 1.4299156391953276, + "grad_norm": 0.5757617123830885, + "learning_rate": 2.7055972037267366e-06, + "loss": 0.5164, + "step": 8814 + }, + { + "epoch": 1.4300778715120053, + "grad_norm": 0.5813611457669263, + "learning_rate": 2.705171614745392e-06, + "loss": 0.4911, + "step": 8815 + }, + { + "epoch": 1.4302401038286827, + "grad_norm": 0.5927151137388488, + "learning_rate": 2.704746019777738e-06, + "loss": 0.5265, + "step": 8816 + }, + { + "epoch": 1.4304023361453602, + "grad_norm": 0.6306958924693237, + "learning_rate": 2.7043204188361937e-06, + "loss": 0.524, + "step": 8817 + }, + { + "epoch": 1.4305645684620376, + "grad_norm": 0.634382395455546, + "learning_rate": 2.7038948119331758e-06, + "loss": 0.5402, + "step": 8818 + }, + { + "epoch": 1.430726800778715, + "grad_norm": 0.6442927368629305, + "learning_rate": 2.7034691990811028e-06, + "loss": 0.5058, + "step": 8819 + }, + { + "epoch": 1.4308890330953927, + "grad_norm": 0.5798329092868639, + "learning_rate": 2.7030435802923927e-06, + "loss": 0.5153, + "step": 8820 + }, + { + "epoch": 1.4310512654120702, + "grad_norm": 0.6367081738910955, + "learning_rate": 2.702617955579463e-06, + "loss": 0.4997, + "step": 8821 + }, + { + "epoch": 1.4312134977287476, + "grad_norm": 0.5700228487905709, + "learning_rate": 2.702192324954734e-06, + "loss": 0.5069, + "step": 8822 + }, + { + "epoch": 1.431375730045425, + "grad_norm": 0.5934169220930829, + "learning_rate": 2.701766688430623e-06, + "loss": 0.518, + "step": 8823 + }, + { + "epoch": 1.4315379623621025, + "grad_norm": 0.5771149964291767, + "learning_rate": 2.7013410460195494e-06, + "loss": 0.5013, + "step": 8824 + }, + { + "epoch": 1.43170019467878, + "grad_norm": 0.6010961669582424, + "learning_rate": 2.7009153977339313e-06, + "loss": 0.5196, + "step": 8825 + }, + { + "epoch": 1.4318624269954574, + "grad_norm": 0.6197893696558109, + "learning_rate": 2.700489743586189e-06, + "loss": 0.5624, + "step": 8826 + }, + { + "epoch": 1.4320246593121349, + "grad_norm": 0.6244080049069582, + "learning_rate": 2.7000640835887413e-06, + "loss": 0.4978, + "step": 8827 + }, + { + "epoch": 1.4321868916288125, + "grad_norm": 0.6369186480844865, + "learning_rate": 2.6996384177540087e-06, + "loss": 0.5301, + "step": 8828 + }, + { + "epoch": 1.43234912394549, + "grad_norm": 0.6063356429379336, + "learning_rate": 2.6992127460944084e-06, + "loss": 0.4973, + "step": 8829 + }, + { + "epoch": 1.4325113562621674, + "grad_norm": 0.6198389536865974, + "learning_rate": 2.6987870686223625e-06, + "loss": 0.5117, + "step": 8830 + }, + { + "epoch": 1.4326735885788449, + "grad_norm": 0.5975867631565458, + "learning_rate": 2.698361385350291e-06, + "loss": 0.5234, + "step": 8831 + }, + { + "epoch": 1.4328358208955223, + "grad_norm": 0.5829005794587352, + "learning_rate": 2.6979356962906123e-06, + "loss": 0.5233, + "step": 8832 + }, + { + "epoch": 1.4329980532122, + "grad_norm": 0.5957797324452051, + "learning_rate": 2.6975100014557492e-06, + "loss": 0.4952, + "step": 8833 + }, + { + "epoch": 1.4331602855288774, + "grad_norm": 0.6477105631917315, + "learning_rate": 2.69708430085812e-06, + "loss": 0.4871, + "step": 8834 + }, + { + "epoch": 1.4333225178455549, + "grad_norm": 0.6386123279387769, + "learning_rate": 2.696658594510147e-06, + "loss": 0.479, + "step": 8835 + }, + { + "epoch": 1.4334847501622323, + "grad_norm": 0.5931693719920577, + "learning_rate": 2.69623288242425e-06, + "loss": 0.5032, + "step": 8836 + }, + { + "epoch": 1.4336469824789098, + "grad_norm": 0.5977412483090079, + "learning_rate": 2.69580716461285e-06, + "loss": 0.4909, + "step": 8837 + }, + { + "epoch": 1.4338092147955872, + "grad_norm": 0.608461799595921, + "learning_rate": 2.69538144108837e-06, + "loss": 0.5251, + "step": 8838 + }, + { + "epoch": 1.4339714471122647, + "grad_norm": 0.6128598100548789, + "learning_rate": 2.6949557118632286e-06, + "loss": 0.4797, + "step": 8839 + }, + { + "epoch": 1.4341336794289423, + "grad_norm": 0.5908062337632037, + "learning_rate": 2.6945299769498497e-06, + "loss": 0.5032, + "step": 8840 + }, + { + "epoch": 1.4342959117456198, + "grad_norm": 0.6224112949317828, + "learning_rate": 2.694104236360654e-06, + "loss": 0.5005, + "step": 8841 + }, + { + "epoch": 1.4344581440622972, + "grad_norm": 0.5878877733456493, + "learning_rate": 2.693678490108064e-06, + "loss": 0.5057, + "step": 8842 + }, + { + "epoch": 1.4346203763789747, + "grad_norm": 0.600702187604439, + "learning_rate": 2.6932527382045e-06, + "loss": 0.5357, + "step": 8843 + }, + { + "epoch": 1.434782608695652, + "grad_norm": 0.5638849650710855, + "learning_rate": 2.6928269806623856e-06, + "loss": 0.5239, + "step": 8844 + }, + { + "epoch": 1.4349448410123298, + "grad_norm": 0.5591078508995777, + "learning_rate": 2.6924012174941433e-06, + "loss": 0.5264, + "step": 8845 + }, + { + "epoch": 1.4351070733290072, + "grad_norm": 0.6491301698992251, + "learning_rate": 2.6919754487121956e-06, + "loss": 0.5281, + "step": 8846 + }, + { + "epoch": 1.4352693056456847, + "grad_norm": 0.5924772821902876, + "learning_rate": 2.6915496743289643e-06, + "loss": 0.5261, + "step": 8847 + }, + { + "epoch": 1.4354315379623621, + "grad_norm": 0.5618876384814769, + "learning_rate": 2.6911238943568728e-06, + "loss": 0.5303, + "step": 8848 + }, + { + "epoch": 1.4355937702790396, + "grad_norm": 0.5702790224332459, + "learning_rate": 2.6906981088083446e-06, + "loss": 0.5408, + "step": 8849 + }, + { + "epoch": 1.435756002595717, + "grad_norm": 0.5973926843595366, + "learning_rate": 2.6902723176958013e-06, + "loss": 0.5294, + "step": 8850 + }, + { + "epoch": 1.4359182349123945, + "grad_norm": 0.6403380830990136, + "learning_rate": 2.6898465210316686e-06, + "loss": 0.5278, + "step": 8851 + }, + { + "epoch": 1.436080467229072, + "grad_norm": 0.6413236962554392, + "learning_rate": 2.6894207188283677e-06, + "loss": 0.5214, + "step": 8852 + }, + { + "epoch": 1.4362426995457496, + "grad_norm": 0.61529646342333, + "learning_rate": 2.6889949110983247e-06, + "loss": 0.5261, + "step": 8853 + }, + { + "epoch": 1.436404931862427, + "grad_norm": 0.5942270773529159, + "learning_rate": 2.688569097853961e-06, + "loss": 0.5401, + "step": 8854 + }, + { + "epoch": 1.4365671641791045, + "grad_norm": 0.5787501257292809, + "learning_rate": 2.6881432791077016e-06, + "loss": 0.509, + "step": 8855 + }, + { + "epoch": 1.436729396495782, + "grad_norm": 0.5956491620718172, + "learning_rate": 2.687717454871971e-06, + "loss": 0.5082, + "step": 8856 + }, + { + "epoch": 1.4368916288124596, + "grad_norm": 0.5939396399735256, + "learning_rate": 2.687291625159193e-06, + "loss": 0.507, + "step": 8857 + }, + { + "epoch": 1.437053861129137, + "grad_norm": 0.6222730397379512, + "learning_rate": 2.686865789981793e-06, + "loss": 0.5501, + "step": 8858 + }, + { + "epoch": 1.4372160934458145, + "grad_norm": 0.5746377904621749, + "learning_rate": 2.6864399493521935e-06, + "loss": 0.5459, + "step": 8859 + }, + { + "epoch": 1.437378325762492, + "grad_norm": 0.5814575291410882, + "learning_rate": 2.686014103282821e-06, + "loss": 0.518, + "step": 8860 + }, + { + "epoch": 1.4375405580791694, + "grad_norm": 0.603005536504369, + "learning_rate": 2.685588251786101e-06, + "loss": 0.564, + "step": 8861 + }, + { + "epoch": 1.4377027903958468, + "grad_norm": 0.6170785949846357, + "learning_rate": 2.6851623948744572e-06, + "loss": 0.5125, + "step": 8862 + }, + { + "epoch": 1.4378650227125243, + "grad_norm": 0.6095068539752259, + "learning_rate": 2.6847365325603147e-06, + "loss": 0.5438, + "step": 8863 + }, + { + "epoch": 1.4380272550292017, + "grad_norm": 0.6013035612046214, + "learning_rate": 2.6843106648561013e-06, + "loss": 0.5428, + "step": 8864 + }, + { + "epoch": 1.4381894873458794, + "grad_norm": 0.6074968828074186, + "learning_rate": 2.6838847917742396e-06, + "loss": 0.5204, + "step": 8865 + }, + { + "epoch": 1.4383517196625568, + "grad_norm": 0.6234586634632374, + "learning_rate": 2.6834589133271565e-06, + "loss": 0.4978, + "step": 8866 + }, + { + "epoch": 1.4385139519792343, + "grad_norm": 0.6067613653839349, + "learning_rate": 2.6830330295272783e-06, + "loss": 0.5505, + "step": 8867 + }, + { + "epoch": 1.4386761842959117, + "grad_norm": 0.6363956803467834, + "learning_rate": 2.68260714038703e-06, + "loss": 0.509, + "step": 8868 + }, + { + "epoch": 1.4388384166125892, + "grad_norm": 0.5831571947344859, + "learning_rate": 2.68218124591884e-06, + "loss": 0.5446, + "step": 8869 + }, + { + "epoch": 1.4390006489292668, + "grad_norm": 0.5811809214665656, + "learning_rate": 2.6817553461351326e-06, + "loss": 0.5352, + "step": 8870 + }, + { + "epoch": 1.4391628812459443, + "grad_norm": 0.5854645507978856, + "learning_rate": 2.6813294410483355e-06, + "loss": 0.5179, + "step": 8871 + }, + { + "epoch": 1.4393251135626217, + "grad_norm": 0.6101575271014977, + "learning_rate": 2.680903530670873e-06, + "loss": 0.4872, + "step": 8872 + }, + { + "epoch": 1.4394873458792992, + "grad_norm": 0.6030600582401171, + "learning_rate": 2.6804776150151752e-06, + "loss": 0.5049, + "step": 8873 + }, + { + "epoch": 1.4396495781959766, + "grad_norm": 0.6244849996094057, + "learning_rate": 2.6800516940936673e-06, + "loss": 0.5379, + "step": 8874 + }, + { + "epoch": 1.439811810512654, + "grad_norm": 0.5926262464885299, + "learning_rate": 2.6796257679187763e-06, + "loss": 0.5022, + "step": 8875 + }, + { + "epoch": 1.4399740428293315, + "grad_norm": 0.6114454809555389, + "learning_rate": 2.6791998365029304e-06, + "loss": 0.4934, + "step": 8876 + }, + { + "epoch": 1.440136275146009, + "grad_norm": 0.6149767982071206, + "learning_rate": 2.6787738998585555e-06, + "loss": 0.5013, + "step": 8877 + }, + { + "epoch": 1.4402985074626866, + "grad_norm": 0.5942605775411175, + "learning_rate": 2.678347957998081e-06, + "loss": 0.5106, + "step": 8878 + }, + { + "epoch": 1.440460739779364, + "grad_norm": 0.5996830240367871, + "learning_rate": 2.6779220109339337e-06, + "loss": 0.5107, + "step": 8879 + }, + { + "epoch": 1.4406229720960415, + "grad_norm": 0.6237973978418839, + "learning_rate": 2.677496058678542e-06, + "loss": 0.5281, + "step": 8880 + }, + { + "epoch": 1.440785204412719, + "grad_norm": 0.5859886049158837, + "learning_rate": 2.6770701012443335e-06, + "loss": 0.5168, + "step": 8881 + }, + { + "epoch": 1.4409474367293966, + "grad_norm": 0.6064943208568921, + "learning_rate": 2.6766441386437363e-06, + "loss": 0.5427, + "step": 8882 + }, + { + "epoch": 1.441109669046074, + "grad_norm": 0.6156744744605733, + "learning_rate": 2.6762181708891783e-06, + "loss": 0.5176, + "step": 8883 + }, + { + "epoch": 1.4412719013627515, + "grad_norm": 0.6514937379229113, + "learning_rate": 2.6757921979930888e-06, + "loss": 0.5191, + "step": 8884 + }, + { + "epoch": 1.441434133679429, + "grad_norm": 0.6298647297678233, + "learning_rate": 2.675366219967897e-06, + "loss": 0.4937, + "step": 8885 + }, + { + "epoch": 1.4415963659961064, + "grad_norm": 0.6229964608776335, + "learning_rate": 2.67494023682603e-06, + "loss": 0.4915, + "step": 8886 + }, + { + "epoch": 1.4417585983127839, + "grad_norm": 0.5988825804455616, + "learning_rate": 2.674514248579919e-06, + "loss": 0.505, + "step": 8887 + }, + { + "epoch": 1.4419208306294613, + "grad_norm": 0.5607104843323393, + "learning_rate": 2.6740882552419908e-06, + "loss": 0.4963, + "step": 8888 + }, + { + "epoch": 1.4420830629461387, + "grad_norm": 0.5807004453161598, + "learning_rate": 2.673662256824676e-06, + "loss": 0.5361, + "step": 8889 + }, + { + "epoch": 1.4422452952628164, + "grad_norm": 0.6202124240390985, + "learning_rate": 2.6732362533404033e-06, + "loss": 0.531, + "step": 8890 + }, + { + "epoch": 1.4424075275794939, + "grad_norm": 0.5906032412877072, + "learning_rate": 2.672810244801603e-06, + "loss": 0.529, + "step": 8891 + }, + { + "epoch": 1.4425697598961713, + "grad_norm": 0.600641728958167, + "learning_rate": 2.6723842312207043e-06, + "loss": 0.5135, + "step": 8892 + }, + { + "epoch": 1.4427319922128488, + "grad_norm": 0.7073157672167055, + "learning_rate": 2.671958212610137e-06, + "loss": 0.5378, + "step": 8893 + }, + { + "epoch": 1.4428942245295262, + "grad_norm": 0.6028484389099806, + "learning_rate": 2.671532188982331e-06, + "loss": 0.4943, + "step": 8894 + }, + { + "epoch": 1.4430564568462039, + "grad_norm": 0.586723063551742, + "learning_rate": 2.6711061603497162e-06, + "loss": 0.5405, + "step": 8895 + }, + { + "epoch": 1.4432186891628813, + "grad_norm": 0.641127383666308, + "learning_rate": 2.6706801267247233e-06, + "loss": 0.5302, + "step": 8896 + }, + { + "epoch": 1.4433809214795588, + "grad_norm": 0.6152588481745999, + "learning_rate": 2.6702540881197836e-06, + "loss": 0.5549, + "step": 8897 + }, + { + "epoch": 1.4435431537962362, + "grad_norm": 0.5722641704257759, + "learning_rate": 2.6698280445473264e-06, + "loss": 0.5182, + "step": 8898 + }, + { + "epoch": 1.4437053861129137, + "grad_norm": 0.6404777967677853, + "learning_rate": 2.6694019960197827e-06, + "loss": 0.5335, + "step": 8899 + }, + { + "epoch": 1.443867618429591, + "grad_norm": 0.6240441984127836, + "learning_rate": 2.6689759425495833e-06, + "loss": 0.5085, + "step": 8900 + }, + { + "epoch": 1.4440298507462686, + "grad_norm": 0.5793799942216948, + "learning_rate": 2.6685498841491587e-06, + "loss": 0.5498, + "step": 8901 + }, + { + "epoch": 1.4441920830629462, + "grad_norm": 0.6048721530433119, + "learning_rate": 2.6681238208309405e-06, + "loss": 0.5145, + "step": 8902 + }, + { + "epoch": 1.4443543153796237, + "grad_norm": 0.6020146077170503, + "learning_rate": 2.667697752607361e-06, + "loss": 0.5085, + "step": 8903 + }, + { + "epoch": 1.4445165476963011, + "grad_norm": 0.5580004868005998, + "learning_rate": 2.6672716794908502e-06, + "loss": 0.5061, + "step": 8904 + }, + { + "epoch": 1.4446787800129786, + "grad_norm": 0.6134359122005101, + "learning_rate": 2.666845601493841e-06, + "loss": 0.528, + "step": 8905 + }, + { + "epoch": 1.444841012329656, + "grad_norm": 0.6352095846719373, + "learning_rate": 2.6664195186287634e-06, + "loss": 0.4715, + "step": 8906 + }, + { + "epoch": 1.4450032446463337, + "grad_norm": 0.5939431629281776, + "learning_rate": 2.665993430908051e-06, + "loss": 0.5414, + "step": 8907 + }, + { + "epoch": 1.4451654769630111, + "grad_norm": 0.5832283383061421, + "learning_rate": 2.6655673383441345e-06, + "loss": 0.5039, + "step": 8908 + }, + { + "epoch": 1.4453277092796886, + "grad_norm": 0.5943805917061403, + "learning_rate": 2.6651412409494466e-06, + "loss": 0.5184, + "step": 8909 + }, + { + "epoch": 1.445489941596366, + "grad_norm": 0.6481269330881959, + "learning_rate": 2.664715138736419e-06, + "loss": 0.5072, + "step": 8910 + }, + { + "epoch": 1.4456521739130435, + "grad_norm": 0.6142726083107636, + "learning_rate": 2.6642890317174854e-06, + "loss": 0.4943, + "step": 8911 + }, + { + "epoch": 1.445814406229721, + "grad_norm": 0.5784891843978001, + "learning_rate": 2.663862919905077e-06, + "loss": 0.5064, + "step": 8912 + }, + { + "epoch": 1.4459766385463984, + "grad_norm": 0.5975028400828379, + "learning_rate": 2.663436803311627e-06, + "loss": 0.4758, + "step": 8913 + }, + { + "epoch": 1.4461388708630758, + "grad_norm": 0.5730130705370785, + "learning_rate": 2.6630106819495687e-06, + "loss": 0.4788, + "step": 8914 + }, + { + "epoch": 1.4463011031797535, + "grad_norm": 0.6069529748964831, + "learning_rate": 2.6625845558313346e-06, + "loss": 0.5204, + "step": 8915 + }, + { + "epoch": 1.446463335496431, + "grad_norm": 0.6015322180943744, + "learning_rate": 2.6621584249693577e-06, + "loss": 0.5383, + "step": 8916 + }, + { + "epoch": 1.4466255678131084, + "grad_norm": 0.6135087470695193, + "learning_rate": 2.6617322893760716e-06, + "loss": 0.5251, + "step": 8917 + }, + { + "epoch": 1.4467878001297858, + "grad_norm": 0.5647757691154369, + "learning_rate": 2.66130614906391e-06, + "loss": 0.4755, + "step": 8918 + }, + { + "epoch": 1.4469500324464633, + "grad_norm": 0.5963710334132164, + "learning_rate": 2.660880004045305e-06, + "loss": 0.5155, + "step": 8919 + }, + { + "epoch": 1.447112264763141, + "grad_norm": 0.5950919130595917, + "learning_rate": 2.660453854332691e-06, + "loss": 0.5024, + "step": 8920 + }, + { + "epoch": 1.4472744970798184, + "grad_norm": 0.6089331425597868, + "learning_rate": 2.6600276999385033e-06, + "loss": 0.5326, + "step": 8921 + }, + { + "epoch": 1.4474367293964958, + "grad_norm": 0.6307109745747873, + "learning_rate": 2.659601540875174e-06, + "loss": 0.5609, + "step": 8922 + }, + { + "epoch": 1.4475989617131733, + "grad_norm": 0.5938605163015764, + "learning_rate": 2.6591753771551377e-06, + "loss": 0.5086, + "step": 8923 + }, + { + "epoch": 1.4477611940298507, + "grad_norm": 0.589329610195793, + "learning_rate": 2.6587492087908283e-06, + "loss": 0.5142, + "step": 8924 + }, + { + "epoch": 1.4479234263465282, + "grad_norm": 0.5921607540536784, + "learning_rate": 2.6583230357946815e-06, + "loss": 0.5349, + "step": 8925 + }, + { + "epoch": 1.4480856586632056, + "grad_norm": 0.6219622302162027, + "learning_rate": 2.65789685817913e-06, + "loss": 0.4875, + "step": 8926 + }, + { + "epoch": 1.4482478909798833, + "grad_norm": 0.6171752206271681, + "learning_rate": 2.6574706759566098e-06, + "loss": 0.515, + "step": 8927 + }, + { + "epoch": 1.4484101232965607, + "grad_norm": 0.5898283362782458, + "learning_rate": 2.6570444891395547e-06, + "loss": 0.5368, + "step": 8928 + }, + { + "epoch": 1.4485723556132382, + "grad_norm": 0.6449941459159585, + "learning_rate": 2.6566182977404e-06, + "loss": 0.5093, + "step": 8929 + }, + { + "epoch": 1.4487345879299156, + "grad_norm": 0.6085289429098721, + "learning_rate": 2.6561921017715806e-06, + "loss": 0.5324, + "step": 8930 + }, + { + "epoch": 1.448896820246593, + "grad_norm": 0.616956019882692, + "learning_rate": 2.6557659012455315e-06, + "loss": 0.5254, + "step": 8931 + }, + { + "epoch": 1.4490590525632707, + "grad_norm": 0.6094448828658822, + "learning_rate": 2.655339696174689e-06, + "loss": 0.5249, + "step": 8932 + }, + { + "epoch": 1.4492212848799482, + "grad_norm": 0.5893958524826151, + "learning_rate": 2.654913486571487e-06, + "loss": 0.5614, + "step": 8933 + }, + { + "epoch": 1.4493835171966256, + "grad_norm": 0.6295205453105612, + "learning_rate": 2.6544872724483622e-06, + "loss": 0.5291, + "step": 8934 + }, + { + "epoch": 1.449545749513303, + "grad_norm": 0.6031003625248335, + "learning_rate": 2.6540610538177495e-06, + "loss": 0.5326, + "step": 8935 + }, + { + "epoch": 1.4497079818299805, + "grad_norm": 0.5994302878012017, + "learning_rate": 2.6536348306920854e-06, + "loss": 0.5402, + "step": 8936 + }, + { + "epoch": 1.449870214146658, + "grad_norm": 0.5760671328657323, + "learning_rate": 2.653208603083805e-06, + "loss": 0.523, + "step": 8937 + }, + { + "epoch": 1.4500324464633354, + "grad_norm": 0.6061729837406852, + "learning_rate": 2.6527823710053464e-06, + "loss": 0.5061, + "step": 8938 + }, + { + "epoch": 1.4501946787800128, + "grad_norm": 0.6078848452426212, + "learning_rate": 2.652356134469143e-06, + "loss": 0.5455, + "step": 8939 + }, + { + "epoch": 1.4503569110966905, + "grad_norm": 0.665219134002755, + "learning_rate": 2.6519298934876325e-06, + "loss": 0.5379, + "step": 8940 + }, + { + "epoch": 1.450519143413368, + "grad_norm": 0.6392742600750732, + "learning_rate": 2.6515036480732515e-06, + "loss": 0.5369, + "step": 8941 + }, + { + "epoch": 1.4506813757300454, + "grad_norm": 0.5786728566189663, + "learning_rate": 2.6510773982384363e-06, + "loss": 0.5218, + "step": 8942 + }, + { + "epoch": 1.4508436080467229, + "grad_norm": 0.6077714402732225, + "learning_rate": 2.6506511439956243e-06, + "loss": 0.5039, + "step": 8943 + }, + { + "epoch": 1.4510058403634005, + "grad_norm": 0.5849193958656719, + "learning_rate": 2.650224885357251e-06, + "loss": 0.4757, + "step": 8944 + }, + { + "epoch": 1.451168072680078, + "grad_norm": 0.6111674579515805, + "learning_rate": 2.6497986223357553e-06, + "loss": 0.5208, + "step": 8945 + }, + { + "epoch": 1.4513303049967554, + "grad_norm": 0.6178941494425642, + "learning_rate": 2.6493723549435726e-06, + "loss": 0.489, + "step": 8946 + }, + { + "epoch": 1.4514925373134329, + "grad_norm": 0.619042879744563, + "learning_rate": 2.64894608319314e-06, + "loss": 0.5119, + "step": 8947 + }, + { + "epoch": 1.4516547696301103, + "grad_norm": 0.6185357915033878, + "learning_rate": 2.648519807096897e-06, + "loss": 0.5419, + "step": 8948 + }, + { + "epoch": 1.4518170019467878, + "grad_norm": 0.6021792480800776, + "learning_rate": 2.648093526667278e-06, + "loss": 0.5237, + "step": 8949 + }, + { + "epoch": 1.4519792342634652, + "grad_norm": 0.6019799014999136, + "learning_rate": 2.6476672419167244e-06, + "loss": 0.5345, + "step": 8950 + }, + { + "epoch": 1.4521414665801426, + "grad_norm": 0.6164870090276632, + "learning_rate": 2.6472409528576706e-06, + "loss": 0.5633, + "step": 8951 + }, + { + "epoch": 1.4523036988968203, + "grad_norm": 0.6224267367784171, + "learning_rate": 2.646814659502556e-06, + "loss": 0.5256, + "step": 8952 + }, + { + "epoch": 1.4524659312134978, + "grad_norm": 0.6185952052217875, + "learning_rate": 2.6463883618638175e-06, + "loss": 0.5385, + "step": 8953 + }, + { + "epoch": 1.4526281635301752, + "grad_norm": 0.6031546400214445, + "learning_rate": 2.645962059953895e-06, + "loss": 0.5143, + "step": 8954 + }, + { + "epoch": 1.4527903958468527, + "grad_norm": 0.5683597527376808, + "learning_rate": 2.6455357537852257e-06, + "loss": 0.5489, + "step": 8955 + }, + { + "epoch": 1.45295262816353, + "grad_norm": 0.5623194895381192, + "learning_rate": 2.6451094433702478e-06, + "loss": 0.5164, + "step": 8956 + }, + { + "epoch": 1.4531148604802078, + "grad_norm": 0.5993532389831859, + "learning_rate": 2.6446831287214e-06, + "loss": 0.511, + "step": 8957 + }, + { + "epoch": 1.4532770927968852, + "grad_norm": 0.6102071488136626, + "learning_rate": 2.6442568098511205e-06, + "loss": 0.5276, + "step": 8958 + }, + { + "epoch": 1.4534393251135627, + "grad_norm": 0.58388934527439, + "learning_rate": 2.6438304867718497e-06, + "loss": 0.5205, + "step": 8959 + }, + { + "epoch": 1.45360155743024, + "grad_norm": 0.6260628947531487, + "learning_rate": 2.6434041594960237e-06, + "loss": 0.5032, + "step": 8960 + }, + { + "epoch": 1.4537637897469176, + "grad_norm": 0.5981570271919205, + "learning_rate": 2.6429778280360844e-06, + "loss": 0.4979, + "step": 8961 + }, + { + "epoch": 1.453926022063595, + "grad_norm": 0.5994815794420771, + "learning_rate": 2.6425514924044685e-06, + "loss": 0.5196, + "step": 8962 + }, + { + "epoch": 1.4540882543802724, + "grad_norm": 0.5948180913469852, + "learning_rate": 2.642125152613618e-06, + "loss": 0.5171, + "step": 8963 + }, + { + "epoch": 1.45425048669695, + "grad_norm": 0.5744602753876062, + "learning_rate": 2.641698808675969e-06, + "loss": 0.5346, + "step": 8964 + }, + { + "epoch": 1.4544127190136276, + "grad_norm": 0.5960998292571071, + "learning_rate": 2.641272460603963e-06, + "loss": 0.5259, + "step": 8965 + }, + { + "epoch": 1.454574951330305, + "grad_norm": 0.6223925050971614, + "learning_rate": 2.640846108410039e-06, + "loss": 0.5234, + "step": 8966 + }, + { + "epoch": 1.4547371836469825, + "grad_norm": 0.6125827855756464, + "learning_rate": 2.640419752106637e-06, + "loss": 0.5074, + "step": 8967 + }, + { + "epoch": 1.45489941596366, + "grad_norm": 0.6099028308567668, + "learning_rate": 2.6399933917061963e-06, + "loss": 0.5396, + "step": 8968 + }, + { + "epoch": 1.4550616482803376, + "grad_norm": 0.5868470852290153, + "learning_rate": 2.639567027221158e-06, + "loss": 0.5, + "step": 8969 + }, + { + "epoch": 1.455223880597015, + "grad_norm": 0.5793080708103057, + "learning_rate": 2.63914065866396e-06, + "loss": 0.5231, + "step": 8970 + }, + { + "epoch": 1.4553861129136925, + "grad_norm": 0.5982993788140827, + "learning_rate": 2.638714286047045e-06, + "loss": 0.5352, + "step": 8971 + }, + { + "epoch": 1.45554834523037, + "grad_norm": 0.6162835856585201, + "learning_rate": 2.6382879093828516e-06, + "loss": 0.5217, + "step": 8972 + }, + { + "epoch": 1.4557105775470474, + "grad_norm": 0.6026660796252534, + "learning_rate": 2.6378615286838215e-06, + "loss": 0.5152, + "step": 8973 + }, + { + "epoch": 1.4558728098637248, + "grad_norm": 0.5892748877256679, + "learning_rate": 2.6374351439623942e-06, + "loss": 0.5341, + "step": 8974 + }, + { + "epoch": 1.4560350421804023, + "grad_norm": 0.6105484455509332, + "learning_rate": 2.6370087552310096e-06, + "loss": 0.5407, + "step": 8975 + }, + { + "epoch": 1.4561972744970797, + "grad_norm": 0.6029347302759896, + "learning_rate": 2.636582362502111e-06, + "loss": 0.5082, + "step": 8976 + }, + { + "epoch": 1.4563595068137574, + "grad_norm": 0.5789626557260389, + "learning_rate": 2.6361559657881367e-06, + "loss": 0.5244, + "step": 8977 + }, + { + "epoch": 1.4565217391304348, + "grad_norm": 0.6234661928573865, + "learning_rate": 2.63572956510153e-06, + "loss": 0.5177, + "step": 8978 + }, + { + "epoch": 1.4566839714471123, + "grad_norm": 0.5692426392077267, + "learning_rate": 2.635303160454731e-06, + "loss": 0.5267, + "step": 8979 + }, + { + "epoch": 1.4568462037637897, + "grad_norm": 0.5721475185203558, + "learning_rate": 2.6348767518601793e-06, + "loss": 0.5071, + "step": 8980 + }, + { + "epoch": 1.4570084360804672, + "grad_norm": 0.6076667956372839, + "learning_rate": 2.6344503393303194e-06, + "loss": 0.4843, + "step": 8981 + }, + { + "epoch": 1.4571706683971448, + "grad_norm": 0.6158790013951345, + "learning_rate": 2.6340239228775904e-06, + "loss": 0.5192, + "step": 8982 + }, + { + "epoch": 1.4573329007138223, + "grad_norm": 0.551772973554965, + "learning_rate": 2.633597502514435e-06, + "loss": 0.5018, + "step": 8983 + }, + { + "epoch": 1.4574951330304997, + "grad_norm": 0.6034227006084176, + "learning_rate": 2.633171078253294e-06, + "loss": 0.5179, + "step": 8984 + }, + { + "epoch": 1.4576573653471772, + "grad_norm": 0.6026391435351823, + "learning_rate": 2.6327446501066107e-06, + "loss": 0.5115, + "step": 8985 + }, + { + "epoch": 1.4578195976638546, + "grad_norm": 0.6062706667677763, + "learning_rate": 2.6323182180868255e-06, + "loss": 0.5143, + "step": 8986 + }, + { + "epoch": 1.457981829980532, + "grad_norm": 0.6055153839225441, + "learning_rate": 2.6318917822063806e-06, + "loss": 0.5135, + "step": 8987 + }, + { + "epoch": 1.4581440622972095, + "grad_norm": 0.5551162895513265, + "learning_rate": 2.6314653424777194e-06, + "loss": 0.5172, + "step": 8988 + }, + { + "epoch": 1.4583062946138872, + "grad_norm": 0.5830614313815545, + "learning_rate": 2.631038898913283e-06, + "loss": 0.5163, + "step": 8989 + }, + { + "epoch": 1.4584685269305646, + "grad_norm": 0.5879160835292095, + "learning_rate": 2.6306124515255143e-06, + "loss": 0.5201, + "step": 8990 + }, + { + "epoch": 1.458630759247242, + "grad_norm": 0.6256964197504725, + "learning_rate": 2.630186000326856e-06, + "loss": 0.5341, + "step": 8991 + }, + { + "epoch": 1.4587929915639195, + "grad_norm": 0.5965991729097911, + "learning_rate": 2.62975954532975e-06, + "loss": 0.5001, + "step": 8992 + }, + { + "epoch": 1.458955223880597, + "grad_norm": 0.6430481950230992, + "learning_rate": 2.6293330865466386e-06, + "loss": 0.5315, + "step": 8993 + }, + { + "epoch": 1.4591174561972746, + "grad_norm": 0.6046392600192, + "learning_rate": 2.6289066239899663e-06, + "loss": 0.4977, + "step": 8994 + }, + { + "epoch": 1.459279688513952, + "grad_norm": 0.6170122311802323, + "learning_rate": 2.628480157672175e-06, + "loss": 0.5129, + "step": 8995 + }, + { + "epoch": 1.4594419208306295, + "grad_norm": 0.6230203216010265, + "learning_rate": 2.628053687605708e-06, + "loss": 0.5151, + "step": 8996 + }, + { + "epoch": 1.459604153147307, + "grad_norm": 0.6127961758151282, + "learning_rate": 2.627627213803008e-06, + "loss": 0.5237, + "step": 8997 + }, + { + "epoch": 1.4597663854639844, + "grad_norm": 0.6039668060475171, + "learning_rate": 2.627200736276518e-06, + "loss": 0.5292, + "step": 8998 + }, + { + "epoch": 1.4599286177806619, + "grad_norm": 0.5907173398610664, + "learning_rate": 2.626774255038683e-06, + "loss": 0.529, + "step": 8999 + }, + { + "epoch": 1.4600908500973393, + "grad_norm": 0.6137626506497487, + "learning_rate": 2.6263477701019447e-06, + "loss": 0.5488, + "step": 9000 + }, + { + "epoch": 1.4602530824140167, + "grad_norm": 0.6042672567397459, + "learning_rate": 2.625921281478748e-06, + "loss": 0.5173, + "step": 9001 + }, + { + "epoch": 1.4604153147306944, + "grad_norm": 0.5883150888554457, + "learning_rate": 2.6254947891815354e-06, + "loss": 0.5006, + "step": 9002 + }, + { + "epoch": 1.4605775470473719, + "grad_norm": 0.6264518586963895, + "learning_rate": 2.625068293222751e-06, + "loss": 0.5229, + "step": 9003 + }, + { + "epoch": 1.4607397793640493, + "grad_norm": 0.6325704388078683, + "learning_rate": 2.6246417936148397e-06, + "loss": 0.5237, + "step": 9004 + }, + { + "epoch": 1.4609020116807268, + "grad_norm": 0.5859918179587726, + "learning_rate": 2.6242152903702444e-06, + "loss": 0.518, + "step": 9005 + }, + { + "epoch": 1.4610642439974044, + "grad_norm": 0.6446663731733274, + "learning_rate": 2.62378878350141e-06, + "loss": 0.5138, + "step": 9006 + }, + { + "epoch": 1.4612264763140819, + "grad_norm": 0.6100605279037928, + "learning_rate": 2.62336227302078e-06, + "loss": 0.5137, + "step": 9007 + }, + { + "epoch": 1.4613887086307593, + "grad_norm": 0.6060245897643872, + "learning_rate": 2.6229357589407993e-06, + "loss": 0.5119, + "step": 9008 + }, + { + "epoch": 1.4615509409474368, + "grad_norm": 0.5883103906147389, + "learning_rate": 2.622509241273912e-06, + "loss": 0.4992, + "step": 9009 + }, + { + "epoch": 1.4617131732641142, + "grad_norm": 0.5894329630724723, + "learning_rate": 2.6220827200325628e-06, + "loss": 0.5114, + "step": 9010 + }, + { + "epoch": 1.4618754055807917, + "grad_norm": 0.6104196162243748, + "learning_rate": 2.621656195229196e-06, + "loss": 0.5031, + "step": 9011 + }, + { + "epoch": 1.462037637897469, + "grad_norm": 0.6458474344962067, + "learning_rate": 2.621229666876257e-06, + "loss": 0.5071, + "step": 9012 + }, + { + "epoch": 1.4621998702141465, + "grad_norm": 0.6249611580170976, + "learning_rate": 2.62080313498619e-06, + "loss": 0.5203, + "step": 9013 + }, + { + "epoch": 1.4623621025308242, + "grad_norm": 0.5868850312814575, + "learning_rate": 2.620376599571441e-06, + "loss": 0.508, + "step": 9014 + }, + { + "epoch": 1.4625243348475017, + "grad_norm": 0.6266442225934207, + "learning_rate": 2.619950060644454e-06, + "loss": 0.5533, + "step": 9015 + }, + { + "epoch": 1.462686567164179, + "grad_norm": 0.5842877184559975, + "learning_rate": 2.6195235182176735e-06, + "loss": 0.5583, + "step": 9016 + }, + { + "epoch": 1.4628487994808566, + "grad_norm": 0.5996202887851826, + "learning_rate": 2.619096972303547e-06, + "loss": 0.5011, + "step": 9017 + }, + { + "epoch": 1.463011031797534, + "grad_norm": 0.6122829352390847, + "learning_rate": 2.618670422914518e-06, + "loss": 0.5515, + "step": 9018 + }, + { + "epoch": 1.4631732641142117, + "grad_norm": 0.5894993552806252, + "learning_rate": 2.618243870063033e-06, + "loss": 0.5144, + "step": 9019 + }, + { + "epoch": 1.4633354964308891, + "grad_norm": 0.613796787171134, + "learning_rate": 2.6178173137615366e-06, + "loss": 0.5316, + "step": 9020 + }, + { + "epoch": 1.4634977287475666, + "grad_norm": 0.6260359226165181, + "learning_rate": 2.6173907540224746e-06, + "loss": 0.5037, + "step": 9021 + }, + { + "epoch": 1.463659961064244, + "grad_norm": 0.590570321955733, + "learning_rate": 2.6169641908582943e-06, + "loss": 0.5119, + "step": 9022 + }, + { + "epoch": 1.4638221933809215, + "grad_norm": 0.5837511003917922, + "learning_rate": 2.6165376242814394e-06, + "loss": 0.5035, + "step": 9023 + }, + { + "epoch": 1.463984425697599, + "grad_norm": 0.6088209393056966, + "learning_rate": 2.616111054304358e-06, + "loss": 0.5502, + "step": 9024 + }, + { + "epoch": 1.4641466580142763, + "grad_norm": 0.6192397795070579, + "learning_rate": 2.6156844809394936e-06, + "loss": 0.5171, + "step": 9025 + }, + { + "epoch": 1.4643088903309538, + "grad_norm": 0.6006750440057612, + "learning_rate": 2.615257904199295e-06, + "loss": 0.5211, + "step": 9026 + }, + { + "epoch": 1.4644711226476315, + "grad_norm": 0.5533446894550322, + "learning_rate": 2.614831324096207e-06, + "loss": 0.4905, + "step": 9027 + }, + { + "epoch": 1.464633354964309, + "grad_norm": 0.5889062726682261, + "learning_rate": 2.6144047406426767e-06, + "loss": 0.5486, + "step": 9028 + }, + { + "epoch": 1.4647955872809864, + "grad_norm": 0.5981562582374346, + "learning_rate": 2.613978153851149e-06, + "loss": 0.5677, + "step": 9029 + }, + { + "epoch": 1.4649578195976638, + "grad_norm": 0.5793174483157632, + "learning_rate": 2.613551563734072e-06, + "loss": 0.4784, + "step": 9030 + }, + { + "epoch": 1.4651200519143415, + "grad_norm": 0.5824000197762894, + "learning_rate": 2.6131249703038923e-06, + "loss": 0.5343, + "step": 9031 + }, + { + "epoch": 1.465282284231019, + "grad_norm": 0.5823590203792299, + "learning_rate": 2.612698373573056e-06, + "loss": 0.5381, + "step": 9032 + }, + { + "epoch": 1.4654445165476964, + "grad_norm": 0.6468202888097714, + "learning_rate": 2.6122717735540103e-06, + "loss": 0.5326, + "step": 9033 + }, + { + "epoch": 1.4656067488643738, + "grad_norm": 0.6223463231736188, + "learning_rate": 2.6118451702592017e-06, + "loss": 0.5052, + "step": 9034 + }, + { + "epoch": 1.4657689811810513, + "grad_norm": 0.6211659173084492, + "learning_rate": 2.6114185637010792e-06, + "loss": 0.5067, + "step": 9035 + }, + { + "epoch": 1.4659312134977287, + "grad_norm": 0.5828655761831867, + "learning_rate": 2.610991953892086e-06, + "loss": 0.4994, + "step": 9036 + }, + { + "epoch": 1.4660934458144061, + "grad_norm": 0.6262746526754595, + "learning_rate": 2.610565340844674e-06, + "loss": 0.5002, + "step": 9037 + }, + { + "epoch": 1.4662556781310836, + "grad_norm": 0.6088320691645163, + "learning_rate": 2.6101387245712865e-06, + "loss": 0.5346, + "step": 9038 + }, + { + "epoch": 1.4664179104477613, + "grad_norm": 0.6219891942566621, + "learning_rate": 2.6097121050843733e-06, + "loss": 0.5385, + "step": 9039 + }, + { + "epoch": 1.4665801427644387, + "grad_norm": 0.6489501751252259, + "learning_rate": 2.6092854823963807e-06, + "loss": 0.5036, + "step": 9040 + }, + { + "epoch": 1.4667423750811162, + "grad_norm": 0.5920977731264865, + "learning_rate": 2.608858856519757e-06, + "loss": 0.569, + "step": 9041 + }, + { + "epoch": 1.4669046073977936, + "grad_norm": 0.5682545581584275, + "learning_rate": 2.6084322274669506e-06, + "loss": 0.4782, + "step": 9042 + }, + { + "epoch": 1.467066839714471, + "grad_norm": 0.6256742897326926, + "learning_rate": 2.6080055952504074e-06, + "loss": 0.5162, + "step": 9043 + }, + { + "epoch": 1.4672290720311487, + "grad_norm": 0.6227946256363104, + "learning_rate": 2.607578959882577e-06, + "loss": 0.5171, + "step": 9044 + }, + { + "epoch": 1.4673913043478262, + "grad_norm": 0.5971365198789251, + "learning_rate": 2.607152321375906e-06, + "loss": 0.5222, + "step": 9045 + }, + { + "epoch": 1.4675535366645036, + "grad_norm": 0.6036789305333292, + "learning_rate": 2.6067256797428437e-06, + "loss": 0.5006, + "step": 9046 + }, + { + "epoch": 1.467715768981181, + "grad_norm": 0.6407525955997978, + "learning_rate": 2.606299034995837e-06, + "loss": 0.5212, + "step": 9047 + }, + { + "epoch": 1.4678780012978585, + "grad_norm": 0.5930948357357391, + "learning_rate": 2.6058723871473357e-06, + "loss": 0.4976, + "step": 9048 + }, + { + "epoch": 1.468040233614536, + "grad_norm": 0.5931900014386401, + "learning_rate": 2.6054457362097863e-06, + "loss": 0.5247, + "step": 9049 + }, + { + "epoch": 1.4682024659312134, + "grad_norm": 0.6236672545334495, + "learning_rate": 2.605019082195639e-06, + "loss": 0.5343, + "step": 9050 + }, + { + "epoch": 1.4683646982478908, + "grad_norm": 0.6065255433689493, + "learning_rate": 2.604592425117341e-06, + "loss": 0.5361, + "step": 9051 + }, + { + "epoch": 1.4685269305645685, + "grad_norm": 0.5597665254622963, + "learning_rate": 2.6041657649873413e-06, + "loss": 0.5288, + "step": 9052 + }, + { + "epoch": 1.468689162881246, + "grad_norm": 0.5999465178124821, + "learning_rate": 2.60373910181809e-06, + "loss": 0.5259, + "step": 9053 + }, + { + "epoch": 1.4688513951979234, + "grad_norm": 0.6138732603763704, + "learning_rate": 2.603312435622033e-06, + "loss": 0.5021, + "step": 9054 + }, + { + "epoch": 1.4690136275146009, + "grad_norm": 0.580171815899423, + "learning_rate": 2.6028857664116224e-06, + "loss": 0.5104, + "step": 9055 + }, + { + "epoch": 1.4691758598312785, + "grad_norm": 0.608107146270005, + "learning_rate": 2.602459094199304e-06, + "loss": 0.5379, + "step": 9056 + }, + { + "epoch": 1.469338092147956, + "grad_norm": 0.6130411784583162, + "learning_rate": 2.6020324189975294e-06, + "loss": 0.5081, + "step": 9057 + }, + { + "epoch": 1.4695003244646334, + "grad_norm": 0.623947213972028, + "learning_rate": 2.601605740818746e-06, + "loss": 0.4994, + "step": 9058 + }, + { + "epoch": 1.4696625567813109, + "grad_norm": 0.6284896613704257, + "learning_rate": 2.601179059675404e-06, + "loss": 0.5149, + "step": 9059 + }, + { + "epoch": 1.4698247890979883, + "grad_norm": 0.610832498366593, + "learning_rate": 2.6007523755799536e-06, + "loss": 0.5578, + "step": 9060 + }, + { + "epoch": 1.4699870214146658, + "grad_norm": 0.6042595395488345, + "learning_rate": 2.600325688544841e-06, + "loss": 0.5039, + "step": 9061 + }, + { + "epoch": 1.4701492537313432, + "grad_norm": 0.6029663503777459, + "learning_rate": 2.5998989985825196e-06, + "loss": 0.5146, + "step": 9062 + }, + { + "epoch": 1.4703114860480206, + "grad_norm": 0.6208411516523779, + "learning_rate": 2.5994723057054362e-06, + "loss": 0.5063, + "step": 9063 + }, + { + "epoch": 1.4704737183646983, + "grad_norm": 0.588619609958395, + "learning_rate": 2.599045609926042e-06, + "loss": 0.5272, + "step": 9064 + }, + { + "epoch": 1.4706359506813758, + "grad_norm": 0.6123299713866938, + "learning_rate": 2.5986189112567857e-06, + "loss": 0.5074, + "step": 9065 + }, + { + "epoch": 1.4707981829980532, + "grad_norm": 0.6285814649939697, + "learning_rate": 2.5981922097101177e-06, + "loss": 0.5016, + "step": 9066 + }, + { + "epoch": 1.4709604153147307, + "grad_norm": 0.5979691025537822, + "learning_rate": 2.597765505298487e-06, + "loss": 0.505, + "step": 9067 + }, + { + "epoch": 1.471122647631408, + "grad_norm": 0.6100014630635713, + "learning_rate": 2.5973387980343447e-06, + "loss": 0.5149, + "step": 9068 + }, + { + "epoch": 1.4712848799480858, + "grad_norm": 0.5936740708404616, + "learning_rate": 2.596912087930141e-06, + "loss": 0.5055, + "step": 9069 + }, + { + "epoch": 1.4714471122647632, + "grad_norm": 0.6295087210139182, + "learning_rate": 2.596485374998325e-06, + "loss": 0.5127, + "step": 9070 + }, + { + "epoch": 1.4716093445814407, + "grad_norm": 0.5902874124581154, + "learning_rate": 2.596058659251348e-06, + "loss": 0.5148, + "step": 9071 + }, + { + "epoch": 1.471771576898118, + "grad_norm": 0.5986384540571008, + "learning_rate": 2.5956319407016584e-06, + "loss": 0.5109, + "step": 9072 + }, + { + "epoch": 1.4719338092147956, + "grad_norm": 0.602650939257983, + "learning_rate": 2.59520521936171e-06, + "loss": 0.5167, + "step": 9073 + }, + { + "epoch": 1.472096041531473, + "grad_norm": 0.6205460657625659, + "learning_rate": 2.59477849524395e-06, + "loss": 0.5096, + "step": 9074 + }, + { + "epoch": 1.4722582738481504, + "grad_norm": 0.6018312170423307, + "learning_rate": 2.5943517683608306e-06, + "loss": 0.4937, + "step": 9075 + }, + { + "epoch": 1.4724205061648281, + "grad_norm": 0.6168990805974047, + "learning_rate": 2.593925038724802e-06, + "loss": 0.5325, + "step": 9076 + }, + { + "epoch": 1.4725827384815056, + "grad_norm": 0.6092701059912846, + "learning_rate": 2.5934983063483154e-06, + "loss": 0.5217, + "step": 9077 + }, + { + "epoch": 1.472744970798183, + "grad_norm": 0.5798436034666942, + "learning_rate": 2.593071571243821e-06, + "loss": 0.5073, + "step": 9078 + }, + { + "epoch": 1.4729072031148605, + "grad_norm": 0.6034636822580017, + "learning_rate": 2.592644833423769e-06, + "loss": 0.5271, + "step": 9079 + }, + { + "epoch": 1.473069435431538, + "grad_norm": 0.635671472864535, + "learning_rate": 2.5922180929006124e-06, + "loss": 0.472, + "step": 9080 + }, + { + "epoch": 1.4732316677482156, + "grad_norm": 0.5942820893637455, + "learning_rate": 2.5917913496868007e-06, + "loss": 0.5179, + "step": 9081 + }, + { + "epoch": 1.473393900064893, + "grad_norm": 0.6043143726105231, + "learning_rate": 2.5913646037947857e-06, + "loss": 0.5296, + "step": 9082 + }, + { + "epoch": 1.4735561323815705, + "grad_norm": 0.5953267595962441, + "learning_rate": 2.5909378552370183e-06, + "loss": 0.537, + "step": 9083 + }, + { + "epoch": 1.473718364698248, + "grad_norm": 0.5690218724872512, + "learning_rate": 2.59051110402595e-06, + "loss": 0.4885, + "step": 9084 + }, + { + "epoch": 1.4738805970149254, + "grad_norm": 0.6728833751947549, + "learning_rate": 2.5900843501740316e-06, + "loss": 0.5315, + "step": 9085 + }, + { + "epoch": 1.4740428293316028, + "grad_norm": 0.6505636128248325, + "learning_rate": 2.5896575936937147e-06, + "loss": 0.557, + "step": 9086 + }, + { + "epoch": 1.4742050616482802, + "grad_norm": 0.6038188670241011, + "learning_rate": 2.5892308345974517e-06, + "loss": 0.5083, + "step": 9087 + }, + { + "epoch": 1.4743672939649577, + "grad_norm": 0.6159133880426071, + "learning_rate": 2.5888040728976928e-06, + "loss": 0.4974, + "step": 9088 + }, + { + "epoch": 1.4745295262816354, + "grad_norm": 0.5670445527828014, + "learning_rate": 2.588377308606891e-06, + "loss": 0.5143, + "step": 9089 + }, + { + "epoch": 1.4746917585983128, + "grad_norm": 0.5957613237320353, + "learning_rate": 2.587950541737496e-06, + "loss": 0.5257, + "step": 9090 + }, + { + "epoch": 1.4748539909149903, + "grad_norm": 0.6017662715287582, + "learning_rate": 2.587523772301963e-06, + "loss": 0.5531, + "step": 9091 + }, + { + "epoch": 1.4750162232316677, + "grad_norm": 0.6119508814119387, + "learning_rate": 2.5870970003127405e-06, + "loss": 0.5281, + "step": 9092 + }, + { + "epoch": 1.4751784555483454, + "grad_norm": 0.6121618355502443, + "learning_rate": 2.5866702257822824e-06, + "loss": 0.5213, + "step": 9093 + }, + { + "epoch": 1.4753406878650228, + "grad_norm": 0.5823354079898191, + "learning_rate": 2.5862434487230404e-06, + "loss": 0.5174, + "step": 9094 + }, + { + "epoch": 1.4755029201817003, + "grad_norm": 0.5816310109723857, + "learning_rate": 2.5858166691474656e-06, + "loss": 0.5319, + "step": 9095 + }, + { + "epoch": 1.4756651524983777, + "grad_norm": 0.5842920778945075, + "learning_rate": 2.5853898870680118e-06, + "loss": 0.4988, + "step": 9096 + }, + { + "epoch": 1.4758273848150552, + "grad_norm": 0.6005722010536308, + "learning_rate": 2.5849631024971296e-06, + "loss": 0.4971, + "step": 9097 + }, + { + "epoch": 1.4759896171317326, + "grad_norm": 0.5830915919193868, + "learning_rate": 2.5845363154472725e-06, + "loss": 0.5087, + "step": 9098 + }, + { + "epoch": 1.47615184944841, + "grad_norm": 0.5910648216721134, + "learning_rate": 2.584109525930893e-06, + "loss": 0.5328, + "step": 9099 + }, + { + "epoch": 1.4763140817650875, + "grad_norm": 0.5951165282874561, + "learning_rate": 2.5836827339604425e-06, + "loss": 0.5313, + "step": 9100 + }, + { + "epoch": 1.4764763140817652, + "grad_norm": 0.6096701541218782, + "learning_rate": 2.583255939548375e-06, + "loss": 0.5152, + "step": 9101 + }, + { + "epoch": 1.4766385463984426, + "grad_norm": 0.6094664070910516, + "learning_rate": 2.5828291427071413e-06, + "loss": 0.5368, + "step": 9102 + }, + { + "epoch": 1.47680077871512, + "grad_norm": 0.5985799891174045, + "learning_rate": 2.5824023434491953e-06, + "loss": 0.5242, + "step": 9103 + }, + { + "epoch": 1.4769630110317975, + "grad_norm": 0.5645455350363137, + "learning_rate": 2.5819755417869894e-06, + "loss": 0.5028, + "step": 9104 + }, + { + "epoch": 1.477125243348475, + "grad_norm": 0.6328598409649592, + "learning_rate": 2.581548737732977e-06, + "loss": 0.5312, + "step": 9105 + }, + { + "epoch": 1.4772874756651526, + "grad_norm": 0.6103190245405089, + "learning_rate": 2.5811219312996106e-06, + "loss": 0.5348, + "step": 9106 + }, + { + "epoch": 1.47744970798183, + "grad_norm": 0.6022109863856472, + "learning_rate": 2.5806951224993424e-06, + "loss": 0.4982, + "step": 9107 + }, + { + "epoch": 1.4776119402985075, + "grad_norm": 0.5872760523464449, + "learning_rate": 2.5802683113446264e-06, + "loss": 0.5127, + "step": 9108 + }, + { + "epoch": 1.477774172615185, + "grad_norm": 0.6312765007981107, + "learning_rate": 2.5798414978479163e-06, + "loss": 0.5086, + "step": 9109 + }, + { + "epoch": 1.4779364049318624, + "grad_norm": 0.6039794910796225, + "learning_rate": 2.5794146820216636e-06, + "loss": 0.5155, + "step": 9110 + }, + { + "epoch": 1.4780986372485398, + "grad_norm": 0.6115562613131991, + "learning_rate": 2.5789878638783235e-06, + "loss": 0.4898, + "step": 9111 + }, + { + "epoch": 1.4782608695652173, + "grad_norm": 0.6360678802607302, + "learning_rate": 2.5785610434303465e-06, + "loss": 0.4933, + "step": 9112 + }, + { + "epoch": 1.4784231018818947, + "grad_norm": 0.6336606634231157, + "learning_rate": 2.5781342206901883e-06, + "loss": 0.5119, + "step": 9113 + }, + { + "epoch": 1.4785853341985724, + "grad_norm": 0.6028384795546402, + "learning_rate": 2.577707395670302e-06, + "loss": 0.498, + "step": 9114 + }, + { + "epoch": 1.4787475665152499, + "grad_norm": 0.6011272204259014, + "learning_rate": 2.57728056838314e-06, + "loss": 0.5112, + "step": 9115 + }, + { + "epoch": 1.4789097988319273, + "grad_norm": 0.5773861243026964, + "learning_rate": 2.576853738841158e-06, + "loss": 0.5338, + "step": 9116 + }, + { + "epoch": 1.4790720311486047, + "grad_norm": 0.5854931677229994, + "learning_rate": 2.576426907056807e-06, + "loss": 0.4998, + "step": 9117 + }, + { + "epoch": 1.4792342634652824, + "grad_norm": 0.6126301632076092, + "learning_rate": 2.576000073042543e-06, + "loss": 0.5306, + "step": 9118 + }, + { + "epoch": 1.4793964957819599, + "grad_norm": 0.6033405160876869, + "learning_rate": 2.575573236810818e-06, + "loss": 0.5107, + "step": 9119 + }, + { + "epoch": 1.4795587280986373, + "grad_norm": 0.626942398837317, + "learning_rate": 2.575146398374087e-06, + "loss": 0.5226, + "step": 9120 + }, + { + "epoch": 1.4797209604153148, + "grad_norm": 0.6615045662993058, + "learning_rate": 2.574719557744803e-06, + "loss": 0.5194, + "step": 9121 + }, + { + "epoch": 1.4798831927319922, + "grad_norm": 0.5940059330122862, + "learning_rate": 2.574292714935421e-06, + "loss": 0.5048, + "step": 9122 + }, + { + "epoch": 1.4800454250486696, + "grad_norm": 0.6204121968280993, + "learning_rate": 2.5738658699583947e-06, + "loss": 0.5068, + "step": 9123 + }, + { + "epoch": 1.480207657365347, + "grad_norm": 0.5764906064308796, + "learning_rate": 2.5734390228261774e-06, + "loss": 0.5245, + "step": 9124 + }, + { + "epoch": 1.4803698896820245, + "grad_norm": 0.5779346827914652, + "learning_rate": 2.5730121735512242e-06, + "loss": 0.5285, + "step": 9125 + }, + { + "epoch": 1.4805321219987022, + "grad_norm": 0.6012152328153673, + "learning_rate": 2.572585322145989e-06, + "loss": 0.5195, + "step": 9126 + }, + { + "epoch": 1.4806943543153797, + "grad_norm": 0.6060797610110902, + "learning_rate": 2.572158468622926e-06, + "loss": 0.4673, + "step": 9127 + }, + { + "epoch": 1.480856586632057, + "grad_norm": 0.6022279034251445, + "learning_rate": 2.5717316129944897e-06, + "loss": 0.5543, + "step": 9128 + }, + { + "epoch": 1.4810188189487346, + "grad_norm": 0.6180357406353872, + "learning_rate": 2.5713047552731345e-06, + "loss": 0.5187, + "step": 9129 + }, + { + "epoch": 1.481181051265412, + "grad_norm": 0.6187904819695497, + "learning_rate": 2.5708778954713143e-06, + "loss": 0.5103, + "step": 9130 + }, + { + "epoch": 1.4813432835820897, + "grad_norm": 0.6067997460547112, + "learning_rate": 2.5704510336014844e-06, + "loss": 0.4722, + "step": 9131 + }, + { + "epoch": 1.4815055158987671, + "grad_norm": 0.6294959064140453, + "learning_rate": 2.5700241696760986e-06, + "loss": 0.5372, + "step": 9132 + }, + { + "epoch": 1.4816677482154446, + "grad_norm": 0.5942586463301811, + "learning_rate": 2.569597303707612e-06, + "loss": 0.5492, + "step": 9133 + }, + { + "epoch": 1.481829980532122, + "grad_norm": 0.5757863697269807, + "learning_rate": 2.5691704357084803e-06, + "loss": 0.4973, + "step": 9134 + }, + { + "epoch": 1.4819922128487995, + "grad_norm": 0.5820840075978182, + "learning_rate": 2.5687435656911556e-06, + "loss": 0.5332, + "step": 9135 + }, + { + "epoch": 1.482154445165477, + "grad_norm": 0.6116235474591858, + "learning_rate": 2.568316693668096e-06, + "loss": 0.5599, + "step": 9136 + }, + { + "epoch": 1.4823166774821543, + "grad_norm": 0.5883852233583315, + "learning_rate": 2.5678898196517538e-06, + "loss": 0.5077, + "step": 9137 + }, + { + "epoch": 1.4824789097988318, + "grad_norm": 0.5702546419979061, + "learning_rate": 2.5674629436545857e-06, + "loss": 0.537, + "step": 9138 + }, + { + "epoch": 1.4826411421155095, + "grad_norm": 0.5855486412566095, + "learning_rate": 2.5670360656890452e-06, + "loss": 0.5444, + "step": 9139 + }, + { + "epoch": 1.482803374432187, + "grad_norm": 0.596948327540615, + "learning_rate": 2.5666091857675883e-06, + "loss": 0.5309, + "step": 9140 + }, + { + "epoch": 1.4829656067488644, + "grad_norm": 0.6147840613949254, + "learning_rate": 2.566182303902669e-06, + "loss": 0.4892, + "step": 9141 + }, + { + "epoch": 1.4831278390655418, + "grad_norm": 0.6156248328541946, + "learning_rate": 2.565755420106744e-06, + "loss": 0.5465, + "step": 9142 + }, + { + "epoch": 1.4832900713822195, + "grad_norm": 0.6405735967788605, + "learning_rate": 2.5653285343922684e-06, + "loss": 0.5387, + "step": 9143 + }, + { + "epoch": 1.483452303698897, + "grad_norm": 0.6024566553876285, + "learning_rate": 2.5649016467716958e-06, + "loss": 0.5262, + "step": 9144 + }, + { + "epoch": 1.4836145360155744, + "grad_norm": 0.6002950445366292, + "learning_rate": 2.564474757257484e-06, + "loss": 0.5592, + "step": 9145 + }, + { + "epoch": 1.4837767683322518, + "grad_norm": 0.6201092630912761, + "learning_rate": 2.564047865862086e-06, + "loss": 0.5542, + "step": 9146 + }, + { + "epoch": 1.4839390006489293, + "grad_norm": 0.5713105037458304, + "learning_rate": 2.563620972597959e-06, + "loss": 0.529, + "step": 9147 + }, + { + "epoch": 1.4841012329656067, + "grad_norm": 0.6013967578848795, + "learning_rate": 2.563194077477557e-06, + "loss": 0.4884, + "step": 9148 + }, + { + "epoch": 1.4842634652822841, + "grad_norm": 0.6070117991156287, + "learning_rate": 2.562767180513337e-06, + "loss": 0.5315, + "step": 9149 + }, + { + "epoch": 1.4844256975989616, + "grad_norm": 0.5998406745684135, + "learning_rate": 2.562340281717753e-06, + "loss": 0.5093, + "step": 9150 + }, + { + "epoch": 1.4845879299156393, + "grad_norm": 0.5868141557886614, + "learning_rate": 2.5619133811032628e-06, + "loss": 0.551, + "step": 9151 + }, + { + "epoch": 1.4847501622323167, + "grad_norm": 0.5933481350479015, + "learning_rate": 2.561486478682321e-06, + "loss": 0.503, + "step": 9152 + }, + { + "epoch": 1.4849123945489942, + "grad_norm": 0.5963792086358546, + "learning_rate": 2.561059574467382e-06, + "loss": 0.5043, + "step": 9153 + }, + { + "epoch": 1.4850746268656716, + "grad_norm": 0.6229896281228519, + "learning_rate": 2.5606326684709033e-06, + "loss": 0.5377, + "step": 9154 + }, + { + "epoch": 1.485236859182349, + "grad_norm": 0.6408197762269052, + "learning_rate": 2.560205760705341e-06, + "loss": 0.5142, + "step": 9155 + }, + { + "epoch": 1.4853990914990267, + "grad_norm": 0.5800753693311318, + "learning_rate": 2.55977885118315e-06, + "loss": 0.5146, + "step": 9156 + }, + { + "epoch": 1.4855613238157042, + "grad_norm": 0.6480533821811248, + "learning_rate": 2.5593519399167872e-06, + "loss": 0.4868, + "step": 9157 + }, + { + "epoch": 1.4857235561323816, + "grad_norm": 0.615621535325027, + "learning_rate": 2.5589250269187073e-06, + "loss": 0.511, + "step": 9158 + }, + { + "epoch": 1.485885788449059, + "grad_norm": 0.5660731231182378, + "learning_rate": 2.558498112201367e-06, + "loss": 0.5001, + "step": 9159 + }, + { + "epoch": 1.4860480207657365, + "grad_norm": 0.6060182110410183, + "learning_rate": 2.558071195777223e-06, + "loss": 0.4814, + "step": 9160 + }, + { + "epoch": 1.486210253082414, + "grad_norm": 0.656272084797765, + "learning_rate": 2.557644277658732e-06, + "loss": 0.5261, + "step": 9161 + }, + { + "epoch": 1.4863724853990914, + "grad_norm": 0.6011864428474067, + "learning_rate": 2.557217357858348e-06, + "loss": 0.5177, + "step": 9162 + }, + { + "epoch": 1.486534717715769, + "grad_norm": 0.6202580170287061, + "learning_rate": 2.556790436388529e-06, + "loss": 0.5306, + "step": 9163 + }, + { + "epoch": 1.4866969500324465, + "grad_norm": 0.6006475996048167, + "learning_rate": 2.5563635132617305e-06, + "loss": 0.5735, + "step": 9164 + }, + { + "epoch": 1.486859182349124, + "grad_norm": 0.6065415473064315, + "learning_rate": 2.55593658849041e-06, + "loss": 0.5211, + "step": 9165 + }, + { + "epoch": 1.4870214146658014, + "grad_norm": 0.5683808191348084, + "learning_rate": 2.555509662087023e-06, + "loss": 0.5201, + "step": 9166 + }, + { + "epoch": 1.4871836469824788, + "grad_norm": 0.5900964633930744, + "learning_rate": 2.555082734064026e-06, + "loss": 0.4884, + "step": 9167 + }, + { + "epoch": 1.4873458792991565, + "grad_norm": 0.5998631866772582, + "learning_rate": 2.5546558044338753e-06, + "loss": 0.5001, + "step": 9168 + }, + { + "epoch": 1.487508111615834, + "grad_norm": 0.5912292740033878, + "learning_rate": 2.5542288732090283e-06, + "loss": 0.5292, + "step": 9169 + }, + { + "epoch": 1.4876703439325114, + "grad_norm": 0.6703861565020889, + "learning_rate": 2.5538019404019415e-06, + "loss": 0.5191, + "step": 9170 + }, + { + "epoch": 1.4878325762491889, + "grad_norm": 0.6172167502704425, + "learning_rate": 2.5533750060250702e-06, + "loss": 0.5317, + "step": 9171 + }, + { + "epoch": 1.4879948085658663, + "grad_norm": 0.5973009534634116, + "learning_rate": 2.552948070090872e-06, + "loss": 0.539, + "step": 9172 + }, + { + "epoch": 1.4881570408825437, + "grad_norm": 0.6160908887538737, + "learning_rate": 2.552521132611805e-06, + "loss": 0.5096, + "step": 9173 + }, + { + "epoch": 1.4883192731992212, + "grad_norm": 0.6533745096995066, + "learning_rate": 2.5520941936003237e-06, + "loss": 0.4752, + "step": 9174 + }, + { + "epoch": 1.4884815055158986, + "grad_norm": 0.5998253460779265, + "learning_rate": 2.5516672530688864e-06, + "loss": 0.5322, + "step": 9175 + }, + { + "epoch": 1.4886437378325763, + "grad_norm": 0.5684977867469647, + "learning_rate": 2.5512403110299494e-06, + "loss": 0.5178, + "step": 9176 + }, + { + "epoch": 1.4888059701492538, + "grad_norm": 0.636806583185574, + "learning_rate": 2.550813367495969e-06, + "loss": 0.5232, + "step": 9177 + }, + { + "epoch": 1.4889682024659312, + "grad_norm": 0.5935819930401982, + "learning_rate": 2.550386422479404e-06, + "loss": 0.5234, + "step": 9178 + }, + { + "epoch": 1.4891304347826086, + "grad_norm": 0.5676841285122837, + "learning_rate": 2.5499594759927093e-06, + "loss": 0.499, + "step": 9179 + }, + { + "epoch": 1.4892926670992863, + "grad_norm": 0.6045473129347921, + "learning_rate": 2.549532528048344e-06, + "loss": 0.517, + "step": 9180 + }, + { + "epoch": 1.4894548994159638, + "grad_norm": 0.5778362106723022, + "learning_rate": 2.5491055786587638e-06, + "loss": 0.5066, + "step": 9181 + }, + { + "epoch": 1.4896171317326412, + "grad_norm": 0.6169372012496094, + "learning_rate": 2.5486786278364255e-06, + "loss": 0.5226, + "step": 9182 + }, + { + "epoch": 1.4897793640493187, + "grad_norm": 0.5713928360425934, + "learning_rate": 2.5482516755937883e-06, + "loss": 0.5241, + "step": 9183 + }, + { + "epoch": 1.489941596365996, + "grad_norm": 0.5979293611912527, + "learning_rate": 2.5478247219433065e-06, + "loss": 0.4939, + "step": 9184 + }, + { + "epoch": 1.4901038286826735, + "grad_norm": 0.5968796022941885, + "learning_rate": 2.5473977668974404e-06, + "loss": 0.5016, + "step": 9185 + }, + { + "epoch": 1.490266060999351, + "grad_norm": 0.5851854101093394, + "learning_rate": 2.5469708104686452e-06, + "loss": 0.5103, + "step": 9186 + }, + { + "epoch": 1.4904282933160284, + "grad_norm": 0.5754736717026486, + "learning_rate": 2.546543852669379e-06, + "loss": 0.5032, + "step": 9187 + }, + { + "epoch": 1.490590525632706, + "grad_norm": 0.6104954996578535, + "learning_rate": 2.5461168935120994e-06, + "loss": 0.5261, + "step": 9188 + }, + { + "epoch": 1.4907527579493836, + "grad_norm": 0.5845778804763038, + "learning_rate": 2.5456899330092626e-06, + "loss": 0.5312, + "step": 9189 + }, + { + "epoch": 1.490914990266061, + "grad_norm": 0.6155791862214127, + "learning_rate": 2.545262971173328e-06, + "loss": 0.5362, + "step": 9190 + }, + { + "epoch": 1.4910772225827384, + "grad_norm": 0.6187843270026667, + "learning_rate": 2.5448360080167513e-06, + "loss": 0.4977, + "step": 9191 + }, + { + "epoch": 1.491239454899416, + "grad_norm": 0.5778866261562636, + "learning_rate": 2.5444090435519906e-06, + "loss": 0.5132, + "step": 9192 + }, + { + "epoch": 1.4914016872160936, + "grad_norm": 0.6098946040291393, + "learning_rate": 2.5439820777915047e-06, + "loss": 0.5349, + "step": 9193 + }, + { + "epoch": 1.491563919532771, + "grad_norm": 0.5780346028797253, + "learning_rate": 2.5435551107477498e-06, + "loss": 0.5086, + "step": 9194 + }, + { + "epoch": 1.4917261518494485, + "grad_norm": 0.6171547930888327, + "learning_rate": 2.5431281424331833e-06, + "loss": 0.4971, + "step": 9195 + }, + { + "epoch": 1.491888384166126, + "grad_norm": 0.5740495165756643, + "learning_rate": 2.5427011728602643e-06, + "loss": 0.5409, + "step": 9196 + }, + { + "epoch": 1.4920506164828033, + "grad_norm": 0.5889902847047482, + "learning_rate": 2.542274202041449e-06, + "loss": 0.5119, + "step": 9197 + }, + { + "epoch": 1.4922128487994808, + "grad_norm": 0.6180881417779667, + "learning_rate": 2.5418472299891965e-06, + "loss": 0.5454, + "step": 9198 + }, + { + "epoch": 1.4923750811161582, + "grad_norm": 0.614570419913012, + "learning_rate": 2.5414202567159645e-06, + "loss": 0.5164, + "step": 9199 + }, + { + "epoch": 1.4925373134328357, + "grad_norm": 0.5983315236641457, + "learning_rate": 2.540993282234209e-06, + "loss": 0.5358, + "step": 9200 + }, + { + "epoch": 1.4926995457495134, + "grad_norm": 0.5756307342927257, + "learning_rate": 2.5405663065563908e-06, + "loss": 0.4731, + "step": 9201 + }, + { + "epoch": 1.4928617780661908, + "grad_norm": 0.6440112603575248, + "learning_rate": 2.540139329694965e-06, + "loss": 0.5071, + "step": 9202 + }, + { + "epoch": 1.4930240103828682, + "grad_norm": 0.5893083179045924, + "learning_rate": 2.5397123516623912e-06, + "loss": 0.5426, + "step": 9203 + }, + { + "epoch": 1.4931862426995457, + "grad_norm": 0.5989786588546249, + "learning_rate": 2.5392853724711268e-06, + "loss": 0.5091, + "step": 9204 + }, + { + "epoch": 1.4933484750162234, + "grad_norm": 0.6534388789259788, + "learning_rate": 2.5388583921336307e-06, + "loss": 0.5258, + "step": 9205 + }, + { + "epoch": 1.4935107073329008, + "grad_norm": 0.627562880818258, + "learning_rate": 2.5384314106623594e-06, + "loss": 0.5135, + "step": 9206 + }, + { + "epoch": 1.4936729396495783, + "grad_norm": 0.5856984726031319, + "learning_rate": 2.5380044280697714e-06, + "loss": 0.5478, + "step": 9207 + }, + { + "epoch": 1.4938351719662557, + "grad_norm": 0.6021677565672793, + "learning_rate": 2.5375774443683263e-06, + "loss": 0.504, + "step": 9208 + }, + { + "epoch": 1.4939974042829332, + "grad_norm": 0.5748603495462962, + "learning_rate": 2.53715045957048e-06, + "loss": 0.5189, + "step": 9209 + }, + { + "epoch": 1.4941596365996106, + "grad_norm": 0.5804406823428939, + "learning_rate": 2.5367234736886925e-06, + "loss": 0.5501, + "step": 9210 + }, + { + "epoch": 1.494321868916288, + "grad_norm": 0.5729307122664505, + "learning_rate": 2.536296486735421e-06, + "loss": 0.5219, + "step": 9211 + }, + { + "epoch": 1.4944841012329655, + "grad_norm": 0.6234260107873968, + "learning_rate": 2.535869498723124e-06, + "loss": 0.5323, + "step": 9212 + }, + { + "epoch": 1.4946463335496432, + "grad_norm": 0.5791683718196003, + "learning_rate": 2.53544250966426e-06, + "loss": 0.5085, + "step": 9213 + }, + { + "epoch": 1.4948085658663206, + "grad_norm": 0.5908096426339117, + "learning_rate": 2.535015519571286e-06, + "loss": 0.4907, + "step": 9214 + }, + { + "epoch": 1.494970798182998, + "grad_norm": 0.6053578396998632, + "learning_rate": 2.5345885284566623e-06, + "loss": 0.5168, + "step": 9215 + }, + { + "epoch": 1.4951330304996755, + "grad_norm": 0.5861007117172576, + "learning_rate": 2.5341615363328463e-06, + "loss": 0.5223, + "step": 9216 + }, + { + "epoch": 1.495295262816353, + "grad_norm": 0.6142300369262874, + "learning_rate": 2.5337345432122966e-06, + "loss": 0.5166, + "step": 9217 + }, + { + "epoch": 1.4954574951330306, + "grad_norm": 0.584959629446629, + "learning_rate": 2.5333075491074704e-06, + "loss": 0.5171, + "step": 9218 + }, + { + "epoch": 1.495619727449708, + "grad_norm": 0.5871699468369226, + "learning_rate": 2.532880554030829e-06, + "loss": 0.5218, + "step": 9219 + }, + { + "epoch": 1.4957819597663855, + "grad_norm": 0.6177837021283107, + "learning_rate": 2.5324535579948274e-06, + "loss": 0.5161, + "step": 9220 + }, + { + "epoch": 1.495944192083063, + "grad_norm": 0.6323509266867344, + "learning_rate": 2.5320265610119272e-06, + "loss": 0.4955, + "step": 9221 + }, + { + "epoch": 1.4961064243997404, + "grad_norm": 0.594605643352455, + "learning_rate": 2.5315995630945838e-06, + "loss": 0.5227, + "step": 9222 + }, + { + "epoch": 1.4962686567164178, + "grad_norm": 0.606687673074831, + "learning_rate": 2.531172564255258e-06, + "loss": 0.5076, + "step": 9223 + }, + { + "epoch": 1.4964308890330953, + "grad_norm": 0.5843643126411604, + "learning_rate": 2.5307455645064083e-06, + "loss": 0.5363, + "step": 9224 + }, + { + "epoch": 1.4965931213497727, + "grad_norm": 0.589165639625964, + "learning_rate": 2.530318563860492e-06, + "loss": 0.5208, + "step": 9225 + }, + { + "epoch": 1.4967553536664504, + "grad_norm": 0.6224121684316759, + "learning_rate": 2.5298915623299695e-06, + "loss": 0.5475, + "step": 9226 + }, + { + "epoch": 1.4969175859831279, + "grad_norm": 0.6017829292191194, + "learning_rate": 2.5294645599272978e-06, + "loss": 0.4717, + "step": 9227 + }, + { + "epoch": 1.4970798182998053, + "grad_norm": 0.5753268431324606, + "learning_rate": 2.5290375566649367e-06, + "loss": 0.4874, + "step": 9228 + }, + { + "epoch": 1.4972420506164827, + "grad_norm": 0.6077963172237224, + "learning_rate": 2.5286105525553444e-06, + "loss": 0.532, + "step": 9229 + }, + { + "epoch": 1.4974042829331604, + "grad_norm": 0.572372645031951, + "learning_rate": 2.5281835476109796e-06, + "loss": 0.4897, + "step": 9230 + }, + { + "epoch": 1.4975665152498379, + "grad_norm": 0.6122512158871447, + "learning_rate": 2.5277565418443012e-06, + "loss": 0.5255, + "step": 9231 + }, + { + "epoch": 1.4977287475665153, + "grad_norm": 0.5869838398851595, + "learning_rate": 2.5273295352677675e-06, + "loss": 0.5329, + "step": 9232 + }, + { + "epoch": 1.4978909798831928, + "grad_norm": 0.6205872276382722, + "learning_rate": 2.526902527893838e-06, + "loss": 0.5212, + "step": 9233 + }, + { + "epoch": 1.4980532121998702, + "grad_norm": 0.6185869947234008, + "learning_rate": 2.5264755197349714e-06, + "loss": 0.5195, + "step": 9234 + }, + { + "epoch": 1.4982154445165476, + "grad_norm": 0.6345305827948805, + "learning_rate": 2.526048510803627e-06, + "loss": 0.5371, + "step": 9235 + }, + { + "epoch": 1.498377676833225, + "grad_norm": 0.5996633279042707, + "learning_rate": 2.5256215011122628e-06, + "loss": 0.5485, + "step": 9236 + }, + { + "epoch": 1.4985399091499025, + "grad_norm": 0.5481145277204807, + "learning_rate": 2.525194490673338e-06, + "loss": 0.5212, + "step": 9237 + }, + { + "epoch": 1.4987021414665802, + "grad_norm": 0.5957437820520931, + "learning_rate": 2.524767479499311e-06, + "loss": 0.5107, + "step": 9238 + }, + { + "epoch": 1.4988643737832577, + "grad_norm": 0.6284825795960922, + "learning_rate": 2.5243404676026416e-06, + "loss": 0.5115, + "step": 9239 + }, + { + "epoch": 1.499026606099935, + "grad_norm": 0.603155144553572, + "learning_rate": 2.5239134549957884e-06, + "loss": 0.5368, + "step": 9240 + }, + { + "epoch": 1.4991888384166125, + "grad_norm": 0.6578077755845275, + "learning_rate": 2.52348644169121e-06, + "loss": 0.5155, + "step": 9241 + }, + { + "epoch": 1.49935107073329, + "grad_norm": 0.5977684539850883, + "learning_rate": 2.523059427701367e-06, + "loss": 0.5189, + "step": 9242 + }, + { + "epoch": 1.4995133030499677, + "grad_norm": 0.5663475361963057, + "learning_rate": 2.5226324130387157e-06, + "loss": 0.5119, + "step": 9243 + }, + { + "epoch": 1.499675535366645, + "grad_norm": 0.5798229842247749, + "learning_rate": 2.5222053977157184e-06, + "loss": 0.5005, + "step": 9244 + }, + { + "epoch": 1.4998377676833226, + "grad_norm": 0.5891733836577697, + "learning_rate": 2.521778381744831e-06, + "loss": 0.5285, + "step": 9245 + }, + { + "epoch": 1.5, + "grad_norm": 0.6421161901615028, + "learning_rate": 2.5213513651385146e-06, + "loss": 0.5122, + "step": 9246 + }, + { + "epoch": 1.5001622323166774, + "grad_norm": 0.6085666432388497, + "learning_rate": 2.520924347909228e-06, + "loss": 0.5376, + "step": 9247 + }, + { + "epoch": 1.500324464633355, + "grad_norm": 0.5926950042999881, + "learning_rate": 2.5204973300694297e-06, + "loss": 0.4924, + "step": 9248 + }, + { + "epoch": 1.5004866969500323, + "grad_norm": 0.6199821284879207, + "learning_rate": 2.520070311631579e-06, + "loss": 0.5484, + "step": 9249 + }, + { + "epoch": 1.5006489292667098, + "grad_norm": 0.5869557812250631, + "learning_rate": 2.5196432926081353e-06, + "loss": 0.5508, + "step": 9250 + }, + { + "epoch": 1.5008111615833875, + "grad_norm": 0.5738103309235514, + "learning_rate": 2.519216273011557e-06, + "loss": 0.5013, + "step": 9251 + }, + { + "epoch": 1.500973393900065, + "grad_norm": 0.5679981553087913, + "learning_rate": 2.518789252854305e-06, + "loss": 0.5074, + "step": 9252 + }, + { + "epoch": 1.5011356262167423, + "grad_norm": 0.614514340979768, + "learning_rate": 2.518362232148837e-06, + "loss": 0.5067, + "step": 9253 + }, + { + "epoch": 1.50129785853342, + "grad_norm": 0.5946532402073588, + "learning_rate": 2.5179352109076133e-06, + "loss": 0.5205, + "step": 9254 + }, + { + "epoch": 1.5014600908500975, + "grad_norm": 0.6079062211093025, + "learning_rate": 2.517508189143092e-06, + "loss": 0.498, + "step": 9255 + }, + { + "epoch": 1.501622323166775, + "grad_norm": 0.5875334262197776, + "learning_rate": 2.5170811668677324e-06, + "loss": 0.5241, + "step": 9256 + }, + { + "epoch": 1.5017845554834524, + "grad_norm": 0.6096503129069952, + "learning_rate": 2.516654144093995e-06, + "loss": 0.5192, + "step": 9257 + }, + { + "epoch": 1.5019467878001298, + "grad_norm": 0.6088307098373188, + "learning_rate": 2.516227120834338e-06, + "loss": 0.5182, + "step": 9258 + }, + { + "epoch": 1.5021090201168072, + "grad_norm": 0.5790079327684713, + "learning_rate": 2.515800097101221e-06, + "loss": 0.5117, + "step": 9259 + }, + { + "epoch": 1.5022712524334847, + "grad_norm": 0.6186323043512036, + "learning_rate": 2.5153730729071035e-06, + "loss": 0.5539, + "step": 9260 + }, + { + "epoch": 1.5024334847501621, + "grad_norm": 0.638303238070279, + "learning_rate": 2.5149460482644437e-06, + "loss": 0.5215, + "step": 9261 + }, + { + "epoch": 1.5025957170668396, + "grad_norm": 0.5985326869780402, + "learning_rate": 2.514519023185703e-06, + "loss": 0.5144, + "step": 9262 + }, + { + "epoch": 1.502757949383517, + "grad_norm": 0.7522869653786454, + "learning_rate": 2.514091997683339e-06, + "loss": 0.5186, + "step": 9263 + }, + { + "epoch": 1.5029201817001947, + "grad_norm": 0.6209079455883761, + "learning_rate": 2.5136649717698118e-06, + "loss": 0.5112, + "step": 9264 + }, + { + "epoch": 1.5030824140168721, + "grad_norm": 0.610330815255279, + "learning_rate": 2.5132379454575806e-06, + "loss": 0.4601, + "step": 9265 + }, + { + "epoch": 1.5032446463335496, + "grad_norm": 0.5710405380678621, + "learning_rate": 2.5128109187591053e-06, + "loss": 0.4719, + "step": 9266 + }, + { + "epoch": 1.5034068786502273, + "grad_norm": 0.5876327151675967, + "learning_rate": 2.512383891686844e-06, + "loss": 0.5247, + "step": 9267 + }, + { + "epoch": 1.5035691109669047, + "grad_norm": 0.598629077921231, + "learning_rate": 2.511956864253258e-06, + "loss": 0.5173, + "step": 9268 + }, + { + "epoch": 1.5037313432835822, + "grad_norm": 0.6273921344884967, + "learning_rate": 2.511529836470804e-06, + "loss": 0.5168, + "step": 9269 + }, + { + "epoch": 1.5038935756002596, + "grad_norm": 0.6180918265836889, + "learning_rate": 2.511102808351944e-06, + "loss": 0.4984, + "step": 9270 + }, + { + "epoch": 1.504055807916937, + "grad_norm": 0.5961693094431335, + "learning_rate": 2.5106757799091364e-06, + "loss": 0.5201, + "step": 9271 + }, + { + "epoch": 1.5042180402336145, + "grad_norm": 0.5654088483084387, + "learning_rate": 2.5102487511548406e-06, + "loss": 0.5174, + "step": 9272 + }, + { + "epoch": 1.504380272550292, + "grad_norm": 0.5985554266846324, + "learning_rate": 2.5098217221015164e-06, + "loss": 0.538, + "step": 9273 + }, + { + "epoch": 1.5045425048669694, + "grad_norm": 0.6150493964826225, + "learning_rate": 2.5093946927616227e-06, + "loss": 0.5033, + "step": 9274 + }, + { + "epoch": 1.5047047371836468, + "grad_norm": 0.5847946859441049, + "learning_rate": 2.50896766314762e-06, + "loss": 0.5266, + "step": 9275 + }, + { + "epoch": 1.5048669695003245, + "grad_norm": 0.6222368569796749, + "learning_rate": 2.5085406332719663e-06, + "loss": 0.5378, + "step": 9276 + }, + { + "epoch": 1.505029201817002, + "grad_norm": 0.572318792256044, + "learning_rate": 2.508113603147122e-06, + "loss": 0.5035, + "step": 9277 + }, + { + "epoch": 1.5051914341336794, + "grad_norm": 0.5851656800665237, + "learning_rate": 2.507686572785547e-06, + "loss": 0.4693, + "step": 9278 + }, + { + "epoch": 1.505353666450357, + "grad_norm": 0.6311165861057717, + "learning_rate": 2.5072595421997e-06, + "loss": 0.5257, + "step": 9279 + }, + { + "epoch": 1.5055158987670345, + "grad_norm": 0.574545353250188, + "learning_rate": 2.50683251140204e-06, + "loss": 0.4805, + "step": 9280 + }, + { + "epoch": 1.505678131083712, + "grad_norm": 0.5785477744864198, + "learning_rate": 2.5064054804050277e-06, + "loss": 0.5063, + "step": 9281 + }, + { + "epoch": 1.5058403634003894, + "grad_norm": 0.6091048017032926, + "learning_rate": 2.505978449221123e-06, + "loss": 0.5088, + "step": 9282 + }, + { + "epoch": 1.5060025957170668, + "grad_norm": 0.5994170806231255, + "learning_rate": 2.505551417862783e-06, + "loss": 0.5101, + "step": 9283 + }, + { + "epoch": 1.5061648280337443, + "grad_norm": 0.6272914680046724, + "learning_rate": 2.50512438634247e-06, + "loss": 0.5112, + "step": 9284 + }, + { + "epoch": 1.5063270603504217, + "grad_norm": 0.5887185695089296, + "learning_rate": 2.5046973546726427e-06, + "loss": 0.5204, + "step": 9285 + }, + { + "epoch": 1.5064892926670992, + "grad_norm": 0.6276878534074336, + "learning_rate": 2.5042703228657596e-06, + "loss": 0.5134, + "step": 9286 + }, + { + "epoch": 1.5066515249837766, + "grad_norm": 0.5851654342565167, + "learning_rate": 2.503843290934281e-06, + "loss": 0.5354, + "step": 9287 + }, + { + "epoch": 1.5068137573004543, + "grad_norm": 0.6025909517913556, + "learning_rate": 2.5034162588906665e-06, + "loss": 0.5102, + "step": 9288 + }, + { + "epoch": 1.5069759896171318, + "grad_norm": 0.5669699598251859, + "learning_rate": 2.5029892267473754e-06, + "loss": 0.5068, + "step": 9289 + }, + { + "epoch": 1.5071382219338092, + "grad_norm": 0.5880508229249144, + "learning_rate": 2.502562194516868e-06, + "loss": 0.4765, + "step": 9290 + }, + { + "epoch": 1.5073004542504866, + "grad_norm": 0.6279913955588007, + "learning_rate": 2.5021351622116024e-06, + "loss": 0.5085, + "step": 9291 + }, + { + "epoch": 1.5074626865671643, + "grad_norm": 0.6129864571064386, + "learning_rate": 2.501708129844039e-06, + "loss": 0.5288, + "step": 9292 + }, + { + "epoch": 1.5076249188838418, + "grad_norm": 0.5834442129075663, + "learning_rate": 2.5012810974266382e-06, + "loss": 0.5189, + "step": 9293 + }, + { + "epoch": 1.5077871512005192, + "grad_norm": 0.6046271984860003, + "learning_rate": 2.5008540649718575e-06, + "loss": 0.4987, + "step": 9294 + }, + { + "epoch": 1.5079493835171967, + "grad_norm": 0.6437481595583234, + "learning_rate": 2.500427032492159e-06, + "loss": 0.5003, + "step": 9295 + }, + { + "epoch": 1.508111615833874, + "grad_norm": 0.6254997263800091, + "learning_rate": 2.5e-06, + "loss": 0.4688, + "step": 9296 + }, + { + "epoch": 1.5082738481505515, + "grad_norm": 0.5754260064830319, + "learning_rate": 2.4995729675078416e-06, + "loss": 0.5028, + "step": 9297 + }, + { + "epoch": 1.508436080467229, + "grad_norm": 0.5919550477508225, + "learning_rate": 2.499145935028142e-06, + "loss": 0.5136, + "step": 9298 + }, + { + "epoch": 1.5085983127839064, + "grad_norm": 0.5802303319103669, + "learning_rate": 2.498718902573363e-06, + "loss": 0.5129, + "step": 9299 + }, + { + "epoch": 1.5087605451005839, + "grad_norm": 0.6223233205337455, + "learning_rate": 2.4982918701559614e-06, + "loss": 0.5015, + "step": 9300 + }, + { + "epoch": 1.5089227774172616, + "grad_norm": 0.5818369650701024, + "learning_rate": 2.497864837788398e-06, + "loss": 0.5207, + "step": 9301 + }, + { + "epoch": 1.509085009733939, + "grad_norm": 0.5985190142702909, + "learning_rate": 2.497437805483133e-06, + "loss": 0.5129, + "step": 9302 + }, + { + "epoch": 1.5092472420506164, + "grad_norm": 0.5960738476083871, + "learning_rate": 2.4970107732526246e-06, + "loss": 0.4787, + "step": 9303 + }, + { + "epoch": 1.5094094743672941, + "grad_norm": 0.609096934374836, + "learning_rate": 2.4965837411093343e-06, + "loss": 0.5091, + "step": 9304 + }, + { + "epoch": 1.5095717066839716, + "grad_norm": 0.576262279591174, + "learning_rate": 2.49615670906572e-06, + "loss": 0.5257, + "step": 9305 + }, + { + "epoch": 1.509733939000649, + "grad_norm": 0.5865181106473686, + "learning_rate": 2.495729677134241e-06, + "loss": 0.5373, + "step": 9306 + }, + { + "epoch": 1.5098961713173265, + "grad_norm": 0.596445397378938, + "learning_rate": 2.495302645327358e-06, + "loss": 0.5075, + "step": 9307 + }, + { + "epoch": 1.510058403634004, + "grad_norm": 0.6021585140759915, + "learning_rate": 2.4948756136575296e-06, + "loss": 0.5181, + "step": 9308 + }, + { + "epoch": 1.5102206359506813, + "grad_norm": 0.5932955616382352, + "learning_rate": 2.4944485821372177e-06, + "loss": 0.5018, + "step": 9309 + }, + { + "epoch": 1.5103828682673588, + "grad_norm": 0.5818072054594858, + "learning_rate": 2.4940215507788783e-06, + "loss": 0.5047, + "step": 9310 + }, + { + "epoch": 1.5105451005840362, + "grad_norm": 0.5974772609340704, + "learning_rate": 2.4935945195949727e-06, + "loss": 0.514, + "step": 9311 + }, + { + "epoch": 1.5107073329007137, + "grad_norm": 0.61459569490858, + "learning_rate": 2.4931674885979603e-06, + "loss": 0.5303, + "step": 9312 + }, + { + "epoch": 1.5108695652173914, + "grad_norm": 0.6159841616025027, + "learning_rate": 2.4927404578003013e-06, + "loss": 0.5467, + "step": 9313 + }, + { + "epoch": 1.5110317975340688, + "grad_norm": 0.6212149661204924, + "learning_rate": 2.4923134272144543e-06, + "loss": 0.5243, + "step": 9314 + }, + { + "epoch": 1.5111940298507462, + "grad_norm": 0.5978252808259894, + "learning_rate": 2.4918863968528788e-06, + "loss": 0.5304, + "step": 9315 + }, + { + "epoch": 1.511356262167424, + "grad_norm": 0.614547528160985, + "learning_rate": 2.4914593667280345e-06, + "loss": 0.5307, + "step": 9316 + }, + { + "epoch": 1.5115184944841014, + "grad_norm": 0.6174761052019384, + "learning_rate": 2.4910323368523806e-06, + "loss": 0.4975, + "step": 9317 + }, + { + "epoch": 1.5116807268007788, + "grad_norm": 0.5786692397469902, + "learning_rate": 2.4906053072383773e-06, + "loss": 0.4859, + "step": 9318 + }, + { + "epoch": 1.5118429591174563, + "grad_norm": 0.6020547061891772, + "learning_rate": 2.4901782778984845e-06, + "loss": 0.5114, + "step": 9319 + }, + { + "epoch": 1.5120051914341337, + "grad_norm": 0.6314950300795849, + "learning_rate": 2.48975124884516e-06, + "loss": 0.5418, + "step": 9320 + }, + { + "epoch": 1.5121674237508111, + "grad_norm": 0.5942793060001362, + "learning_rate": 2.489324220090864e-06, + "loss": 0.5122, + "step": 9321 + }, + { + "epoch": 1.5123296560674886, + "grad_norm": 0.6127721683228519, + "learning_rate": 2.4888971916480565e-06, + "loss": 0.5408, + "step": 9322 + }, + { + "epoch": 1.512491888384166, + "grad_norm": 0.665909369241247, + "learning_rate": 2.488470163529196e-06, + "loss": 0.5025, + "step": 9323 + }, + { + "epoch": 1.5126541207008435, + "grad_norm": 0.6107600158239637, + "learning_rate": 2.488043135746743e-06, + "loss": 0.523, + "step": 9324 + }, + { + "epoch": 1.512816353017521, + "grad_norm": 0.5983287555796782, + "learning_rate": 2.487616108313157e-06, + "loss": 0.5214, + "step": 9325 + }, + { + "epoch": 1.5129785853341986, + "grad_norm": 0.6127063051234072, + "learning_rate": 2.4871890812408955e-06, + "loss": 0.5154, + "step": 9326 + }, + { + "epoch": 1.513140817650876, + "grad_norm": 0.5963703537041503, + "learning_rate": 2.48676205454242e-06, + "loss": 0.5456, + "step": 9327 + }, + { + "epoch": 1.5133030499675535, + "grad_norm": 0.630807889459575, + "learning_rate": 2.4863350282301882e-06, + "loss": 0.5125, + "step": 9328 + }, + { + "epoch": 1.5134652822842312, + "grad_norm": 0.6259584311255366, + "learning_rate": 2.4859080023166614e-06, + "loss": 0.5134, + "step": 9329 + }, + { + "epoch": 1.5136275146009086, + "grad_norm": 0.6444386571721096, + "learning_rate": 2.485480976814298e-06, + "loss": 0.5137, + "step": 9330 + }, + { + "epoch": 1.513789746917586, + "grad_norm": 0.58865752461955, + "learning_rate": 2.485053951735557e-06, + "loss": 0.5327, + "step": 9331 + }, + { + "epoch": 1.5139519792342635, + "grad_norm": 0.656126607701413, + "learning_rate": 2.4846269270928973e-06, + "loss": 0.5419, + "step": 9332 + }, + { + "epoch": 1.514114211550941, + "grad_norm": 0.6001536818642351, + "learning_rate": 2.48419990289878e-06, + "loss": 0.5359, + "step": 9333 + }, + { + "epoch": 1.5142764438676184, + "grad_norm": 0.6003919257877646, + "learning_rate": 2.4837728791656625e-06, + "loss": 0.5064, + "step": 9334 + }, + { + "epoch": 1.5144386761842958, + "grad_norm": 0.617550590006663, + "learning_rate": 2.483345855906006e-06, + "loss": 0.537, + "step": 9335 + }, + { + "epoch": 1.5146009085009733, + "grad_norm": 0.646018315304623, + "learning_rate": 2.4829188331322684e-06, + "loss": 0.5359, + "step": 9336 + }, + { + "epoch": 1.5147631408176507, + "grad_norm": 0.6539943346083259, + "learning_rate": 2.4824918108569086e-06, + "loss": 0.5234, + "step": 9337 + }, + { + "epoch": 1.5149253731343284, + "grad_norm": 0.5942653341884983, + "learning_rate": 2.4820647890923876e-06, + "loss": 0.5154, + "step": 9338 + }, + { + "epoch": 1.5150876054510058, + "grad_norm": 0.6070800986303959, + "learning_rate": 2.481637767851163e-06, + "loss": 0.5141, + "step": 9339 + }, + { + "epoch": 1.5152498377676833, + "grad_norm": 0.6191716950558132, + "learning_rate": 2.4812107471456958e-06, + "loss": 0.5431, + "step": 9340 + }, + { + "epoch": 1.515412070084361, + "grad_norm": 0.6247464869348378, + "learning_rate": 2.4807837269884435e-06, + "loss": 0.5367, + "step": 9341 + }, + { + "epoch": 1.5155743024010384, + "grad_norm": 0.5948903926259103, + "learning_rate": 2.4803567073918656e-06, + "loss": 0.5135, + "step": 9342 + }, + { + "epoch": 1.5157365347177159, + "grad_norm": 0.6168446311149077, + "learning_rate": 2.479929688368422e-06, + "loss": 0.5262, + "step": 9343 + }, + { + "epoch": 1.5158987670343933, + "grad_norm": 0.6117145294705848, + "learning_rate": 2.479502669930571e-06, + "loss": 0.5332, + "step": 9344 + }, + { + "epoch": 1.5160609993510707, + "grad_norm": 0.5868618798973081, + "learning_rate": 2.4790756520907734e-06, + "loss": 0.5257, + "step": 9345 + }, + { + "epoch": 1.5162232316677482, + "grad_norm": 0.596311532514343, + "learning_rate": 2.478648634861486e-06, + "loss": 0.5, + "step": 9346 + }, + { + "epoch": 1.5163854639844256, + "grad_norm": 0.6304406789624095, + "learning_rate": 2.4782216182551696e-06, + "loss": 0.5447, + "step": 9347 + }, + { + "epoch": 1.516547696301103, + "grad_norm": 0.5946694649095206, + "learning_rate": 2.4777946022842824e-06, + "loss": 0.507, + "step": 9348 + }, + { + "epoch": 1.5167099286177805, + "grad_norm": 0.5865068353994656, + "learning_rate": 2.4773675869612843e-06, + "loss": 0.531, + "step": 9349 + }, + { + "epoch": 1.516872160934458, + "grad_norm": 0.610979473625007, + "learning_rate": 2.4769405722986343e-06, + "loss": 0.5296, + "step": 9350 + }, + { + "epoch": 1.5170343932511356, + "grad_norm": 0.6305209562867108, + "learning_rate": 2.4765135583087903e-06, + "loss": 0.5045, + "step": 9351 + }, + { + "epoch": 1.517196625567813, + "grad_norm": 0.6145337073965264, + "learning_rate": 2.4760865450042124e-06, + "loss": 0.5355, + "step": 9352 + }, + { + "epoch": 1.5173588578844905, + "grad_norm": 0.6203954648416382, + "learning_rate": 2.475659532397359e-06, + "loss": 0.5198, + "step": 9353 + }, + { + "epoch": 1.5175210902011682, + "grad_norm": 0.6210171588387313, + "learning_rate": 2.4752325205006895e-06, + "loss": 0.5248, + "step": 9354 + }, + { + "epoch": 1.5176833225178457, + "grad_norm": 0.606582754489949, + "learning_rate": 2.4748055093266633e-06, + "loss": 0.5326, + "step": 9355 + }, + { + "epoch": 1.517845554834523, + "grad_norm": 0.5994452247280588, + "learning_rate": 2.4743784988877385e-06, + "loss": 0.5578, + "step": 9356 + }, + { + "epoch": 1.5180077871512005, + "grad_norm": 0.6102648421364104, + "learning_rate": 2.4739514891963736e-06, + "loss": 0.504, + "step": 9357 + }, + { + "epoch": 1.518170019467878, + "grad_norm": 0.5881495898038467, + "learning_rate": 2.473524480265029e-06, + "loss": 0.5376, + "step": 9358 + }, + { + "epoch": 1.5183322517845554, + "grad_norm": 0.6112043209221545, + "learning_rate": 2.473097472106162e-06, + "loss": 0.5149, + "step": 9359 + }, + { + "epoch": 1.5184944841012329, + "grad_norm": 0.6113881192393733, + "learning_rate": 2.4726704647322334e-06, + "loss": 0.5576, + "step": 9360 + }, + { + "epoch": 1.5186567164179103, + "grad_norm": 0.6061258655195392, + "learning_rate": 2.4722434581557e-06, + "loss": 0.5216, + "step": 9361 + }, + { + "epoch": 1.5188189487345878, + "grad_norm": 0.6158991340332772, + "learning_rate": 2.4718164523890212e-06, + "loss": 0.5399, + "step": 9362 + }, + { + "epoch": 1.5189811810512654, + "grad_norm": 0.6123261583309685, + "learning_rate": 2.4713894474446564e-06, + "loss": 0.5251, + "step": 9363 + }, + { + "epoch": 1.519143413367943, + "grad_norm": 0.5905328794796091, + "learning_rate": 2.4709624433350633e-06, + "loss": 0.5274, + "step": 9364 + }, + { + "epoch": 1.5193056456846203, + "grad_norm": 0.5904352068539492, + "learning_rate": 2.4705354400727035e-06, + "loss": 0.5195, + "step": 9365 + }, + { + "epoch": 1.519467878001298, + "grad_norm": 0.5734687628203919, + "learning_rate": 2.4701084376700317e-06, + "loss": 0.4818, + "step": 9366 + }, + { + "epoch": 1.5196301103179755, + "grad_norm": 0.592859225363596, + "learning_rate": 2.4696814361395087e-06, + "loss": 0.5215, + "step": 9367 + }, + { + "epoch": 1.519792342634653, + "grad_norm": 0.5789776211026073, + "learning_rate": 2.4692544354935926e-06, + "loss": 0.545, + "step": 9368 + }, + { + "epoch": 1.5199545749513304, + "grad_norm": 0.5746333844407374, + "learning_rate": 2.4688274357447424e-06, + "loss": 0.5033, + "step": 9369 + }, + { + "epoch": 1.5201168072680078, + "grad_norm": 0.6296255084285749, + "learning_rate": 2.4684004369054162e-06, + "loss": 0.5275, + "step": 9370 + }, + { + "epoch": 1.5202790395846852, + "grad_norm": 0.5868435501876553, + "learning_rate": 2.4679734389880744e-06, + "loss": 0.5221, + "step": 9371 + }, + { + "epoch": 1.5204412719013627, + "grad_norm": 0.5876040825041436, + "learning_rate": 2.4675464420051735e-06, + "loss": 0.5425, + "step": 9372 + }, + { + "epoch": 1.5206035042180401, + "grad_norm": 0.5860752154514625, + "learning_rate": 2.467119445969172e-06, + "loss": 0.5245, + "step": 9373 + }, + { + "epoch": 1.5207657365347176, + "grad_norm": 0.587192705262609, + "learning_rate": 2.4666924508925296e-06, + "loss": 0.521, + "step": 9374 + }, + { + "epoch": 1.5209279688513953, + "grad_norm": 0.6067076307019003, + "learning_rate": 2.466265456787704e-06, + "loss": 0.5261, + "step": 9375 + }, + { + "epoch": 1.5210902011680727, + "grad_norm": 0.6383421990695219, + "learning_rate": 2.465838463667154e-06, + "loss": 0.5297, + "step": 9376 + }, + { + "epoch": 1.5212524334847501, + "grad_norm": 0.595016107165332, + "learning_rate": 2.465411471543338e-06, + "loss": 0.5202, + "step": 9377 + }, + { + "epoch": 1.5214146658014276, + "grad_norm": 0.6061043599873852, + "learning_rate": 2.4649844804287144e-06, + "loss": 0.5284, + "step": 9378 + }, + { + "epoch": 1.5215768981181053, + "grad_norm": 0.6199855690343503, + "learning_rate": 2.464557490335741e-06, + "loss": 0.5025, + "step": 9379 + }, + { + "epoch": 1.5217391304347827, + "grad_norm": 0.6016826203364054, + "learning_rate": 2.4641305012768767e-06, + "loss": 0.5193, + "step": 9380 + }, + { + "epoch": 1.5219013627514602, + "grad_norm": 0.5930690283804108, + "learning_rate": 2.4637035132645802e-06, + "loss": 0.4932, + "step": 9381 + }, + { + "epoch": 1.5220635950681376, + "grad_norm": 0.6282066571717713, + "learning_rate": 2.4632765263113083e-06, + "loss": 0.5092, + "step": 9382 + }, + { + "epoch": 1.522225827384815, + "grad_norm": 0.6079067308940757, + "learning_rate": 2.4628495404295203e-06, + "loss": 0.4389, + "step": 9383 + }, + { + "epoch": 1.5223880597014925, + "grad_norm": 0.5671448909300008, + "learning_rate": 2.4624225556316745e-06, + "loss": 0.4876, + "step": 9384 + }, + { + "epoch": 1.52255029201817, + "grad_norm": 0.6026997253350205, + "learning_rate": 2.4619955719302285e-06, + "loss": 0.4905, + "step": 9385 + }, + { + "epoch": 1.5227125243348474, + "grad_norm": 0.6027380401910569, + "learning_rate": 2.461568589337642e-06, + "loss": 0.5106, + "step": 9386 + }, + { + "epoch": 1.5228747566515248, + "grad_norm": 0.5969385226528457, + "learning_rate": 2.46114160786637e-06, + "loss": 0.5273, + "step": 9387 + }, + { + "epoch": 1.5230369889682025, + "grad_norm": 0.6237838373156483, + "learning_rate": 2.4607146275288736e-06, + "loss": 0.5524, + "step": 9388 + }, + { + "epoch": 1.52319922128488, + "grad_norm": 0.6169340749079527, + "learning_rate": 2.460287648337609e-06, + "loss": 0.5198, + "step": 9389 + }, + { + "epoch": 1.5233614536015574, + "grad_norm": 0.5696114110631648, + "learning_rate": 2.459860670305035e-06, + "loss": 0.4834, + "step": 9390 + }, + { + "epoch": 1.523523685918235, + "grad_norm": 0.6314426167013557, + "learning_rate": 2.4594336934436105e-06, + "loss": 0.521, + "step": 9391 + }, + { + "epoch": 1.5236859182349125, + "grad_norm": 0.5888287476560679, + "learning_rate": 2.4590067177657913e-06, + "loss": 0.5132, + "step": 9392 + }, + { + "epoch": 1.52384815055159, + "grad_norm": 0.6168002096493125, + "learning_rate": 2.4585797432840364e-06, + "loss": 0.5266, + "step": 9393 + }, + { + "epoch": 1.5240103828682674, + "grad_norm": 0.6337979048128228, + "learning_rate": 2.458152770010804e-06, + "loss": 0.5049, + "step": 9394 + }, + { + "epoch": 1.5241726151849448, + "grad_norm": 0.6376603193552546, + "learning_rate": 2.457725797958551e-06, + "loss": 0.5721, + "step": 9395 + }, + { + "epoch": 1.5243348475016223, + "grad_norm": 0.5815003142840803, + "learning_rate": 2.4572988271397366e-06, + "loss": 0.5165, + "step": 9396 + }, + { + "epoch": 1.5244970798182997, + "grad_norm": 0.5811721641238278, + "learning_rate": 2.4568718575668175e-06, + "loss": 0.5007, + "step": 9397 + }, + { + "epoch": 1.5246593121349772, + "grad_norm": 0.5727471048941749, + "learning_rate": 2.456444889252251e-06, + "loss": 0.4989, + "step": 9398 + }, + { + "epoch": 1.5248215444516546, + "grad_norm": 0.598274209842575, + "learning_rate": 2.4560179222084958e-06, + "loss": 0.5311, + "step": 9399 + }, + { + "epoch": 1.5249837767683323, + "grad_norm": 0.6066777295437098, + "learning_rate": 2.455590956448009e-06, + "loss": 0.5041, + "step": 9400 + }, + { + "epoch": 1.5251460090850097, + "grad_norm": 0.5769680041822969, + "learning_rate": 2.45516399198325e-06, + "loss": 0.5304, + "step": 9401 + }, + { + "epoch": 1.5253082414016872, + "grad_norm": 0.6314389930409017, + "learning_rate": 2.4547370288266733e-06, + "loss": 0.5032, + "step": 9402 + }, + { + "epoch": 1.5254704737183649, + "grad_norm": 0.6394477539248531, + "learning_rate": 2.4543100669907383e-06, + "loss": 0.5288, + "step": 9403 + }, + { + "epoch": 1.5256327060350423, + "grad_norm": 0.6001038917484237, + "learning_rate": 2.4538831064879014e-06, + "loss": 0.4956, + "step": 9404 + }, + { + "epoch": 1.5257949383517198, + "grad_norm": 0.5940304245316327, + "learning_rate": 2.4534561473306217e-06, + "loss": 0.5274, + "step": 9405 + }, + { + "epoch": 1.5259571706683972, + "grad_norm": 0.5732935719054694, + "learning_rate": 2.453029189531356e-06, + "loss": 0.5272, + "step": 9406 + }, + { + "epoch": 1.5261194029850746, + "grad_norm": 0.5935837425068293, + "learning_rate": 2.4526022331025604e-06, + "loss": 0.4977, + "step": 9407 + }, + { + "epoch": 1.526281635301752, + "grad_norm": 0.6139380590922701, + "learning_rate": 2.452175278056694e-06, + "loss": 0.5311, + "step": 9408 + }, + { + "epoch": 1.5264438676184295, + "grad_norm": 0.581697470282464, + "learning_rate": 2.451748324406212e-06, + "loss": 0.5233, + "step": 9409 + }, + { + "epoch": 1.526606099935107, + "grad_norm": 0.5680757907569806, + "learning_rate": 2.4513213721635745e-06, + "loss": 0.5293, + "step": 9410 + }, + { + "epoch": 1.5267683322517844, + "grad_norm": 0.6322644547514918, + "learning_rate": 2.4508944213412375e-06, + "loss": 0.4769, + "step": 9411 + }, + { + "epoch": 1.5269305645684619, + "grad_norm": 0.5752386869395812, + "learning_rate": 2.4504674719516575e-06, + "loss": 0.4854, + "step": 9412 + }, + { + "epoch": 1.5270927968851395, + "grad_norm": 0.5859233021274605, + "learning_rate": 2.450040524007291e-06, + "loss": 0.538, + "step": 9413 + }, + { + "epoch": 1.527255029201817, + "grad_norm": 0.5928552357679762, + "learning_rate": 2.449613577520597e-06, + "loss": 0.4982, + "step": 9414 + }, + { + "epoch": 1.5274172615184944, + "grad_norm": 0.6547878010381155, + "learning_rate": 2.449186632504031e-06, + "loss": 0.4968, + "step": 9415 + }, + { + "epoch": 1.527579493835172, + "grad_norm": 0.6501807234735614, + "learning_rate": 2.4487596889700514e-06, + "loss": 0.5078, + "step": 9416 + }, + { + "epoch": 1.5277417261518496, + "grad_norm": 0.6326494888306181, + "learning_rate": 2.448332746931115e-06, + "loss": 0.5014, + "step": 9417 + }, + { + "epoch": 1.527903958468527, + "grad_norm": 0.608827963337378, + "learning_rate": 2.4479058063996767e-06, + "loss": 0.5312, + "step": 9418 + }, + { + "epoch": 1.5280661907852044, + "grad_norm": 0.591809872111133, + "learning_rate": 2.447478867388196e-06, + "loss": 0.5163, + "step": 9419 + }, + { + "epoch": 1.528228423101882, + "grad_norm": 0.5931813856968239, + "learning_rate": 2.447051929909128e-06, + "loss": 0.5143, + "step": 9420 + }, + { + "epoch": 1.5283906554185593, + "grad_norm": 0.6008702160902307, + "learning_rate": 2.44662499397493e-06, + "loss": 0.5275, + "step": 9421 + }, + { + "epoch": 1.5285528877352368, + "grad_norm": 0.5945298523675266, + "learning_rate": 2.44619805959806e-06, + "loss": 0.528, + "step": 9422 + }, + { + "epoch": 1.5287151200519142, + "grad_norm": 0.6066366501717021, + "learning_rate": 2.445771126790973e-06, + "loss": 0.4942, + "step": 9423 + }, + { + "epoch": 1.5288773523685917, + "grad_norm": 0.604817481976584, + "learning_rate": 2.445344195566125e-06, + "loss": 0.5511, + "step": 9424 + }, + { + "epoch": 1.5290395846852693, + "grad_norm": 0.6168341936419883, + "learning_rate": 2.4449172659359747e-06, + "loss": 0.4839, + "step": 9425 + }, + { + "epoch": 1.5292018170019468, + "grad_norm": 0.6186963794769719, + "learning_rate": 2.4444903379129775e-06, + "loss": 0.5422, + "step": 9426 + }, + { + "epoch": 1.5293640493186242, + "grad_norm": 0.5846092001050915, + "learning_rate": 2.444063411509591e-06, + "loss": 0.5148, + "step": 9427 + }, + { + "epoch": 1.529526281635302, + "grad_norm": 0.5739969015223612, + "learning_rate": 2.44363648673827e-06, + "loss": 0.5071, + "step": 9428 + }, + { + "epoch": 1.5296885139519794, + "grad_norm": 0.5832208409512122, + "learning_rate": 2.4432095636114716e-06, + "loss": 0.5326, + "step": 9429 + }, + { + "epoch": 1.5298507462686568, + "grad_norm": 0.6114889570322596, + "learning_rate": 2.4427826421416524e-06, + "loss": 0.5004, + "step": 9430 + }, + { + "epoch": 1.5300129785853342, + "grad_norm": 0.5940276552476984, + "learning_rate": 2.4423557223412685e-06, + "loss": 0.4883, + "step": 9431 + }, + { + "epoch": 1.5301752109020117, + "grad_norm": 0.5784495685644934, + "learning_rate": 2.4419288042227775e-06, + "loss": 0.5429, + "step": 9432 + }, + { + "epoch": 1.5303374432186891, + "grad_norm": 0.5812614547779174, + "learning_rate": 2.4415018877986335e-06, + "loss": 0.4972, + "step": 9433 + }, + { + "epoch": 1.5304996755353666, + "grad_norm": 0.5826088904197131, + "learning_rate": 2.441074973081293e-06, + "loss": 0.5058, + "step": 9434 + }, + { + "epoch": 1.530661907852044, + "grad_norm": 0.5916823325792264, + "learning_rate": 2.440648060083214e-06, + "loss": 0.5414, + "step": 9435 + }, + { + "epoch": 1.5308241401687215, + "grad_norm": 0.5672496748884891, + "learning_rate": 2.4402211488168506e-06, + "loss": 0.5048, + "step": 9436 + }, + { + "epoch": 1.530986372485399, + "grad_norm": 0.5901123354298434, + "learning_rate": 2.43979423929466e-06, + "loss": 0.4738, + "step": 9437 + }, + { + "epoch": 1.5311486048020766, + "grad_norm": 0.6227449851743515, + "learning_rate": 2.439367331529097e-06, + "loss": 0.4981, + "step": 9438 + }, + { + "epoch": 1.531310837118754, + "grad_norm": 0.634043007887586, + "learning_rate": 2.438940425532619e-06, + "loss": 0.5125, + "step": 9439 + }, + { + "epoch": 1.5314730694354315, + "grad_norm": 0.5751567160747391, + "learning_rate": 2.4385135213176804e-06, + "loss": 0.5428, + "step": 9440 + }, + { + "epoch": 1.5316353017521092, + "grad_norm": 0.6238145499488196, + "learning_rate": 2.4380866188967376e-06, + "loss": 0.5221, + "step": 9441 + }, + { + "epoch": 1.5317975340687866, + "grad_norm": 0.6262217874529226, + "learning_rate": 2.4376597182822476e-06, + "loss": 0.5383, + "step": 9442 + }, + { + "epoch": 1.531959766385464, + "grad_norm": 0.6087535748964702, + "learning_rate": 2.437232819486664e-06, + "loss": 0.5068, + "step": 9443 + }, + { + "epoch": 1.5321219987021415, + "grad_norm": 0.6121205481725328, + "learning_rate": 2.4368059225224438e-06, + "loss": 0.5429, + "step": 9444 + }, + { + "epoch": 1.532284231018819, + "grad_norm": 0.5847197891140345, + "learning_rate": 2.4363790274020416e-06, + "loss": 0.5397, + "step": 9445 + }, + { + "epoch": 1.5324464633354964, + "grad_norm": 0.6479808990079226, + "learning_rate": 2.4359521341379146e-06, + "loss": 0.5442, + "step": 9446 + }, + { + "epoch": 1.5326086956521738, + "grad_norm": 0.6232698118347147, + "learning_rate": 2.4355252427425177e-06, + "loss": 0.5452, + "step": 9447 + }, + { + "epoch": 1.5327709279688513, + "grad_norm": 0.6018365807859143, + "learning_rate": 2.4350983532283046e-06, + "loss": 0.5213, + "step": 9448 + }, + { + "epoch": 1.5329331602855287, + "grad_norm": 0.5604754556050686, + "learning_rate": 2.4346714656077325e-06, + "loss": 0.4979, + "step": 9449 + }, + { + "epoch": 1.5330953926022064, + "grad_norm": 0.6053316729016724, + "learning_rate": 2.4342445798932563e-06, + "loss": 0.5037, + "step": 9450 + }, + { + "epoch": 1.5332576249188838, + "grad_norm": 0.6080757980853285, + "learning_rate": 2.433817696097331e-06, + "loss": 0.5131, + "step": 9451 + }, + { + "epoch": 1.5334198572355613, + "grad_norm": 0.5959118575563834, + "learning_rate": 2.433390814232413e-06, + "loss": 0.5398, + "step": 9452 + }, + { + "epoch": 1.533582089552239, + "grad_norm": 0.576799262485577, + "learning_rate": 2.432963934310956e-06, + "loss": 0.5148, + "step": 9453 + }, + { + "epoch": 1.5337443218689164, + "grad_norm": 0.6106021602420004, + "learning_rate": 2.4325370563454155e-06, + "loss": 0.4822, + "step": 9454 + }, + { + "epoch": 1.5339065541855939, + "grad_norm": 0.5657615792650618, + "learning_rate": 2.432110180348247e-06, + "loss": 0.5023, + "step": 9455 + }, + { + "epoch": 1.5340687865022713, + "grad_norm": 0.6306484269546639, + "learning_rate": 2.4316833063319045e-06, + "loss": 0.4915, + "step": 9456 + }, + { + "epoch": 1.5342310188189487, + "grad_norm": 0.6122767258116191, + "learning_rate": 2.4312564343088456e-06, + "loss": 0.5108, + "step": 9457 + }, + { + "epoch": 1.5343932511356262, + "grad_norm": 0.6070633549471384, + "learning_rate": 2.430829564291521e-06, + "loss": 0.533, + "step": 9458 + }, + { + "epoch": 1.5345554834523036, + "grad_norm": 0.6583055949193503, + "learning_rate": 2.4304026962923887e-06, + "loss": 0.5087, + "step": 9459 + }, + { + "epoch": 1.534717715768981, + "grad_norm": 0.5895450520547779, + "learning_rate": 2.4299758303239022e-06, + "loss": 0.5444, + "step": 9460 + }, + { + "epoch": 1.5348799480856585, + "grad_norm": 0.5858646462422338, + "learning_rate": 2.429548966398517e-06, + "loss": 0.533, + "step": 9461 + }, + { + "epoch": 1.5350421804023362, + "grad_norm": 0.6181036928945707, + "learning_rate": 2.429122104528686e-06, + "loss": 0.5442, + "step": 9462 + }, + { + "epoch": 1.5352044127190136, + "grad_norm": 0.6272103462303983, + "learning_rate": 2.4286952447268668e-06, + "loss": 0.5086, + "step": 9463 + }, + { + "epoch": 1.535366645035691, + "grad_norm": 0.5962968003521824, + "learning_rate": 2.428268387005511e-06, + "loss": 0.5322, + "step": 9464 + }, + { + "epoch": 1.5355288773523685, + "grad_norm": 0.6334248135138865, + "learning_rate": 2.4278415313770744e-06, + "loss": 0.5455, + "step": 9465 + }, + { + "epoch": 1.5356911096690462, + "grad_norm": 0.5871989510091641, + "learning_rate": 2.4274146778540113e-06, + "loss": 0.4944, + "step": 9466 + }, + { + "epoch": 1.5358533419857237, + "grad_norm": 0.5829434471603723, + "learning_rate": 2.4269878264487757e-06, + "loss": 0.5402, + "step": 9467 + }, + { + "epoch": 1.536015574302401, + "grad_norm": 0.6220914555463646, + "learning_rate": 2.426560977173823e-06, + "loss": 0.4418, + "step": 9468 + }, + { + "epoch": 1.5361778066190785, + "grad_norm": 0.6161171821827687, + "learning_rate": 2.426134130041606e-06, + "loss": 0.5611, + "step": 9469 + }, + { + "epoch": 1.536340038935756, + "grad_norm": 0.598609962649697, + "learning_rate": 2.4257072850645795e-06, + "loss": 0.5207, + "step": 9470 + }, + { + "epoch": 1.5365022712524334, + "grad_norm": 0.5756047224087921, + "learning_rate": 2.4252804422551974e-06, + "loss": 0.4869, + "step": 9471 + }, + { + "epoch": 1.5366645035691109, + "grad_norm": 0.6138104839752552, + "learning_rate": 2.4248536016259137e-06, + "loss": 0.5267, + "step": 9472 + }, + { + "epoch": 1.5368267358857883, + "grad_norm": 0.5973591434889418, + "learning_rate": 2.4244267631891833e-06, + "loss": 0.5563, + "step": 9473 + }, + { + "epoch": 1.5369889682024658, + "grad_norm": 0.6064698201897802, + "learning_rate": 2.423999926957458e-06, + "loss": 0.5415, + "step": 9474 + }, + { + "epoch": 1.5371512005191434, + "grad_norm": 0.6240860583185519, + "learning_rate": 2.423573092943194e-06, + "loss": 0.544, + "step": 9475 + }, + { + "epoch": 1.537313432835821, + "grad_norm": 0.5871516480681412, + "learning_rate": 2.4231462611588425e-06, + "loss": 0.5203, + "step": 9476 + }, + { + "epoch": 1.5374756651524983, + "grad_norm": 0.6132845523942977, + "learning_rate": 2.42271943161686e-06, + "loss": 0.494, + "step": 9477 + }, + { + "epoch": 1.537637897469176, + "grad_norm": 0.6173777170114946, + "learning_rate": 2.4222926043296995e-06, + "loss": 0.5249, + "step": 9478 + }, + { + "epoch": 1.5378001297858535, + "grad_norm": 0.593148117240995, + "learning_rate": 2.4218657793098126e-06, + "loss": 0.5218, + "step": 9479 + }, + { + "epoch": 1.537962362102531, + "grad_norm": 0.5868974786855898, + "learning_rate": 2.4214389565696543e-06, + "loss": 0.5174, + "step": 9480 + }, + { + "epoch": 1.5381245944192083, + "grad_norm": 0.5699789730320873, + "learning_rate": 2.4210121361216777e-06, + "loss": 0.4956, + "step": 9481 + }, + { + "epoch": 1.5382868267358858, + "grad_norm": 0.5836455554618915, + "learning_rate": 2.4205853179783368e-06, + "loss": 0.5197, + "step": 9482 + }, + { + "epoch": 1.5384490590525632, + "grad_norm": 0.5855824012487634, + "learning_rate": 2.420158502152085e-06, + "loss": 0.5139, + "step": 9483 + }, + { + "epoch": 1.5386112913692407, + "grad_norm": 0.5818813250554961, + "learning_rate": 2.419731688655374e-06, + "loss": 0.4939, + "step": 9484 + }, + { + "epoch": 1.5387735236859181, + "grad_norm": 0.5879810849801798, + "learning_rate": 2.419304877500658e-06, + "loss": 0.5374, + "step": 9485 + }, + { + "epoch": 1.5389357560025956, + "grad_norm": 0.6332646170076374, + "learning_rate": 2.4188780687003903e-06, + "loss": 0.5254, + "step": 9486 + }, + { + "epoch": 1.5390979883192732, + "grad_norm": 0.6123362514028609, + "learning_rate": 2.418451262267023e-06, + "loss": 0.4672, + "step": 9487 + }, + { + "epoch": 1.5392602206359507, + "grad_norm": 0.6154762247334442, + "learning_rate": 2.4180244582130114e-06, + "loss": 0.515, + "step": 9488 + }, + { + "epoch": 1.5394224529526281, + "grad_norm": 0.5838195631878657, + "learning_rate": 2.417597656550806e-06, + "loss": 0.5279, + "step": 9489 + }, + { + "epoch": 1.5395846852693058, + "grad_norm": 0.5984105468221388, + "learning_rate": 2.4171708572928595e-06, + "loss": 0.5079, + "step": 9490 + }, + { + "epoch": 1.5397469175859833, + "grad_norm": 0.5783662179739305, + "learning_rate": 2.4167440604516264e-06, + "loss": 0.4997, + "step": 9491 + }, + { + "epoch": 1.5399091499026607, + "grad_norm": 0.6446085925999583, + "learning_rate": 2.4163172660395575e-06, + "loss": 0.4959, + "step": 9492 + }, + { + "epoch": 1.5400713822193381, + "grad_norm": 0.617337493944956, + "learning_rate": 2.4158904740691086e-06, + "loss": 0.522, + "step": 9493 + }, + { + "epoch": 1.5402336145360156, + "grad_norm": 0.5795858946924346, + "learning_rate": 2.4154636845527284e-06, + "loss": 0.4857, + "step": 9494 + }, + { + "epoch": 1.540395846852693, + "grad_norm": 0.5955777204442035, + "learning_rate": 2.4150368975028713e-06, + "loss": 0.5402, + "step": 9495 + }, + { + "epoch": 1.5405580791693705, + "grad_norm": 0.6187815272640182, + "learning_rate": 2.414610112931989e-06, + "loss": 0.5153, + "step": 9496 + }, + { + "epoch": 1.540720311486048, + "grad_norm": 0.6290807413416053, + "learning_rate": 2.414183330852535e-06, + "loss": 0.522, + "step": 9497 + }, + { + "epoch": 1.5408825438027254, + "grad_norm": 0.5958730543988461, + "learning_rate": 2.4137565512769613e-06, + "loss": 0.5099, + "step": 9498 + }, + { + "epoch": 1.5410447761194028, + "grad_norm": 0.6212623581362563, + "learning_rate": 2.4133297742177184e-06, + "loss": 0.5047, + "step": 9499 + }, + { + "epoch": 1.5412070084360805, + "grad_norm": 0.5686049394589877, + "learning_rate": 2.4129029996872603e-06, + "loss": 0.4973, + "step": 9500 + }, + { + "epoch": 1.541369240752758, + "grad_norm": 0.5994106141223825, + "learning_rate": 2.4124762276980378e-06, + "loss": 0.48, + "step": 9501 + }, + { + "epoch": 1.5415314730694354, + "grad_norm": 0.5893297307136295, + "learning_rate": 2.4120494582625036e-06, + "loss": 0.5308, + "step": 9502 + }, + { + "epoch": 1.541693705386113, + "grad_norm": 0.5939494643351254, + "learning_rate": 2.4116226913931103e-06, + "loss": 0.5406, + "step": 9503 + }, + { + "epoch": 1.5418559377027905, + "grad_norm": 0.59352828473473, + "learning_rate": 2.4111959271023085e-06, + "loss": 0.5034, + "step": 9504 + }, + { + "epoch": 1.542018170019468, + "grad_norm": 0.6324850882030181, + "learning_rate": 2.410769165402549e-06, + "loss": 0.541, + "step": 9505 + }, + { + "epoch": 1.5421804023361454, + "grad_norm": 0.5782757903715295, + "learning_rate": 2.410342406306286e-06, + "loss": 0.4936, + "step": 9506 + }, + { + "epoch": 1.5423426346528228, + "grad_norm": 0.6050802494127064, + "learning_rate": 2.409915649825969e-06, + "loss": 0.5199, + "step": 9507 + }, + { + "epoch": 1.5425048669695003, + "grad_norm": 0.5856456018128167, + "learning_rate": 2.4094888959740506e-06, + "loss": 0.5281, + "step": 9508 + }, + { + "epoch": 1.5426670992861777, + "grad_norm": 0.5992836979091789, + "learning_rate": 2.409062144762983e-06, + "loss": 0.5122, + "step": 9509 + }, + { + "epoch": 1.5428293316028552, + "grad_norm": 0.6169632438208248, + "learning_rate": 2.4086353962052147e-06, + "loss": 0.5067, + "step": 9510 + }, + { + "epoch": 1.5429915639195326, + "grad_norm": 0.5777425855421695, + "learning_rate": 2.4082086503131997e-06, + "loss": 0.4954, + "step": 9511 + }, + { + "epoch": 1.5431537962362103, + "grad_norm": 0.6050577657115467, + "learning_rate": 2.4077819070993876e-06, + "loss": 0.4977, + "step": 9512 + }, + { + "epoch": 1.5433160285528877, + "grad_norm": 0.6151632754980211, + "learning_rate": 2.407355166576231e-06, + "loss": 0.4823, + "step": 9513 + }, + { + "epoch": 1.5434782608695652, + "grad_norm": 0.5860530428818237, + "learning_rate": 2.40692842875618e-06, + "loss": 0.5025, + "step": 9514 + }, + { + "epoch": 1.5436404931862429, + "grad_norm": 0.6369285095432564, + "learning_rate": 2.4065016936516854e-06, + "loss": 0.5282, + "step": 9515 + }, + { + "epoch": 1.5438027255029203, + "grad_norm": 0.6239760484070047, + "learning_rate": 2.4060749612751987e-06, + "loss": 0.4956, + "step": 9516 + }, + { + "epoch": 1.5439649578195977, + "grad_norm": 0.6151619483901961, + "learning_rate": 2.4056482316391703e-06, + "loss": 0.5185, + "step": 9517 + }, + { + "epoch": 1.5441271901362752, + "grad_norm": 0.5908326178244021, + "learning_rate": 2.4052215047560505e-06, + "loss": 0.5485, + "step": 9518 + }, + { + "epoch": 1.5442894224529526, + "grad_norm": 0.5640610063852327, + "learning_rate": 2.4047947806382914e-06, + "loss": 0.4828, + "step": 9519 + }, + { + "epoch": 1.54445165476963, + "grad_norm": 0.6144546831640002, + "learning_rate": 2.404368059298342e-06, + "loss": 0.4976, + "step": 9520 + }, + { + "epoch": 1.5446138870863075, + "grad_norm": 0.5888310677834141, + "learning_rate": 2.403941340748653e-06, + "loss": 0.5295, + "step": 9521 + }, + { + "epoch": 1.544776119402985, + "grad_norm": 0.5900986773696028, + "learning_rate": 2.403514625001676e-06, + "loss": 0.5132, + "step": 9522 + }, + { + "epoch": 1.5449383517196624, + "grad_norm": 0.5884941593529223, + "learning_rate": 2.4030879120698596e-06, + "loss": 0.5016, + "step": 9523 + }, + { + "epoch": 1.5451005840363399, + "grad_norm": 0.6212958424756488, + "learning_rate": 2.402661201965656e-06, + "loss": 0.5049, + "step": 9524 + }, + { + "epoch": 1.5452628163530175, + "grad_norm": 0.5936858148459909, + "learning_rate": 2.402234494701514e-06, + "loss": 0.5277, + "step": 9525 + }, + { + "epoch": 1.545425048669695, + "grad_norm": 0.6371844018394036, + "learning_rate": 2.4018077902898836e-06, + "loss": 0.5337, + "step": 9526 + }, + { + "epoch": 1.5455872809863724, + "grad_norm": 0.5864628604116077, + "learning_rate": 2.4013810887432155e-06, + "loss": 0.526, + "step": 9527 + }, + { + "epoch": 1.54574951330305, + "grad_norm": 0.6027751590328093, + "learning_rate": 2.400954390073959e-06, + "loss": 0.4779, + "step": 9528 + }, + { + "epoch": 1.5459117456197276, + "grad_norm": 0.5673086440635275, + "learning_rate": 2.400527694294565e-06, + "loss": 0.5023, + "step": 9529 + }, + { + "epoch": 1.546073977936405, + "grad_norm": 0.6032332519042537, + "learning_rate": 2.400101001417481e-06, + "loss": 0.4983, + "step": 9530 + }, + { + "epoch": 1.5462362102530824, + "grad_norm": 0.5971792758903312, + "learning_rate": 2.3996743114551593e-06, + "loss": 0.4935, + "step": 9531 + }, + { + "epoch": 1.54639844256976, + "grad_norm": 0.5670498773758966, + "learning_rate": 2.3992476244200472e-06, + "loss": 0.5257, + "step": 9532 + }, + { + "epoch": 1.5465606748864373, + "grad_norm": 0.6495885230521717, + "learning_rate": 2.398820940324596e-06, + "loss": 0.5152, + "step": 9533 + }, + { + "epoch": 1.5467229072031148, + "grad_norm": 0.6173190856894455, + "learning_rate": 2.3983942591812547e-06, + "loss": 0.5244, + "step": 9534 + }, + { + "epoch": 1.5468851395197922, + "grad_norm": 0.6081668471588971, + "learning_rate": 2.3979675810024714e-06, + "loss": 0.4941, + "step": 9535 + }, + { + "epoch": 1.5470473718364697, + "grad_norm": 0.5821942051527326, + "learning_rate": 2.3975409058006964e-06, + "loss": 0.5162, + "step": 9536 + }, + { + "epoch": 1.5472096041531473, + "grad_norm": 0.621589642181087, + "learning_rate": 2.3971142335883785e-06, + "loss": 0.4934, + "step": 9537 + }, + { + "epoch": 1.5473718364698248, + "grad_norm": 0.5939651382696973, + "learning_rate": 2.396687564377967e-06, + "loss": 0.5004, + "step": 9538 + }, + { + "epoch": 1.5475340687865022, + "grad_norm": 0.6440142745772013, + "learning_rate": 2.3962608981819114e-06, + "loss": 0.4874, + "step": 9539 + }, + { + "epoch": 1.54769630110318, + "grad_norm": 0.6410719429154474, + "learning_rate": 2.395834235012659e-06, + "loss": 0.5291, + "step": 9540 + }, + { + "epoch": 1.5478585334198574, + "grad_norm": 0.6333301888017208, + "learning_rate": 2.3954075748826597e-06, + "loss": 0.5136, + "step": 9541 + }, + { + "epoch": 1.5480207657365348, + "grad_norm": 0.6100315053335387, + "learning_rate": 2.394980917804362e-06, + "loss": 0.5284, + "step": 9542 + }, + { + "epoch": 1.5481829980532122, + "grad_norm": 0.6151019364611471, + "learning_rate": 2.3945542637902137e-06, + "loss": 0.5322, + "step": 9543 + }, + { + "epoch": 1.5483452303698897, + "grad_norm": 0.5957429664446404, + "learning_rate": 2.3941276128526656e-06, + "loss": 0.5231, + "step": 9544 + }, + { + "epoch": 1.5485074626865671, + "grad_norm": 0.5974111965861103, + "learning_rate": 2.393700965004164e-06, + "loss": 0.4966, + "step": 9545 + }, + { + "epoch": 1.5486696950032446, + "grad_norm": 0.607014641486166, + "learning_rate": 2.393274320257157e-06, + "loss": 0.543, + "step": 9546 + }, + { + "epoch": 1.548831927319922, + "grad_norm": 0.6474761743925428, + "learning_rate": 2.392847678624095e-06, + "loss": 0.538, + "step": 9547 + }, + { + "epoch": 1.5489941596365995, + "grad_norm": 0.5750616029978949, + "learning_rate": 2.3924210401174236e-06, + "loss": 0.5554, + "step": 9548 + }, + { + "epoch": 1.5491563919532771, + "grad_norm": 0.6024171128385186, + "learning_rate": 2.3919944047495934e-06, + "loss": 0.5463, + "step": 9549 + }, + { + "epoch": 1.5493186242699546, + "grad_norm": 0.6333931310189123, + "learning_rate": 2.3915677725330506e-06, + "loss": 0.5167, + "step": 9550 + }, + { + "epoch": 1.549480856586632, + "grad_norm": 0.6056210906880123, + "learning_rate": 2.3911411434802437e-06, + "loss": 0.4985, + "step": 9551 + }, + { + "epoch": 1.5496430889033095, + "grad_norm": 0.60964029863013, + "learning_rate": 2.3907145176036197e-06, + "loss": 0.5147, + "step": 9552 + }, + { + "epoch": 1.5498053212199872, + "grad_norm": 0.592608846458318, + "learning_rate": 2.3902878949156275e-06, + "loss": 0.509, + "step": 9553 + }, + { + "epoch": 1.5499675535366646, + "grad_norm": 0.5796668929289762, + "learning_rate": 2.3898612754287147e-06, + "loss": 0.4983, + "step": 9554 + }, + { + "epoch": 1.550129785853342, + "grad_norm": 0.6030501814274156, + "learning_rate": 2.389434659155327e-06, + "loss": 0.5372, + "step": 9555 + }, + { + "epoch": 1.5502920181700195, + "grad_norm": 0.6208765673283992, + "learning_rate": 2.389008046107914e-06, + "loss": 0.5095, + "step": 9556 + }, + { + "epoch": 1.550454250486697, + "grad_norm": 0.5905902116171995, + "learning_rate": 2.3885814362989216e-06, + "loss": 0.5207, + "step": 9557 + }, + { + "epoch": 1.5506164828033744, + "grad_norm": 0.6012597241335037, + "learning_rate": 2.388154829740798e-06, + "loss": 0.5502, + "step": 9558 + }, + { + "epoch": 1.5507787151200518, + "grad_norm": 0.6372965035777707, + "learning_rate": 2.3877282264459892e-06, + "loss": 0.4789, + "step": 9559 + }, + { + "epoch": 1.5509409474367293, + "grad_norm": 0.6147251625344244, + "learning_rate": 2.3873016264269446e-06, + "loss": 0.5297, + "step": 9560 + }, + { + "epoch": 1.5511031797534067, + "grad_norm": 0.6128870835247086, + "learning_rate": 2.3868750296961085e-06, + "loss": 0.5116, + "step": 9561 + }, + { + "epoch": 1.5512654120700844, + "grad_norm": 0.646282279166708, + "learning_rate": 2.3864484362659286e-06, + "loss": 0.5412, + "step": 9562 + }, + { + "epoch": 1.5514276443867618, + "grad_norm": 0.6116372587857345, + "learning_rate": 2.386021846148851e-06, + "loss": 0.5207, + "step": 9563 + }, + { + "epoch": 1.5515898767034393, + "grad_norm": 0.6033319315035337, + "learning_rate": 2.3855952593573246e-06, + "loss": 0.5292, + "step": 9564 + }, + { + "epoch": 1.551752109020117, + "grad_norm": 0.5940406723965647, + "learning_rate": 2.3851686759037943e-06, + "loss": 0.5197, + "step": 9565 + }, + { + "epoch": 1.5519143413367944, + "grad_norm": 0.6005542369835115, + "learning_rate": 2.3847420958007055e-06, + "loss": 0.5511, + "step": 9566 + }, + { + "epoch": 1.5520765736534718, + "grad_norm": 0.5652115241719217, + "learning_rate": 2.384315519060507e-06, + "loss": 0.5437, + "step": 9567 + }, + { + "epoch": 1.5522388059701493, + "grad_norm": 0.5878987700325912, + "learning_rate": 2.383888945695643e-06, + "loss": 0.5279, + "step": 9568 + }, + { + "epoch": 1.5524010382868267, + "grad_norm": 0.6434517319183858, + "learning_rate": 2.383462375718561e-06, + "loss": 0.5295, + "step": 9569 + }, + { + "epoch": 1.5525632706035042, + "grad_norm": 0.6867091706705638, + "learning_rate": 2.383035809141707e-06, + "loss": 0.532, + "step": 9570 + }, + { + "epoch": 1.5527255029201816, + "grad_norm": 0.6562165406326939, + "learning_rate": 2.382609245977526e-06, + "loss": 0.4972, + "step": 9571 + }, + { + "epoch": 1.552887735236859, + "grad_norm": 0.6487725500728034, + "learning_rate": 2.3821826862384647e-06, + "loss": 0.5081, + "step": 9572 + }, + { + "epoch": 1.5530499675535365, + "grad_norm": 0.5900535201471164, + "learning_rate": 2.3817561299369677e-06, + "loss": 0.4996, + "step": 9573 + }, + { + "epoch": 1.5532121998702142, + "grad_norm": 0.6055875099605854, + "learning_rate": 2.3813295770854826e-06, + "loss": 0.5021, + "step": 9574 + }, + { + "epoch": 1.5533744321868916, + "grad_norm": 0.6379075363808372, + "learning_rate": 2.3809030276964542e-06, + "loss": 0.4945, + "step": 9575 + }, + { + "epoch": 1.553536664503569, + "grad_norm": 0.6213262177099036, + "learning_rate": 2.3804764817823273e-06, + "loss": 0.5228, + "step": 9576 + }, + { + "epoch": 1.5536988968202468, + "grad_norm": 0.5839675356066145, + "learning_rate": 2.3800499393555472e-06, + "loss": 0.5573, + "step": 9577 + }, + { + "epoch": 1.5538611291369242, + "grad_norm": 0.607919159492989, + "learning_rate": 2.37962340042856e-06, + "loss": 0.5195, + "step": 9578 + }, + { + "epoch": 1.5540233614536016, + "grad_norm": 0.6207643545708399, + "learning_rate": 2.37919686501381e-06, + "loss": 0.5065, + "step": 9579 + }, + { + "epoch": 1.554185593770279, + "grad_norm": 0.6433570410444814, + "learning_rate": 2.3787703331237437e-06, + "loss": 0.5507, + "step": 9580 + }, + { + "epoch": 1.5543478260869565, + "grad_norm": 0.5899365730715752, + "learning_rate": 2.3783438047708045e-06, + "loss": 0.5289, + "step": 9581 + }, + { + "epoch": 1.554510058403634, + "grad_norm": 0.5727583666691753, + "learning_rate": 2.3779172799674377e-06, + "loss": 0.474, + "step": 9582 + }, + { + "epoch": 1.5546722907203114, + "grad_norm": 0.6069951413799156, + "learning_rate": 2.3774907587260888e-06, + "loss": 0.5077, + "step": 9583 + }, + { + "epoch": 1.5548345230369889, + "grad_norm": 0.626558043133812, + "learning_rate": 2.377064241059201e-06, + "loss": 0.5229, + "step": 9584 + }, + { + "epoch": 1.5549967553536663, + "grad_norm": 0.615058868974659, + "learning_rate": 2.3766377269792214e-06, + "loss": 0.5127, + "step": 9585 + }, + { + "epoch": 1.5551589876703438, + "grad_norm": 0.5975422458194101, + "learning_rate": 2.376211216498591e-06, + "loss": 0.5138, + "step": 9586 + }, + { + "epoch": 1.5553212199870214, + "grad_norm": 0.5959127946073716, + "learning_rate": 2.3757847096297564e-06, + "loss": 0.5007, + "step": 9587 + }, + { + "epoch": 1.5554834523036989, + "grad_norm": 0.560945636418264, + "learning_rate": 2.3753582063851607e-06, + "loss": 0.5137, + "step": 9588 + }, + { + "epoch": 1.5556456846203763, + "grad_norm": 0.5934460789043201, + "learning_rate": 2.3749317067772492e-06, + "loss": 0.5212, + "step": 9589 + }, + { + "epoch": 1.555807916937054, + "grad_norm": 0.6048395825968536, + "learning_rate": 2.374505210818466e-06, + "loss": 0.5294, + "step": 9590 + }, + { + "epoch": 1.5559701492537314, + "grad_norm": 0.6078362036876973, + "learning_rate": 2.3740787185212534e-06, + "loss": 0.5392, + "step": 9591 + }, + { + "epoch": 1.556132381570409, + "grad_norm": 0.6246671307679589, + "learning_rate": 2.373652229898056e-06, + "loss": 0.5471, + "step": 9592 + }, + { + "epoch": 1.5562946138870863, + "grad_norm": 0.6174454494414654, + "learning_rate": 2.3732257449613174e-06, + "loss": 0.5229, + "step": 9593 + }, + { + "epoch": 1.5564568462037638, + "grad_norm": 0.6284164161014014, + "learning_rate": 2.372799263723482e-06, + "loss": 0.525, + "step": 9594 + }, + { + "epoch": 1.5566190785204412, + "grad_norm": 0.5794529025295301, + "learning_rate": 2.3723727861969933e-06, + "loss": 0.5004, + "step": 9595 + }, + { + "epoch": 1.5567813108371187, + "grad_norm": 0.598091560492034, + "learning_rate": 2.3719463123942934e-06, + "loss": 0.4649, + "step": 9596 + }, + { + "epoch": 1.5569435431537961, + "grad_norm": 0.6271879803302981, + "learning_rate": 2.3715198423278257e-06, + "loss": 0.5258, + "step": 9597 + }, + { + "epoch": 1.5571057754704736, + "grad_norm": 0.6046393917313245, + "learning_rate": 2.3710933760100345e-06, + "loss": 0.5189, + "step": 9598 + }, + { + "epoch": 1.5572680077871512, + "grad_norm": 0.5627003909705093, + "learning_rate": 2.3706669134533614e-06, + "loss": 0.5147, + "step": 9599 + }, + { + "epoch": 1.5574302401038287, + "grad_norm": 0.6019223790240651, + "learning_rate": 2.370240454670251e-06, + "loss": 0.4925, + "step": 9600 + }, + { + "epoch": 1.5575924724205061, + "grad_norm": 0.609893220394005, + "learning_rate": 2.3698139996731454e-06, + "loss": 0.5422, + "step": 9601 + }, + { + "epoch": 1.5577547047371838, + "grad_norm": 0.6043154493641367, + "learning_rate": 2.369387548474486e-06, + "loss": 0.5412, + "step": 9602 + }, + { + "epoch": 1.5579169370538613, + "grad_norm": 0.611928663235029, + "learning_rate": 2.3689611010867175e-06, + "loss": 0.5401, + "step": 9603 + }, + { + "epoch": 1.5580791693705387, + "grad_norm": 0.6104258333503724, + "learning_rate": 2.368534657522281e-06, + "loss": 0.55, + "step": 9604 + }, + { + "epoch": 1.5582414016872161, + "grad_norm": 0.6300517294655905, + "learning_rate": 2.3681082177936194e-06, + "loss": 0.5238, + "step": 9605 + }, + { + "epoch": 1.5584036340038936, + "grad_norm": 0.6370010619752666, + "learning_rate": 2.3676817819131757e-06, + "loss": 0.5152, + "step": 9606 + }, + { + "epoch": 1.558565866320571, + "grad_norm": 0.6409883231269734, + "learning_rate": 2.36725534989339e-06, + "loss": 0.4939, + "step": 9607 + }, + { + "epoch": 1.5587280986372485, + "grad_norm": 0.6067397470024295, + "learning_rate": 2.3668289217467063e-06, + "loss": 0.5335, + "step": 9608 + }, + { + "epoch": 1.558890330953926, + "grad_norm": 0.5983611906320938, + "learning_rate": 2.366402497485566e-06, + "loss": 0.51, + "step": 9609 + }, + { + "epoch": 1.5590525632706034, + "grad_norm": 0.6461723187102341, + "learning_rate": 2.36597607712241e-06, + "loss": 0.4983, + "step": 9610 + }, + { + "epoch": 1.5592147955872808, + "grad_norm": 0.6172403532995955, + "learning_rate": 2.365549660669682e-06, + "loss": 0.5296, + "step": 9611 + }, + { + "epoch": 1.5593770279039585, + "grad_norm": 0.6526987029325276, + "learning_rate": 2.365123248139821e-06, + "loss": 0.51, + "step": 9612 + }, + { + "epoch": 1.559539260220636, + "grad_norm": 0.5777105185406909, + "learning_rate": 2.3646968395452703e-06, + "loss": 0.517, + "step": 9613 + }, + { + "epoch": 1.5597014925373134, + "grad_norm": 0.5941270861637805, + "learning_rate": 2.364270434898471e-06, + "loss": 0.4976, + "step": 9614 + }, + { + "epoch": 1.559863724853991, + "grad_norm": 0.6142103239599891, + "learning_rate": 2.363844034211863e-06, + "loss": 0.5317, + "step": 9615 + }, + { + "epoch": 1.5600259571706685, + "grad_norm": 0.6171063481000603, + "learning_rate": 2.36341763749789e-06, + "loss": 0.5053, + "step": 9616 + }, + { + "epoch": 1.560188189487346, + "grad_norm": 0.6021977080682296, + "learning_rate": 2.362991244768991e-06, + "loss": 0.5171, + "step": 9617 + }, + { + "epoch": 1.5603504218040234, + "grad_norm": 0.5916406377215001, + "learning_rate": 2.362564856037607e-06, + "loss": 0.5274, + "step": 9618 + }, + { + "epoch": 1.5605126541207008, + "grad_norm": 0.647779188906249, + "learning_rate": 2.3621384713161793e-06, + "loss": 0.5367, + "step": 9619 + }, + { + "epoch": 1.5606748864373783, + "grad_norm": 0.5850029420351426, + "learning_rate": 2.3617120906171484e-06, + "loss": 0.5088, + "step": 9620 + }, + { + "epoch": 1.5608371187540557, + "grad_norm": 0.5742320555967424, + "learning_rate": 2.3612857139529564e-06, + "loss": 0.5322, + "step": 9621 + }, + { + "epoch": 1.5609993510707332, + "grad_norm": 0.6084942316547708, + "learning_rate": 2.3608593413360403e-06, + "loss": 0.5126, + "step": 9622 + }, + { + "epoch": 1.5611615833874106, + "grad_norm": 0.5940266539638193, + "learning_rate": 2.3604329727788435e-06, + "loss": 0.5004, + "step": 9623 + }, + { + "epoch": 1.5613238157040883, + "grad_norm": 0.5939405390184492, + "learning_rate": 2.360006608293804e-06, + "loss": 0.4928, + "step": 9624 + }, + { + "epoch": 1.5614860480207657, + "grad_norm": 0.6143001144868845, + "learning_rate": 2.3595802478933634e-06, + "loss": 0.4909, + "step": 9625 + }, + { + "epoch": 1.5616482803374432, + "grad_norm": 0.6316167099060862, + "learning_rate": 2.359153891589962e-06, + "loss": 0.5632, + "step": 9626 + }, + { + "epoch": 1.5618105126541209, + "grad_norm": 0.586453350597033, + "learning_rate": 2.3587275393960375e-06, + "loss": 0.5401, + "step": 9627 + }, + { + "epoch": 1.5619727449707983, + "grad_norm": 0.5949310269171048, + "learning_rate": 2.3583011913240317e-06, + "loss": 0.5649, + "step": 9628 + }, + { + "epoch": 1.5621349772874757, + "grad_norm": 0.5801507055983948, + "learning_rate": 2.357874847386383e-06, + "loss": 0.5128, + "step": 9629 + }, + { + "epoch": 1.5622972096041532, + "grad_norm": 0.6423342509565015, + "learning_rate": 2.357448507595531e-06, + "loss": 0.5511, + "step": 9630 + }, + { + "epoch": 1.5624594419208306, + "grad_norm": 0.5882571590710568, + "learning_rate": 2.357022171963917e-06, + "loss": 0.5319, + "step": 9631 + }, + { + "epoch": 1.562621674237508, + "grad_norm": 0.5904648562477168, + "learning_rate": 2.3565958405039767e-06, + "loss": 0.5513, + "step": 9632 + }, + { + "epoch": 1.5627839065541855, + "grad_norm": 0.5910554169171645, + "learning_rate": 2.3561695132281515e-06, + "loss": 0.5499, + "step": 9633 + }, + { + "epoch": 1.562946138870863, + "grad_norm": 0.5999483523157885, + "learning_rate": 2.35574319014888e-06, + "loss": 0.5511, + "step": 9634 + }, + { + "epoch": 1.5631083711875404, + "grad_norm": 0.5960270719667147, + "learning_rate": 2.3553168712786006e-06, + "loss": 0.5261, + "step": 9635 + }, + { + "epoch": 1.563270603504218, + "grad_norm": 0.617241687812595, + "learning_rate": 2.3548905566297535e-06, + "loss": 0.5268, + "step": 9636 + }, + { + "epoch": 1.5634328358208955, + "grad_norm": 0.5955636303506813, + "learning_rate": 2.3544642462147756e-06, + "loss": 0.5092, + "step": 9637 + }, + { + "epoch": 1.563595068137573, + "grad_norm": 0.6192773833808194, + "learning_rate": 2.354037940046106e-06, + "loss": 0.5232, + "step": 9638 + }, + { + "epoch": 1.5637573004542504, + "grad_norm": 0.609013993270328, + "learning_rate": 2.3536116381361833e-06, + "loss": 0.533, + "step": 9639 + }, + { + "epoch": 1.563919532770928, + "grad_norm": 0.5879043682660777, + "learning_rate": 2.3531853404974446e-06, + "loss": 0.5455, + "step": 9640 + }, + { + "epoch": 1.5640817650876055, + "grad_norm": 0.797875025006444, + "learning_rate": 2.3527590471423307e-06, + "loss": 0.5268, + "step": 9641 + }, + { + "epoch": 1.564243997404283, + "grad_norm": 0.5961376206205404, + "learning_rate": 2.352332758083277e-06, + "loss": 0.516, + "step": 9642 + }, + { + "epoch": 1.5644062297209604, + "grad_norm": 0.6043662406911862, + "learning_rate": 2.3519064733327225e-06, + "loss": 0.4889, + "step": 9643 + }, + { + "epoch": 1.5645684620376379, + "grad_norm": 0.6227455596913706, + "learning_rate": 2.351480192903104e-06, + "loss": 0.5453, + "step": 9644 + }, + { + "epoch": 1.5647306943543153, + "grad_norm": 0.629940827991726, + "learning_rate": 2.3510539168068602e-06, + "loss": 0.4911, + "step": 9645 + }, + { + "epoch": 1.5648929266709928, + "grad_norm": 0.5950976035793615, + "learning_rate": 2.3506276450564287e-06, + "loss": 0.5063, + "step": 9646 + }, + { + "epoch": 1.5650551589876702, + "grad_norm": 0.6059531091558009, + "learning_rate": 2.3502013776642455e-06, + "loss": 0.5251, + "step": 9647 + }, + { + "epoch": 1.5652173913043477, + "grad_norm": 0.6057603521801561, + "learning_rate": 2.3497751146427494e-06, + "loss": 0.4807, + "step": 9648 + }, + { + "epoch": 1.5653796236210253, + "grad_norm": 0.613393094957925, + "learning_rate": 2.349348856004376e-06, + "loss": 0.5256, + "step": 9649 + }, + { + "epoch": 1.5655418559377028, + "grad_norm": 0.585213292193197, + "learning_rate": 2.348922601761564e-06, + "loss": 0.5172, + "step": 9650 + }, + { + "epoch": 1.5657040882543802, + "grad_norm": 0.6205100592142309, + "learning_rate": 2.3484963519267485e-06, + "loss": 0.5124, + "step": 9651 + }, + { + "epoch": 1.565866320571058, + "grad_norm": 0.645244197272917, + "learning_rate": 2.3480701065123683e-06, + "loss": 0.553, + "step": 9652 + }, + { + "epoch": 1.5660285528877353, + "grad_norm": 0.5837919157973478, + "learning_rate": 2.347643865530858e-06, + "loss": 0.5272, + "step": 9653 + }, + { + "epoch": 1.5661907852044128, + "grad_norm": 0.6057746767062798, + "learning_rate": 2.3472176289946544e-06, + "loss": 0.5103, + "step": 9654 + }, + { + "epoch": 1.5663530175210902, + "grad_norm": 0.6086732521847127, + "learning_rate": 2.3467913969161947e-06, + "loss": 0.4982, + "step": 9655 + }, + { + "epoch": 1.5665152498377677, + "grad_norm": 0.6222042407299423, + "learning_rate": 2.346365169307915e-06, + "loss": 0.5234, + "step": 9656 + }, + { + "epoch": 1.5666774821544451, + "grad_norm": 0.5922530930261021, + "learning_rate": 2.3459389461822517e-06, + "loss": 0.5182, + "step": 9657 + }, + { + "epoch": 1.5668397144711226, + "grad_norm": 0.6088923632222726, + "learning_rate": 2.3455127275516386e-06, + "loss": 0.481, + "step": 9658 + }, + { + "epoch": 1.5670019467878, + "grad_norm": 0.6180121757665207, + "learning_rate": 2.345086513428514e-06, + "loss": 0.5335, + "step": 9659 + }, + { + "epoch": 1.5671641791044775, + "grad_norm": 0.6152562311095444, + "learning_rate": 2.3446603038253115e-06, + "loss": 0.5296, + "step": 9660 + }, + { + "epoch": 1.5673264114211551, + "grad_norm": 0.5906092714606958, + "learning_rate": 2.344234098754469e-06, + "loss": 0.5095, + "step": 9661 + }, + { + "epoch": 1.5674886437378326, + "grad_norm": 0.5765202644452356, + "learning_rate": 2.3438078982284206e-06, + "loss": 0.5193, + "step": 9662 + }, + { + "epoch": 1.56765087605451, + "grad_norm": 0.606053871961849, + "learning_rate": 2.343381702259601e-06, + "loss": 0.5245, + "step": 9663 + }, + { + "epoch": 1.5678131083711877, + "grad_norm": 0.5940996060095944, + "learning_rate": 2.342955510860446e-06, + "loss": 0.5196, + "step": 9664 + }, + { + "epoch": 1.5679753406878651, + "grad_norm": 0.6084505207873521, + "learning_rate": 2.3425293240433906e-06, + "loss": 0.517, + "step": 9665 + }, + { + "epoch": 1.5681375730045426, + "grad_norm": 0.5997962330774166, + "learning_rate": 2.34210314182087e-06, + "loss": 0.4912, + "step": 9666 + }, + { + "epoch": 1.56829980532122, + "grad_norm": 0.5766936214583921, + "learning_rate": 2.3416769642053193e-06, + "loss": 0.5232, + "step": 9667 + }, + { + "epoch": 1.5684620376378975, + "grad_norm": 0.5714032939473245, + "learning_rate": 2.341250791209172e-06, + "loss": 0.5081, + "step": 9668 + }, + { + "epoch": 1.568624269954575, + "grad_norm": 0.6375607324334935, + "learning_rate": 2.3408246228448627e-06, + "loss": 0.5089, + "step": 9669 + }, + { + "epoch": 1.5687865022712524, + "grad_norm": 0.6143385393601644, + "learning_rate": 2.3403984591248265e-06, + "loss": 0.5384, + "step": 9670 + }, + { + "epoch": 1.5689487345879298, + "grad_norm": 0.622339019607823, + "learning_rate": 2.3399723000614966e-06, + "loss": 0.5375, + "step": 9671 + }, + { + "epoch": 1.5691109669046073, + "grad_norm": 0.6069151696477436, + "learning_rate": 2.3395461456673093e-06, + "loss": 0.5124, + "step": 9672 + }, + { + "epoch": 1.5692731992212847, + "grad_norm": 0.579515354738125, + "learning_rate": 2.3391199959546963e-06, + "loss": 0.4852, + "step": 9673 + }, + { + "epoch": 1.5694354315379624, + "grad_norm": 0.6154211075046813, + "learning_rate": 2.338693850936091e-06, + "loss": 0.4891, + "step": 9674 + }, + { + "epoch": 1.5695976638546398, + "grad_norm": 0.5825552064836537, + "learning_rate": 2.3382677106239292e-06, + "loss": 0.5145, + "step": 9675 + }, + { + "epoch": 1.5697598961713173, + "grad_norm": 0.5981663631320341, + "learning_rate": 2.3378415750306423e-06, + "loss": 0.5178, + "step": 9676 + }, + { + "epoch": 1.569922128487995, + "grad_norm": 0.5977701562434212, + "learning_rate": 2.3374154441686667e-06, + "loss": 0.5318, + "step": 9677 + }, + { + "epoch": 1.5700843608046724, + "grad_norm": 0.613356236398246, + "learning_rate": 2.336989318050432e-06, + "loss": 0.5387, + "step": 9678 + }, + { + "epoch": 1.5702465931213498, + "grad_norm": 0.6054652895897717, + "learning_rate": 2.3365631966883736e-06, + "loss": 0.5348, + "step": 9679 + }, + { + "epoch": 1.5704088254380273, + "grad_norm": 0.5796003968164585, + "learning_rate": 2.3361370800949234e-06, + "loss": 0.5243, + "step": 9680 + }, + { + "epoch": 1.5705710577547047, + "grad_norm": 0.5917546876581671, + "learning_rate": 2.3357109682825155e-06, + "loss": 0.509, + "step": 9681 + }, + { + "epoch": 1.5707332900713822, + "grad_norm": 0.6233642364710628, + "learning_rate": 2.335284861263582e-06, + "loss": 0.5339, + "step": 9682 + }, + { + "epoch": 1.5708955223880596, + "grad_norm": 0.5959203173419723, + "learning_rate": 2.3348587590505546e-06, + "loss": 0.495, + "step": 9683 + }, + { + "epoch": 1.571057754704737, + "grad_norm": 0.5965814556896004, + "learning_rate": 2.3344326616558668e-06, + "loss": 0.5248, + "step": 9684 + }, + { + "epoch": 1.5712199870214145, + "grad_norm": 0.6009695069224408, + "learning_rate": 2.3340065690919493e-06, + "loss": 0.5294, + "step": 9685 + }, + { + "epoch": 1.5713822193380922, + "grad_norm": 0.6249841712256442, + "learning_rate": 2.333580481371237e-06, + "loss": 0.5675, + "step": 9686 + }, + { + "epoch": 1.5715444516547696, + "grad_norm": 0.6167275980877531, + "learning_rate": 2.33315439850616e-06, + "loss": 0.5225, + "step": 9687 + }, + { + "epoch": 1.571706683971447, + "grad_norm": 0.6248294468544638, + "learning_rate": 2.33272832050915e-06, + "loss": 0.4867, + "step": 9688 + }, + { + "epoch": 1.5718689162881248, + "grad_norm": 0.6012954672773562, + "learning_rate": 2.3323022473926395e-06, + "loss": 0.528, + "step": 9689 + }, + { + "epoch": 1.5720311486048022, + "grad_norm": 0.6160417696998365, + "learning_rate": 2.33187617916906e-06, + "loss": 0.4699, + "step": 9690 + }, + { + "epoch": 1.5721933809214796, + "grad_norm": 0.6228712762449112, + "learning_rate": 2.3314501158508417e-06, + "loss": 0.5159, + "step": 9691 + }, + { + "epoch": 1.572355613238157, + "grad_norm": 0.6169787903515311, + "learning_rate": 2.3310240574504184e-06, + "loss": 0.5074, + "step": 9692 + }, + { + "epoch": 1.5725178455548345, + "grad_norm": 0.5882827456150634, + "learning_rate": 2.330598003980219e-06, + "loss": 0.526, + "step": 9693 + }, + { + "epoch": 1.572680077871512, + "grad_norm": 0.6480020152377751, + "learning_rate": 2.330171955452675e-06, + "loss": 0.5128, + "step": 9694 + }, + { + "epoch": 1.5728423101881894, + "grad_norm": 0.5949389595332112, + "learning_rate": 2.3297459118802173e-06, + "loss": 0.5244, + "step": 9695 + }, + { + "epoch": 1.5730045425048669, + "grad_norm": 0.6022871485800112, + "learning_rate": 2.3293198732752762e-06, + "loss": 0.5079, + "step": 9696 + }, + { + "epoch": 1.5731667748215443, + "grad_norm": 0.6056353409173286, + "learning_rate": 2.328893839650284e-06, + "loss": 0.4984, + "step": 9697 + }, + { + "epoch": 1.5733290071382218, + "grad_norm": 0.668963763964466, + "learning_rate": 2.3284678110176704e-06, + "loss": 0.5144, + "step": 9698 + }, + { + "epoch": 1.5734912394548994, + "grad_norm": 0.5811096031222274, + "learning_rate": 2.328041787389864e-06, + "loss": 0.5498, + "step": 9699 + }, + { + "epoch": 1.5736534717715769, + "grad_norm": 0.6232645136075338, + "learning_rate": 2.3276157687792966e-06, + "loss": 0.5363, + "step": 9700 + }, + { + "epoch": 1.5738157040882543, + "grad_norm": 0.5982529507357883, + "learning_rate": 2.3271897551983977e-06, + "loss": 0.4982, + "step": 9701 + }, + { + "epoch": 1.573977936404932, + "grad_norm": 0.5972302354638769, + "learning_rate": 2.3267637466595967e-06, + "loss": 0.5015, + "step": 9702 + }, + { + "epoch": 1.5741401687216094, + "grad_norm": 0.5916780846890489, + "learning_rate": 2.3263377431753247e-06, + "loss": 0.5135, + "step": 9703 + }, + { + "epoch": 1.574302401038287, + "grad_norm": 0.5634362200467385, + "learning_rate": 2.32591174475801e-06, + "loss": 0.503, + "step": 9704 + }, + { + "epoch": 1.5744646333549643, + "grad_norm": 0.5719227708450176, + "learning_rate": 2.3254857514200814e-06, + "loss": 0.4772, + "step": 9705 + }, + { + "epoch": 1.5746268656716418, + "grad_norm": 0.5724421263929892, + "learning_rate": 2.32505976317397e-06, + "loss": 0.5074, + "step": 9706 + }, + { + "epoch": 1.5747890979883192, + "grad_norm": 0.5979198702941313, + "learning_rate": 2.3246337800321033e-06, + "loss": 0.5301, + "step": 9707 + }, + { + "epoch": 1.5749513303049967, + "grad_norm": 0.6253933915024048, + "learning_rate": 2.3242078020069116e-06, + "loss": 0.556, + "step": 9708 + }, + { + "epoch": 1.5751135626216741, + "grad_norm": 0.649696914042647, + "learning_rate": 2.3237818291108225e-06, + "loss": 0.5416, + "step": 9709 + }, + { + "epoch": 1.5752757949383516, + "grad_norm": 0.5803218658326117, + "learning_rate": 2.3233558613562645e-06, + "loss": 0.4957, + "step": 9710 + }, + { + "epoch": 1.5754380272550292, + "grad_norm": 0.6177152115944299, + "learning_rate": 2.3229298987556674e-06, + "loss": 0.5409, + "step": 9711 + }, + { + "epoch": 1.5756002595717067, + "grad_norm": 0.5836121019109365, + "learning_rate": 2.3225039413214583e-06, + "loss": 0.5, + "step": 9712 + }, + { + "epoch": 1.5757624918883841, + "grad_norm": 0.6059775413677226, + "learning_rate": 2.322077989066067e-06, + "loss": 0.5095, + "step": 9713 + }, + { + "epoch": 1.5759247242050618, + "grad_norm": 0.6032706841735481, + "learning_rate": 2.3216520420019194e-06, + "loss": 0.5117, + "step": 9714 + }, + { + "epoch": 1.5760869565217392, + "grad_norm": 0.5790518628759115, + "learning_rate": 2.3212261001414453e-06, + "loss": 0.4951, + "step": 9715 + }, + { + "epoch": 1.5762491888384167, + "grad_norm": 0.6252780302355647, + "learning_rate": 2.3208001634970704e-06, + "loss": 0.5038, + "step": 9716 + }, + { + "epoch": 1.5764114211550941, + "grad_norm": 0.5899725561425069, + "learning_rate": 2.320374232081224e-06, + "loss": 0.4969, + "step": 9717 + }, + { + "epoch": 1.5765736534717716, + "grad_norm": 0.591704277018916, + "learning_rate": 2.319948305906334e-06, + "loss": 0.521, + "step": 9718 + }, + { + "epoch": 1.576735885788449, + "grad_norm": 0.6218988227868818, + "learning_rate": 2.3195223849848256e-06, + "loss": 0.5317, + "step": 9719 + }, + { + "epoch": 1.5768981181051265, + "grad_norm": 0.6106412039346243, + "learning_rate": 2.3190964693291275e-06, + "loss": 0.5092, + "step": 9720 + }, + { + "epoch": 1.577060350421804, + "grad_norm": 0.6352758094535045, + "learning_rate": 2.3186705589516657e-06, + "loss": 0.5127, + "step": 9721 + }, + { + "epoch": 1.5772225827384814, + "grad_norm": 0.5886267613506071, + "learning_rate": 2.318244653864868e-06, + "loss": 0.4867, + "step": 9722 + }, + { + "epoch": 1.577384815055159, + "grad_norm": 0.5935581534528153, + "learning_rate": 2.3178187540811607e-06, + "loss": 0.5486, + "step": 9723 + }, + { + "epoch": 1.5775470473718365, + "grad_norm": 0.5833035136201329, + "learning_rate": 2.3173928596129703e-06, + "loss": 0.5189, + "step": 9724 + }, + { + "epoch": 1.577709279688514, + "grad_norm": 0.610973995543089, + "learning_rate": 2.316966970472722e-06, + "loss": 0.4978, + "step": 9725 + }, + { + "epoch": 1.5778715120051914, + "grad_norm": 0.606419963188429, + "learning_rate": 2.316541086672844e-06, + "loss": 0.5172, + "step": 9726 + }, + { + "epoch": 1.578033744321869, + "grad_norm": 0.5916817829850742, + "learning_rate": 2.316115208225761e-06, + "loss": 0.5357, + "step": 9727 + }, + { + "epoch": 1.5781959766385465, + "grad_norm": 0.597020696890349, + "learning_rate": 2.3156893351439e-06, + "loss": 0.5083, + "step": 9728 + }, + { + "epoch": 1.578358208955224, + "grad_norm": 0.6053464700535091, + "learning_rate": 2.3152634674396857e-06, + "loss": 0.5019, + "step": 9729 + }, + { + "epoch": 1.5785204412719014, + "grad_norm": 0.5930758825432468, + "learning_rate": 2.314837605125543e-06, + "loss": 0.5126, + "step": 9730 + }, + { + "epoch": 1.5786826735885788, + "grad_norm": 0.6058758267150129, + "learning_rate": 2.3144117482138996e-06, + "loss": 0.5301, + "step": 9731 + }, + { + "epoch": 1.5788449059052563, + "grad_norm": 0.6185229146681019, + "learning_rate": 2.3139858967171784e-06, + "loss": 0.5098, + "step": 9732 + }, + { + "epoch": 1.5790071382219337, + "grad_norm": 0.5993424517261816, + "learning_rate": 2.3135600506478073e-06, + "loss": 0.4854, + "step": 9733 + }, + { + "epoch": 1.5791693705386112, + "grad_norm": 0.5766067044052607, + "learning_rate": 2.3131342100182084e-06, + "loss": 0.5198, + "step": 9734 + }, + { + "epoch": 1.5793316028552886, + "grad_norm": 0.6278388355485754, + "learning_rate": 2.312708374840808e-06, + "loss": 0.5511, + "step": 9735 + }, + { + "epoch": 1.5794938351719663, + "grad_norm": 0.5968430489855852, + "learning_rate": 2.3122825451280294e-06, + "loss": 0.5386, + "step": 9736 + }, + { + "epoch": 1.5796560674886437, + "grad_norm": 0.5945674980550905, + "learning_rate": 2.311856720892299e-06, + "loss": 0.531, + "step": 9737 + }, + { + "epoch": 1.5798182998053212, + "grad_norm": 0.5996664154401036, + "learning_rate": 2.3114309021460404e-06, + "loss": 0.5331, + "step": 9738 + }, + { + "epoch": 1.5799805321219988, + "grad_norm": 0.6057188717131491, + "learning_rate": 2.3110050889016765e-06, + "loss": 0.5452, + "step": 9739 + }, + { + "epoch": 1.5801427644386763, + "grad_norm": 0.6029060652989929, + "learning_rate": 2.3105792811716327e-06, + "loss": 0.5315, + "step": 9740 + }, + { + "epoch": 1.5803049967553537, + "grad_norm": 0.6720074572388317, + "learning_rate": 2.310153478968332e-06, + "loss": 0.5063, + "step": 9741 + }, + { + "epoch": 1.5804672290720312, + "grad_norm": 0.6112130687892333, + "learning_rate": 2.309727682304199e-06, + "loss": 0.5229, + "step": 9742 + }, + { + "epoch": 1.5806294613887086, + "grad_norm": 0.5888143572707331, + "learning_rate": 2.309301891191656e-06, + "loss": 0.5042, + "step": 9743 + }, + { + "epoch": 1.580791693705386, + "grad_norm": 0.5674555814831399, + "learning_rate": 2.308876105643128e-06, + "loss": 0.4894, + "step": 9744 + }, + { + "epoch": 1.5809539260220635, + "grad_norm": 0.5837271895070402, + "learning_rate": 2.3084503256710366e-06, + "loss": 0.514, + "step": 9745 + }, + { + "epoch": 1.581116158338741, + "grad_norm": 0.5762073942255882, + "learning_rate": 2.308024551287805e-06, + "loss": 0.5073, + "step": 9746 + }, + { + "epoch": 1.5812783906554184, + "grad_norm": 0.5698768819855821, + "learning_rate": 2.307598782505857e-06, + "loss": 0.5225, + "step": 9747 + }, + { + "epoch": 1.581440622972096, + "grad_norm": 0.5860340907712432, + "learning_rate": 2.307173019337615e-06, + "loss": 0.5221, + "step": 9748 + }, + { + "epoch": 1.5816028552887735, + "grad_norm": 0.5952338379329194, + "learning_rate": 2.3067472617955013e-06, + "loss": 0.5325, + "step": 9749 + }, + { + "epoch": 1.581765087605451, + "grad_norm": 0.6449664079817626, + "learning_rate": 2.3063215098919374e-06, + "loss": 0.53, + "step": 9750 + }, + { + "epoch": 1.5819273199221286, + "grad_norm": 0.5912970092868592, + "learning_rate": 2.3058957636393466e-06, + "loss": 0.5192, + "step": 9751 + }, + { + "epoch": 1.582089552238806, + "grad_norm": 0.6244514612625008, + "learning_rate": 2.3054700230501503e-06, + "loss": 0.4717, + "step": 9752 + }, + { + "epoch": 1.5822517845554835, + "grad_norm": 0.6236923007776319, + "learning_rate": 2.3050442881367714e-06, + "loss": 0.504, + "step": 9753 + }, + { + "epoch": 1.582414016872161, + "grad_norm": 0.6333925037639366, + "learning_rate": 2.304618558911631e-06, + "loss": 0.498, + "step": 9754 + }, + { + "epoch": 1.5825762491888384, + "grad_norm": 0.5882478196090458, + "learning_rate": 2.3041928353871502e-06, + "loss": 0.5117, + "step": 9755 + }, + { + "epoch": 1.5827384815055159, + "grad_norm": 0.6333436923833199, + "learning_rate": 2.303767117575751e-06, + "loss": 0.5302, + "step": 9756 + }, + { + "epoch": 1.5829007138221933, + "grad_norm": 0.6385701748974068, + "learning_rate": 2.3033414054898533e-06, + "loss": 0.5146, + "step": 9757 + }, + { + "epoch": 1.5830629461388708, + "grad_norm": 0.6262933109837268, + "learning_rate": 2.30291569914188e-06, + "loss": 0.5198, + "step": 9758 + }, + { + "epoch": 1.5832251784555482, + "grad_norm": 0.5775190775679512, + "learning_rate": 2.3024899985442525e-06, + "loss": 0.5256, + "step": 9759 + }, + { + "epoch": 1.5833874107722257, + "grad_norm": 0.6121202754365856, + "learning_rate": 2.3020643037093886e-06, + "loss": 0.5399, + "step": 9760 + }, + { + "epoch": 1.5835496430889033, + "grad_norm": 0.5841998283160258, + "learning_rate": 2.30163861464971e-06, + "loss": 0.5344, + "step": 9761 + }, + { + "epoch": 1.5837118754055808, + "grad_norm": 0.6375737456160623, + "learning_rate": 2.301212931377638e-06, + "loss": 0.5142, + "step": 9762 + }, + { + "epoch": 1.5838741077222582, + "grad_norm": 0.5987070429494634, + "learning_rate": 2.3007872539055916e-06, + "loss": 0.5326, + "step": 9763 + }, + { + "epoch": 1.584036340038936, + "grad_norm": 0.6044754826696747, + "learning_rate": 2.300361582245993e-06, + "loss": 0.4795, + "step": 9764 + }, + { + "epoch": 1.5841985723556133, + "grad_norm": 0.6030865845742618, + "learning_rate": 2.2999359164112595e-06, + "loss": 0.5129, + "step": 9765 + }, + { + "epoch": 1.5843608046722908, + "grad_norm": 0.623127766977438, + "learning_rate": 2.2995102564138113e-06, + "loss": 0.5231, + "step": 9766 + }, + { + "epoch": 1.5845230369889682, + "grad_norm": 0.6326589526254357, + "learning_rate": 2.299084602266069e-06, + "loss": 0.5144, + "step": 9767 + }, + { + "epoch": 1.5846852693056457, + "grad_norm": 0.5981825882162859, + "learning_rate": 2.298658953980451e-06, + "loss": 0.4891, + "step": 9768 + }, + { + "epoch": 1.5848475016223231, + "grad_norm": 0.607883951916392, + "learning_rate": 2.298233311569378e-06, + "loss": 0.4934, + "step": 9769 + }, + { + "epoch": 1.5850097339390006, + "grad_norm": 0.5934333675852184, + "learning_rate": 2.297807675045267e-06, + "loss": 0.5059, + "step": 9770 + }, + { + "epoch": 1.585171966255678, + "grad_norm": 0.6639185967047275, + "learning_rate": 2.2973820444205374e-06, + "loss": 0.5676, + "step": 9771 + }, + { + "epoch": 1.5853341985723555, + "grad_norm": 0.6179607640622641, + "learning_rate": 2.296956419707608e-06, + "loss": 0.5182, + "step": 9772 + }, + { + "epoch": 1.5854964308890331, + "grad_norm": 0.6264872724676984, + "learning_rate": 2.296530800918898e-06, + "loss": 0.5156, + "step": 9773 + }, + { + "epoch": 1.5856586632057106, + "grad_norm": 0.6056625050150678, + "learning_rate": 2.2961051880668255e-06, + "loss": 0.5058, + "step": 9774 + }, + { + "epoch": 1.585820895522388, + "grad_norm": 0.5811583255395547, + "learning_rate": 2.295679581163807e-06, + "loss": 0.5392, + "step": 9775 + }, + { + "epoch": 1.5859831278390657, + "grad_norm": 0.6098534653337774, + "learning_rate": 2.2952539802222625e-06, + "loss": 0.4871, + "step": 9776 + }, + { + "epoch": 1.5861453601557431, + "grad_norm": 0.602433298042042, + "learning_rate": 2.2948283852546083e-06, + "loss": 0.5513, + "step": 9777 + }, + { + "epoch": 1.5863075924724206, + "grad_norm": 0.607853320211931, + "learning_rate": 2.294402796273264e-06, + "loss": 0.477, + "step": 9778 + }, + { + "epoch": 1.586469824789098, + "grad_norm": 0.6019888306718517, + "learning_rate": 2.2939772132906456e-06, + "loss": 0.5043, + "step": 9779 + }, + { + "epoch": 1.5866320571057755, + "grad_norm": 0.6954912042630662, + "learning_rate": 2.2935516363191695e-06, + "loss": 0.5105, + "step": 9780 + }, + { + "epoch": 1.586794289422453, + "grad_norm": 0.6636406382748109, + "learning_rate": 2.2931260653712547e-06, + "loss": 0.5576, + "step": 9781 + }, + { + "epoch": 1.5869565217391304, + "grad_norm": 0.6124664549149514, + "learning_rate": 2.292700500459317e-06, + "loss": 0.4934, + "step": 9782 + }, + { + "epoch": 1.5871187540558078, + "grad_norm": 0.583784010901274, + "learning_rate": 2.292274941595773e-06, + "loss": 0.517, + "step": 9783 + }, + { + "epoch": 1.5872809863724853, + "grad_norm": 0.5995906372217021, + "learning_rate": 2.2918493887930403e-06, + "loss": 0.5414, + "step": 9784 + }, + { + "epoch": 1.5874432186891627, + "grad_norm": 0.6159546117178604, + "learning_rate": 2.2914238420635348e-06, + "loss": 0.4995, + "step": 9785 + }, + { + "epoch": 1.5876054510058404, + "grad_norm": 0.6325518101398078, + "learning_rate": 2.2909983014196716e-06, + "loss": 0.5609, + "step": 9786 + }, + { + "epoch": 1.5877676833225178, + "grad_norm": 0.6173642285963649, + "learning_rate": 2.2905727668738685e-06, + "loss": 0.5235, + "step": 9787 + }, + { + "epoch": 1.5879299156391953, + "grad_norm": 0.6110003185455112, + "learning_rate": 2.29014723843854e-06, + "loss": 0.5177, + "step": 9788 + }, + { + "epoch": 1.588092147955873, + "grad_norm": 0.6027306971150128, + "learning_rate": 2.2897217161261027e-06, + "loss": 0.4934, + "step": 9789 + }, + { + "epoch": 1.5882543802725504, + "grad_norm": 1.043327213266391, + "learning_rate": 2.2892961999489722e-06, + "loss": 0.5215, + "step": 9790 + }, + { + "epoch": 1.5884166125892278, + "grad_norm": 0.5799135909628665, + "learning_rate": 2.2888706899195628e-06, + "loss": 0.5028, + "step": 9791 + }, + { + "epoch": 1.5885788449059053, + "grad_norm": 0.6002194969517488, + "learning_rate": 2.2884451860502902e-06, + "loss": 0.5211, + "step": 9792 + }, + { + "epoch": 1.5887410772225827, + "grad_norm": 0.6566310899607045, + "learning_rate": 2.2880196883535687e-06, + "loss": 0.4959, + "step": 9793 + }, + { + "epoch": 1.5889033095392602, + "grad_norm": 0.5745053970000957, + "learning_rate": 2.287594196841815e-06, + "loss": 0.5081, + "step": 9794 + }, + { + "epoch": 1.5890655418559376, + "grad_norm": 0.6105555991961724, + "learning_rate": 2.2871687115274423e-06, + "loss": 0.5263, + "step": 9795 + }, + { + "epoch": 1.589227774172615, + "grad_norm": 0.6262912421439807, + "learning_rate": 2.286743232422865e-06, + "loss": 0.5459, + "step": 9796 + }, + { + "epoch": 1.5893900064892925, + "grad_norm": 0.6574985071966525, + "learning_rate": 2.2863177595404966e-06, + "loss": 0.5487, + "step": 9797 + }, + { + "epoch": 1.5895522388059702, + "grad_norm": 0.6055315165756509, + "learning_rate": 2.285892292892753e-06, + "loss": 0.5164, + "step": 9798 + }, + { + "epoch": 1.5897144711226476, + "grad_norm": 0.5891480538596583, + "learning_rate": 2.285466832492046e-06, + "loss": 0.5084, + "step": 9799 + }, + { + "epoch": 1.589876703439325, + "grad_norm": 0.6537724907189385, + "learning_rate": 2.285041378350792e-06, + "loss": 0.5513, + "step": 9800 + }, + { + "epoch": 1.5900389357560027, + "grad_norm": 0.6159308430014953, + "learning_rate": 2.2846159304814024e-06, + "loss": 0.5237, + "step": 9801 + }, + { + "epoch": 1.5902011680726802, + "grad_norm": 0.6208396106847307, + "learning_rate": 2.2841904888962903e-06, + "loss": 0.503, + "step": 9802 + }, + { + "epoch": 1.5903634003893576, + "grad_norm": 0.5974174144407457, + "learning_rate": 2.28376505360787e-06, + "loss": 0.488, + "step": 9803 + }, + { + "epoch": 1.590525632706035, + "grad_norm": 0.5674318907112952, + "learning_rate": 2.2833396246285535e-06, + "loss": 0.5241, + "step": 9804 + }, + { + "epoch": 1.5906878650227125, + "grad_norm": 0.5725159009832438, + "learning_rate": 2.282914201970756e-06, + "loss": 0.525, + "step": 9805 + }, + { + "epoch": 1.59085009733939, + "grad_norm": 0.5746572628872867, + "learning_rate": 2.2824887856468867e-06, + "loss": 0.5214, + "step": 9806 + }, + { + "epoch": 1.5910123296560674, + "grad_norm": 0.5853718969131234, + "learning_rate": 2.28206337566936e-06, + "loss": 0.498, + "step": 9807 + }, + { + "epoch": 1.5911745619727449, + "grad_norm": 0.8881089519967096, + "learning_rate": 2.281637972050587e-06, + "loss": 0.5008, + "step": 9808 + }, + { + "epoch": 1.5913367942894223, + "grad_norm": 0.6367376712271183, + "learning_rate": 2.281212574802981e-06, + "loss": 0.5226, + "step": 9809 + }, + { + "epoch": 1.5914990266061, + "grad_norm": 0.6066992909215885, + "learning_rate": 2.280787183938954e-06, + "loss": 0.5254, + "step": 9810 + }, + { + "epoch": 1.5916612589227774, + "grad_norm": 0.6226029696103733, + "learning_rate": 2.280361799470915e-06, + "loss": 0.5297, + "step": 9811 + }, + { + "epoch": 1.5918234912394549, + "grad_norm": 0.6175051454115116, + "learning_rate": 2.2799364214112784e-06, + "loss": 0.5078, + "step": 9812 + }, + { + "epoch": 1.5919857235561323, + "grad_norm": 0.6068460043992493, + "learning_rate": 2.279511049772454e-06, + "loss": 0.5102, + "step": 9813 + }, + { + "epoch": 1.59214795587281, + "grad_norm": 0.6089009566855262, + "learning_rate": 2.279085684566854e-06, + "loss": 0.5085, + "step": 9814 + }, + { + "epoch": 1.5923101881894874, + "grad_norm": 0.5796021911310446, + "learning_rate": 2.278660325806889e-06, + "loss": 0.5088, + "step": 9815 + }, + { + "epoch": 1.5924724205061649, + "grad_norm": 0.5868617818005603, + "learning_rate": 2.278234973504969e-06, + "loss": 0.4841, + "step": 9816 + }, + { + "epoch": 1.5926346528228423, + "grad_norm": 0.6058578193812587, + "learning_rate": 2.277809627673504e-06, + "loss": 0.5157, + "step": 9817 + }, + { + "epoch": 1.5927968851395198, + "grad_norm": 0.5773163959265427, + "learning_rate": 2.2773842883249063e-06, + "loss": 0.5001, + "step": 9818 + }, + { + "epoch": 1.5929591174561972, + "grad_norm": 0.6440014075755602, + "learning_rate": 2.2769589554715846e-06, + "loss": 0.5188, + "step": 9819 + }, + { + "epoch": 1.5931213497728747, + "grad_norm": 0.5615581306536175, + "learning_rate": 2.27653362912595e-06, + "loss": 0.5211, + "step": 9820 + }, + { + "epoch": 1.5932835820895521, + "grad_norm": 0.5730795046488687, + "learning_rate": 2.2761083093004117e-06, + "loss": 0.4955, + "step": 9821 + }, + { + "epoch": 1.5934458144062296, + "grad_norm": 0.6283215149319809, + "learning_rate": 2.275682996007378e-06, + "loss": 0.5486, + "step": 9822 + }, + { + "epoch": 1.5936080467229072, + "grad_norm": 0.6087604235750662, + "learning_rate": 2.2752576892592605e-06, + "loss": 0.5534, + "step": 9823 + }, + { + "epoch": 1.5937702790395847, + "grad_norm": 0.5926530652897348, + "learning_rate": 2.2748323890684664e-06, + "loss": 0.5218, + "step": 9824 + }, + { + "epoch": 1.5939325113562621, + "grad_norm": 0.6184055720312887, + "learning_rate": 2.2744070954474072e-06, + "loss": 0.5335, + "step": 9825 + }, + { + "epoch": 1.5940947436729398, + "grad_norm": 0.6077199977067379, + "learning_rate": 2.2739818084084895e-06, + "loss": 0.5396, + "step": 9826 + }, + { + "epoch": 1.5942569759896172, + "grad_norm": 0.5810202716375282, + "learning_rate": 2.273556527964122e-06, + "loss": 0.5328, + "step": 9827 + }, + { + "epoch": 1.5944192083062947, + "grad_norm": 0.6107733973268465, + "learning_rate": 2.2731312541267144e-06, + "loss": 0.5081, + "step": 9828 + }, + { + "epoch": 1.5945814406229721, + "grad_norm": 0.5788696388173192, + "learning_rate": 2.2727059869086747e-06, + "loss": 0.5132, + "step": 9829 + }, + { + "epoch": 1.5947436729396496, + "grad_norm": 0.6436631207905928, + "learning_rate": 2.2722807263224112e-06, + "loss": 0.5258, + "step": 9830 + }, + { + "epoch": 1.594905905256327, + "grad_norm": 0.5934297164035421, + "learning_rate": 2.2718554723803297e-06, + "loss": 0.521, + "step": 9831 + }, + { + "epoch": 1.5950681375730045, + "grad_norm": 0.6757421536671597, + "learning_rate": 2.27143022509484e-06, + "loss": 0.5296, + "step": 9832 + }, + { + "epoch": 1.595230369889682, + "grad_norm": 0.593567002742758, + "learning_rate": 2.2710049844783485e-06, + "loss": 0.5343, + "step": 9833 + }, + { + "epoch": 1.5953926022063594, + "grad_norm": 0.5884068268214889, + "learning_rate": 2.2705797505432636e-06, + "loss": 0.5134, + "step": 9834 + }, + { + "epoch": 1.595554834523037, + "grad_norm": 0.6160099129206977, + "learning_rate": 2.270154523301992e-06, + "loss": 0.4974, + "step": 9835 + }, + { + "epoch": 1.5957170668397145, + "grad_norm": 0.5792147774590811, + "learning_rate": 2.2697293027669395e-06, + "loss": 0.5084, + "step": 9836 + }, + { + "epoch": 1.595879299156392, + "grad_norm": 0.5867946917754309, + "learning_rate": 2.2693040889505137e-06, + "loss": 0.5469, + "step": 9837 + }, + { + "epoch": 1.5960415314730696, + "grad_norm": 0.5944225353753066, + "learning_rate": 2.2688788818651204e-06, + "loss": 0.4873, + "step": 9838 + }, + { + "epoch": 1.596203763789747, + "grad_norm": 0.5909692938540191, + "learning_rate": 2.2684536815231673e-06, + "loss": 0.5017, + "step": 9839 + }, + { + "epoch": 1.5963659961064245, + "grad_norm": 0.6135672328677759, + "learning_rate": 2.2680284879370597e-06, + "loss": 0.5181, + "step": 9840 + }, + { + "epoch": 1.596528228423102, + "grad_norm": 0.5924972529234813, + "learning_rate": 2.2676033011192043e-06, + "loss": 0.5056, + "step": 9841 + }, + { + "epoch": 1.5966904607397794, + "grad_norm": 0.6201343312805292, + "learning_rate": 2.2671781210820045e-06, + "loss": 0.5239, + "step": 9842 + }, + { + "epoch": 1.5968526930564568, + "grad_norm": 0.6252694850159666, + "learning_rate": 2.2667529478378678e-06, + "loss": 0.5411, + "step": 9843 + }, + { + "epoch": 1.5970149253731343, + "grad_norm": 0.6353400576027164, + "learning_rate": 2.2663277813991987e-06, + "loss": 0.4909, + "step": 9844 + }, + { + "epoch": 1.5971771576898117, + "grad_norm": 1.273965398202696, + "learning_rate": 2.2659026217784032e-06, + "loss": 0.538, + "step": 9845 + }, + { + "epoch": 1.5973393900064892, + "grad_norm": 0.5827072490808062, + "learning_rate": 2.2654774689878862e-06, + "loss": 0.4783, + "step": 9846 + }, + { + "epoch": 1.5975016223231666, + "grad_norm": 0.6647401765576525, + "learning_rate": 2.2650523230400508e-06, + "loss": 0.5142, + "step": 9847 + }, + { + "epoch": 1.5976638546398443, + "grad_norm": 0.6182184909698706, + "learning_rate": 2.264627183947303e-06, + "loss": 0.4692, + "step": 9848 + }, + { + "epoch": 1.5978260869565217, + "grad_norm": 0.6218951629785241, + "learning_rate": 2.264202051722046e-06, + "loss": 0.5283, + "step": 9849 + }, + { + "epoch": 1.5979883192731992, + "grad_norm": 0.6006035888516639, + "learning_rate": 2.2637769263766855e-06, + "loss": 0.5147, + "step": 9850 + }, + { + "epoch": 1.5981505515898768, + "grad_norm": 0.6288740602689135, + "learning_rate": 2.2633518079236246e-06, + "loss": 0.5061, + "step": 9851 + }, + { + "epoch": 1.5983127839065543, + "grad_norm": 0.6169910902676163, + "learning_rate": 2.2629266963752668e-06, + "loss": 0.5059, + "step": 9852 + }, + { + "epoch": 1.5984750162232317, + "grad_norm": 0.5982355426688212, + "learning_rate": 2.2625015917440154e-06, + "loss": 0.5161, + "step": 9853 + }, + { + "epoch": 1.5986372485399092, + "grad_norm": 0.6007800806711938, + "learning_rate": 2.262076494042274e-06, + "loss": 0.5372, + "step": 9854 + }, + { + "epoch": 1.5987994808565866, + "grad_norm": 0.6226418167269866, + "learning_rate": 2.261651403282446e-06, + "loss": 0.5029, + "step": 9855 + }, + { + "epoch": 1.598961713173264, + "grad_norm": 0.6277248950142817, + "learning_rate": 2.261226319476935e-06, + "loss": 0.5604, + "step": 9856 + }, + { + "epoch": 1.5991239454899415, + "grad_norm": 0.6003283728430867, + "learning_rate": 2.260801242638142e-06, + "loss": 0.5167, + "step": 9857 + }, + { + "epoch": 1.599286177806619, + "grad_norm": 0.5760651165894735, + "learning_rate": 2.2603761727784702e-06, + "loss": 0.5039, + "step": 9858 + }, + { + "epoch": 1.5994484101232964, + "grad_norm": 0.5758712873289424, + "learning_rate": 2.259951109910322e-06, + "loss": 0.5328, + "step": 9859 + }, + { + "epoch": 1.599610642439974, + "grad_norm": 0.6489437866408214, + "learning_rate": 2.2595260540460993e-06, + "loss": 0.5091, + "step": 9860 + }, + { + "epoch": 1.5997728747566515, + "grad_norm": 0.6091226119674062, + "learning_rate": 2.259101005198205e-06, + "loss": 0.5049, + "step": 9861 + }, + { + "epoch": 1.599935107073329, + "grad_norm": 0.6170569147750176, + "learning_rate": 2.25867596337904e-06, + "loss": 0.5217, + "step": 9862 + }, + { + "epoch": 1.6000973393900066, + "grad_norm": 0.6102780980991852, + "learning_rate": 2.258250928601005e-06, + "loss": 0.5279, + "step": 9863 + }, + { + "epoch": 1.600259571706684, + "grad_norm": 0.627057081304537, + "learning_rate": 2.257825900876502e-06, + "loss": 0.5359, + "step": 9864 + }, + { + "epoch": 1.6004218040233615, + "grad_norm": 0.6112015149783483, + "learning_rate": 2.2574008802179323e-06, + "loss": 0.5047, + "step": 9865 + }, + { + "epoch": 1.600584036340039, + "grad_norm": 0.6169526790288423, + "learning_rate": 2.2569758666376973e-06, + "loss": 0.5288, + "step": 9866 + }, + { + "epoch": 1.6007462686567164, + "grad_norm": 0.5887113107318863, + "learning_rate": 2.256550860148196e-06, + "loss": 0.5273, + "step": 9867 + }, + { + "epoch": 1.6009085009733939, + "grad_norm": 0.6460804876604634, + "learning_rate": 2.2561258607618296e-06, + "loss": 0.5221, + "step": 9868 + }, + { + "epoch": 1.6010707332900713, + "grad_norm": 0.6109170712099945, + "learning_rate": 2.2557008684909988e-06, + "loss": 0.5009, + "step": 9869 + }, + { + "epoch": 1.6012329656067488, + "grad_norm": 0.5822266658481364, + "learning_rate": 2.2552758833481035e-06, + "loss": 0.4895, + "step": 9870 + }, + { + "epoch": 1.6013951979234262, + "grad_norm": 0.6216358410501539, + "learning_rate": 2.2548509053455436e-06, + "loss": 0.5036, + "step": 9871 + }, + { + "epoch": 1.6015574302401037, + "grad_norm": 0.5939897955445183, + "learning_rate": 2.254425934495718e-06, + "loss": 0.5222, + "step": 9872 + }, + { + "epoch": 1.6017196625567813, + "grad_norm": 0.5736987120634145, + "learning_rate": 2.2540009708110267e-06, + "loss": 0.494, + "step": 9873 + }, + { + "epoch": 1.6018818948734588, + "grad_norm": 0.5953409244552174, + "learning_rate": 2.2535760143038688e-06, + "loss": 0.5018, + "step": 9874 + }, + { + "epoch": 1.6020441271901362, + "grad_norm": 0.5735343399184479, + "learning_rate": 2.253151064986643e-06, + "loss": 0.5142, + "step": 9875 + }, + { + "epoch": 1.602206359506814, + "grad_norm": 0.5862269160432287, + "learning_rate": 2.252726122871749e-06, + "loss": 0.5037, + "step": 9876 + }, + { + "epoch": 1.6023685918234913, + "grad_norm": 0.623867879855646, + "learning_rate": 2.2523011879715847e-06, + "loss": 0.5307, + "step": 9877 + }, + { + "epoch": 1.6025308241401688, + "grad_norm": 0.593420659489619, + "learning_rate": 2.2518762602985476e-06, + "loss": 0.5057, + "step": 9878 + }, + { + "epoch": 1.6026930564568462, + "grad_norm": 0.5892457255271902, + "learning_rate": 2.251451339865037e-06, + "loss": 0.508, + "step": 9879 + }, + { + "epoch": 1.6028552887735237, + "grad_norm": 0.5863056539783417, + "learning_rate": 2.2510264266834506e-06, + "loss": 0.5534, + "step": 9880 + }, + { + "epoch": 1.6030175210902011, + "grad_norm": 0.6193526265225335, + "learning_rate": 2.250601520766187e-06, + "loss": 0.529, + "step": 9881 + }, + { + "epoch": 1.6031797534068786, + "grad_norm": 0.5795472524839271, + "learning_rate": 2.2501766221256423e-06, + "loss": 0.5096, + "step": 9882 + }, + { + "epoch": 1.603341985723556, + "grad_norm": 0.6097836865374228, + "learning_rate": 2.2497517307742136e-06, + "loss": 0.4982, + "step": 9883 + }, + { + "epoch": 1.6035042180402335, + "grad_norm": 0.6119654706393677, + "learning_rate": 2.249326846724299e-06, + "loss": 0.5009, + "step": 9884 + }, + { + "epoch": 1.6036664503569111, + "grad_norm": 0.591101339343229, + "learning_rate": 2.248901969988295e-06, + "loss": 0.5085, + "step": 9885 + }, + { + "epoch": 1.6038286826735886, + "grad_norm": 0.5912286449003511, + "learning_rate": 2.248477100578599e-06, + "loss": 0.5084, + "step": 9886 + }, + { + "epoch": 1.603990914990266, + "grad_norm": 0.6464923175903271, + "learning_rate": 2.248052238507607e-06, + "loss": 0.5207, + "step": 9887 + }, + { + "epoch": 1.6041531473069437, + "grad_norm": 0.5886026812843127, + "learning_rate": 2.2476273837877146e-06, + "loss": 0.526, + "step": 9888 + }, + { + "epoch": 1.6043153796236211, + "grad_norm": 0.6106426116573226, + "learning_rate": 2.247202536431318e-06, + "loss": 0.5205, + "step": 9889 + }, + { + "epoch": 1.6044776119402986, + "grad_norm": 0.6134816099236227, + "learning_rate": 2.246777696450813e-06, + "loss": 0.5138, + "step": 9890 + }, + { + "epoch": 1.604639844256976, + "grad_norm": 0.589116640303564, + "learning_rate": 2.2463528638585954e-06, + "loss": 0.5066, + "step": 9891 + }, + { + "epoch": 1.6048020765736535, + "grad_norm": 0.5985973033186899, + "learning_rate": 2.245928038667062e-06, + "loss": 0.5374, + "step": 9892 + }, + { + "epoch": 1.604964308890331, + "grad_norm": 0.6296315486532438, + "learning_rate": 2.2455032208886055e-06, + "loss": 0.5141, + "step": 9893 + }, + { + "epoch": 1.6051265412070084, + "grad_norm": 0.6031559544016384, + "learning_rate": 2.2450784105356214e-06, + "loss": 0.5404, + "step": 9894 + }, + { + "epoch": 1.6052887735236858, + "grad_norm": 0.6286572445038977, + "learning_rate": 2.2446536076205057e-06, + "loss": 0.5153, + "step": 9895 + }, + { + "epoch": 1.6054510058403633, + "grad_norm": 0.5760048265265527, + "learning_rate": 2.244228812155651e-06, + "loss": 0.5023, + "step": 9896 + }, + { + "epoch": 1.605613238157041, + "grad_norm": 0.5989696766499478, + "learning_rate": 2.2438040241534555e-06, + "loss": 0.4985, + "step": 9897 + }, + { + "epoch": 1.6057754704737184, + "grad_norm": 0.5944021850000861, + "learning_rate": 2.243379243626308e-06, + "loss": 0.5078, + "step": 9898 + }, + { + "epoch": 1.6059377027903958, + "grad_norm": 0.5810806755217507, + "learning_rate": 2.2429544705866056e-06, + "loss": 0.5291, + "step": 9899 + }, + { + "epoch": 1.6060999351070735, + "grad_norm": 0.598927900454816, + "learning_rate": 2.2425297050467408e-06, + "loss": 0.5058, + "step": 9900 + }, + { + "epoch": 1.606262167423751, + "grad_norm": 0.6071404896723179, + "learning_rate": 2.2421049470191077e-06, + "loss": 0.4675, + "step": 9901 + }, + { + "epoch": 1.6064243997404284, + "grad_norm": 0.6043464322569448, + "learning_rate": 2.2416801965161e-06, + "loss": 0.5332, + "step": 9902 + }, + { + "epoch": 1.6065866320571058, + "grad_norm": 0.6109307491906301, + "learning_rate": 2.2412554535501086e-06, + "loss": 0.5018, + "step": 9903 + }, + { + "epoch": 1.6067488643737833, + "grad_norm": 0.6214874072175639, + "learning_rate": 2.240830718133529e-06, + "loss": 0.5265, + "step": 9904 + }, + { + "epoch": 1.6069110966904607, + "grad_norm": 0.5854042887385107, + "learning_rate": 2.2404059902787506e-06, + "loss": 0.5249, + "step": 9905 + }, + { + "epoch": 1.6070733290071382, + "grad_norm": 0.5996875231843927, + "learning_rate": 2.2399812699981685e-06, + "loss": 0.5346, + "step": 9906 + }, + { + "epoch": 1.6072355613238156, + "grad_norm": 0.6148873220090931, + "learning_rate": 2.239556557304174e-06, + "loss": 0.5288, + "step": 9907 + }, + { + "epoch": 1.607397793640493, + "grad_norm": 0.5886134744248659, + "learning_rate": 2.2391318522091585e-06, + "loss": 0.514, + "step": 9908 + }, + { + "epoch": 1.6075600259571705, + "grad_norm": 0.6173306828116584, + "learning_rate": 2.238707154725513e-06, + "loss": 0.5205, + "step": 9909 + }, + { + "epoch": 1.6077222582738482, + "grad_norm": 0.6200373167892177, + "learning_rate": 2.2382824648656306e-06, + "loss": 0.5118, + "step": 9910 + }, + { + "epoch": 1.6078844905905256, + "grad_norm": 0.6154468902339908, + "learning_rate": 2.2378577826419008e-06, + "loss": 0.5022, + "step": 9911 + }, + { + "epoch": 1.608046722907203, + "grad_norm": 0.5778455320016, + "learning_rate": 2.2374331080667168e-06, + "loss": 0.501, + "step": 9912 + }, + { + "epoch": 1.6082089552238807, + "grad_norm": 0.610288931413626, + "learning_rate": 2.237008441152467e-06, + "loss": 0.5132, + "step": 9913 + }, + { + "epoch": 1.6083711875405582, + "grad_norm": 0.6043510040552706, + "learning_rate": 2.2365837819115432e-06, + "loss": 0.5125, + "step": 9914 + }, + { + "epoch": 1.6085334198572356, + "grad_norm": 0.608612668839807, + "learning_rate": 2.236159130356336e-06, + "loss": 0.5198, + "step": 9915 + }, + { + "epoch": 1.608695652173913, + "grad_norm": 0.5863212109115317, + "learning_rate": 2.235734486499234e-06, + "loss": 0.5152, + "step": 9916 + }, + { + "epoch": 1.6088578844905905, + "grad_norm": 0.6227774718335979, + "learning_rate": 2.235309850352629e-06, + "loss": 0.529, + "step": 9917 + }, + { + "epoch": 1.609020116807268, + "grad_norm": 0.5968290437038763, + "learning_rate": 2.2348852219289097e-06, + "loss": 0.5086, + "step": 9918 + }, + { + "epoch": 1.6091823491239454, + "grad_norm": 0.5796727202172642, + "learning_rate": 2.2344606012404647e-06, + "loss": 0.5163, + "step": 9919 + }, + { + "epoch": 1.6093445814406229, + "grad_norm": 0.5899597877086595, + "learning_rate": 2.2340359882996847e-06, + "loss": 0.5404, + "step": 9920 + }, + { + "epoch": 1.6095068137573003, + "grad_norm": 0.5924070642070793, + "learning_rate": 2.2336113831189577e-06, + "loss": 0.5243, + "step": 9921 + }, + { + "epoch": 1.609669046073978, + "grad_norm": 0.620295120328647, + "learning_rate": 2.2331867857106732e-06, + "loss": 0.5517, + "step": 9922 + }, + { + "epoch": 1.6098312783906554, + "grad_norm": 0.625346385884348, + "learning_rate": 2.2327621960872187e-06, + "loss": 0.4911, + "step": 9923 + }, + { + "epoch": 1.6099935107073329, + "grad_norm": 0.6131402187974305, + "learning_rate": 2.232337614260983e-06, + "loss": 0.5176, + "step": 9924 + }, + { + "epoch": 1.6101557430240105, + "grad_norm": 0.6195537272321705, + "learning_rate": 2.231913040244354e-06, + "loss": 0.5012, + "step": 9925 + }, + { + "epoch": 1.610317975340688, + "grad_norm": 0.584370134674849, + "learning_rate": 2.23148847404972e-06, + "loss": 0.5538, + "step": 9926 + }, + { + "epoch": 1.6104802076573654, + "grad_norm": 0.6240043712744495, + "learning_rate": 2.231063915689469e-06, + "loss": 0.5578, + "step": 9927 + }, + { + "epoch": 1.6106424399740429, + "grad_norm": 0.6118720544764705, + "learning_rate": 2.2306393651759866e-06, + "loss": 0.5257, + "step": 9928 + }, + { + "epoch": 1.6108046722907203, + "grad_norm": 0.5835771165352847, + "learning_rate": 2.2302148225216614e-06, + "loss": 0.51, + "step": 9929 + }, + { + "epoch": 1.6109669046073978, + "grad_norm": 0.5595922863608094, + "learning_rate": 2.2297902877388794e-06, + "loss": 0.4906, + "step": 9930 + }, + { + "epoch": 1.6111291369240752, + "grad_norm": 0.5975574041032123, + "learning_rate": 2.2293657608400285e-06, + "loss": 0.5339, + "step": 9931 + }, + { + "epoch": 1.6112913692407527, + "grad_norm": 0.6125129284853311, + "learning_rate": 2.228941241837494e-06, + "loss": 0.4962, + "step": 9932 + }, + { + "epoch": 1.6114536015574301, + "grad_norm": 0.6490890111828024, + "learning_rate": 2.228516730743664e-06, + "loss": 0.5431, + "step": 9933 + }, + { + "epoch": 1.6116158338741076, + "grad_norm": 0.6104117711682852, + "learning_rate": 2.2280922275709216e-06, + "loss": 0.4874, + "step": 9934 + }, + { + "epoch": 1.6117780661907852, + "grad_norm": 0.6199255886930585, + "learning_rate": 2.2276677323316546e-06, + "loss": 0.5375, + "step": 9935 + }, + { + "epoch": 1.6119402985074627, + "grad_norm": 0.6063733194082601, + "learning_rate": 2.227243245038247e-06, + "loss": 0.4829, + "step": 9936 + }, + { + "epoch": 1.6121025308241401, + "grad_norm": 0.6196797755316642, + "learning_rate": 2.2268187657030867e-06, + "loss": 0.4999, + "step": 9937 + }, + { + "epoch": 1.6122647631408178, + "grad_norm": 0.5903052739022506, + "learning_rate": 2.2263942943385568e-06, + "loss": 0.545, + "step": 9938 + }, + { + "epoch": 1.6124269954574952, + "grad_norm": 0.5911445854108865, + "learning_rate": 2.2259698309570418e-06, + "loss": 0.551, + "step": 9939 + }, + { + "epoch": 1.6125892277741727, + "grad_norm": 0.6188980282885136, + "learning_rate": 2.2255453755709274e-06, + "loss": 0.532, + "step": 9940 + }, + { + "epoch": 1.6127514600908501, + "grad_norm": 0.5892835484556118, + "learning_rate": 2.225120928192597e-06, + "loss": 0.477, + "step": 9941 + }, + { + "epoch": 1.6129136924075276, + "grad_norm": 0.5887897014085297, + "learning_rate": 2.224696488834436e-06, + "loss": 0.5462, + "step": 9942 + }, + { + "epoch": 1.613075924724205, + "grad_norm": 0.612464968091039, + "learning_rate": 2.2242720575088277e-06, + "loss": 0.5136, + "step": 9943 + }, + { + "epoch": 1.6132381570408825, + "grad_norm": 0.6294705780071986, + "learning_rate": 2.2238476342281555e-06, + "loss": 0.5021, + "step": 9944 + }, + { + "epoch": 1.61340038935756, + "grad_norm": 0.6234429569598418, + "learning_rate": 2.223423219004802e-06, + "loss": 0.5162, + "step": 9945 + }, + { + "epoch": 1.6135626216742374, + "grad_norm": 0.6258018827502602, + "learning_rate": 2.2229988118511527e-06, + "loss": 0.5321, + "step": 9946 + }, + { + "epoch": 1.613724853990915, + "grad_norm": 0.586577290722102, + "learning_rate": 2.2225744127795884e-06, + "loss": 0.4999, + "step": 9947 + }, + { + "epoch": 1.6138870863075925, + "grad_norm": 0.6438954132482129, + "learning_rate": 2.2221500218024934e-06, + "loss": 0.5284, + "step": 9948 + }, + { + "epoch": 1.61404931862427, + "grad_norm": 0.6027134654166892, + "learning_rate": 2.2217256389322494e-06, + "loss": 0.5114, + "step": 9949 + }, + { + "epoch": 1.6142115509409476, + "grad_norm": 0.6085444278713512, + "learning_rate": 2.221301264181238e-06, + "loss": 0.5198, + "step": 9950 + }, + { + "epoch": 1.614373783257625, + "grad_norm": 0.5969468827506647, + "learning_rate": 2.2208768975618426e-06, + "loss": 0.4731, + "step": 9951 + }, + { + "epoch": 1.6145360155743025, + "grad_norm": 0.591419778099826, + "learning_rate": 2.2204525390864436e-06, + "loss": 0.5387, + "step": 9952 + }, + { + "epoch": 1.61469824789098, + "grad_norm": 0.5769651697902085, + "learning_rate": 2.2200281887674242e-06, + "loss": 0.5099, + "step": 9953 + }, + { + "epoch": 1.6148604802076574, + "grad_norm": 0.6122123910080066, + "learning_rate": 2.219603846617165e-06, + "loss": 0.5011, + "step": 9954 + }, + { + "epoch": 1.6150227125243348, + "grad_norm": 0.6114221501310217, + "learning_rate": 2.2191795126480464e-06, + "loss": 0.4996, + "step": 9955 + }, + { + "epoch": 1.6151849448410123, + "grad_norm": 0.5943396447470741, + "learning_rate": 2.2187551868724487e-06, + "loss": 0.5141, + "step": 9956 + }, + { + "epoch": 1.6153471771576897, + "grad_norm": 0.6207793966383836, + "learning_rate": 2.218330869302754e-06, + "loss": 0.5056, + "step": 9957 + }, + { + "epoch": 1.6155094094743672, + "grad_norm": 0.5987743278495389, + "learning_rate": 2.217906559951343e-06, + "loss": 0.5511, + "step": 9958 + }, + { + "epoch": 1.6156716417910446, + "grad_norm": 0.5810813541483193, + "learning_rate": 2.217482258830594e-06, + "loss": 0.5047, + "step": 9959 + }, + { + "epoch": 1.6158338741077223, + "grad_norm": 0.6178344170942498, + "learning_rate": 2.2170579659528877e-06, + "loss": 0.52, + "step": 9960 + }, + { + "epoch": 1.6159961064243997, + "grad_norm": 0.6201094108174879, + "learning_rate": 2.216633681330604e-06, + "loss": 0.5322, + "step": 9961 + }, + { + "epoch": 1.6161583387410772, + "grad_norm": 0.5955088678606513, + "learning_rate": 2.216209404976122e-06, + "loss": 0.5067, + "step": 9962 + }, + { + "epoch": 1.6163205710577548, + "grad_norm": 0.5779501916919838, + "learning_rate": 2.215785136901822e-06, + "loss": 0.4809, + "step": 9963 + }, + { + "epoch": 1.6164828033744323, + "grad_norm": 0.6256983595136281, + "learning_rate": 2.2153608771200803e-06, + "loss": 0.5046, + "step": 9964 + }, + { + "epoch": 1.6166450356911097, + "grad_norm": 0.5906951831131838, + "learning_rate": 2.2149366256432774e-06, + "loss": 0.4856, + "step": 9965 + }, + { + "epoch": 1.6168072680077872, + "grad_norm": 0.6074624028931209, + "learning_rate": 2.2145123824837913e-06, + "loss": 0.475, + "step": 9966 + }, + { + "epoch": 1.6169695003244646, + "grad_norm": 0.5974610157223615, + "learning_rate": 2.214088147654001e-06, + "loss": 0.5046, + "step": 9967 + }, + { + "epoch": 1.617131732641142, + "grad_norm": 0.6000879066808206, + "learning_rate": 2.213663921166284e-06, + "loss": 0.5101, + "step": 9968 + }, + { + "epoch": 1.6172939649578195, + "grad_norm": 0.6068138016411317, + "learning_rate": 2.2132397030330173e-06, + "loss": 0.5074, + "step": 9969 + }, + { + "epoch": 1.617456197274497, + "grad_norm": 0.6051436680763033, + "learning_rate": 2.2128154932665778e-06, + "loss": 0.5265, + "step": 9970 + }, + { + "epoch": 1.6176184295911744, + "grad_norm": 0.5660951037423424, + "learning_rate": 2.2123912918793443e-06, + "loss": 0.4738, + "step": 9971 + }, + { + "epoch": 1.617780661907852, + "grad_norm": 0.5773644586287902, + "learning_rate": 2.211967098883693e-06, + "loss": 0.5191, + "step": 9972 + }, + { + "epoch": 1.6179428942245295, + "grad_norm": 0.6068565752353117, + "learning_rate": 2.211542914292002e-06, + "loss": 0.5252, + "step": 9973 + }, + { + "epoch": 1.618105126541207, + "grad_norm": 0.6092668780076747, + "learning_rate": 2.2111187381166452e-06, + "loss": 0.5338, + "step": 9974 + }, + { + "epoch": 1.6182673588578846, + "grad_norm": 0.630728902808123, + "learning_rate": 2.21069457037e-06, + "loss": 0.5145, + "step": 9975 + }, + { + "epoch": 1.618429591174562, + "grad_norm": 0.604767184448902, + "learning_rate": 2.210270411064443e-06, + "loss": 0.5278, + "step": 9976 + }, + { + "epoch": 1.6185918234912395, + "grad_norm": 0.6460360256357008, + "learning_rate": 2.2098462602123487e-06, + "loss": 0.5371, + "step": 9977 + }, + { + "epoch": 1.618754055807917, + "grad_norm": 0.6488168983639439, + "learning_rate": 2.209422117826094e-06, + "loss": 0.4849, + "step": 9978 + }, + { + "epoch": 1.6189162881245944, + "grad_norm": 0.6100767745925553, + "learning_rate": 2.208997983918054e-06, + "loss": 0.5202, + "step": 9979 + }, + { + "epoch": 1.6190785204412719, + "grad_norm": 0.6667230740120226, + "learning_rate": 2.2085738585006026e-06, + "loss": 0.509, + "step": 9980 + }, + { + "epoch": 1.6192407527579493, + "grad_norm": 0.6603296842311389, + "learning_rate": 2.2081497415861146e-06, + "loss": 0.515, + "step": 9981 + }, + { + "epoch": 1.6194029850746268, + "grad_norm": 0.5864120500501289, + "learning_rate": 2.2077256331869653e-06, + "loss": 0.5368, + "step": 9982 + }, + { + "epoch": 1.6195652173913042, + "grad_norm": 0.5877499684199794, + "learning_rate": 2.2073015333155282e-06, + "loss": 0.5653, + "step": 9983 + }, + { + "epoch": 1.6197274497079819, + "grad_norm": 0.585794226542824, + "learning_rate": 2.206877441984179e-06, + "loss": 0.508, + "step": 9984 + }, + { + "epoch": 1.6198896820246593, + "grad_norm": 0.5836515796090996, + "learning_rate": 2.2064533592052893e-06, + "loss": 0.5046, + "step": 9985 + }, + { + "epoch": 1.6200519143413368, + "grad_norm": 0.5644623578383589, + "learning_rate": 2.2060292849912327e-06, + "loss": 0.5156, + "step": 9986 + }, + { + "epoch": 1.6202141466580144, + "grad_norm": 0.6035209588815166, + "learning_rate": 2.2056052193543843e-06, + "loss": 0.4727, + "step": 9987 + }, + { + "epoch": 1.6203763789746919, + "grad_norm": 0.6034754554639132, + "learning_rate": 2.2051811623071152e-06, + "loss": 0.5472, + "step": 9988 + }, + { + "epoch": 1.6205386112913693, + "grad_norm": 0.6058701571563014, + "learning_rate": 2.2047571138618008e-06, + "loss": 0.5073, + "step": 9989 + }, + { + "epoch": 1.6207008436080468, + "grad_norm": 0.6208658812962776, + "learning_rate": 2.2043330740308097e-06, + "loss": 0.5245, + "step": 9990 + }, + { + "epoch": 1.6208630759247242, + "grad_norm": 0.5772006036843345, + "learning_rate": 2.203909042826517e-06, + "loss": 0.5305, + "step": 9991 + }, + { + "epoch": 1.6210253082414017, + "grad_norm": 0.6352119512330846, + "learning_rate": 2.2034850202612936e-06, + "loss": 0.528, + "step": 9992 + }, + { + "epoch": 1.6211875405580791, + "grad_norm": 0.6166942717896388, + "learning_rate": 2.2030610063475113e-06, + "loss": 0.4702, + "step": 9993 + }, + { + "epoch": 1.6213497728747566, + "grad_norm": 0.5794488539109867, + "learning_rate": 2.2026370010975428e-06, + "loss": 0.4978, + "step": 9994 + }, + { + "epoch": 1.621512005191434, + "grad_norm": 0.6279729105533302, + "learning_rate": 2.2022130045237573e-06, + "loss": 0.5226, + "step": 9995 + }, + { + "epoch": 1.6216742375081115, + "grad_norm": 0.6228528433860739, + "learning_rate": 2.2017890166385275e-06, + "loss": 0.5108, + "step": 9996 + }, + { + "epoch": 1.6218364698247891, + "grad_norm": 0.5463858360174019, + "learning_rate": 2.2013650374542226e-06, + "loss": 0.5028, + "step": 9997 + }, + { + "epoch": 1.6219987021414666, + "grad_norm": 0.6065079204974262, + "learning_rate": 2.2009410669832147e-06, + "loss": 0.5248, + "step": 9998 + }, + { + "epoch": 1.622160934458144, + "grad_norm": 0.5958586526078389, + "learning_rate": 2.2005171052378736e-06, + "loss": 0.4795, + "step": 9999 + }, + { + "epoch": 1.6223231667748217, + "grad_norm": 0.6155351074152835, + "learning_rate": 2.200093152230568e-06, + "loss": 0.532, + "step": 10000 + }, + { + "epoch": 1.6224853990914991, + "grad_norm": 0.6193027405260118, + "learning_rate": 2.199669207973669e-06, + "loss": 0.4671, + "step": 10001 + }, + { + "epoch": 1.6226476314081766, + "grad_norm": 0.5902889047414298, + "learning_rate": 2.1992452724795455e-06, + "loss": 0.4885, + "step": 10002 + }, + { + "epoch": 1.622809863724854, + "grad_norm": 0.6211994243125817, + "learning_rate": 2.198821345760566e-06, + "loss": 0.4919, + "step": 10003 + }, + { + "epoch": 1.6229720960415315, + "grad_norm": 0.6015079352935995, + "learning_rate": 2.1983974278291018e-06, + "loss": 0.5188, + "step": 10004 + }, + { + "epoch": 1.623134328358209, + "grad_norm": 0.6112256707592657, + "learning_rate": 2.197973518697519e-06, + "loss": 0.5327, + "step": 10005 + }, + { + "epoch": 1.6232965606748864, + "grad_norm": 0.6181814471097271, + "learning_rate": 2.197549618378187e-06, + "loss": 0.5397, + "step": 10006 + }, + { + "epoch": 1.6234587929915638, + "grad_norm": 0.5749113031836245, + "learning_rate": 2.197125726883474e-06, + "loss": 0.5297, + "step": 10007 + }, + { + "epoch": 1.6236210253082413, + "grad_norm": 0.5959387967468569, + "learning_rate": 2.196701844225748e-06, + "loss": 0.5734, + "step": 10008 + }, + { + "epoch": 1.623783257624919, + "grad_norm": 0.5952739672039838, + "learning_rate": 2.196277970417377e-06, + "loss": 0.5158, + "step": 10009 + }, + { + "epoch": 1.6239454899415964, + "grad_norm": 0.6218814417100531, + "learning_rate": 2.195854105470728e-06, + "loss": 0.5247, + "step": 10010 + }, + { + "epoch": 1.6241077222582738, + "grad_norm": 0.5916737585205148, + "learning_rate": 2.1954302493981674e-06, + "loss": 0.5292, + "step": 10011 + }, + { + "epoch": 1.6242699545749515, + "grad_norm": 0.614241021903467, + "learning_rate": 2.1950064022120635e-06, + "loss": 0.5113, + "step": 10012 + }, + { + "epoch": 1.624432186891629, + "grad_norm": 0.589022694836849, + "learning_rate": 2.1945825639247818e-06, + "loss": 0.5371, + "step": 10013 + }, + { + "epoch": 1.6245944192083064, + "grad_norm": 0.6181619343178901, + "learning_rate": 2.1941587345486903e-06, + "loss": 0.5475, + "step": 10014 + }, + { + "epoch": 1.6247566515249838, + "grad_norm": 0.5907924835346452, + "learning_rate": 2.1937349140961523e-06, + "loss": 0.5581, + "step": 10015 + }, + { + "epoch": 1.6249188838416613, + "grad_norm": 0.607002493724937, + "learning_rate": 2.193311102579536e-06, + "loss": 0.5112, + "step": 10016 + }, + { + "epoch": 1.6250811161583387, + "grad_norm": 0.5884251593955119, + "learning_rate": 2.1928873000112057e-06, + "loss": 0.5305, + "step": 10017 + }, + { + "epoch": 1.6252433484750162, + "grad_norm": 0.5908643981141822, + "learning_rate": 2.1924635064035275e-06, + "loss": 0.5275, + "step": 10018 + }, + { + "epoch": 1.6254055807916936, + "grad_norm": 0.618710636074328, + "learning_rate": 2.192039721768867e-06, + "loss": 0.532, + "step": 10019 + }, + { + "epoch": 1.625567813108371, + "grad_norm": 0.6100158323394016, + "learning_rate": 2.1916159461195874e-06, + "loss": 0.5434, + "step": 10020 + }, + { + "epoch": 1.6257300454250485, + "grad_norm": 0.6352735969907948, + "learning_rate": 2.191192179468054e-06, + "loss": 0.5086, + "step": 10021 + }, + { + "epoch": 1.6258922777417262, + "grad_norm": 0.5827567455879435, + "learning_rate": 2.190768421826631e-06, + "loss": 0.5283, + "step": 10022 + }, + { + "epoch": 1.6260545100584036, + "grad_norm": 0.5857117178872179, + "learning_rate": 2.190344673207683e-06, + "loss": 0.5187, + "step": 10023 + }, + { + "epoch": 1.626216742375081, + "grad_norm": 0.5900586237962363, + "learning_rate": 2.1899209336235725e-06, + "loss": 0.5043, + "step": 10024 + }, + { + "epoch": 1.6263789746917587, + "grad_norm": 0.669446680219409, + "learning_rate": 2.1894972030866655e-06, + "loss": 0.5197, + "step": 10025 + }, + { + "epoch": 1.6265412070084362, + "grad_norm": 0.5941528327077027, + "learning_rate": 2.1890734816093222e-06, + "loss": 0.498, + "step": 10026 + }, + { + "epoch": 1.6267034393251136, + "grad_norm": 0.5718870820729609, + "learning_rate": 2.1886497692039068e-06, + "loss": 0.5106, + "step": 10027 + }, + { + "epoch": 1.626865671641791, + "grad_norm": 0.6014966466266135, + "learning_rate": 2.188226065882782e-06, + "loss": 0.5143, + "step": 10028 + }, + { + "epoch": 1.6270279039584685, + "grad_norm": 0.6116739330541714, + "learning_rate": 2.1878023716583105e-06, + "loss": 0.5102, + "step": 10029 + }, + { + "epoch": 1.627190136275146, + "grad_norm": 0.5807290899403281, + "learning_rate": 2.187378686542855e-06, + "loss": 0.5165, + "step": 10030 + }, + { + "epoch": 1.6273523685918234, + "grad_norm": 0.6114866728937604, + "learning_rate": 2.1869550105487753e-06, + "loss": 0.5403, + "step": 10031 + }, + { + "epoch": 1.6275146009085009, + "grad_norm": 0.6270345555110431, + "learning_rate": 2.1865313436884345e-06, + "loss": 0.5325, + "step": 10032 + }, + { + "epoch": 1.6276768332251783, + "grad_norm": 0.5839299239703658, + "learning_rate": 2.186107685974194e-06, + "loss": 0.5148, + "step": 10033 + }, + { + "epoch": 1.627839065541856, + "grad_norm": 0.6146951231111704, + "learning_rate": 2.185684037418415e-06, + "loss": 0.4893, + "step": 10034 + }, + { + "epoch": 1.6280012978585334, + "grad_norm": 0.6161669977379128, + "learning_rate": 2.1852603980334584e-06, + "loss": 0.4863, + "step": 10035 + }, + { + "epoch": 1.6281635301752109, + "grad_norm": 0.587286309490229, + "learning_rate": 2.184836767831684e-06, + "loss": 0.4903, + "step": 10036 + }, + { + "epoch": 1.6283257624918885, + "grad_norm": 0.6203977968033261, + "learning_rate": 2.1844131468254516e-06, + "loss": 0.5481, + "step": 10037 + }, + { + "epoch": 1.628487994808566, + "grad_norm": 0.6320986506165801, + "learning_rate": 2.183989535027123e-06, + "loss": 0.4844, + "step": 10038 + }, + { + "epoch": 1.6286502271252434, + "grad_norm": 0.6203726626595752, + "learning_rate": 2.1835659324490564e-06, + "loss": 0.5319, + "step": 10039 + }, + { + "epoch": 1.6288124594419209, + "grad_norm": 0.600180334786578, + "learning_rate": 2.183142339103613e-06, + "loss": 0.5106, + "step": 10040 + }, + { + "epoch": 1.6289746917585983, + "grad_norm": 0.6023760587843219, + "learning_rate": 2.1827187550031504e-06, + "loss": 0.4785, + "step": 10041 + }, + { + "epoch": 1.6291369240752758, + "grad_norm": 0.5804687726385696, + "learning_rate": 2.1822951801600275e-06, + "loss": 0.526, + "step": 10042 + }, + { + "epoch": 1.6292991563919532, + "grad_norm": 0.6276214574626665, + "learning_rate": 2.181871614586604e-06, + "loss": 0.5235, + "step": 10043 + }, + { + "epoch": 1.6294613887086307, + "grad_norm": 0.5792855243879348, + "learning_rate": 2.1814480582952376e-06, + "loss": 0.4892, + "step": 10044 + }, + { + "epoch": 1.6296236210253081, + "grad_norm": 0.5941155171691699, + "learning_rate": 2.181024511298288e-06, + "loss": 0.5005, + "step": 10045 + }, + { + "epoch": 1.6297858533419856, + "grad_norm": 0.582296417216057, + "learning_rate": 2.180600973608111e-06, + "loss": 0.5148, + "step": 10046 + }, + { + "epoch": 1.6299480856586632, + "grad_norm": 0.6438963099654968, + "learning_rate": 2.180177445237065e-06, + "loss": 0.5161, + "step": 10047 + }, + { + "epoch": 1.6301103179753407, + "grad_norm": 0.5657185646709939, + "learning_rate": 2.1797539261975067e-06, + "loss": 0.5003, + "step": 10048 + }, + { + "epoch": 1.6302725502920181, + "grad_norm": 0.5974053185787146, + "learning_rate": 2.179330416501794e-06, + "loss": 0.4992, + "step": 10049 + }, + { + "epoch": 1.6304347826086958, + "grad_norm": 0.6033868600412623, + "learning_rate": 2.1789069161622844e-06, + "loss": 0.5003, + "step": 10050 + }, + { + "epoch": 1.6305970149253732, + "grad_norm": 0.6149602868944063, + "learning_rate": 2.1784834251913323e-06, + "loss": 0.5253, + "step": 10051 + }, + { + "epoch": 1.6307592472420507, + "grad_norm": 0.6215699975387143, + "learning_rate": 2.1780599436012956e-06, + "loss": 0.5318, + "step": 10052 + }, + { + "epoch": 1.6309214795587281, + "grad_norm": 0.5962079062258894, + "learning_rate": 2.1776364714045294e-06, + "loss": 0.5291, + "step": 10053 + }, + { + "epoch": 1.6310837118754056, + "grad_norm": 0.5954151000198026, + "learning_rate": 2.1772130086133898e-06, + "loss": 0.4952, + "step": 10054 + }, + { + "epoch": 1.631245944192083, + "grad_norm": 0.5950840866783101, + "learning_rate": 2.176789555240233e-06, + "loss": 0.5024, + "step": 10055 + }, + { + "epoch": 1.6314081765087605, + "grad_norm": 0.7348929786886902, + "learning_rate": 2.1763661112974115e-06, + "loss": 0.5363, + "step": 10056 + }, + { + "epoch": 1.631570408825438, + "grad_norm": 0.5889824772001409, + "learning_rate": 2.1759426767972833e-06, + "loss": 0.5091, + "step": 10057 + }, + { + "epoch": 1.6317326411421154, + "grad_norm": 0.6091526884030005, + "learning_rate": 2.1755192517522006e-06, + "loss": 0.5173, + "step": 10058 + }, + { + "epoch": 1.631894873458793, + "grad_norm": 0.5970595122269491, + "learning_rate": 2.175095836174519e-06, + "loss": 0.5157, + "step": 10059 + }, + { + "epoch": 1.6320571057754705, + "grad_norm": 0.5981038667910493, + "learning_rate": 2.1746724300765932e-06, + "loss": 0.5402, + "step": 10060 + }, + { + "epoch": 1.632219338092148, + "grad_norm": 0.5738140011030484, + "learning_rate": 2.174249033470775e-06, + "loss": 0.5373, + "step": 10061 + }, + { + "epoch": 1.6323815704088256, + "grad_norm": 0.5999712800030362, + "learning_rate": 2.173825646369419e-06, + "loss": 0.5361, + "step": 10062 + }, + { + "epoch": 1.632543802725503, + "grad_norm": 0.592326403356629, + "learning_rate": 2.173402268784878e-06, + "loss": 0.5073, + "step": 10063 + }, + { + "epoch": 1.6327060350421805, + "grad_norm": 0.5739961721600129, + "learning_rate": 2.1729789007295053e-06, + "loss": 0.5265, + "step": 10064 + }, + { + "epoch": 1.632868267358858, + "grad_norm": 0.6021683674764223, + "learning_rate": 2.1725555422156542e-06, + "loss": 0.5291, + "step": 10065 + }, + { + "epoch": 1.6330304996755354, + "grad_norm": 0.5990436183576864, + "learning_rate": 2.1721321932556753e-06, + "loss": 0.5, + "step": 10066 + }, + { + "epoch": 1.6331927319922128, + "grad_norm": 0.6223422842185571, + "learning_rate": 2.171708853861922e-06, + "loss": 0.4832, + "step": 10067 + }, + { + "epoch": 1.6333549643088903, + "grad_norm": 0.6139626270033205, + "learning_rate": 2.1712855240467455e-06, + "loss": 0.5044, + "step": 10068 + }, + { + "epoch": 1.6335171966255677, + "grad_norm": 0.5697775781406568, + "learning_rate": 2.1708622038224974e-06, + "loss": 0.5176, + "step": 10069 + }, + { + "epoch": 1.6336794289422452, + "grad_norm": 0.5982058195949522, + "learning_rate": 2.1704388932015307e-06, + "loss": 0.4971, + "step": 10070 + }, + { + "epoch": 1.6338416612589228, + "grad_norm": 0.6196259845785751, + "learning_rate": 2.1700155921961934e-06, + "loss": 0.5093, + "step": 10071 + }, + { + "epoch": 1.6340038935756003, + "grad_norm": 0.619628585866038, + "learning_rate": 2.169592300818838e-06, + "loss": 0.522, + "step": 10072 + }, + { + "epoch": 1.6341661258922777, + "grad_norm": 0.5875650924734896, + "learning_rate": 2.169169019081814e-06, + "loss": 0.5288, + "step": 10073 + }, + { + "epoch": 1.6343283582089554, + "grad_norm": 0.5862421952674808, + "learning_rate": 2.1687457469974726e-06, + "loss": 0.5303, + "step": 10074 + }, + { + "epoch": 1.6344905905256328, + "grad_norm": 0.6109379330987338, + "learning_rate": 2.1683224845781625e-06, + "loss": 0.5335, + "step": 10075 + }, + { + "epoch": 1.6346528228423103, + "grad_norm": 0.6016959811035013, + "learning_rate": 2.167899231836235e-06, + "loss": 0.5351, + "step": 10076 + }, + { + "epoch": 1.6348150551589877, + "grad_norm": 0.5986482686986234, + "learning_rate": 2.1674759887840376e-06, + "loss": 0.5385, + "step": 10077 + }, + { + "epoch": 1.6349772874756652, + "grad_norm": 0.5983385497860754, + "learning_rate": 2.167052755433919e-06, + "loss": 0.5176, + "step": 10078 + }, + { + "epoch": 1.6351395197923426, + "grad_norm": 0.5700031130513761, + "learning_rate": 2.16662953179823e-06, + "loss": 0.5354, + "step": 10079 + }, + { + "epoch": 1.63530175210902, + "grad_norm": 0.6032110231984126, + "learning_rate": 2.166206317889317e-06, + "loss": 0.495, + "step": 10080 + }, + { + "epoch": 1.6354639844256975, + "grad_norm": 0.5878507600544847, + "learning_rate": 2.16578311371953e-06, + "loss": 0.5445, + "step": 10081 + }, + { + "epoch": 1.635626216742375, + "grad_norm": 0.619437094183742, + "learning_rate": 2.1653599193012155e-06, + "loss": 0.521, + "step": 10082 + }, + { + "epoch": 1.6357884490590524, + "grad_norm": 0.5915206810703396, + "learning_rate": 2.1649367346467214e-06, + "loss": 0.4848, + "step": 10083 + }, + { + "epoch": 1.63595068137573, + "grad_norm": 0.6153057154806725, + "learning_rate": 2.1645135597683943e-06, + "loss": 0.4939, + "step": 10084 + }, + { + "epoch": 1.6361129136924075, + "grad_norm": 0.5963072344815745, + "learning_rate": 2.164090394678583e-06, + "loss": 0.5471, + "step": 10085 + }, + { + "epoch": 1.636275146009085, + "grad_norm": 0.6341170034251755, + "learning_rate": 2.1636672393896334e-06, + "loss": 0.563, + "step": 10086 + }, + { + "epoch": 1.6364373783257626, + "grad_norm": 0.6310185915218213, + "learning_rate": 2.1632440939138906e-06, + "loss": 0.5, + "step": 10087 + }, + { + "epoch": 1.63659961064244, + "grad_norm": 0.6020267020343374, + "learning_rate": 2.1628209582637024e-06, + "loss": 0.4908, + "step": 10088 + }, + { + "epoch": 1.6367618429591175, + "grad_norm": 0.6063811681476101, + "learning_rate": 2.1623978324514135e-06, + "loss": 0.4941, + "step": 10089 + }, + { + "epoch": 1.636924075275795, + "grad_norm": 0.6340063540144925, + "learning_rate": 2.1619747164893703e-06, + "loss": 0.523, + "step": 10090 + }, + { + "epoch": 1.6370863075924724, + "grad_norm": 0.5850371627727392, + "learning_rate": 2.161551610389919e-06, + "loss": 0.5002, + "step": 10091 + }, + { + "epoch": 1.6372485399091499, + "grad_norm": 0.5827471708601369, + "learning_rate": 2.161128514165402e-06, + "loss": 0.4676, + "step": 10092 + }, + { + "epoch": 1.6374107722258273, + "grad_norm": 0.5803310285772235, + "learning_rate": 2.1607054278281664e-06, + "loss": 0.5466, + "step": 10093 + }, + { + "epoch": 1.6375730045425048, + "grad_norm": 0.5869731146097567, + "learning_rate": 2.1602823513905554e-06, + "loss": 0.5189, + "step": 10094 + }, + { + "epoch": 1.6377352368591822, + "grad_norm": 0.5908025252320566, + "learning_rate": 2.159859284864913e-06, + "loss": 0.5115, + "step": 10095 + }, + { + "epoch": 1.6378974691758599, + "grad_norm": 0.597038554330522, + "learning_rate": 2.1594362282635845e-06, + "loss": 0.4911, + "step": 10096 + }, + { + "epoch": 1.6380597014925373, + "grad_norm": 0.6071888546539292, + "learning_rate": 2.1590131815989123e-06, + "loss": 0.571, + "step": 10097 + }, + { + "epoch": 1.6382219338092148, + "grad_norm": 0.5883053031335687, + "learning_rate": 2.1585901448832387e-06, + "loss": 0.4907, + "step": 10098 + }, + { + "epoch": 1.6383841661258924, + "grad_norm": 0.6104701414097528, + "learning_rate": 2.158167118128909e-06, + "loss": 0.5263, + "step": 10099 + }, + { + "epoch": 1.6385463984425699, + "grad_norm": 0.5914536114149412, + "learning_rate": 2.1577441013482634e-06, + "loss": 0.4811, + "step": 10100 + }, + { + "epoch": 1.6387086307592473, + "grad_norm": 0.624119279769647, + "learning_rate": 2.157321094553647e-06, + "loss": 0.5351, + "step": 10101 + }, + { + "epoch": 1.6388708630759248, + "grad_norm": 0.6161336619974119, + "learning_rate": 2.1568980977574e-06, + "loss": 0.5104, + "step": 10102 + }, + { + "epoch": 1.6390330953926022, + "grad_norm": 0.5993648075586647, + "learning_rate": 2.156475110971864e-06, + "loss": 0.5077, + "step": 10103 + }, + { + "epoch": 1.6391953277092797, + "grad_norm": 0.5782753065547618, + "learning_rate": 2.156052134209382e-06, + "loss": 0.493, + "step": 10104 + }, + { + "epoch": 1.6393575600259571, + "grad_norm": 0.6298622655448853, + "learning_rate": 2.155629167482294e-06, + "loss": 0.5135, + "step": 10105 + }, + { + "epoch": 1.6395197923426346, + "grad_norm": 0.6185075260362666, + "learning_rate": 2.1552062108029428e-06, + "loss": 0.4948, + "step": 10106 + }, + { + "epoch": 1.639682024659312, + "grad_norm": 0.5638489271832549, + "learning_rate": 2.154783264183666e-06, + "loss": 0.4677, + "step": 10107 + }, + { + "epoch": 1.6398442569759895, + "grad_norm": 0.6408677517031461, + "learning_rate": 2.154360327636806e-06, + "loss": 0.4953, + "step": 10108 + }, + { + "epoch": 1.6400064892926671, + "grad_norm": 0.5875770536025723, + "learning_rate": 2.1539374011747017e-06, + "loss": 0.5295, + "step": 10109 + }, + { + "epoch": 1.6401687216093446, + "grad_norm": 0.6019652746557631, + "learning_rate": 2.1535144848096943e-06, + "loss": 0.4918, + "step": 10110 + }, + { + "epoch": 1.640330953926022, + "grad_norm": 0.6064765988535785, + "learning_rate": 2.153091578554123e-06, + "loss": 0.4849, + "step": 10111 + }, + { + "epoch": 1.6404931862426997, + "grad_norm": 0.5905168630768426, + "learning_rate": 2.152668682420326e-06, + "loss": 0.5371, + "step": 10112 + }, + { + "epoch": 1.6406554185593771, + "grad_norm": 0.5829781872186349, + "learning_rate": 2.152245796420642e-06, + "loss": 0.5086, + "step": 10113 + }, + { + "epoch": 1.6408176508760546, + "grad_norm": 0.6440409129386552, + "learning_rate": 2.151822920567411e-06, + "loss": 0.5086, + "step": 10114 + }, + { + "epoch": 1.640979883192732, + "grad_norm": 0.611543237546865, + "learning_rate": 2.15140005487297e-06, + "loss": 0.5219, + "step": 10115 + }, + { + "epoch": 1.6411421155094095, + "grad_norm": 0.6017611765853944, + "learning_rate": 2.150977199349659e-06, + "loss": 0.5456, + "step": 10116 + }, + { + "epoch": 1.641304347826087, + "grad_norm": 0.6299376243862875, + "learning_rate": 2.1505543540098133e-06, + "loss": 0.5477, + "step": 10117 + }, + { + "epoch": 1.6414665801427644, + "grad_norm": 0.6099292024273419, + "learning_rate": 2.1501315188657703e-06, + "loss": 0.5223, + "step": 10118 + }, + { + "epoch": 1.6416288124594418, + "grad_norm": 0.6167923498978645, + "learning_rate": 2.149708693929869e-06, + "loss": 0.4925, + "step": 10119 + }, + { + "epoch": 1.6417910447761193, + "grad_norm": 0.5691430619104437, + "learning_rate": 2.1492858792144446e-06, + "loss": 0.5282, + "step": 10120 + }, + { + "epoch": 1.641953277092797, + "grad_norm": 0.6145857161330699, + "learning_rate": 2.1488630747318347e-06, + "loss": 0.501, + "step": 10121 + }, + { + "epoch": 1.6421155094094744, + "grad_norm": 0.6045483956106001, + "learning_rate": 2.1484402804943755e-06, + "loss": 0.4904, + "step": 10122 + }, + { + "epoch": 1.6422777417261518, + "grad_norm": 0.6395024020225919, + "learning_rate": 2.1480174965144014e-06, + "loss": 0.5481, + "step": 10123 + }, + { + "epoch": 1.6424399740428295, + "grad_norm": 0.5947097042754425, + "learning_rate": 2.1475947228042494e-06, + "loss": 0.5286, + "step": 10124 + }, + { + "epoch": 1.642602206359507, + "grad_norm": 0.6118202949791914, + "learning_rate": 2.1471719593762537e-06, + "loss": 0.5144, + "step": 10125 + }, + { + "epoch": 1.6427644386761844, + "grad_norm": 0.5923047305291549, + "learning_rate": 2.146749206242751e-06, + "loss": 0.5198, + "step": 10126 + }, + { + "epoch": 1.6429266709928618, + "grad_norm": 0.5870986804082521, + "learning_rate": 2.1463264634160754e-06, + "loss": 0.5233, + "step": 10127 + }, + { + "epoch": 1.6430889033095393, + "grad_norm": 0.589947060135714, + "learning_rate": 2.14590373090856e-06, + "loss": 0.5138, + "step": 10128 + }, + { + "epoch": 1.6432511356262167, + "grad_norm": 0.6078166378546735, + "learning_rate": 2.1454810087325397e-06, + "loss": 0.5182, + "step": 10129 + }, + { + "epoch": 1.6434133679428942, + "grad_norm": 0.5990266871296573, + "learning_rate": 2.1450582969003485e-06, + "loss": 0.5001, + "step": 10130 + }, + { + "epoch": 1.6435756002595716, + "grad_norm": 0.5779474205506705, + "learning_rate": 2.1446355954243197e-06, + "loss": 0.4955, + "step": 10131 + }, + { + "epoch": 1.643737832576249, + "grad_norm": 0.6005454200226463, + "learning_rate": 2.1442129043167877e-06, + "loss": 0.5298, + "step": 10132 + }, + { + "epoch": 1.6439000648929265, + "grad_norm": 0.6349233709535359, + "learning_rate": 2.1437902235900837e-06, + "loss": 0.5223, + "step": 10133 + }, + { + "epoch": 1.6440622972096042, + "grad_norm": 0.5590908532879538, + "learning_rate": 2.14336755325654e-06, + "loss": 0.5222, + "step": 10134 + }, + { + "epoch": 1.6442245295262816, + "grad_norm": 0.5975107371598753, + "learning_rate": 2.1429448933284908e-06, + "loss": 0.4817, + "step": 10135 + }, + { + "epoch": 1.644386761842959, + "grad_norm": 0.6148533175252754, + "learning_rate": 2.142522243818266e-06, + "loss": 0.4926, + "step": 10136 + }, + { + "epoch": 1.6445489941596367, + "grad_norm": 0.6236567207471992, + "learning_rate": 2.1420996047382e-06, + "loss": 0.5581, + "step": 10137 + }, + { + "epoch": 1.6447112264763142, + "grad_norm": 0.6014850064901627, + "learning_rate": 2.1416769761006217e-06, + "loss": 0.4955, + "step": 10138 + }, + { + "epoch": 1.6448734587929916, + "grad_norm": 0.6072286153180049, + "learning_rate": 2.1412543579178623e-06, + "loss": 0.5219, + "step": 10139 + }, + { + "epoch": 1.645035691109669, + "grad_norm": 0.5908361617848338, + "learning_rate": 2.1408317502022538e-06, + "loss": 0.5455, + "step": 10140 + }, + { + "epoch": 1.6451979234263465, + "grad_norm": 0.5796328298756503, + "learning_rate": 2.1404091529661262e-06, + "loss": 0.5051, + "step": 10141 + }, + { + "epoch": 1.645360155743024, + "grad_norm": 0.5844544310828544, + "learning_rate": 2.13998656622181e-06, + "loss": 0.4905, + "step": 10142 + }, + { + "epoch": 1.6455223880597014, + "grad_norm": 0.6152058062457549, + "learning_rate": 2.139563989981633e-06, + "loss": 0.5422, + "step": 10143 + }, + { + "epoch": 1.6456846203763789, + "grad_norm": 0.5986697305070041, + "learning_rate": 2.1391414242579278e-06, + "loss": 0.4974, + "step": 10144 + }, + { + "epoch": 1.6458468526930563, + "grad_norm": 0.5983545438344137, + "learning_rate": 2.1387188690630215e-06, + "loss": 0.5208, + "step": 10145 + }, + { + "epoch": 1.646009085009734, + "grad_norm": 0.6346549379435266, + "learning_rate": 2.1382963244092437e-06, + "loss": 0.4998, + "step": 10146 + }, + { + "epoch": 1.6461713173264114, + "grad_norm": 0.6067580176422158, + "learning_rate": 2.1378737903089242e-06, + "loss": 0.5105, + "step": 10147 + }, + { + "epoch": 1.6463335496430889, + "grad_norm": 0.5922576544746513, + "learning_rate": 2.1374512667743883e-06, + "loss": 0.5291, + "step": 10148 + }, + { + "epoch": 1.6464957819597665, + "grad_norm": 0.5838319444368905, + "learning_rate": 2.137028753817967e-06, + "loss": 0.5137, + "step": 10149 + }, + { + "epoch": 1.646658014276444, + "grad_norm": 0.6253184083559645, + "learning_rate": 2.136606251451986e-06, + "loss": 0.5254, + "step": 10150 + }, + { + "epoch": 1.6468202465931214, + "grad_norm": 0.6150327011250285, + "learning_rate": 2.136183759688774e-06, + "loss": 0.5189, + "step": 10151 + }, + { + "epoch": 1.6469824789097989, + "grad_norm": 0.5858137201784459, + "learning_rate": 2.1357612785406585e-06, + "loss": 0.5135, + "step": 10152 + }, + { + "epoch": 1.6471447112264763, + "grad_norm": 0.6046312718996989, + "learning_rate": 2.135338808019965e-06, + "loss": 0.5117, + "step": 10153 + }, + { + "epoch": 1.6473069435431538, + "grad_norm": 0.5760363375553943, + "learning_rate": 2.134916348139019e-06, + "loss": 0.5209, + "step": 10154 + }, + { + "epoch": 1.6474691758598312, + "grad_norm": 0.6095947963277895, + "learning_rate": 2.134493898910149e-06, + "loss": 0.5356, + "step": 10155 + }, + { + "epoch": 1.6476314081765087, + "grad_norm": 0.6216793957168537, + "learning_rate": 2.134071460345679e-06, + "loss": 0.508, + "step": 10156 + }, + { + "epoch": 1.647793640493186, + "grad_norm": 0.6431426261826819, + "learning_rate": 2.1336490324579363e-06, + "loss": 0.5225, + "step": 10157 + }, + { + "epoch": 1.6479558728098638, + "grad_norm": 0.6453117968862695, + "learning_rate": 2.1332266152592453e-06, + "loss": 0.5349, + "step": 10158 + }, + { + "epoch": 1.6481181051265412, + "grad_norm": 0.5753157729083677, + "learning_rate": 2.1328042087619296e-06, + "loss": 0.5064, + "step": 10159 + }, + { + "epoch": 1.6482803374432187, + "grad_norm": 0.5986483387334568, + "learning_rate": 2.1323818129783152e-06, + "loss": 0.5086, + "step": 10160 + }, + { + "epoch": 1.6484425697598963, + "grad_norm": 0.5964229630957748, + "learning_rate": 2.131959427920726e-06, + "loss": 0.5241, + "step": 10161 + }, + { + "epoch": 1.6486048020765738, + "grad_norm": 0.5906927750141513, + "learning_rate": 2.1315370536014877e-06, + "loss": 0.4709, + "step": 10162 + }, + { + "epoch": 1.6487670343932512, + "grad_norm": 0.5985342398949083, + "learning_rate": 2.1311146900329203e-06, + "loss": 0.4925, + "step": 10163 + }, + { + "epoch": 1.6489292667099287, + "grad_norm": 0.5973873281595168, + "learning_rate": 2.13069233722735e-06, + "loss": 0.4849, + "step": 10164 + }, + { + "epoch": 1.6490914990266061, + "grad_norm": 0.5854830706001045, + "learning_rate": 2.1302699951970985e-06, + "loss": 0.5214, + "step": 10165 + }, + { + "epoch": 1.6492537313432836, + "grad_norm": 0.5858621247207086, + "learning_rate": 2.129847663954489e-06, + "loss": 0.5066, + "step": 10166 + }, + { + "epoch": 1.649415963659961, + "grad_norm": 0.6261754939867213, + "learning_rate": 2.129425343511844e-06, + "loss": 0.5113, + "step": 10167 + }, + { + "epoch": 1.6495781959766385, + "grad_norm": 0.5715741944577003, + "learning_rate": 2.129003033881486e-06, + "loss": 0.5388, + "step": 10168 + }, + { + "epoch": 1.649740428293316, + "grad_norm": 0.6082348894042794, + "learning_rate": 2.128580735075736e-06, + "loss": 0.5071, + "step": 10169 + }, + { + "epoch": 1.6499026606099934, + "grad_norm": 0.5747517610075433, + "learning_rate": 2.128158447106915e-06, + "loss": 0.5031, + "step": 10170 + }, + { + "epoch": 1.650064892926671, + "grad_norm": 0.6242421689660786, + "learning_rate": 2.127736169987345e-06, + "loss": 0.4958, + "step": 10171 + }, + { + "epoch": 1.6502271252433485, + "grad_norm": 0.5999676129298558, + "learning_rate": 2.1273139037293467e-06, + "loss": 0.5211, + "step": 10172 + }, + { + "epoch": 1.650389357560026, + "grad_norm": 0.606858628054559, + "learning_rate": 2.126891648345241e-06, + "loss": 0.522, + "step": 10173 + }, + { + "epoch": 1.6505515898767036, + "grad_norm": 0.5897564317854261, + "learning_rate": 2.126469403847347e-06, + "loss": 0.5287, + "step": 10174 + }, + { + "epoch": 1.650713822193381, + "grad_norm": 0.611147307994256, + "learning_rate": 2.1260471702479857e-06, + "loss": 0.5235, + "step": 10175 + }, + { + "epoch": 1.6508760545100585, + "grad_norm": 0.6052730761847462, + "learning_rate": 2.125624947559475e-06, + "loss": 0.489, + "step": 10176 + }, + { + "epoch": 1.651038286826736, + "grad_norm": 0.6125690620622374, + "learning_rate": 2.125202735794136e-06, + "loss": 0.5141, + "step": 10177 + }, + { + "epoch": 1.6512005191434134, + "grad_norm": 0.5773112423125898, + "learning_rate": 2.124780534964287e-06, + "loss": 0.4999, + "step": 10178 + }, + { + "epoch": 1.6513627514600908, + "grad_norm": 0.6093226099832382, + "learning_rate": 2.124358345082246e-06, + "loss": 0.4938, + "step": 10179 + }, + { + "epoch": 1.6515249837767683, + "grad_norm": 0.5954309724359285, + "learning_rate": 2.1239361661603317e-06, + "loss": 0.517, + "step": 10180 + }, + { + "epoch": 1.6516872160934457, + "grad_norm": 0.6113327324349246, + "learning_rate": 2.1235139982108615e-06, + "loss": 0.4879, + "step": 10181 + }, + { + "epoch": 1.6518494484101232, + "grad_norm": 0.6051167961280144, + "learning_rate": 2.123091841246154e-06, + "loss": 0.5256, + "step": 10182 + }, + { + "epoch": 1.6520116807268008, + "grad_norm": 0.5907526605473467, + "learning_rate": 2.1226696952785268e-06, + "loss": 0.5272, + "step": 10183 + }, + { + "epoch": 1.6521739130434783, + "grad_norm": 0.5977991230008697, + "learning_rate": 2.122247560320295e-06, + "loss": 0.5053, + "step": 10184 + }, + { + "epoch": 1.6523361453601557, + "grad_norm": 0.5953207277523034, + "learning_rate": 2.1218254363837768e-06, + "loss": 0.5172, + "step": 10185 + }, + { + "epoch": 1.6524983776768334, + "grad_norm": 0.6498051915307915, + "learning_rate": 2.121403323481288e-06, + "loss": 0.5201, + "step": 10186 + }, + { + "epoch": 1.6526606099935108, + "grad_norm": 0.6035572041324373, + "learning_rate": 2.1209812216251447e-06, + "loss": 0.5233, + "step": 10187 + }, + { + "epoch": 1.6528228423101883, + "grad_norm": 0.6005077277883896, + "learning_rate": 2.1205591308276637e-06, + "loss": 0.5355, + "step": 10188 + }, + { + "epoch": 1.6529850746268657, + "grad_norm": 0.5942374717901333, + "learning_rate": 2.1201370511011583e-06, + "loss": 0.5542, + "step": 10189 + }, + { + "epoch": 1.6531473069435432, + "grad_norm": 0.5971350733206653, + "learning_rate": 2.1197149824579443e-06, + "loss": 0.5417, + "step": 10190 + }, + { + "epoch": 1.6533095392602206, + "grad_norm": 0.6044291405106318, + "learning_rate": 2.119292924910337e-06, + "loss": 0.4909, + "step": 10191 + }, + { + "epoch": 1.653471771576898, + "grad_norm": 0.5997557379375938, + "learning_rate": 2.1188708784706507e-06, + "loss": 0.5318, + "step": 10192 + }, + { + "epoch": 1.6536340038935755, + "grad_norm": 0.6053708661994122, + "learning_rate": 2.1184488431511998e-06, + "loss": 0.5291, + "step": 10193 + }, + { + "epoch": 1.653796236210253, + "grad_norm": 0.6182307186458492, + "learning_rate": 2.118026818964297e-06, + "loss": 0.5179, + "step": 10194 + }, + { + "epoch": 1.6539584685269304, + "grad_norm": 0.599931450223617, + "learning_rate": 2.117604805922256e-06, + "loss": 0.5142, + "step": 10195 + }, + { + "epoch": 1.654120700843608, + "grad_norm": 0.6478185488194232, + "learning_rate": 2.117182804037391e-06, + "loss": 0.523, + "step": 10196 + }, + { + "epoch": 1.6542829331602855, + "grad_norm": 0.5834143456489796, + "learning_rate": 2.116760813322013e-06, + "loss": 0.5165, + "step": 10197 + }, + { + "epoch": 1.654445165476963, + "grad_norm": 0.5846015673130881, + "learning_rate": 2.116338833788437e-06, + "loss": 0.5112, + "step": 10198 + }, + { + "epoch": 1.6546073977936406, + "grad_norm": 0.5889637259009075, + "learning_rate": 2.1159168654489724e-06, + "loss": 0.504, + "step": 10199 + }, + { + "epoch": 1.654769630110318, + "grad_norm": 0.6109808866532631, + "learning_rate": 2.1154949083159328e-06, + "loss": 0.502, + "step": 10200 + }, + { + "epoch": 1.6549318624269955, + "grad_norm": 0.572364844454868, + "learning_rate": 2.115072962401628e-06, + "loss": 0.486, + "step": 10201 + }, + { + "epoch": 1.655094094743673, + "grad_norm": 0.601236663401065, + "learning_rate": 2.1146510277183714e-06, + "loss": 0.5076, + "step": 10202 + }, + { + "epoch": 1.6552563270603504, + "grad_norm": 0.5872958847436364, + "learning_rate": 2.1142291042784724e-06, + "loss": 0.5308, + "step": 10203 + }, + { + "epoch": 1.6554185593770279, + "grad_norm": 0.594283231934558, + "learning_rate": 2.113807192094241e-06, + "loss": 0.4956, + "step": 10204 + }, + { + "epoch": 1.6555807916937053, + "grad_norm": 0.6067490532182165, + "learning_rate": 2.113385291177989e-06, + "loss": 0.5175, + "step": 10205 + }, + { + "epoch": 1.6557430240103828, + "grad_norm": 0.5628240513114906, + "learning_rate": 2.112963401542024e-06, + "loss": 0.4926, + "step": 10206 + }, + { + "epoch": 1.6559052563270602, + "grad_norm": 0.6740119837045008, + "learning_rate": 2.112541523198658e-06, + "loss": 0.502, + "step": 10207 + }, + { + "epoch": 1.6560674886437379, + "grad_norm": 0.5628173059023243, + "learning_rate": 2.1121196561601996e-06, + "loss": 0.5244, + "step": 10208 + }, + { + "epoch": 1.6562297209604153, + "grad_norm": 0.5989450405219291, + "learning_rate": 2.111697800438956e-06, + "loss": 0.5128, + "step": 10209 + }, + { + "epoch": 1.6563919532770928, + "grad_norm": 0.5851109381591204, + "learning_rate": 2.111275956047237e-06, + "loss": 0.5341, + "step": 10210 + }, + { + "epoch": 1.6565541855937704, + "grad_norm": 0.6360145742650822, + "learning_rate": 2.11085412299735e-06, + "loss": 0.52, + "step": 10211 + }, + { + "epoch": 1.6567164179104479, + "grad_norm": 0.5884560253384962, + "learning_rate": 2.1104323013016038e-06, + "loss": 0.4986, + "step": 10212 + }, + { + "epoch": 1.6568786502271253, + "grad_norm": 0.590702434571486, + "learning_rate": 2.110010490972306e-06, + "loss": 0.4948, + "step": 10213 + }, + { + "epoch": 1.6570408825438028, + "grad_norm": 0.6324953494194677, + "learning_rate": 2.109588692021764e-06, + "loss": 0.515, + "step": 10214 + }, + { + "epoch": 1.6572031148604802, + "grad_norm": 0.5912053560666636, + "learning_rate": 2.109166904462282e-06, + "loss": 0.5037, + "step": 10215 + }, + { + "epoch": 1.6573653471771577, + "grad_norm": 0.5716045699480868, + "learning_rate": 2.10874512830617e-06, + "loss": 0.5206, + "step": 10216 + }, + { + "epoch": 1.6575275794938351, + "grad_norm": 0.6113625307974455, + "learning_rate": 2.108323363565732e-06, + "loss": 0.4732, + "step": 10217 + }, + { + "epoch": 1.6576898118105126, + "grad_norm": 0.6063357263689824, + "learning_rate": 2.107901610253275e-06, + "loss": 0.5116, + "step": 10218 + }, + { + "epoch": 1.65785204412719, + "grad_norm": 0.6216816161923892, + "learning_rate": 2.1074798683811047e-06, + "loss": 0.5044, + "step": 10219 + }, + { + "epoch": 1.6580142764438677, + "grad_norm": 0.5969121896369447, + "learning_rate": 2.1070581379615253e-06, + "loss": 0.5326, + "step": 10220 + }, + { + "epoch": 1.6581765087605451, + "grad_norm": 0.6143219688872805, + "learning_rate": 2.106636419006842e-06, + "loss": 0.5299, + "step": 10221 + }, + { + "epoch": 1.6583387410772226, + "grad_norm": 0.5753474934363254, + "learning_rate": 2.106214711529359e-06, + "loss": 0.4766, + "step": 10222 + }, + { + "epoch": 1.6585009733939, + "grad_norm": 0.6238653715610472, + "learning_rate": 2.1057930155413807e-06, + "loss": 0.524, + "step": 10223 + }, + { + "epoch": 1.6586632057105777, + "grad_norm": 0.9118421244109725, + "learning_rate": 2.1053713310552122e-06, + "loss": 0.5425, + "step": 10224 + }, + { + "epoch": 1.6588254380272551, + "grad_norm": 0.6004708736979564, + "learning_rate": 2.1049496580831556e-06, + "loss": 0.4965, + "step": 10225 + }, + { + "epoch": 1.6589876703439326, + "grad_norm": 0.5831485927321084, + "learning_rate": 2.104527996637514e-06, + "loss": 0.4946, + "step": 10226 + }, + { + "epoch": 1.65914990266061, + "grad_norm": 0.5891898939732867, + "learning_rate": 2.1041063467305916e-06, + "loss": 0.5271, + "step": 10227 + }, + { + "epoch": 1.6593121349772875, + "grad_norm": 0.5865619581828891, + "learning_rate": 2.1036847083746887e-06, + "loss": 0.4995, + "step": 10228 + }, + { + "epoch": 1.659474367293965, + "grad_norm": 0.579964790611479, + "learning_rate": 2.1032630815821103e-06, + "loss": 0.5218, + "step": 10229 + }, + { + "epoch": 1.6596365996106424, + "grad_norm": 0.5920027814403237, + "learning_rate": 2.102841466365156e-06, + "loss": 0.525, + "step": 10230 + }, + { + "epoch": 1.6597988319273198, + "grad_norm": 0.5937680110882482, + "learning_rate": 2.102419862736128e-06, + "loss": 0.4951, + "step": 10231 + }, + { + "epoch": 1.6599610642439973, + "grad_norm": 0.614372244714741, + "learning_rate": 2.101998270707328e-06, + "loss": 0.5326, + "step": 10232 + }, + { + "epoch": 1.660123296560675, + "grad_norm": 0.5994338583551926, + "learning_rate": 2.101576690291056e-06, + "loss": 0.5281, + "step": 10233 + }, + { + "epoch": 1.6602855288773524, + "grad_norm": 0.6150961041263087, + "learning_rate": 2.1011551214996135e-06, + "loss": 0.4875, + "step": 10234 + }, + { + "epoch": 1.6604477611940298, + "grad_norm": 0.5917655364623235, + "learning_rate": 2.1007335643452988e-06, + "loss": 0.4927, + "step": 10235 + }, + { + "epoch": 1.6606099935107075, + "grad_norm": 0.623594819750394, + "learning_rate": 2.100312018840413e-06, + "loss": 0.5154, + "step": 10236 + }, + { + "epoch": 1.660772225827385, + "grad_norm": 0.5901000542918644, + "learning_rate": 2.0998904849972557e-06, + "loss": 0.4872, + "step": 10237 + }, + { + "epoch": 1.6609344581440624, + "grad_norm": 0.6058767907193311, + "learning_rate": 2.0994689628281257e-06, + "loss": 0.5223, + "step": 10238 + }, + { + "epoch": 1.6610966904607398, + "grad_norm": 0.5956942969923708, + "learning_rate": 2.099047452345323e-06, + "loss": 0.4985, + "step": 10239 + }, + { + "epoch": 1.6612589227774173, + "grad_norm": 0.6228580382084878, + "learning_rate": 2.098625953561144e-06, + "loss": 0.533, + "step": 10240 + }, + { + "epoch": 1.6614211550940947, + "grad_norm": 0.622620684627598, + "learning_rate": 2.098204466487887e-06, + "loss": 0.533, + "step": 10241 + }, + { + "epoch": 1.6615833874107722, + "grad_norm": 0.5850124917647034, + "learning_rate": 2.0977829911378507e-06, + "loss": 0.5195, + "step": 10242 + }, + { + "epoch": 1.6617456197274496, + "grad_norm": 0.5944866417520783, + "learning_rate": 2.097361527523333e-06, + "loss": 0.5179, + "step": 10243 + }, + { + "epoch": 1.661907852044127, + "grad_norm": 0.5907371172062292, + "learning_rate": 2.096940075656631e-06, + "loss": 0.5416, + "step": 10244 + }, + { + "epoch": 1.6620700843608047, + "grad_norm": 0.6313368736727263, + "learning_rate": 2.0965186355500396e-06, + "loss": 0.5227, + "step": 10245 + }, + { + "epoch": 1.6622323166774822, + "grad_norm": 0.5962119619570853, + "learning_rate": 2.096097207215856e-06, + "loss": 0.5563, + "step": 10246 + }, + { + "epoch": 1.6623945489941596, + "grad_norm": 0.6312500440425666, + "learning_rate": 2.095675790666377e-06, + "loss": 0.5102, + "step": 10247 + }, + { + "epoch": 1.6625567813108373, + "grad_norm": 0.5837346785435543, + "learning_rate": 2.095254385913897e-06, + "loss": 0.5613, + "step": 10248 + }, + { + "epoch": 1.6627190136275147, + "grad_norm": 0.6192766457921731, + "learning_rate": 2.094832992970714e-06, + "loss": 0.4769, + "step": 10249 + }, + { + "epoch": 1.6628812459441922, + "grad_norm": 0.6188137081192195, + "learning_rate": 2.09441161184912e-06, + "loss": 0.5285, + "step": 10250 + }, + { + "epoch": 1.6630434782608696, + "grad_norm": 0.6032479525795369, + "learning_rate": 2.0939902425614106e-06, + "loss": 0.528, + "step": 10251 + }, + { + "epoch": 1.663205710577547, + "grad_norm": 0.5978755341593555, + "learning_rate": 2.093568885119881e-06, + "loss": 0.5239, + "step": 10252 + }, + { + "epoch": 1.6633679428942245, + "grad_norm": 0.6329502822514985, + "learning_rate": 2.093147539536824e-06, + "loss": 0.4954, + "step": 10253 + }, + { + "epoch": 1.663530175210902, + "grad_norm": 0.5955836488685972, + "learning_rate": 2.0927262058245353e-06, + "loss": 0.5085, + "step": 10254 + }, + { + "epoch": 1.6636924075275794, + "grad_norm": 0.612179613486732, + "learning_rate": 2.092304883995306e-06, + "loss": 0.5195, + "step": 10255 + }, + { + "epoch": 1.6638546398442569, + "grad_norm": 0.5815787485020472, + "learning_rate": 2.0918835740614295e-06, + "loss": 0.4971, + "step": 10256 + }, + { + "epoch": 1.6640168721609343, + "grad_norm": 0.6346992204585996, + "learning_rate": 2.0914622760351978e-06, + "loss": 0.5278, + "step": 10257 + }, + { + "epoch": 1.664179104477612, + "grad_norm": 0.5863738280547486, + "learning_rate": 2.0910409899289054e-06, + "loss": 0.5175, + "step": 10258 + }, + { + "epoch": 1.6643413367942894, + "grad_norm": 0.6582528907680457, + "learning_rate": 2.0906197157548414e-06, + "loss": 0.5423, + "step": 10259 + }, + { + "epoch": 1.6645035691109669, + "grad_norm": 0.570911688709582, + "learning_rate": 2.0901984535253007e-06, + "loss": 0.5505, + "step": 10260 + }, + { + "epoch": 1.6646658014276445, + "grad_norm": 0.6031077082202018, + "learning_rate": 2.0897772032525715e-06, + "loss": 0.5257, + "step": 10261 + }, + { + "epoch": 1.664828033744322, + "grad_norm": 0.6109828361071024, + "learning_rate": 2.089355964948945e-06, + "loss": 0.5743, + "step": 10262 + }, + { + "epoch": 1.6649902660609994, + "grad_norm": 0.5965527667573928, + "learning_rate": 2.0889347386267134e-06, + "loss": 0.5445, + "step": 10263 + }, + { + "epoch": 1.6651524983776769, + "grad_norm": 0.6349821137742943, + "learning_rate": 2.088513524298165e-06, + "loss": 0.5268, + "step": 10264 + }, + { + "epoch": 1.6653147306943543, + "grad_norm": 0.6154167256717402, + "learning_rate": 2.0880923219755918e-06, + "loss": 0.5155, + "step": 10265 + }, + { + "epoch": 1.6654769630110318, + "grad_norm": 0.5964435899025899, + "learning_rate": 2.087671131671281e-06, + "loss": 0.5039, + "step": 10266 + }, + { + "epoch": 1.6656391953277092, + "grad_norm": 0.5820161427811419, + "learning_rate": 2.087249953397523e-06, + "loss": 0.498, + "step": 10267 + }, + { + "epoch": 1.6658014276443867, + "grad_norm": 0.6255034300813431, + "learning_rate": 2.0868287871666055e-06, + "loss": 0.5144, + "step": 10268 + }, + { + "epoch": 1.665963659961064, + "grad_norm": 0.5913151725859102, + "learning_rate": 2.0864076329908183e-06, + "loss": 0.5747, + "step": 10269 + }, + { + "epoch": 1.6661258922777418, + "grad_norm": 0.6334610337353084, + "learning_rate": 2.0859864908824486e-06, + "loss": 0.5086, + "step": 10270 + }, + { + "epoch": 1.6662881245944192, + "grad_norm": 0.6081923035236451, + "learning_rate": 2.085565360853784e-06, + "loss": 0.5224, + "step": 10271 + }, + { + "epoch": 1.6664503569110967, + "grad_norm": 0.5995777711388018, + "learning_rate": 2.085144242917112e-06, + "loss": 0.5579, + "step": 10272 + }, + { + "epoch": 1.6666125892277743, + "grad_norm": 0.600541746127217, + "learning_rate": 2.0847231370847195e-06, + "loss": 0.511, + "step": 10273 + }, + { + "epoch": 1.6667748215444518, + "grad_norm": 0.6301744705816462, + "learning_rate": 2.0843020433688934e-06, + "loss": 0.5051, + "step": 10274 + }, + { + "epoch": 1.6669370538611292, + "grad_norm": 0.6301830801094982, + "learning_rate": 2.083880961781921e-06, + "loss": 0.4999, + "step": 10275 + }, + { + "epoch": 1.6670992861778067, + "grad_norm": 0.6160392456938251, + "learning_rate": 2.0834598923360855e-06, + "loss": 0.4959, + "step": 10276 + }, + { + "epoch": 1.6672615184944841, + "grad_norm": 0.5976396143033352, + "learning_rate": 2.083038835043675e-06, + "loss": 0.5044, + "step": 10277 + }, + { + "epoch": 1.6674237508111616, + "grad_norm": 0.6224683814476161, + "learning_rate": 2.082617789916973e-06, + "loss": 0.4879, + "step": 10278 + }, + { + "epoch": 1.667585983127839, + "grad_norm": 0.6268249302187442, + "learning_rate": 2.0821967569682656e-06, + "loss": 0.5205, + "step": 10279 + }, + { + "epoch": 1.6677482154445165, + "grad_norm": 0.594523847849389, + "learning_rate": 2.081775736209838e-06, + "loss": 0.5235, + "step": 10280 + }, + { + "epoch": 1.667910447761194, + "grad_norm": 0.6641994848672075, + "learning_rate": 2.0813547276539724e-06, + "loss": 0.513, + "step": 10281 + }, + { + "epoch": 1.6680726800778714, + "grad_norm": 0.6007799434567785, + "learning_rate": 2.0809337313129528e-06, + "loss": 0.494, + "step": 10282 + }, + { + "epoch": 1.668234912394549, + "grad_norm": 0.5901641963137562, + "learning_rate": 2.080512747199064e-06, + "loss": 0.4951, + "step": 10283 + }, + { + "epoch": 1.6683971447112265, + "grad_norm": 0.5847427134040545, + "learning_rate": 2.080091775324588e-06, + "loss": 0.4932, + "step": 10284 + }, + { + "epoch": 1.668559377027904, + "grad_norm": 0.6007374881926589, + "learning_rate": 2.0796708157018086e-06, + "loss": 0.5289, + "step": 10285 + }, + { + "epoch": 1.6687216093445816, + "grad_norm": 0.5899840978854411, + "learning_rate": 2.0792498683430072e-06, + "loss": 0.5051, + "step": 10286 + }, + { + "epoch": 1.668883841661259, + "grad_norm": 0.5941985455725098, + "learning_rate": 2.0788289332604653e-06, + "loss": 0.5074, + "step": 10287 + }, + { + "epoch": 1.6690460739779365, + "grad_norm": 0.6165154656461991, + "learning_rate": 2.078408010466466e-06, + "loss": 0.5409, + "step": 10288 + }, + { + "epoch": 1.669208306294614, + "grad_norm": 0.6048409000805751, + "learning_rate": 2.0779870999732897e-06, + "loss": 0.5252, + "step": 10289 + }, + { + "epoch": 1.6693705386112914, + "grad_norm": 0.5977000554569362, + "learning_rate": 2.0775662017932187e-06, + "loss": 0.5065, + "step": 10290 + }, + { + "epoch": 1.6695327709279688, + "grad_norm": 0.6215054945484267, + "learning_rate": 2.0771453159385313e-06, + "loss": 0.5281, + "step": 10291 + }, + { + "epoch": 1.6696950032446463, + "grad_norm": 0.6135383131644184, + "learning_rate": 2.0767244424215093e-06, + "loss": 0.4901, + "step": 10292 + }, + { + "epoch": 1.6698572355613237, + "grad_norm": 0.570881109680136, + "learning_rate": 2.0763035812544314e-06, + "loss": 0.5192, + "step": 10293 + }, + { + "epoch": 1.6700194678780012, + "grad_norm": 0.6400279920337706, + "learning_rate": 2.0758827324495788e-06, + "loss": 0.5298, + "step": 10294 + }, + { + "epoch": 1.6701817001946788, + "grad_norm": 0.6019818208106279, + "learning_rate": 2.07546189601923e-06, + "loss": 0.5078, + "step": 10295 + }, + { + "epoch": 1.6703439325113563, + "grad_norm": 0.6181522702398007, + "learning_rate": 2.075041071975662e-06, + "loss": 0.5264, + "step": 10296 + }, + { + "epoch": 1.6705061648280337, + "grad_norm": 0.6073576731639375, + "learning_rate": 2.0746202603311562e-06, + "loss": 0.5452, + "step": 10297 + }, + { + "epoch": 1.6706683971447114, + "grad_norm": 0.6202269696641431, + "learning_rate": 2.0741994610979878e-06, + "loss": 0.5194, + "step": 10298 + }, + { + "epoch": 1.6708306294613888, + "grad_norm": 0.6078246385412825, + "learning_rate": 2.073778674288436e-06, + "loss": 0.4892, + "step": 10299 + }, + { + "epoch": 1.6709928617780663, + "grad_norm": 0.6041426896309459, + "learning_rate": 2.0733578999147797e-06, + "loss": 0.5302, + "step": 10300 + }, + { + "epoch": 1.6711550940947437, + "grad_norm": 0.6534712310794869, + "learning_rate": 2.072937137989293e-06, + "loss": 0.5237, + "step": 10301 + }, + { + "epoch": 1.6713173264114212, + "grad_norm": 0.6051086709836907, + "learning_rate": 2.072516388524253e-06, + "loss": 0.5248, + "step": 10302 + }, + { + "epoch": 1.6714795587280986, + "grad_norm": 0.5804918105494593, + "learning_rate": 2.0720956515319376e-06, + "loss": 0.5046, + "step": 10303 + }, + { + "epoch": 1.671641791044776, + "grad_norm": 0.588408539291992, + "learning_rate": 2.0716749270246205e-06, + "loss": 0.508, + "step": 10304 + }, + { + "epoch": 1.6718040233614535, + "grad_norm": 0.6232743289794835, + "learning_rate": 2.071254215014579e-06, + "loss": 0.532, + "step": 10305 + }, + { + "epoch": 1.671966255678131, + "grad_norm": 0.6168275115238496, + "learning_rate": 2.0708335155140883e-06, + "loss": 0.5007, + "step": 10306 + }, + { + "epoch": 1.6721284879948086, + "grad_norm": 0.615256692625505, + "learning_rate": 2.0704128285354214e-06, + "loss": 0.5302, + "step": 10307 + }, + { + "epoch": 1.672290720311486, + "grad_norm": 0.5812729420156009, + "learning_rate": 2.0699921540908542e-06, + "loss": 0.5245, + "step": 10308 + }, + { + "epoch": 1.6724529526281635, + "grad_norm": 0.5650352779380127, + "learning_rate": 2.06957149219266e-06, + "loss": 0.542, + "step": 10309 + }, + { + "epoch": 1.672615184944841, + "grad_norm": 0.5796131542769178, + "learning_rate": 2.0691508428531136e-06, + "loss": 0.5097, + "step": 10310 + }, + { + "epoch": 1.6727774172615186, + "grad_norm": 0.663136248359482, + "learning_rate": 2.0687302060844876e-06, + "loss": 0.5086, + "step": 10311 + }, + { + "epoch": 1.672939649578196, + "grad_norm": 0.6366521139893955, + "learning_rate": 2.0683095818990544e-06, + "loss": 0.5254, + "step": 10312 + }, + { + "epoch": 1.6731018818948735, + "grad_norm": 0.5808650470223605, + "learning_rate": 2.0678889703090876e-06, + "loss": 0.5381, + "step": 10313 + }, + { + "epoch": 1.673264114211551, + "grad_norm": 0.594801969759165, + "learning_rate": 2.0674683713268584e-06, + "loss": 0.5119, + "step": 10314 + }, + { + "epoch": 1.6734263465282284, + "grad_norm": 0.6354220252338983, + "learning_rate": 2.0670477849646392e-06, + "loss": 0.514, + "step": 10315 + }, + { + "epoch": 1.6735885788449059, + "grad_norm": 0.5950124870059195, + "learning_rate": 2.066627211234702e-06, + "loss": 0.5049, + "step": 10316 + }, + { + "epoch": 1.6737508111615833, + "grad_norm": 0.5726938402954235, + "learning_rate": 2.066206650149317e-06, + "loss": 0.5099, + "step": 10317 + }, + { + "epoch": 1.6739130434782608, + "grad_norm": 0.6200265555054737, + "learning_rate": 2.0657861017207552e-06, + "loss": 0.5094, + "step": 10318 + }, + { + "epoch": 1.6740752757949382, + "grad_norm": 0.5908544938302621, + "learning_rate": 2.065365565961287e-06, + "loss": 0.5137, + "step": 10319 + }, + { + "epoch": 1.6742375081116159, + "grad_norm": 0.5976552960857312, + "learning_rate": 2.0649450428831827e-06, + "loss": 0.5089, + "step": 10320 + }, + { + "epoch": 1.6743997404282933, + "grad_norm": 0.5609914459887763, + "learning_rate": 2.064524532498712e-06, + "loss": 0.503, + "step": 10321 + }, + { + "epoch": 1.6745619727449708, + "grad_norm": 0.5873291029156299, + "learning_rate": 2.064104034820144e-06, + "loss": 0.5162, + "step": 10322 + }, + { + "epoch": 1.6747242050616484, + "grad_norm": 0.6239252761632944, + "learning_rate": 2.0636835498597467e-06, + "loss": 0.5236, + "step": 10323 + }, + { + "epoch": 1.6748864373783259, + "grad_norm": 0.6430424550920077, + "learning_rate": 2.06326307762979e-06, + "loss": 0.5064, + "step": 10324 + }, + { + "epoch": 1.6750486696950033, + "grad_norm": 0.6436251141768986, + "learning_rate": 2.062842618142542e-06, + "loss": 0.5216, + "step": 10325 + }, + { + "epoch": 1.6752109020116808, + "grad_norm": 0.6053509418228403, + "learning_rate": 2.0624221714102697e-06, + "loss": 0.479, + "step": 10326 + }, + { + "epoch": 1.6753731343283582, + "grad_norm": 0.5978575760062289, + "learning_rate": 2.06200173744524e-06, + "loss": 0.5133, + "step": 10327 + }, + { + "epoch": 1.6755353666450357, + "grad_norm": 0.6739446845600566, + "learning_rate": 2.0615813162597213e-06, + "loss": 0.5316, + "step": 10328 + }, + { + "epoch": 1.675697598961713, + "grad_norm": 0.590145244932701, + "learning_rate": 2.061160907865979e-06, + "loss": 0.5305, + "step": 10329 + }, + { + "epoch": 1.6758598312783906, + "grad_norm": 0.5868684974019751, + "learning_rate": 2.0607405122762806e-06, + "loss": 0.5263, + "step": 10330 + }, + { + "epoch": 1.676022063595068, + "grad_norm": 0.5876449805638221, + "learning_rate": 2.060320129502892e-06, + "loss": 0.5219, + "step": 10331 + }, + { + "epoch": 1.6761842959117457, + "grad_norm": 0.6168797269258903, + "learning_rate": 2.059899759558077e-06, + "loss": 0.5073, + "step": 10332 + }, + { + "epoch": 1.6763465282284231, + "grad_norm": 0.5980672580106035, + "learning_rate": 2.059479402454103e-06, + "loss": 0.5391, + "step": 10333 + }, + { + "epoch": 1.6765087605451006, + "grad_norm": 0.594523772717932, + "learning_rate": 2.0590590582032323e-06, + "loss": 0.5501, + "step": 10334 + }, + { + "epoch": 1.6766709928617782, + "grad_norm": 0.6108375633949252, + "learning_rate": 2.058638726817732e-06, + "loss": 0.5065, + "step": 10335 + }, + { + "epoch": 1.6768332251784557, + "grad_norm": 0.6174377583709664, + "learning_rate": 2.0582184083098657e-06, + "loss": 0.4985, + "step": 10336 + }, + { + "epoch": 1.6769954574951331, + "grad_norm": 0.5971784544539984, + "learning_rate": 2.057798102691895e-06, + "loss": 0.5448, + "step": 10337 + }, + { + "epoch": 1.6771576898118106, + "grad_norm": 0.6154271890334947, + "learning_rate": 2.057377809976084e-06, + "loss": 0.5154, + "step": 10338 + }, + { + "epoch": 1.677319922128488, + "grad_norm": 0.6157987140876208, + "learning_rate": 2.0569575301746974e-06, + "loss": 0.5329, + "step": 10339 + }, + { + "epoch": 1.6774821544451655, + "grad_norm": 0.5865300247999576, + "learning_rate": 2.0565372632999957e-06, + "loss": 0.4974, + "step": 10340 + }, + { + "epoch": 1.677644386761843, + "grad_norm": 0.6273981571295688, + "learning_rate": 2.0561170093642422e-06, + "loss": 0.4909, + "step": 10341 + }, + { + "epoch": 1.6778066190785204, + "grad_norm": 0.5959312489770576, + "learning_rate": 2.0556967683796984e-06, + "loss": 0.5097, + "step": 10342 + }, + { + "epoch": 1.6779688513951978, + "grad_norm": 0.5694695749266354, + "learning_rate": 2.055276540358625e-06, + "loss": 0.5336, + "step": 10343 + }, + { + "epoch": 1.6781310837118752, + "grad_norm": 0.5960168241294987, + "learning_rate": 2.054856325313284e-06, + "loss": 0.4949, + "step": 10344 + }, + { + "epoch": 1.678293316028553, + "grad_norm": 0.6045095327025378, + "learning_rate": 2.054436123255935e-06, + "loss": 0.5429, + "step": 10345 + }, + { + "epoch": 1.6784555483452304, + "grad_norm": 0.6146953113512114, + "learning_rate": 2.05401593419884e-06, + "loss": 0.5025, + "step": 10346 + }, + { + "epoch": 1.6786177806619078, + "grad_norm": 0.5983256919743635, + "learning_rate": 2.053595758154257e-06, + "loss": 0.4996, + "step": 10347 + }, + { + "epoch": 1.6787800129785855, + "grad_norm": 0.6024784184038189, + "learning_rate": 2.053175595134447e-06, + "loss": 0.5003, + "step": 10348 + }, + { + "epoch": 1.678942245295263, + "grad_norm": 0.6375397351619866, + "learning_rate": 2.0527554451516677e-06, + "loss": 0.4956, + "step": 10349 + }, + { + "epoch": 1.6791044776119404, + "grad_norm": 0.6403342587396, + "learning_rate": 2.052335308218179e-06, + "loss": 0.5479, + "step": 10350 + }, + { + "epoch": 1.6792667099286178, + "grad_norm": 0.6734271684417853, + "learning_rate": 2.0519151843462393e-06, + "loss": 0.541, + "step": 10351 + }, + { + "epoch": 1.6794289422452953, + "grad_norm": 0.6231440840770267, + "learning_rate": 2.0514950735481053e-06, + "loss": 0.5425, + "step": 10352 + }, + { + "epoch": 1.6795911745619727, + "grad_norm": 0.6339346865517193, + "learning_rate": 2.051074975836036e-06, + "loss": 0.5069, + "step": 10353 + }, + { + "epoch": 1.6797534068786502, + "grad_norm": 0.6079717577533057, + "learning_rate": 2.0506548912222878e-06, + "loss": 0.5232, + "step": 10354 + }, + { + "epoch": 1.6799156391953276, + "grad_norm": 0.5810215048015176, + "learning_rate": 2.050234819719118e-06, + "loss": 0.5278, + "step": 10355 + }, + { + "epoch": 1.680077871512005, + "grad_norm": 0.6071756675596606, + "learning_rate": 2.049814761338782e-06, + "loss": 0.5216, + "step": 10356 + }, + { + "epoch": 1.6802401038286827, + "grad_norm": 0.6862990387461341, + "learning_rate": 2.049394716093539e-06, + "loss": 0.5374, + "step": 10357 + }, + { + "epoch": 1.6804023361453602, + "grad_norm": 0.6066652141499819, + "learning_rate": 2.048974683995641e-06, + "loss": 0.5278, + "step": 10358 + }, + { + "epoch": 1.6805645684620376, + "grad_norm": 0.5781358573432662, + "learning_rate": 2.048554665057345e-06, + "loss": 0.5032, + "step": 10359 + }, + { + "epoch": 1.6807268007787153, + "grad_norm": 0.5878071645980273, + "learning_rate": 2.048134659290905e-06, + "loss": 0.4955, + "step": 10360 + }, + { + "epoch": 1.6808890330953927, + "grad_norm": 0.6196060288074162, + "learning_rate": 2.047714666708577e-06, + "loss": 0.5076, + "step": 10361 + }, + { + "epoch": 1.6810512654120702, + "grad_norm": 0.5882350209085708, + "learning_rate": 2.0472946873226147e-06, + "loss": 0.5176, + "step": 10362 + }, + { + "epoch": 1.6812134977287476, + "grad_norm": 0.5860238799152147, + "learning_rate": 2.046874721145271e-06, + "loss": 0.5353, + "step": 10363 + }, + { + "epoch": 1.681375730045425, + "grad_norm": 0.5826813100928172, + "learning_rate": 2.0464547681888e-06, + "loss": 0.5044, + "step": 10364 + }, + { + "epoch": 1.6815379623621025, + "grad_norm": 0.6399602178812103, + "learning_rate": 2.046034828465454e-06, + "loss": 0.5104, + "step": 10365 + }, + { + "epoch": 1.68170019467878, + "grad_norm": 0.6455912872778135, + "learning_rate": 2.0456149019874868e-06, + "loss": 0.545, + "step": 10366 + }, + { + "epoch": 1.6818624269954574, + "grad_norm": 0.584019958409863, + "learning_rate": 2.045194988767151e-06, + "loss": 0.5078, + "step": 10367 + }, + { + "epoch": 1.6820246593121349, + "grad_norm": 0.5855427873540268, + "learning_rate": 2.0447750888166958e-06, + "loss": 0.5186, + "step": 10368 + }, + { + "epoch": 1.6821868916288123, + "grad_norm": 0.6126460559602114, + "learning_rate": 2.044355202148375e-06, + "loss": 0.5085, + "step": 10369 + }, + { + "epoch": 1.68234912394549, + "grad_norm": 0.5799566825364235, + "learning_rate": 2.0439353287744383e-06, + "loss": 0.512, + "step": 10370 + }, + { + "epoch": 1.6825113562621674, + "grad_norm": 0.59989891653021, + "learning_rate": 2.0435154687071376e-06, + "loss": 0.5005, + "step": 10371 + }, + { + "epoch": 1.6826735885788449, + "grad_norm": 0.6042714775054969, + "learning_rate": 2.043095621958723e-06, + "loss": 0.498, + "step": 10372 + }, + { + "epoch": 1.6828358208955225, + "grad_norm": 0.6075130519497718, + "learning_rate": 2.042675788541444e-06, + "loss": 0.524, + "step": 10373 + }, + { + "epoch": 1.6829980532122, + "grad_norm": 0.6058917414234317, + "learning_rate": 2.0422559684675498e-06, + "loss": 0.5079, + "step": 10374 + }, + { + "epoch": 1.6831602855288774, + "grad_norm": 0.6071143980515796, + "learning_rate": 2.04183616174929e-06, + "loss": 0.5328, + "step": 10375 + }, + { + "epoch": 1.6833225178455549, + "grad_norm": 0.6115649336811924, + "learning_rate": 2.0414163683989124e-06, + "loss": 0.5493, + "step": 10376 + }, + { + "epoch": 1.6834847501622323, + "grad_norm": 0.6135213672967992, + "learning_rate": 2.0409965884286676e-06, + "loss": 0.5045, + "step": 10377 + }, + { + "epoch": 1.6836469824789098, + "grad_norm": 0.5843013775372234, + "learning_rate": 2.0405768218508015e-06, + "loss": 0.5379, + "step": 10378 + }, + { + "epoch": 1.6838092147955872, + "grad_norm": 0.5741039722335641, + "learning_rate": 2.0401570686775616e-06, + "loss": 0.4708, + "step": 10379 + }, + { + "epoch": 1.6839714471122647, + "grad_norm": 0.61839459077008, + "learning_rate": 2.039737328921196e-06, + "loss": 0.539, + "step": 10380 + }, + { + "epoch": 1.684133679428942, + "grad_norm": 0.6135227076454813, + "learning_rate": 2.0393176025939516e-06, + "loss": 0.5307, + "step": 10381 + }, + { + "epoch": 1.6842959117456198, + "grad_norm": 0.6291129616082333, + "learning_rate": 2.0388978897080754e-06, + "loss": 0.4613, + "step": 10382 + }, + { + "epoch": 1.6844581440622972, + "grad_norm": 0.5569633555159257, + "learning_rate": 2.038478190275811e-06, + "loss": 0.4772, + "step": 10383 + }, + { + "epoch": 1.6846203763789747, + "grad_norm": 0.6038417351168979, + "learning_rate": 2.0380585043094057e-06, + "loss": 0.5073, + "step": 10384 + }, + { + "epoch": 1.6847826086956523, + "grad_norm": 0.5911520822675836, + "learning_rate": 2.037638831821104e-06, + "loss": 0.5258, + "step": 10385 + }, + { + "epoch": 1.6849448410123298, + "grad_norm": 0.5945284840582091, + "learning_rate": 2.0372191728231524e-06, + "loss": 0.4839, + "step": 10386 + }, + { + "epoch": 1.6851070733290072, + "grad_norm": 0.6234151961644427, + "learning_rate": 2.0367995273277935e-06, + "loss": 0.5359, + "step": 10387 + }, + { + "epoch": 1.6852693056456847, + "grad_norm": 0.5809872910687585, + "learning_rate": 2.0363798953472713e-06, + "loss": 0.4983, + "step": 10388 + }, + { + "epoch": 1.6854315379623621, + "grad_norm": 0.5914962631698747, + "learning_rate": 2.0359602768938306e-06, + "loss": 0.5297, + "step": 10389 + }, + { + "epoch": 1.6855937702790396, + "grad_norm": 0.592538274725961, + "learning_rate": 2.0355406719797135e-06, + "loss": 0.4978, + "step": 10390 + }, + { + "epoch": 1.685756002595717, + "grad_norm": 0.6027449633758175, + "learning_rate": 2.035121080617164e-06, + "loss": 0.4908, + "step": 10391 + }, + { + "epoch": 1.6859182349123945, + "grad_norm": 0.5808877979285932, + "learning_rate": 2.0347015028184243e-06, + "loss": 0.5259, + "step": 10392 + }, + { + "epoch": 1.686080467229072, + "grad_norm": 0.5708188799511779, + "learning_rate": 2.0342819385957353e-06, + "loss": 0.4842, + "step": 10393 + }, + { + "epoch": 1.6862426995457496, + "grad_norm": 0.5962860277420562, + "learning_rate": 2.03386238796134e-06, + "loss": 0.5136, + "step": 10394 + }, + { + "epoch": 1.686404931862427, + "grad_norm": 0.56752108998662, + "learning_rate": 2.033442850927479e-06, + "loss": 0.4906, + "step": 10395 + }, + { + "epoch": 1.6865671641791045, + "grad_norm": 0.5828171096053225, + "learning_rate": 2.033023327506393e-06, + "loss": 0.5088, + "step": 10396 + }, + { + "epoch": 1.686729396495782, + "grad_norm": 0.625423172460867, + "learning_rate": 2.0326038177103235e-06, + "loss": 0.5194, + "step": 10397 + }, + { + "epoch": 1.6868916288124596, + "grad_norm": 0.6353201464625654, + "learning_rate": 2.0321843215515095e-06, + "loss": 0.5302, + "step": 10398 + }, + { + "epoch": 1.687053861129137, + "grad_norm": 0.559708474976567, + "learning_rate": 2.0317648390421905e-06, + "loss": 0.4976, + "step": 10399 + }, + { + "epoch": 1.6872160934458145, + "grad_norm": 0.5948718911421728, + "learning_rate": 2.031345370194607e-06, + "loss": 0.525, + "step": 10400 + }, + { + "epoch": 1.687378325762492, + "grad_norm": 0.5830097240857928, + "learning_rate": 2.0309259150209962e-06, + "loss": 0.548, + "step": 10401 + }, + { + "epoch": 1.6875405580791694, + "grad_norm": 0.617709906495333, + "learning_rate": 2.030506473533599e-06, + "loss": 0.5149, + "step": 10402 + }, + { + "epoch": 1.6877027903958468, + "grad_norm": 0.600173898555395, + "learning_rate": 2.030087045744652e-06, + "loss": 0.5233, + "step": 10403 + }, + { + "epoch": 1.6878650227125243, + "grad_norm": 0.5874223339651843, + "learning_rate": 2.0296676316663915e-06, + "loss": 0.5057, + "step": 10404 + }, + { + "epoch": 1.6880272550292017, + "grad_norm": 0.564544360657708, + "learning_rate": 2.029248231311057e-06, + "loss": 0.5112, + "step": 10405 + }, + { + "epoch": 1.6881894873458791, + "grad_norm": 0.6059998285754937, + "learning_rate": 2.0288288446908845e-06, + "loss": 0.5347, + "step": 10406 + }, + { + "epoch": 1.6883517196625568, + "grad_norm": 0.6123194904789301, + "learning_rate": 2.0284094718181102e-06, + "loss": 0.5343, + "step": 10407 + }, + { + "epoch": 1.6885139519792343, + "grad_norm": 0.6173353829266087, + "learning_rate": 2.0279901127049716e-06, + "loss": 0.5224, + "step": 10408 + }, + { + "epoch": 1.6886761842959117, + "grad_norm": 0.6033882144354097, + "learning_rate": 2.027570767363702e-06, + "loss": 0.4987, + "step": 10409 + }, + { + "epoch": 1.6888384166125894, + "grad_norm": 0.6278358523126686, + "learning_rate": 2.027151435806539e-06, + "loss": 0.5096, + "step": 10410 + }, + { + "epoch": 1.6890006489292668, + "grad_norm": 0.6143572655435329, + "learning_rate": 2.0267321180457155e-06, + "loss": 0.5121, + "step": 10411 + }, + { + "epoch": 1.6891628812459443, + "grad_norm": 0.6025413985973205, + "learning_rate": 2.026312814093467e-06, + "loss": 0.5496, + "step": 10412 + }, + { + "epoch": 1.6893251135626217, + "grad_norm": 0.6145911799821805, + "learning_rate": 2.0258935239620285e-06, + "loss": 0.489, + "step": 10413 + }, + { + "epoch": 1.6894873458792992, + "grad_norm": 0.5986906988591175, + "learning_rate": 2.0254742476636323e-06, + "loss": 0.5314, + "step": 10414 + }, + { + "epoch": 1.6896495781959766, + "grad_norm": 0.6244489397903629, + "learning_rate": 2.025054985210511e-06, + "loss": 0.5204, + "step": 10415 + }, + { + "epoch": 1.689811810512654, + "grad_norm": 0.600583253562359, + "learning_rate": 2.0246357366148994e-06, + "loss": 0.5326, + "step": 10416 + }, + { + "epoch": 1.6899740428293315, + "grad_norm": 0.5597296279106639, + "learning_rate": 2.024216501889028e-06, + "loss": 0.5191, + "step": 10417 + }, + { + "epoch": 1.690136275146009, + "grad_norm": 0.6091063906755061, + "learning_rate": 2.023797281045132e-06, + "loss": 0.5032, + "step": 10418 + }, + { + "epoch": 1.6902985074626866, + "grad_norm": 0.6236353850590416, + "learning_rate": 2.023378074095439e-06, + "loss": 0.5095, + "step": 10419 + }, + { + "epoch": 1.690460739779364, + "grad_norm": 0.6074844788322357, + "learning_rate": 2.0229588810521825e-06, + "loss": 0.4789, + "step": 10420 + }, + { + "epoch": 1.6906229720960415, + "grad_norm": 0.594672201112322, + "learning_rate": 2.0225397019275926e-06, + "loss": 0.5189, + "step": 10421 + }, + { + "epoch": 1.6907852044127192, + "grad_norm": 0.594921447973167, + "learning_rate": 2.0221205367339003e-06, + "loss": 0.5248, + "step": 10422 + }, + { + "epoch": 1.6909474367293966, + "grad_norm": 0.594180765975877, + "learning_rate": 2.0217013854833364e-06, + "loss": 0.5299, + "step": 10423 + }, + { + "epoch": 1.691109669046074, + "grad_norm": 0.6248588248139196, + "learning_rate": 2.021282248188128e-06, + "loss": 0.5097, + "step": 10424 + }, + { + "epoch": 1.6912719013627515, + "grad_norm": 0.6377669648936888, + "learning_rate": 2.0208631248605063e-06, + "loss": 0.487, + "step": 10425 + }, + { + "epoch": 1.691434133679429, + "grad_norm": 0.5903975896134517, + "learning_rate": 2.0204440155126996e-06, + "loss": 0.5109, + "step": 10426 + }, + { + "epoch": 1.6915963659961064, + "grad_norm": 0.6073928025460077, + "learning_rate": 2.0200249201569365e-06, + "loss": 0.5276, + "step": 10427 + }, + { + "epoch": 1.6917585983127839, + "grad_norm": 0.6040002131231405, + "learning_rate": 2.0196058388054452e-06, + "loss": 0.5249, + "step": 10428 + }, + { + "epoch": 1.6919208306294613, + "grad_norm": 0.6248024092315867, + "learning_rate": 2.0191867714704523e-06, + "loss": 0.5117, + "step": 10429 + }, + { + "epoch": 1.6920830629461387, + "grad_norm": 0.603134633392932, + "learning_rate": 2.018767718164185e-06, + "loss": 0.5281, + "step": 10430 + }, + { + "epoch": 1.6922452952628162, + "grad_norm": 0.5853201302504154, + "learning_rate": 2.018348678898871e-06, + "loss": 0.4873, + "step": 10431 + }, + { + "epoch": 1.6924075275794939, + "grad_norm": 0.6133911462626075, + "learning_rate": 2.017929653686736e-06, + "loss": 0.5134, + "step": 10432 + }, + { + "epoch": 1.6925697598961713, + "grad_norm": 0.5931037858997167, + "learning_rate": 2.017510642540007e-06, + "loss": 0.5081, + "step": 10433 + }, + { + "epoch": 1.6927319922128488, + "grad_norm": 0.5518939437152557, + "learning_rate": 2.0170916454709078e-06, + "loss": 0.5127, + "step": 10434 + }, + { + "epoch": 1.6928942245295264, + "grad_norm": 0.5695134903147395, + "learning_rate": 2.016672662491664e-06, + "loss": 0.5007, + "step": 10435 + }, + { + "epoch": 1.6930564568462039, + "grad_norm": 0.6090288123248101, + "learning_rate": 2.016253693614501e-06, + "loss": 0.4906, + "step": 10436 + }, + { + "epoch": 1.6932186891628813, + "grad_norm": 0.5697687637859006, + "learning_rate": 2.0158347388516423e-06, + "loss": 0.4999, + "step": 10437 + }, + { + "epoch": 1.6933809214795588, + "grad_norm": 0.633952473089111, + "learning_rate": 2.0154157982153134e-06, + "loss": 0.4912, + "step": 10438 + }, + { + "epoch": 1.6935431537962362, + "grad_norm": 0.6273516084879814, + "learning_rate": 2.0149968717177363e-06, + "loss": 0.4725, + "step": 10439 + }, + { + "epoch": 1.6937053861129137, + "grad_norm": 0.6502225257540171, + "learning_rate": 2.014577959371134e-06, + "loss": 0.498, + "step": 10440 + }, + { + "epoch": 1.693867618429591, + "grad_norm": 0.5979796369235575, + "learning_rate": 2.014159061187729e-06, + "loss": 0.4752, + "step": 10441 + }, + { + "epoch": 1.6940298507462686, + "grad_norm": 0.6282942026228475, + "learning_rate": 2.0137401771797445e-06, + "loss": 0.5241, + "step": 10442 + }, + { + "epoch": 1.694192083062946, + "grad_norm": 0.6017534460120513, + "learning_rate": 2.0133213073594026e-06, + "loss": 0.4943, + "step": 10443 + }, + { + "epoch": 1.6943543153796237, + "grad_norm": 0.5918583699664421, + "learning_rate": 2.0129024517389222e-06, + "loss": 0.5087, + "step": 10444 + }, + { + "epoch": 1.6945165476963011, + "grad_norm": 0.5790602180782648, + "learning_rate": 2.012483610330527e-06, + "loss": 0.5353, + "step": 10445 + }, + { + "epoch": 1.6946787800129786, + "grad_norm": 0.6435716449350274, + "learning_rate": 2.012064783146436e-06, + "loss": 0.54, + "step": 10446 + }, + { + "epoch": 1.6948410123296562, + "grad_norm": 0.6048536363788863, + "learning_rate": 2.0116459701988707e-06, + "loss": 0.4851, + "step": 10447 + }, + { + "epoch": 1.6950032446463337, + "grad_norm": 0.5715337523283848, + "learning_rate": 2.0112271715000488e-06, + "loss": 0.5191, + "step": 10448 + }, + { + "epoch": 1.6951654769630111, + "grad_norm": 0.6272097283675027, + "learning_rate": 2.0108083870621925e-06, + "loss": 0.5202, + "step": 10449 + }, + { + "epoch": 1.6953277092796886, + "grad_norm": 0.6102522602952776, + "learning_rate": 2.010389616897518e-06, + "loss": 0.5324, + "step": 10450 + }, + { + "epoch": 1.695489941596366, + "grad_norm": 0.6037189974255179, + "learning_rate": 2.009970861018245e-06, + "loss": 0.4993, + "step": 10451 + }, + { + "epoch": 1.6956521739130435, + "grad_norm": 0.6019591037219881, + "learning_rate": 2.0095521194365915e-06, + "loss": 0.5326, + "step": 10452 + }, + { + "epoch": 1.695814406229721, + "grad_norm": 0.5729718525031277, + "learning_rate": 2.0091333921647754e-06, + "loss": 0.5208, + "step": 10453 + }, + { + "epoch": 1.6959766385463984, + "grad_norm": 0.606945725171024, + "learning_rate": 2.008714679215014e-06, + "loss": 0.5356, + "step": 10454 + }, + { + "epoch": 1.6961388708630758, + "grad_norm": 0.5981444594073219, + "learning_rate": 2.008295980599523e-06, + "loss": 0.5167, + "step": 10455 + }, + { + "epoch": 1.6963011031797532, + "grad_norm": 0.6347989419677329, + "learning_rate": 2.00787729633052e-06, + "loss": 0.4956, + "step": 10456 + }, + { + "epoch": 1.696463335496431, + "grad_norm": 0.6108283416811288, + "learning_rate": 2.0074586264202193e-06, + "loss": 0.5232, + "step": 10457 + }, + { + "epoch": 1.6966255678131084, + "grad_norm": 0.6093952009036221, + "learning_rate": 2.0070399708808393e-06, + "loss": 0.5218, + "step": 10458 + }, + { + "epoch": 1.6967878001297858, + "grad_norm": 0.5945663171484068, + "learning_rate": 2.0066213297245937e-06, + "loss": 0.5214, + "step": 10459 + }, + { + "epoch": 1.6969500324464635, + "grad_norm": 0.6138271471944446, + "learning_rate": 2.006202702963696e-06, + "loss": 0.5145, + "step": 10460 + }, + { + "epoch": 1.697112264763141, + "grad_norm": 0.6299460789983098, + "learning_rate": 2.005784090610362e-06, + "loss": 0.5396, + "step": 10461 + }, + { + "epoch": 1.6972744970798184, + "grad_norm": 0.6422790213014485, + "learning_rate": 2.0053654926768044e-06, + "loss": 0.4796, + "step": 10462 + }, + { + "epoch": 1.6974367293964958, + "grad_norm": 0.561714496078477, + "learning_rate": 2.0049469091752383e-06, + "loss": 0.4843, + "step": 10463 + }, + { + "epoch": 1.6975989617131733, + "grad_norm": 0.611374008923924, + "learning_rate": 2.0045283401178764e-06, + "loss": 0.513, + "step": 10464 + }, + { + "epoch": 1.6977611940298507, + "grad_norm": 0.6049449312923155, + "learning_rate": 2.0041097855169305e-06, + "loss": 0.5118, + "step": 10465 + }, + { + "epoch": 1.6979234263465282, + "grad_norm": 0.6227765080529202, + "learning_rate": 2.003691245384612e-06, + "loss": 0.5131, + "step": 10466 + }, + { + "epoch": 1.6980856586632056, + "grad_norm": 0.5881915299266951, + "learning_rate": 2.0032727197331347e-06, + "loss": 0.5225, + "step": 10467 + }, + { + "epoch": 1.698247890979883, + "grad_norm": 0.6212406601614849, + "learning_rate": 2.0028542085747087e-06, + "loss": 0.4973, + "step": 10468 + }, + { + "epoch": 1.6984101232965607, + "grad_norm": 0.5726084293750694, + "learning_rate": 2.002435711921546e-06, + "loss": 0.5347, + "step": 10469 + }, + { + "epoch": 1.6985723556132382, + "grad_norm": 0.6160007333277653, + "learning_rate": 2.002017229785856e-06, + "loss": 0.4922, + "step": 10470 + }, + { + "epoch": 1.6987345879299156, + "grad_norm": 0.6207431806413105, + "learning_rate": 2.001598762179849e-06, + "loss": 0.5186, + "step": 10471 + }, + { + "epoch": 1.6988968202465933, + "grad_norm": 0.6131706062544616, + "learning_rate": 2.001180309115735e-06, + "loss": 0.528, + "step": 10472 + }, + { + "epoch": 1.6990590525632707, + "grad_norm": 0.5785742520793857, + "learning_rate": 2.0007618706057225e-06, + "loss": 0.5016, + "step": 10473 + }, + { + "epoch": 1.6992212848799482, + "grad_norm": 0.5695548963511866, + "learning_rate": 2.0003434466620227e-06, + "loss": 0.5129, + "step": 10474 + }, + { + "epoch": 1.6993835171966256, + "grad_norm": 0.6315243971243875, + "learning_rate": 1.9999250372968404e-06, + "loss": 0.5071, + "step": 10475 + }, + { + "epoch": 1.699545749513303, + "grad_norm": 0.6046311910449793, + "learning_rate": 1.999506642522386e-06, + "loss": 0.5193, + "step": 10476 + }, + { + "epoch": 1.6997079818299805, + "grad_norm": 0.6300245240942935, + "learning_rate": 1.9990882623508656e-06, + "loss": 0.5375, + "step": 10477 + }, + { + "epoch": 1.699870214146658, + "grad_norm": 0.596022261181638, + "learning_rate": 1.9986698967944878e-06, + "loss": 0.4885, + "step": 10478 + }, + { + "epoch": 1.7000324464633354, + "grad_norm": 0.5957707174821621, + "learning_rate": 1.9982515458654596e-06, + "loss": 0.5127, + "step": 10479 + }, + { + "epoch": 1.7001946787800128, + "grad_norm": 0.6355431523530598, + "learning_rate": 1.9978332095759843e-06, + "loss": 0.5038, + "step": 10480 + }, + { + "epoch": 1.7003569110966905, + "grad_norm": 0.5840927531673735, + "learning_rate": 1.997414887938271e-06, + "loss": 0.5153, + "step": 10481 + }, + { + "epoch": 1.700519143413368, + "grad_norm": 0.5991789301239999, + "learning_rate": 1.9969965809645223e-06, + "loss": 0.5089, + "step": 10482 + }, + { + "epoch": 1.7006813757300454, + "grad_norm": 0.605953229172655, + "learning_rate": 1.996578288666946e-06, + "loss": 0.5298, + "step": 10483 + }, + { + "epoch": 1.7008436080467229, + "grad_norm": 0.6081161847436236, + "learning_rate": 1.996160011057746e-06, + "loss": 0.5323, + "step": 10484 + }, + { + "epoch": 1.7010058403634005, + "grad_norm": 0.59028482108232, + "learning_rate": 1.995741748149124e-06, + "loss": 0.5116, + "step": 10485 + }, + { + "epoch": 1.701168072680078, + "grad_norm": 0.5987418124229841, + "learning_rate": 1.995323499953286e-06, + "loss": 0.5648, + "step": 10486 + }, + { + "epoch": 1.7013303049967554, + "grad_norm": 0.587802029488385, + "learning_rate": 1.9949052664824352e-06, + "loss": 0.5185, + "step": 10487 + }, + { + "epoch": 1.7014925373134329, + "grad_norm": 0.6424197016977449, + "learning_rate": 1.994487047748773e-06, + "loss": 0.5017, + "step": 10488 + }, + { + "epoch": 1.7016547696301103, + "grad_norm": 0.622761328440621, + "learning_rate": 1.994068843764504e-06, + "loss": 0.5057, + "step": 10489 + }, + { + "epoch": 1.7018170019467878, + "grad_norm": 0.635059028359695, + "learning_rate": 1.993650654541828e-06, + "loss": 0.491, + "step": 10490 + }, + { + "epoch": 1.7019792342634652, + "grad_norm": 0.6674301074864221, + "learning_rate": 1.993232480092947e-06, + "loss": 0.5313, + "step": 10491 + }, + { + "epoch": 1.7021414665801426, + "grad_norm": 0.586833763800073, + "learning_rate": 1.9928143204300633e-06, + "loss": 0.5091, + "step": 10492 + }, + { + "epoch": 1.70230369889682, + "grad_norm": 0.5784876080214909, + "learning_rate": 1.992396175565376e-06, + "loss": 0.4932, + "step": 10493 + }, + { + "epoch": 1.7024659312134978, + "grad_norm": 0.6058212644156822, + "learning_rate": 1.9919780455110866e-06, + "loss": 0.4992, + "step": 10494 + }, + { + "epoch": 1.7026281635301752, + "grad_norm": 0.6084113656722991, + "learning_rate": 1.991559930279395e-06, + "loss": 0.5229, + "step": 10495 + }, + { + "epoch": 1.7027903958468527, + "grad_norm": 0.6134644920478274, + "learning_rate": 1.991141829882499e-06, + "loss": 0.5078, + "step": 10496 + }, + { + "epoch": 1.7029526281635303, + "grad_norm": 0.6406010925861846, + "learning_rate": 1.9907237443325993e-06, + "loss": 0.5278, + "step": 10497 + }, + { + "epoch": 1.7031148604802078, + "grad_norm": 0.5909632393011875, + "learning_rate": 1.990305673641894e-06, + "loss": 0.4934, + "step": 10498 + }, + { + "epoch": 1.7032770927968852, + "grad_norm": 0.6256367210450674, + "learning_rate": 1.98988761782258e-06, + "loss": 0.5012, + "step": 10499 + }, + { + "epoch": 1.7034393251135627, + "grad_norm": 0.6065465121883035, + "learning_rate": 1.989469576886857e-06, + "loss": 0.5274, + "step": 10500 + }, + { + "epoch": 1.70360155743024, + "grad_norm": 0.5804728345038512, + "learning_rate": 1.9890515508469202e-06, + "loss": 0.516, + "step": 10501 + }, + { + "epoch": 1.7037637897469176, + "grad_norm": 0.5840125342587719, + "learning_rate": 1.988633539714967e-06, + "loss": 0.544, + "step": 10502 + }, + { + "epoch": 1.703926022063595, + "grad_norm": 0.6152550787123886, + "learning_rate": 1.9882155435031944e-06, + "loss": 0.5037, + "step": 10503 + }, + { + "epoch": 1.7040882543802724, + "grad_norm": 0.5863360654518417, + "learning_rate": 1.9877975622237976e-06, + "loss": 0.5091, + "step": 10504 + }, + { + "epoch": 1.70425048669695, + "grad_norm": 0.6076009103322284, + "learning_rate": 1.987379595888973e-06, + "loss": 0.4964, + "step": 10505 + }, + { + "epoch": 1.7044127190136276, + "grad_norm": 0.6013734599565322, + "learning_rate": 1.9869616445109146e-06, + "loss": 0.4829, + "step": 10506 + }, + { + "epoch": 1.704574951330305, + "grad_norm": 0.5744170137119617, + "learning_rate": 1.9865437081018173e-06, + "loss": 0.5098, + "step": 10507 + }, + { + "epoch": 1.7047371836469825, + "grad_norm": 0.5996200514285718, + "learning_rate": 1.9861257866738753e-06, + "loss": 0.5089, + "step": 10508 + }, + { + "epoch": 1.7048994159636601, + "grad_norm": 0.5902371698284467, + "learning_rate": 1.9857078802392823e-06, + "loss": 0.5129, + "step": 10509 + }, + { + "epoch": 1.7050616482803376, + "grad_norm": 0.5897231174692018, + "learning_rate": 1.985289988810233e-06, + "loss": 0.5399, + "step": 10510 + }, + { + "epoch": 1.705223880597015, + "grad_norm": 0.5885262646574996, + "learning_rate": 1.9848721123989173e-06, + "loss": 0.5083, + "step": 10511 + }, + { + "epoch": 1.7053861129136925, + "grad_norm": 0.5702056898328954, + "learning_rate": 1.98445425101753e-06, + "loss": 0.4865, + "step": 10512 + }, + { + "epoch": 1.70554834523037, + "grad_norm": 0.5990136959312036, + "learning_rate": 1.984036404678262e-06, + "loss": 0.5234, + "step": 10513 + }, + { + "epoch": 1.7057105775470474, + "grad_norm": 0.6189450417360803, + "learning_rate": 1.983618573393305e-06, + "loss": 0.5222, + "step": 10514 + }, + { + "epoch": 1.7058728098637248, + "grad_norm": 0.6118101080258342, + "learning_rate": 1.9832007571748513e-06, + "loss": 0.5403, + "step": 10515 + }, + { + "epoch": 1.7060350421804023, + "grad_norm": 0.5905141894425856, + "learning_rate": 1.982782956035089e-06, + "loss": 0.5039, + "step": 10516 + }, + { + "epoch": 1.7061972744970797, + "grad_norm": 0.6119544589501267, + "learning_rate": 1.982365169986211e-06, + "loss": 0.5188, + "step": 10517 + }, + { + "epoch": 1.7063595068137571, + "grad_norm": 0.6168716956496203, + "learning_rate": 1.981947399040405e-06, + "loss": 0.5342, + "step": 10518 + }, + { + "epoch": 1.7065217391304348, + "grad_norm": 0.6074206321500054, + "learning_rate": 1.981529643209862e-06, + "loss": 0.5138, + "step": 10519 + }, + { + "epoch": 1.7066839714471123, + "grad_norm": 0.6151857464450532, + "learning_rate": 1.9811119025067702e-06, + "loss": 0.5605, + "step": 10520 + }, + { + "epoch": 1.7068462037637897, + "grad_norm": 0.5859474951473738, + "learning_rate": 1.9806941769433177e-06, + "loss": 0.5436, + "step": 10521 + }, + { + "epoch": 1.7070084360804674, + "grad_norm": 0.5758642132822799, + "learning_rate": 1.980276466531692e-06, + "loss": 0.4616, + "step": 10522 + }, + { + "epoch": 1.7071706683971448, + "grad_norm": 0.6139978154481333, + "learning_rate": 1.979858771284083e-06, + "loss": 0.5314, + "step": 10523 + }, + { + "epoch": 1.7073329007138223, + "grad_norm": 0.5716499176097436, + "learning_rate": 1.979441091212675e-06, + "loss": 0.4899, + "step": 10524 + }, + { + "epoch": 1.7074951330304997, + "grad_norm": 0.6080718231694456, + "learning_rate": 1.979023426329657e-06, + "loss": 0.5061, + "step": 10525 + }, + { + "epoch": 1.7076573653471772, + "grad_norm": 0.592581438527118, + "learning_rate": 1.9786057766472134e-06, + "loss": 0.493, + "step": 10526 + }, + { + "epoch": 1.7078195976638546, + "grad_norm": 0.5892030734301443, + "learning_rate": 1.978188142177531e-06, + "loss": 0.524, + "step": 10527 + }, + { + "epoch": 1.707981829980532, + "grad_norm": 0.6069633081533201, + "learning_rate": 1.9777705229327954e-06, + "loss": 0.4973, + "step": 10528 + }, + { + "epoch": 1.7081440622972095, + "grad_norm": 0.634000328502927, + "learning_rate": 1.97735291892519e-06, + "loss": 0.5281, + "step": 10529 + }, + { + "epoch": 1.708306294613887, + "grad_norm": 0.5765914123544663, + "learning_rate": 1.976935330166902e-06, + "loss": 0.521, + "step": 10530 + }, + { + "epoch": 1.7084685269305646, + "grad_norm": 0.6085273996388754, + "learning_rate": 1.976517756670113e-06, + "loss": 0.5036, + "step": 10531 + }, + { + "epoch": 1.708630759247242, + "grad_norm": 0.6086657017752171, + "learning_rate": 1.976100198447007e-06, + "loss": 0.4883, + "step": 10532 + }, + { + "epoch": 1.7087929915639195, + "grad_norm": 0.6184944894963775, + "learning_rate": 1.9756826555097676e-06, + "loss": 0.5342, + "step": 10533 + }, + { + "epoch": 1.7089552238805972, + "grad_norm": 0.5926842102065368, + "learning_rate": 1.9752651278705777e-06, + "loss": 0.5051, + "step": 10534 + }, + { + "epoch": 1.7091174561972746, + "grad_norm": 0.5883476502535908, + "learning_rate": 1.9748476155416197e-06, + "loss": 0.5202, + "step": 10535 + }, + { + "epoch": 1.709279688513952, + "grad_norm": 0.5833545541562186, + "learning_rate": 1.9744301185350736e-06, + "loss": 0.503, + "step": 10536 + }, + { + "epoch": 1.7094419208306295, + "grad_norm": 0.5984165132704004, + "learning_rate": 1.974012636863123e-06, + "loss": 0.4865, + "step": 10537 + }, + { + "epoch": 1.709604153147307, + "grad_norm": 0.5733047826606821, + "learning_rate": 1.973595170537947e-06, + "loss": 0.512, + "step": 10538 + }, + { + "epoch": 1.7097663854639844, + "grad_norm": 0.5994589882195659, + "learning_rate": 1.9731777195717275e-06, + "loss": 0.5123, + "step": 10539 + }, + { + "epoch": 1.7099286177806619, + "grad_norm": 0.5867182063479941, + "learning_rate": 1.9727602839766434e-06, + "loss": 0.5261, + "step": 10540 + }, + { + "epoch": 1.7100908500973393, + "grad_norm": 0.6151887839438008, + "learning_rate": 1.9723428637648757e-06, + "loss": 0.5329, + "step": 10541 + }, + { + "epoch": 1.7102530824140167, + "grad_norm": 0.6251747999978077, + "learning_rate": 1.971925458948602e-06, + "loss": 0.4864, + "step": 10542 + }, + { + "epoch": 1.7104153147306942, + "grad_norm": 0.6749547711108203, + "learning_rate": 1.971508069540001e-06, + "loss": 0.4746, + "step": 10543 + }, + { + "epoch": 1.7105775470473719, + "grad_norm": 0.5925043376371192, + "learning_rate": 1.9710906955512515e-06, + "loss": 0.496, + "step": 10544 + }, + { + "epoch": 1.7107397793640493, + "grad_norm": 0.5901517103653618, + "learning_rate": 1.9706733369945318e-06, + "loss": 0.5126, + "step": 10545 + }, + { + "epoch": 1.7109020116807268, + "grad_norm": 0.5943589563989266, + "learning_rate": 1.9702559938820187e-06, + "loss": 0.5038, + "step": 10546 + }, + { + "epoch": 1.7110642439974044, + "grad_norm": 0.6410894993702275, + "learning_rate": 1.9698386662258876e-06, + "loss": 0.4619, + "step": 10547 + }, + { + "epoch": 1.7112264763140819, + "grad_norm": 0.6064151345011461, + "learning_rate": 1.9694213540383176e-06, + "loss": 0.5036, + "step": 10548 + }, + { + "epoch": 1.7113887086307593, + "grad_norm": 0.6087948585640961, + "learning_rate": 1.9690040573314824e-06, + "loss": 0.4907, + "step": 10549 + }, + { + "epoch": 1.7115509409474368, + "grad_norm": 0.6123081420062917, + "learning_rate": 1.9685867761175584e-06, + "loss": 0.5347, + "step": 10550 + }, + { + "epoch": 1.7117131732641142, + "grad_norm": 0.5858054661214205, + "learning_rate": 1.9681695104087216e-06, + "loss": 0.498, + "step": 10551 + }, + { + "epoch": 1.7118754055807917, + "grad_norm": 0.6033974643996985, + "learning_rate": 1.967752260217144e-06, + "loss": 0.5456, + "step": 10552 + }, + { + "epoch": 1.712037637897469, + "grad_norm": 0.5945280005321155, + "learning_rate": 1.9673350255550023e-06, + "loss": 0.4681, + "step": 10553 + }, + { + "epoch": 1.7121998702141465, + "grad_norm": 0.6286743183020255, + "learning_rate": 1.966917806434469e-06, + "loss": 0.5325, + "step": 10554 + }, + { + "epoch": 1.712362102530824, + "grad_norm": 0.6277550968357813, + "learning_rate": 1.9665006028677175e-06, + "loss": 0.4856, + "step": 10555 + }, + { + "epoch": 1.7125243348475017, + "grad_norm": 0.5921827570547, + "learning_rate": 1.9660834148669216e-06, + "loss": 0.5004, + "step": 10556 + }, + { + "epoch": 1.712686567164179, + "grad_norm": 0.6497416503057326, + "learning_rate": 1.9656662424442517e-06, + "loss": 0.5139, + "step": 10557 + }, + { + "epoch": 1.7128487994808566, + "grad_norm": 0.5940812070742555, + "learning_rate": 1.9652490856118804e-06, + "loss": 0.5496, + "step": 10558 + }, + { + "epoch": 1.7130110317975342, + "grad_norm": 0.5882439385298917, + "learning_rate": 1.96483194438198e-06, + "loss": 0.4947, + "step": 10559 + }, + { + "epoch": 1.7131732641142117, + "grad_norm": 0.5893021168943398, + "learning_rate": 1.96441481876672e-06, + "loss": 0.5139, + "step": 10560 + }, + { + "epoch": 1.7133354964308891, + "grad_norm": 0.5874919903641372, + "learning_rate": 1.9639977087782733e-06, + "loss": 0.4833, + "step": 10561 + }, + { + "epoch": 1.7134977287475666, + "grad_norm": 0.6085637299145196, + "learning_rate": 1.963580614428808e-06, + "loss": 0.5165, + "step": 10562 + }, + { + "epoch": 1.713659961064244, + "grad_norm": 0.5903007512158226, + "learning_rate": 1.9631635357304927e-06, + "loss": 0.5277, + "step": 10563 + }, + { + "epoch": 1.7138221933809215, + "grad_norm": 0.5577287899610035, + "learning_rate": 1.962746472695499e-06, + "loss": 0.5212, + "step": 10564 + }, + { + "epoch": 1.713984425697599, + "grad_norm": 0.6154639359822626, + "learning_rate": 1.9623294253359936e-06, + "loss": 0.484, + "step": 10565 + }, + { + "epoch": 1.7141466580142763, + "grad_norm": 0.6385438643854316, + "learning_rate": 1.9619123936641473e-06, + "loss": 0.5342, + "step": 10566 + }, + { + "epoch": 1.7143088903309538, + "grad_norm": 0.6195961668297092, + "learning_rate": 1.961495377692125e-06, + "loss": 0.5196, + "step": 10567 + }, + { + "epoch": 1.7144711226476315, + "grad_norm": 0.6014037087665582, + "learning_rate": 1.9610783774320956e-06, + "loss": 0.5407, + "step": 10568 + }, + { + "epoch": 1.714633354964309, + "grad_norm": 0.6115082260326011, + "learning_rate": 1.9606613928962247e-06, + "loss": 0.5373, + "step": 10569 + }, + { + "epoch": 1.7147955872809864, + "grad_norm": 0.6103398031437841, + "learning_rate": 1.9602444240966804e-06, + "loss": 0.5336, + "step": 10570 + }, + { + "epoch": 1.7149578195976638, + "grad_norm": 0.6191817996307316, + "learning_rate": 1.9598274710456285e-06, + "loss": 0.4763, + "step": 10571 + }, + { + "epoch": 1.7151200519143415, + "grad_norm": 0.5779956970164899, + "learning_rate": 1.959410533755232e-06, + "loss": 0.5198, + "step": 10572 + }, + { + "epoch": 1.715282284231019, + "grad_norm": 0.6175612609482367, + "learning_rate": 1.9589936122376587e-06, + "loss": 0.5435, + "step": 10573 + }, + { + "epoch": 1.7154445165476964, + "grad_norm": 0.602319899779068, + "learning_rate": 1.9585767065050715e-06, + "loss": 0.504, + "step": 10574 + }, + { + "epoch": 1.7156067488643738, + "grad_norm": 0.5940616161392354, + "learning_rate": 1.9581598165696357e-06, + "loss": 0.5085, + "step": 10575 + }, + { + "epoch": 1.7157689811810513, + "grad_norm": 0.5939265568427812, + "learning_rate": 1.957742942443515e-06, + "loss": 0.524, + "step": 10576 + }, + { + "epoch": 1.7159312134977287, + "grad_norm": 0.6338589215982323, + "learning_rate": 1.9573260841388707e-06, + "loss": 0.5366, + "step": 10577 + }, + { + "epoch": 1.7160934458144061, + "grad_norm": 0.6106952579097579, + "learning_rate": 1.956909241667867e-06, + "loss": 0.5295, + "step": 10578 + }, + { + "epoch": 1.7162556781310836, + "grad_norm": 0.6111215751714032, + "learning_rate": 1.9564924150426663e-06, + "loss": 0.4784, + "step": 10579 + }, + { + "epoch": 1.716417910447761, + "grad_norm": 0.6143215425842457, + "learning_rate": 1.956075604275429e-06, + "loss": 0.5052, + "step": 10580 + }, + { + "epoch": 1.7165801427644387, + "grad_norm": 0.5673493831035298, + "learning_rate": 1.9556588093783185e-06, + "loss": 0.508, + "step": 10581 + }, + { + "epoch": 1.7167423750811162, + "grad_norm": 0.6422230394727302, + "learning_rate": 1.955242030363494e-06, + "loss": 0.5052, + "step": 10582 + }, + { + "epoch": 1.7169046073977936, + "grad_norm": 0.6070205447576393, + "learning_rate": 1.954825267243116e-06, + "loss": 0.5112, + "step": 10583 + }, + { + "epoch": 1.7170668397144713, + "grad_norm": 0.6199001927232823, + "learning_rate": 1.9544085200293457e-06, + "loss": 0.5095, + "step": 10584 + }, + { + "epoch": 1.7172290720311487, + "grad_norm": 0.5877437056424835, + "learning_rate": 1.953991788734341e-06, + "loss": 0.514, + "step": 10585 + }, + { + "epoch": 1.7173913043478262, + "grad_norm": 0.6226125870676381, + "learning_rate": 1.953575073370262e-06, + "loss": 0.5102, + "step": 10586 + }, + { + "epoch": 1.7175535366645036, + "grad_norm": 0.6483326086298353, + "learning_rate": 1.953158373949267e-06, + "loss": 0.533, + "step": 10587 + }, + { + "epoch": 1.717715768981181, + "grad_norm": 0.6547009503185476, + "learning_rate": 1.9527416904835133e-06, + "loss": 0.5009, + "step": 10588 + }, + { + "epoch": 1.7178780012978585, + "grad_norm": 0.5540923179831069, + "learning_rate": 1.9523250229851597e-06, + "loss": 0.4942, + "step": 10589 + }, + { + "epoch": 1.718040233614536, + "grad_norm": 0.6280610646050045, + "learning_rate": 1.951908371466362e-06, + "loss": 0.5201, + "step": 10590 + }, + { + "epoch": 1.7182024659312134, + "grad_norm": 0.6066506213106532, + "learning_rate": 1.9514917359392783e-06, + "loss": 0.5194, + "step": 10591 + }, + { + "epoch": 1.7183646982478908, + "grad_norm": 0.6074462223134126, + "learning_rate": 1.951075116416065e-06, + "loss": 0.4994, + "step": 10592 + }, + { + "epoch": 1.7185269305645685, + "grad_norm": 0.6004995551333101, + "learning_rate": 1.950658512908876e-06, + "loss": 0.5234, + "step": 10593 + }, + { + "epoch": 1.718689162881246, + "grad_norm": 0.5959016193503002, + "learning_rate": 1.9502419254298674e-06, + "loss": 0.5121, + "step": 10594 + }, + { + "epoch": 1.7188513951979234, + "grad_norm": 0.6020303093120035, + "learning_rate": 1.9498253539911945e-06, + "loss": 0.5082, + "step": 10595 + }, + { + "epoch": 1.719013627514601, + "grad_norm": 0.5720090827370478, + "learning_rate": 1.949408798605011e-06, + "loss": 0.5194, + "step": 10596 + }, + { + "epoch": 1.7191758598312785, + "grad_norm": 0.627268365068001, + "learning_rate": 1.948992259283472e-06, + "loss": 0.4924, + "step": 10597 + }, + { + "epoch": 1.719338092147956, + "grad_norm": 0.6400447632774032, + "learning_rate": 1.9485757360387297e-06, + "loss": 0.4784, + "step": 10598 + }, + { + "epoch": 1.7195003244646334, + "grad_norm": 0.6031901704811519, + "learning_rate": 1.9481592288829367e-06, + "loss": 0.4856, + "step": 10599 + }, + { + "epoch": 1.7196625567813109, + "grad_norm": 0.6151129560960211, + "learning_rate": 1.947742737828246e-06, + "loss": 0.5322, + "step": 10600 + }, + { + "epoch": 1.7198247890979883, + "grad_norm": 0.6288136932748655, + "learning_rate": 1.9473262628868098e-06, + "loss": 0.5696, + "step": 10601 + }, + { + "epoch": 1.7199870214146658, + "grad_norm": 0.6022391962928255, + "learning_rate": 1.946909804070781e-06, + "loss": 0.5022, + "step": 10602 + }, + { + "epoch": 1.7201492537313432, + "grad_norm": 0.6063215003064331, + "learning_rate": 1.9464933613923074e-06, + "loss": 0.5267, + "step": 10603 + }, + { + "epoch": 1.7203114860480206, + "grad_norm": 0.6158263999167257, + "learning_rate": 1.9460769348635423e-06, + "loss": 0.4934, + "step": 10604 + }, + { + "epoch": 1.720473718364698, + "grad_norm": 0.6119382964663432, + "learning_rate": 1.945660524496634e-06, + "loss": 0.5294, + "step": 10605 + }, + { + "epoch": 1.7206359506813758, + "grad_norm": 0.6081321577946511, + "learning_rate": 1.9452441303037335e-06, + "loss": 0.5048, + "step": 10606 + }, + { + "epoch": 1.7207981829980532, + "grad_norm": 0.6056080420623698, + "learning_rate": 1.9448277522969895e-06, + "loss": 0.5415, + "step": 10607 + }, + { + "epoch": 1.7209604153147307, + "grad_norm": 0.6247714357124159, + "learning_rate": 1.9444113904885504e-06, + "loss": 0.501, + "step": 10608 + }, + { + "epoch": 1.7211226476314083, + "grad_norm": 0.6533834424675928, + "learning_rate": 1.9439950448905647e-06, + "loss": 0.5375, + "step": 10609 + }, + { + "epoch": 1.7212848799480858, + "grad_norm": 0.6354350313394146, + "learning_rate": 1.9435787155151797e-06, + "loss": 0.5381, + "step": 10610 + }, + { + "epoch": 1.7214471122647632, + "grad_norm": 0.6058642416925923, + "learning_rate": 1.9431624023745436e-06, + "loss": 0.5121, + "step": 10611 + }, + { + "epoch": 1.7216093445814407, + "grad_norm": 0.6367112020086634, + "learning_rate": 1.9427461054808038e-06, + "loss": 0.5224, + "step": 10612 + }, + { + "epoch": 1.721771576898118, + "grad_norm": 0.6734118761816961, + "learning_rate": 1.9423298248461044e-06, + "loss": 0.5267, + "step": 10613 + }, + { + "epoch": 1.7219338092147956, + "grad_norm": 0.6253752286979309, + "learning_rate": 1.9419135604825916e-06, + "loss": 0.4873, + "step": 10614 + }, + { + "epoch": 1.722096041531473, + "grad_norm": 0.6247120192328668, + "learning_rate": 1.9414973124024126e-06, + "loss": 0.5211, + "step": 10615 + }, + { + "epoch": 1.7222582738481504, + "grad_norm": 0.6114436682985446, + "learning_rate": 1.9410810806177105e-06, + "loss": 0.4923, + "step": 10616 + }, + { + "epoch": 1.722420506164828, + "grad_norm": 0.6266559635795943, + "learning_rate": 1.940664865140632e-06, + "loss": 0.5185, + "step": 10617 + }, + { + "epoch": 1.7225827384815056, + "grad_norm": 0.6112031338006977, + "learning_rate": 1.9402486659833183e-06, + "loss": 0.4641, + "step": 10618 + }, + { + "epoch": 1.722744970798183, + "grad_norm": 0.6169385972292462, + "learning_rate": 1.939832483157914e-06, + "loss": 0.5244, + "step": 10619 + }, + { + "epoch": 1.7229072031148605, + "grad_norm": 0.588373868256728, + "learning_rate": 1.9394163166765624e-06, + "loss": 0.496, + "step": 10620 + }, + { + "epoch": 1.7230694354315381, + "grad_norm": 0.6113807838364954, + "learning_rate": 1.9390001665514057e-06, + "loss": 0.5191, + "step": 10621 + }, + { + "epoch": 1.7232316677482156, + "grad_norm": 0.6115010213847114, + "learning_rate": 1.9385840327945873e-06, + "loss": 0.5146, + "step": 10622 + }, + { + "epoch": 1.723393900064893, + "grad_norm": 0.5981717285030419, + "learning_rate": 1.9381679154182463e-06, + "loss": 0.5144, + "step": 10623 + }, + { + "epoch": 1.7235561323815705, + "grad_norm": 0.6059263371406464, + "learning_rate": 1.937751814434525e-06, + "loss": 0.5176, + "step": 10624 + }, + { + "epoch": 1.723718364698248, + "grad_norm": 0.6735670400920217, + "learning_rate": 1.9373357298555647e-06, + "loss": 0.5429, + "step": 10625 + }, + { + "epoch": 1.7238805970149254, + "grad_norm": 0.58726996253836, + "learning_rate": 1.9369196616935045e-06, + "loss": 0.5318, + "step": 10626 + }, + { + "epoch": 1.7240428293316028, + "grad_norm": 0.6000003327999202, + "learning_rate": 1.9365036099604853e-06, + "loss": 0.5048, + "step": 10627 + }, + { + "epoch": 1.7242050616482802, + "grad_norm": 0.6431432661792955, + "learning_rate": 1.936087574668644e-06, + "loss": 0.5354, + "step": 10628 + }, + { + "epoch": 1.7243672939649577, + "grad_norm": 0.6037382088262747, + "learning_rate": 1.9356715558301213e-06, + "loss": 0.4912, + "step": 10629 + }, + { + "epoch": 1.7245295262816351, + "grad_norm": 0.6184528471490692, + "learning_rate": 1.9352555534570547e-06, + "loss": 0.5464, + "step": 10630 + }, + { + "epoch": 1.7246917585983128, + "grad_norm": 0.6124074677931718, + "learning_rate": 1.934839567561582e-06, + "loss": 0.4956, + "step": 10631 + }, + { + "epoch": 1.7248539909149903, + "grad_norm": 0.594587765643979, + "learning_rate": 1.9344235981558417e-06, + "loss": 0.5285, + "step": 10632 + }, + { + "epoch": 1.7250162232316677, + "grad_norm": 0.586362648330674, + "learning_rate": 1.934007645251968e-06, + "loss": 0.5278, + "step": 10633 + }, + { + "epoch": 1.7251784555483454, + "grad_norm": 0.6346858233696197, + "learning_rate": 1.933591708862099e-06, + "loss": 0.5367, + "step": 10634 + }, + { + "epoch": 1.7253406878650228, + "grad_norm": 0.5768461826139633, + "learning_rate": 1.93317578899837e-06, + "loss": 0.4998, + "step": 10635 + }, + { + "epoch": 1.7255029201817003, + "grad_norm": 0.6211111493696266, + "learning_rate": 1.9327598856729164e-06, + "loss": 0.5412, + "step": 10636 + }, + { + "epoch": 1.7256651524983777, + "grad_norm": 0.6035265857370078, + "learning_rate": 1.932343998897873e-06, + "loss": 0.5255, + "step": 10637 + }, + { + "epoch": 1.7258273848150552, + "grad_norm": 0.6452136243304954, + "learning_rate": 1.931928128685375e-06, + "loss": 0.5606, + "step": 10638 + }, + { + "epoch": 1.7259896171317326, + "grad_norm": 0.5930159195675911, + "learning_rate": 1.9315122750475548e-06, + "loss": 0.5065, + "step": 10639 + }, + { + "epoch": 1.72615184944841, + "grad_norm": 0.61359056354601, + "learning_rate": 1.9310964379965467e-06, + "loss": 0.503, + "step": 10640 + }, + { + "epoch": 1.7263140817650875, + "grad_norm": 0.6034011265410079, + "learning_rate": 1.930680617544483e-06, + "loss": 0.5188, + "step": 10641 + }, + { + "epoch": 1.726476314081765, + "grad_norm": 0.6207997781878566, + "learning_rate": 1.930264813703497e-06, + "loss": 0.5074, + "step": 10642 + }, + { + "epoch": 1.7266385463984426, + "grad_norm": 0.5960920145079509, + "learning_rate": 1.9298490264857203e-06, + "loss": 0.5267, + "step": 10643 + }, + { + "epoch": 1.72680077871512, + "grad_norm": 0.6258844043393536, + "learning_rate": 1.9294332559032837e-06, + "loss": 0.496, + "step": 10644 + }, + { + "epoch": 1.7269630110317975, + "grad_norm": 0.5926414988688328, + "learning_rate": 1.9290175019683187e-06, + "loss": 0.5025, + "step": 10645 + }, + { + "epoch": 1.7271252433484752, + "grad_norm": 0.598982575210554, + "learning_rate": 1.9286017646929556e-06, + "loss": 0.5245, + "step": 10646 + }, + { + "epoch": 1.7272874756651526, + "grad_norm": 0.6317607972899266, + "learning_rate": 1.9281860440893253e-06, + "loss": 0.5007, + "step": 10647 + }, + { + "epoch": 1.72744970798183, + "grad_norm": 0.6300510619809876, + "learning_rate": 1.927770340169557e-06, + "loss": 0.5252, + "step": 10648 + }, + { + "epoch": 1.7276119402985075, + "grad_norm": 0.6862869494979804, + "learning_rate": 1.9273546529457786e-06, + "loss": 0.5272, + "step": 10649 + }, + { + "epoch": 1.727774172615185, + "grad_norm": 0.5878326526092355, + "learning_rate": 1.926938982430119e-06, + "loss": 0.5017, + "step": 10650 + }, + { + "epoch": 1.7279364049318624, + "grad_norm": 0.6410115727962591, + "learning_rate": 1.926523328634707e-06, + "loss": 0.5026, + "step": 10651 + }, + { + "epoch": 1.7280986372485398, + "grad_norm": 0.6016318851927809, + "learning_rate": 1.926107691571669e-06, + "loss": 0.4937, + "step": 10652 + }, + { + "epoch": 1.7282608695652173, + "grad_norm": 0.5744849633451992, + "learning_rate": 1.9256920712531347e-06, + "loss": 0.5146, + "step": 10653 + }, + { + "epoch": 1.7284231018818947, + "grad_norm": 0.6021568103089572, + "learning_rate": 1.925276467691228e-06, + "loss": 0.4959, + "step": 10654 + }, + { + "epoch": 1.7285853341985724, + "grad_norm": 0.6125561331776104, + "learning_rate": 1.9248608808980747e-06, + "loss": 0.5134, + "step": 10655 + }, + { + "epoch": 1.7287475665152499, + "grad_norm": 0.6185380479418531, + "learning_rate": 1.924445310885803e-06, + "loss": 0.5365, + "step": 10656 + }, + { + "epoch": 1.7289097988319273, + "grad_norm": 0.5882023124005549, + "learning_rate": 1.9240297576665353e-06, + "loss": 0.5039, + "step": 10657 + }, + { + "epoch": 1.7290720311486047, + "grad_norm": 0.6001234410441559, + "learning_rate": 1.9236142212523988e-06, + "loss": 0.5083, + "step": 10658 + }, + { + "epoch": 1.7292342634652824, + "grad_norm": 0.6120390439994644, + "learning_rate": 1.9231987016555157e-06, + "loss": 0.5271, + "step": 10659 + }, + { + "epoch": 1.7293964957819599, + "grad_norm": 0.591174535956858, + "learning_rate": 1.922783198888011e-06, + "loss": 0.4921, + "step": 10660 + }, + { + "epoch": 1.7295587280986373, + "grad_norm": 0.6039334216443686, + "learning_rate": 1.922367712962006e-06, + "loss": 0.5317, + "step": 10661 + }, + { + "epoch": 1.7297209604153148, + "grad_norm": 0.6304159923006374, + "learning_rate": 1.921952243889625e-06, + "loss": 0.5034, + "step": 10662 + }, + { + "epoch": 1.7298831927319922, + "grad_norm": 0.6239083127862987, + "learning_rate": 1.92153679168299e-06, + "loss": 0.5154, + "step": 10663 + }, + { + "epoch": 1.7300454250486696, + "grad_norm": 0.618651571131533, + "learning_rate": 1.921121356354222e-06, + "loss": 0.5175, + "step": 10664 + }, + { + "epoch": 1.730207657365347, + "grad_norm": 0.5719657199714075, + "learning_rate": 1.9207059379154426e-06, + "loss": 0.5307, + "step": 10665 + }, + { + "epoch": 1.7303698896820245, + "grad_norm": 0.61907868425766, + "learning_rate": 1.920290536378772e-06, + "loss": 0.5274, + "step": 10666 + }, + { + "epoch": 1.730532121998702, + "grad_norm": 0.6170117896693, + "learning_rate": 1.919875151756332e-06, + "loss": 0.544, + "step": 10667 + }, + { + "epoch": 1.7306943543153797, + "grad_norm": 0.5920920482778527, + "learning_rate": 1.919459784060241e-06, + "loss": 0.5208, + "step": 10668 + }, + { + "epoch": 1.730856586632057, + "grad_norm": 0.6071757065089871, + "learning_rate": 1.9190444333026177e-06, + "loss": 0.5449, + "step": 10669 + }, + { + "epoch": 1.7310188189487346, + "grad_norm": 0.5916309947450964, + "learning_rate": 1.9186290994955815e-06, + "loss": 0.5297, + "step": 10670 + }, + { + "epoch": 1.7311810512654122, + "grad_norm": 0.6943640464634017, + "learning_rate": 1.9182137826512505e-06, + "loss": 0.5391, + "step": 10671 + }, + { + "epoch": 1.7313432835820897, + "grad_norm": 0.640742271037101, + "learning_rate": 1.917798482781743e-06, + "loss": 0.4765, + "step": 10672 + }, + { + "epoch": 1.7315055158987671, + "grad_norm": 0.6169464764514208, + "learning_rate": 1.9173831998991765e-06, + "loss": 0.5017, + "step": 10673 + }, + { + "epoch": 1.7316677482154446, + "grad_norm": 0.58397666695397, + "learning_rate": 1.9169679340156663e-06, + "loss": 0.4998, + "step": 10674 + }, + { + "epoch": 1.731829980532122, + "grad_norm": 0.6215713771680046, + "learning_rate": 1.916552685143329e-06, + "loss": 0.5225, + "step": 10675 + }, + { + "epoch": 1.7319922128487995, + "grad_norm": 0.5806359643648683, + "learning_rate": 1.9161374532942812e-06, + "loss": 0.5171, + "step": 10676 + }, + { + "epoch": 1.732154445165477, + "grad_norm": 0.5778820921315221, + "learning_rate": 1.915722238480637e-06, + "loss": 0.5578, + "step": 10677 + }, + { + "epoch": 1.7323166774821543, + "grad_norm": 0.6004054313051246, + "learning_rate": 1.9153070407145136e-06, + "loss": 0.4978, + "step": 10678 + }, + { + "epoch": 1.7324789097988318, + "grad_norm": 0.5638774526300238, + "learning_rate": 1.9148918600080224e-06, + "loss": 0.5153, + "step": 10679 + }, + { + "epoch": 1.7326411421155095, + "grad_norm": 0.5938253466420218, + "learning_rate": 1.914476696373278e-06, + "loss": 0.5322, + "step": 10680 + }, + { + "epoch": 1.732803374432187, + "grad_norm": 0.6092685255415529, + "learning_rate": 1.9140615498223942e-06, + "loss": 0.5022, + "step": 10681 + }, + { + "epoch": 1.7329656067488644, + "grad_norm": 0.5693833926441242, + "learning_rate": 1.913646420367483e-06, + "loss": 0.5191, + "step": 10682 + }, + { + "epoch": 1.733127839065542, + "grad_norm": 0.6040353892449875, + "learning_rate": 1.9132313080206577e-06, + "loss": 0.4825, + "step": 10683 + }, + { + "epoch": 1.7332900713822195, + "grad_norm": 0.6174700018373115, + "learning_rate": 1.9128162127940304e-06, + "loss": 0.5417, + "step": 10684 + }, + { + "epoch": 1.733452303698897, + "grad_norm": 0.5888527613774954, + "learning_rate": 1.912401134699711e-06, + "loss": 0.5393, + "step": 10685 + }, + { + "epoch": 1.7336145360155744, + "grad_norm": 0.661377523552462, + "learning_rate": 1.9119860737498095e-06, + "loss": 0.5361, + "step": 10686 + }, + { + "epoch": 1.7337767683322518, + "grad_norm": 0.6210693501277015, + "learning_rate": 1.9115710299564386e-06, + "loss": 0.5335, + "step": 10687 + }, + { + "epoch": 1.7339390006489293, + "grad_norm": 0.5932975116316466, + "learning_rate": 1.9111560033317063e-06, + "loss": 0.5188, + "step": 10688 + }, + { + "epoch": 1.7341012329656067, + "grad_norm": 0.6104897303579736, + "learning_rate": 1.910740993887723e-06, + "loss": 0.5105, + "step": 10689 + }, + { + "epoch": 1.7342634652822841, + "grad_norm": 0.5982513609535562, + "learning_rate": 1.9103260016365973e-06, + "loss": 0.5135, + "step": 10690 + }, + { + "epoch": 1.7344256975989616, + "grad_norm": 0.5927323621784615, + "learning_rate": 1.909911026590436e-06, + "loss": 0.5024, + "step": 10691 + }, + { + "epoch": 1.734587929915639, + "grad_norm": 0.6075237139704216, + "learning_rate": 1.9094960687613486e-06, + "loss": 0.4998, + "step": 10692 + }, + { + "epoch": 1.7347501622323167, + "grad_norm": 0.618398322546204, + "learning_rate": 1.909081128161441e-06, + "loss": 0.4871, + "step": 10693 + }, + { + "epoch": 1.7349123945489942, + "grad_norm": 0.592712264164141, + "learning_rate": 1.9086662048028223e-06, + "loss": 0.5, + "step": 10694 + }, + { + "epoch": 1.7350746268656716, + "grad_norm": 0.6147293085727019, + "learning_rate": 1.9082512986975953e-06, + "loss": 0.4998, + "step": 10695 + }, + { + "epoch": 1.7352368591823493, + "grad_norm": 0.5797970807863612, + "learning_rate": 1.9078364098578686e-06, + "loss": 0.5244, + "step": 10696 + }, + { + "epoch": 1.7353990914990267, + "grad_norm": 0.6240448053948361, + "learning_rate": 1.9074215382957455e-06, + "loss": 0.5051, + "step": 10697 + }, + { + "epoch": 1.7355613238157042, + "grad_norm": 0.6223115835815525, + "learning_rate": 1.9070066840233326e-06, + "loss": 0.5235, + "step": 10698 + }, + { + "epoch": 1.7357235561323816, + "grad_norm": 0.5879500013308779, + "learning_rate": 1.9065918470527334e-06, + "loss": 0.496, + "step": 10699 + }, + { + "epoch": 1.735885788449059, + "grad_norm": 0.6310741176661054, + "learning_rate": 1.9061770273960506e-06, + "loss": 0.4823, + "step": 10700 + }, + { + "epoch": 1.7360480207657365, + "grad_norm": 0.5709038914347133, + "learning_rate": 1.9057622250653885e-06, + "loss": 0.5084, + "step": 10701 + }, + { + "epoch": 1.736210253082414, + "grad_norm": 0.6024963833547974, + "learning_rate": 1.9053474400728492e-06, + "loss": 0.5359, + "step": 10702 + }, + { + "epoch": 1.7363724853990914, + "grad_norm": 0.613339576122543, + "learning_rate": 1.9049326724305355e-06, + "loss": 0.527, + "step": 10703 + }, + { + "epoch": 1.7365347177157688, + "grad_norm": 0.6020354933649318, + "learning_rate": 1.9045179221505497e-06, + "loss": 0.516, + "step": 10704 + }, + { + "epoch": 1.7366969500324465, + "grad_norm": 0.559863057623298, + "learning_rate": 1.9041031892449913e-06, + "loss": 0.4964, + "step": 10705 + }, + { + "epoch": 1.736859182349124, + "grad_norm": 0.6012980527842602, + "learning_rate": 1.903688473725962e-06, + "loss": 0.4922, + "step": 10706 + }, + { + "epoch": 1.7370214146658014, + "grad_norm": 0.5741849041349978, + "learning_rate": 1.9032737756055622e-06, + "loss": 0.5383, + "step": 10707 + }, + { + "epoch": 1.737183646982479, + "grad_norm": 0.5782552703156362, + "learning_rate": 1.9028590948958905e-06, + "loss": 0.4886, + "step": 10708 + }, + { + "epoch": 1.7373458792991565, + "grad_norm": 0.5936386753944262, + "learning_rate": 1.902444431609048e-06, + "loss": 0.5344, + "step": 10709 + }, + { + "epoch": 1.737508111615834, + "grad_norm": 0.60377661687679, + "learning_rate": 1.9020297857571317e-06, + "loss": 0.5232, + "step": 10710 + }, + { + "epoch": 1.7376703439325114, + "grad_norm": 0.5847426777375484, + "learning_rate": 1.9016151573522395e-06, + "loss": 0.4774, + "step": 10711 + }, + { + "epoch": 1.7378325762491889, + "grad_norm": 0.5897931997341351, + "learning_rate": 1.9012005464064705e-06, + "loss": 0.5073, + "step": 10712 + }, + { + "epoch": 1.7379948085658663, + "grad_norm": 0.6055066821579423, + "learning_rate": 1.9007859529319206e-06, + "loss": 0.5297, + "step": 10713 + }, + { + "epoch": 1.7381570408825437, + "grad_norm": 0.6136937971101208, + "learning_rate": 1.900371376940688e-06, + "loss": 0.5224, + "step": 10714 + }, + { + "epoch": 1.7383192731992212, + "grad_norm": 0.6186637547278107, + "learning_rate": 1.8999568184448675e-06, + "loss": 0.531, + "step": 10715 + }, + { + "epoch": 1.7384815055158986, + "grad_norm": 0.6041882177166062, + "learning_rate": 1.899542277456554e-06, + "loss": 0.5013, + "step": 10716 + }, + { + "epoch": 1.738643737832576, + "grad_norm": 0.5845665175057094, + "learning_rate": 1.8991277539878445e-06, + "loss": 0.535, + "step": 10717 + }, + { + "epoch": 1.7388059701492538, + "grad_norm": 0.6025089807718993, + "learning_rate": 1.8987132480508323e-06, + "loss": 0.5123, + "step": 10718 + }, + { + "epoch": 1.7389682024659312, + "grad_norm": 0.629868024525255, + "learning_rate": 1.8982987596576124e-06, + "loss": 0.5324, + "step": 10719 + }, + { + "epoch": 1.7391304347826086, + "grad_norm": 0.6288024532612677, + "learning_rate": 1.897884288820277e-06, + "loss": 0.5133, + "step": 10720 + }, + { + "epoch": 1.7392926670992863, + "grad_norm": 0.6442621727957909, + "learning_rate": 1.8974698355509202e-06, + "loss": 0.4976, + "step": 10721 + }, + { + "epoch": 1.7394548994159638, + "grad_norm": 0.6043108916321447, + "learning_rate": 1.8970553998616336e-06, + "loss": 0.4959, + "step": 10722 + }, + { + "epoch": 1.7396171317326412, + "grad_norm": 0.6094175151333082, + "learning_rate": 1.8966409817645108e-06, + "loss": 0.5279, + "step": 10723 + }, + { + "epoch": 1.7397793640493187, + "grad_norm": 0.6123338230017318, + "learning_rate": 1.896226581271643e-06, + "loss": 0.5092, + "step": 10724 + }, + { + "epoch": 1.739941596365996, + "grad_norm": 0.627300289465032, + "learning_rate": 1.8958121983951194e-06, + "loss": 0.516, + "step": 10725 + }, + { + "epoch": 1.7401038286826735, + "grad_norm": 0.6052100320204026, + "learning_rate": 1.8953978331470322e-06, + "loss": 0.4776, + "step": 10726 + }, + { + "epoch": 1.740266060999351, + "grad_norm": 0.5680512458638378, + "learning_rate": 1.8949834855394702e-06, + "loss": 0.5276, + "step": 10727 + }, + { + "epoch": 1.7404282933160284, + "grad_norm": 0.5720258546695869, + "learning_rate": 1.8945691555845242e-06, + "loss": 0.4613, + "step": 10728 + }, + { + "epoch": 1.7405905256327059, + "grad_norm": 0.6142617991284763, + "learning_rate": 1.894154843294282e-06, + "loss": 0.5392, + "step": 10729 + }, + { + "epoch": 1.7407527579493836, + "grad_norm": 0.6149289744094368, + "learning_rate": 1.8937405486808337e-06, + "loss": 0.5155, + "step": 10730 + }, + { + "epoch": 1.740914990266061, + "grad_norm": 0.6059016441676139, + "learning_rate": 1.8933262717562647e-06, + "loss": 0.5157, + "step": 10731 + }, + { + "epoch": 1.7410772225827384, + "grad_norm": 0.6271558868127544, + "learning_rate": 1.8929120125326642e-06, + "loss": 0.5034, + "step": 10732 + }, + { + "epoch": 1.7412394548994161, + "grad_norm": 0.6007709925624656, + "learning_rate": 1.8924977710221179e-06, + "loss": 0.5443, + "step": 10733 + }, + { + "epoch": 1.7414016872160936, + "grad_norm": 0.6001891209097692, + "learning_rate": 1.8920835472367133e-06, + "loss": 0.5194, + "step": 10734 + }, + { + "epoch": 1.741563919532771, + "grad_norm": 0.5982661027128348, + "learning_rate": 1.8916693411885365e-06, + "loss": 0.5005, + "step": 10735 + }, + { + "epoch": 1.7417261518494485, + "grad_norm": 0.6210438923819261, + "learning_rate": 1.8912551528896708e-06, + "loss": 0.5282, + "step": 10736 + }, + { + "epoch": 1.741888384166126, + "grad_norm": 0.5736756154832926, + "learning_rate": 1.8908409823522028e-06, + "loss": 0.5174, + "step": 10737 + }, + { + "epoch": 1.7420506164828033, + "grad_norm": 0.6026190562622709, + "learning_rate": 1.8904268295882158e-06, + "loss": 0.511, + "step": 10738 + }, + { + "epoch": 1.7422128487994808, + "grad_norm": 0.6162317267557715, + "learning_rate": 1.8900126946097947e-06, + "loss": 0.5088, + "step": 10739 + }, + { + "epoch": 1.7423750811161582, + "grad_norm": 0.5866499374080555, + "learning_rate": 1.889598577429022e-06, + "loss": 0.483, + "step": 10740 + }, + { + "epoch": 1.7425373134328357, + "grad_norm": 0.5922052258238726, + "learning_rate": 1.8891844780579808e-06, + "loss": 0.4928, + "step": 10741 + }, + { + "epoch": 1.7426995457495134, + "grad_norm": 0.5930030436476846, + "learning_rate": 1.888770396508752e-06, + "loss": 0.5051, + "step": 10742 + }, + { + "epoch": 1.7428617780661908, + "grad_norm": 0.5868408005477332, + "learning_rate": 1.888356332793419e-06, + "loss": 0.5381, + "step": 10743 + }, + { + "epoch": 1.7430240103828682, + "grad_norm": 0.5940776833886463, + "learning_rate": 1.8879422869240615e-06, + "loss": 0.5153, + "step": 10744 + }, + { + "epoch": 1.7431862426995457, + "grad_norm": 0.6351772290408169, + "learning_rate": 1.8875282589127622e-06, + "loss": 0.5325, + "step": 10745 + }, + { + "epoch": 1.7433484750162234, + "grad_norm": 0.5621223440550573, + "learning_rate": 1.8871142487715993e-06, + "loss": 0.5066, + "step": 10746 + }, + { + "epoch": 1.7435107073329008, + "grad_norm": 0.6106895913893514, + "learning_rate": 1.8867002565126527e-06, + "loss": 0.4828, + "step": 10747 + }, + { + "epoch": 1.7436729396495783, + "grad_norm": 0.6008221003306834, + "learning_rate": 1.8862862821480023e-06, + "loss": 0.4777, + "step": 10748 + }, + { + "epoch": 1.7438351719662557, + "grad_norm": 0.6208333543575794, + "learning_rate": 1.8858723256897257e-06, + "loss": 0.4895, + "step": 10749 + }, + { + "epoch": 1.7439974042829332, + "grad_norm": 0.5837199052508211, + "learning_rate": 1.8854583871499027e-06, + "loss": 0.5285, + "step": 10750 + }, + { + "epoch": 1.7441596365996106, + "grad_norm": 0.6370153006537809, + "learning_rate": 1.8850444665406087e-06, + "loss": 0.5053, + "step": 10751 + }, + { + "epoch": 1.744321868916288, + "grad_norm": 0.6020618013549317, + "learning_rate": 1.8846305638739217e-06, + "loss": 0.5249, + "step": 10752 + }, + { + "epoch": 1.7444841012329655, + "grad_norm": 0.5901631386336201, + "learning_rate": 1.8842166791619177e-06, + "loss": 0.4818, + "step": 10753 + }, + { + "epoch": 1.744646333549643, + "grad_norm": 0.607488449900029, + "learning_rate": 1.883802812416673e-06, + "loss": 0.5137, + "step": 10754 + }, + { + "epoch": 1.7448085658663206, + "grad_norm": 0.6310741575132104, + "learning_rate": 1.883388963650264e-06, + "loss": 0.4885, + "step": 10755 + }, + { + "epoch": 1.744970798182998, + "grad_norm": 0.6130108378459751, + "learning_rate": 1.8829751328747633e-06, + "loss": 0.4859, + "step": 10756 + }, + { + "epoch": 1.7451330304996755, + "grad_norm": 0.6013264024645172, + "learning_rate": 1.8825613201022475e-06, + "loss": 0.5011, + "step": 10757 + }, + { + "epoch": 1.7452952628163532, + "grad_norm": 0.6050076362303487, + "learning_rate": 1.8821475253447884e-06, + "loss": 0.5371, + "step": 10758 + }, + { + "epoch": 1.7454574951330306, + "grad_norm": 0.6171585989949657, + "learning_rate": 1.8817337486144614e-06, + "loss": 0.5421, + "step": 10759 + }, + { + "epoch": 1.745619727449708, + "grad_norm": 0.5776475415638456, + "learning_rate": 1.8813199899233386e-06, + "loss": 0.5091, + "step": 10760 + }, + { + "epoch": 1.7457819597663855, + "grad_norm": 0.5998586688068303, + "learning_rate": 1.880906249283491e-06, + "loss": 0.5232, + "step": 10761 + }, + { + "epoch": 1.745944192083063, + "grad_norm": 0.5828116413579902, + "learning_rate": 1.8804925267069924e-06, + "loss": 0.5257, + "step": 10762 + }, + { + "epoch": 1.7461064243997404, + "grad_norm": 0.5958752218798206, + "learning_rate": 1.8800788222059119e-06, + "loss": 0.5054, + "step": 10763 + }, + { + "epoch": 1.7462686567164178, + "grad_norm": 0.5969330777970586, + "learning_rate": 1.8796651357923218e-06, + "loss": 0.5195, + "step": 10764 + }, + { + "epoch": 1.7464308890330953, + "grad_norm": 0.5936367792939594, + "learning_rate": 1.8792514674782924e-06, + "loss": 0.4856, + "step": 10765 + }, + { + "epoch": 1.7465931213497727, + "grad_norm": 0.5988822099601347, + "learning_rate": 1.8788378172758922e-06, + "loss": 0.4966, + "step": 10766 + }, + { + "epoch": 1.7467553536664504, + "grad_norm": 0.6136709840425056, + "learning_rate": 1.8784241851971902e-06, + "loss": 0.5261, + "step": 10767 + }, + { + "epoch": 1.7469175859831279, + "grad_norm": 0.6265516183512798, + "learning_rate": 1.8780105712542562e-06, + "loss": 0.5287, + "step": 10768 + }, + { + "epoch": 1.7470798182998053, + "grad_norm": 0.5966696985173855, + "learning_rate": 1.8775969754591568e-06, + "loss": 0.4953, + "step": 10769 + }, + { + "epoch": 1.747242050616483, + "grad_norm": 0.5723828604284692, + "learning_rate": 1.8771833978239615e-06, + "loss": 0.5057, + "step": 10770 + }, + { + "epoch": 1.7474042829331604, + "grad_norm": 0.5926515163956957, + "learning_rate": 1.8767698383607355e-06, + "loss": 0.5187, + "step": 10771 + }, + { + "epoch": 1.7475665152498379, + "grad_norm": 0.6091237409487443, + "learning_rate": 1.8763562970815455e-06, + "loss": 0.4939, + "step": 10772 + }, + { + "epoch": 1.7477287475665153, + "grad_norm": 0.5991104799824158, + "learning_rate": 1.8759427739984582e-06, + "loss": 0.5016, + "step": 10773 + }, + { + "epoch": 1.7478909798831928, + "grad_norm": 0.6332555683165563, + "learning_rate": 1.875529269123538e-06, + "loss": 0.5338, + "step": 10774 + }, + { + "epoch": 1.7480532121998702, + "grad_norm": 0.6451167540796232, + "learning_rate": 1.8751157824688509e-06, + "loss": 0.5177, + "step": 10775 + }, + { + "epoch": 1.7482154445165476, + "grad_norm": 0.6246917600213296, + "learning_rate": 1.874702314046461e-06, + "loss": 0.5133, + "step": 10776 + }, + { + "epoch": 1.748377676833225, + "grad_norm": 0.5922541784949708, + "learning_rate": 1.8742888638684315e-06, + "loss": 0.5314, + "step": 10777 + }, + { + "epoch": 1.7485399091499025, + "grad_norm": 0.598489518508214, + "learning_rate": 1.8738754319468256e-06, + "loss": 0.4893, + "step": 10778 + }, + { + "epoch": 1.74870214146658, + "grad_norm": 0.6159766743129405, + "learning_rate": 1.8734620182937066e-06, + "loss": 0.5322, + "step": 10779 + }, + { + "epoch": 1.7488643737832577, + "grad_norm": 0.608023605207391, + "learning_rate": 1.873048622921136e-06, + "loss": 0.4825, + "step": 10780 + }, + { + "epoch": 1.749026606099935, + "grad_norm": 0.6081954973437501, + "learning_rate": 1.872635245841177e-06, + "loss": 0.5191, + "step": 10781 + }, + { + "epoch": 1.7491888384166125, + "grad_norm": 0.6433332334102386, + "learning_rate": 1.8722218870658894e-06, + "loss": 0.5186, + "step": 10782 + }, + { + "epoch": 1.7493510707332902, + "grad_norm": 0.6040169269160178, + "learning_rate": 1.8718085466073333e-06, + "loss": 0.5233, + "step": 10783 + }, + { + "epoch": 1.7495133030499677, + "grad_norm": 0.6190787190995847, + "learning_rate": 1.8713952244775701e-06, + "loss": 0.5207, + "step": 10784 + }, + { + "epoch": 1.749675535366645, + "grad_norm": 0.6177518258315816, + "learning_rate": 1.8709819206886581e-06, + "loss": 0.5292, + "step": 10785 + }, + { + "epoch": 1.7498377676833226, + "grad_norm": 0.6255474798258391, + "learning_rate": 1.8705686352526589e-06, + "loss": 0.4927, + "step": 10786 + }, + { + "epoch": 1.75, + "grad_norm": 0.6343104826963981, + "learning_rate": 1.8701553681816276e-06, + "loss": 0.5049, + "step": 10787 + }, + { + "epoch": 1.7501622323166774, + "grad_norm": 0.5932136769840097, + "learning_rate": 1.8697421194876236e-06, + "loss": 0.5099, + "step": 10788 + }, + { + "epoch": 1.750324464633355, + "grad_norm": 0.6366170052022028, + "learning_rate": 1.869328889182704e-06, + "loss": 0.5041, + "step": 10789 + }, + { + "epoch": 1.7504866969500323, + "grad_norm": 0.6247274744218981, + "learning_rate": 1.8689156772789265e-06, + "loss": 0.5334, + "step": 10790 + }, + { + "epoch": 1.7506489292667098, + "grad_norm": 0.5762377616064136, + "learning_rate": 1.8685024837883475e-06, + "loss": 0.5101, + "step": 10791 + }, + { + "epoch": 1.7508111615833875, + "grad_norm": 0.5880321476062323, + "learning_rate": 1.8680893087230207e-06, + "loss": 0.544, + "step": 10792 + }, + { + "epoch": 1.750973393900065, + "grad_norm": 0.6087772031234856, + "learning_rate": 1.8676761520950037e-06, + "loss": 0.5366, + "step": 10793 + }, + { + "epoch": 1.7511356262167423, + "grad_norm": 0.6114224916873763, + "learning_rate": 1.8672630139163495e-06, + "loss": 0.5166, + "step": 10794 + }, + { + "epoch": 1.75129785853342, + "grad_norm": 0.631152293580286, + "learning_rate": 1.8668498941991137e-06, + "loss": 0.5013, + "step": 10795 + }, + { + "epoch": 1.7514600908500975, + "grad_norm": 0.638821876266032, + "learning_rate": 1.8664367929553495e-06, + "loss": 0.519, + "step": 10796 + }, + { + "epoch": 1.751622323166775, + "grad_norm": 0.5970007977784303, + "learning_rate": 1.8660237101971088e-06, + "loss": 0.4895, + "step": 10797 + }, + { + "epoch": 1.7517845554834524, + "grad_norm": 0.6242576073663483, + "learning_rate": 1.8656106459364457e-06, + "loss": 0.5172, + "step": 10798 + }, + { + "epoch": 1.7519467878001298, + "grad_norm": 0.6332135461812602, + "learning_rate": 1.8651976001854116e-06, + "loss": 0.496, + "step": 10799 + }, + { + "epoch": 1.7521090201168072, + "grad_norm": 0.6036051931163666, + "learning_rate": 1.8647845729560576e-06, + "loss": 0.5445, + "step": 10800 + }, + { + "epoch": 1.7522712524334847, + "grad_norm": 0.603050680657094, + "learning_rate": 1.8643715642604356e-06, + "loss": 0.5453, + "step": 10801 + }, + { + "epoch": 1.7524334847501621, + "grad_norm": 0.5986773948181587, + "learning_rate": 1.8639585741105953e-06, + "loss": 0.4892, + "step": 10802 + }, + { + "epoch": 1.7525957170668396, + "grad_norm": 0.6802491567044513, + "learning_rate": 1.863545602518586e-06, + "loss": 0.512, + "step": 10803 + }, + { + "epoch": 1.752757949383517, + "grad_norm": 0.6118821220171426, + "learning_rate": 1.863132649496458e-06, + "loss": 0.5469, + "step": 10804 + }, + { + "epoch": 1.7529201817001947, + "grad_norm": 0.5817480799080041, + "learning_rate": 1.8627197150562593e-06, + "loss": 0.5203, + "step": 10805 + }, + { + "epoch": 1.7530824140168721, + "grad_norm": 0.654843082233156, + "learning_rate": 1.8623067992100398e-06, + "loss": 0.5435, + "step": 10806 + }, + { + "epoch": 1.7532446463335496, + "grad_norm": 0.6177801066317852, + "learning_rate": 1.861893901969845e-06, + "loss": 0.4544, + "step": 10807 + }, + { + "epoch": 1.7534068786502273, + "grad_norm": 0.6088092697818042, + "learning_rate": 1.8614810233477227e-06, + "loss": 0.4986, + "step": 10808 + }, + { + "epoch": 1.7535691109669047, + "grad_norm": 0.6043874228746345, + "learning_rate": 1.86106816335572e-06, + "loss": 0.4818, + "step": 10809 + }, + { + "epoch": 1.7537313432835822, + "grad_norm": 0.6391592285848755, + "learning_rate": 1.8606553220058828e-06, + "loss": 0.5166, + "step": 10810 + }, + { + "epoch": 1.7538935756002596, + "grad_norm": 0.6118527768939808, + "learning_rate": 1.8602424993102573e-06, + "loss": 0.5332, + "step": 10811 + }, + { + "epoch": 1.754055807916937, + "grad_norm": 0.6271551294847195, + "learning_rate": 1.8598296952808865e-06, + "loss": 0.5194, + "step": 10812 + }, + { + "epoch": 1.7542180402336145, + "grad_norm": 0.6197439292464785, + "learning_rate": 1.859416909929816e-06, + "loss": 0.543, + "step": 10813 + }, + { + "epoch": 1.754380272550292, + "grad_norm": 0.607270282159024, + "learning_rate": 1.8590041432690895e-06, + "loss": 0.521, + "step": 10814 + }, + { + "epoch": 1.7545425048669694, + "grad_norm": 0.6007497198807972, + "learning_rate": 1.8585913953107509e-06, + "loss": 0.4952, + "step": 10815 + }, + { + "epoch": 1.7547047371836468, + "grad_norm": 0.5914558552619704, + "learning_rate": 1.8581786660668434e-06, + "loss": 0.5251, + "step": 10816 + }, + { + "epoch": 1.7548669695003245, + "grad_norm": 0.5958014686396772, + "learning_rate": 1.8577659555494066e-06, + "loss": 0.5104, + "step": 10817 + }, + { + "epoch": 1.755029201817002, + "grad_norm": 0.6124451796113011, + "learning_rate": 1.857353263770485e-06, + "loss": 0.5385, + "step": 10818 + }, + { + "epoch": 1.7551914341336794, + "grad_norm": 0.6203335213377443, + "learning_rate": 1.8569405907421182e-06, + "loss": 0.52, + "step": 10819 + }, + { + "epoch": 1.755353666450357, + "grad_norm": 0.6136937860509426, + "learning_rate": 1.8565279364763479e-06, + "loss": 0.5082, + "step": 10820 + }, + { + "epoch": 1.7555158987670345, + "grad_norm": 0.5937640397704674, + "learning_rate": 1.8561153009852125e-06, + "loss": 0.5336, + "step": 10821 + }, + { + "epoch": 1.755678131083712, + "grad_norm": 0.5994923694349119, + "learning_rate": 1.8557026842807547e-06, + "loss": 0.4986, + "step": 10822 + }, + { + "epoch": 1.7558403634003894, + "grad_norm": 0.5651663079770987, + "learning_rate": 1.8552900863750095e-06, + "loss": 0.5083, + "step": 10823 + }, + { + "epoch": 1.7560025957170668, + "grad_norm": 0.5914367279408548, + "learning_rate": 1.8548775072800173e-06, + "loss": 0.5263, + "step": 10824 + }, + { + "epoch": 1.7561648280337443, + "grad_norm": 0.6239641993941843, + "learning_rate": 1.8544649470078158e-06, + "loss": 0.5325, + "step": 10825 + }, + { + "epoch": 1.7563270603504217, + "grad_norm": 0.6066828119977582, + "learning_rate": 1.8540524055704424e-06, + "loss": 0.5418, + "step": 10826 + }, + { + "epoch": 1.7564892926670992, + "grad_norm": 0.6268212893938844, + "learning_rate": 1.8536398829799343e-06, + "loss": 0.4919, + "step": 10827 + }, + { + "epoch": 1.7566515249837766, + "grad_norm": 0.5940126826097987, + "learning_rate": 1.8532273792483263e-06, + "loss": 0.5183, + "step": 10828 + }, + { + "epoch": 1.7568137573004543, + "grad_norm": 0.5834358335083882, + "learning_rate": 1.8528148943876551e-06, + "loss": 0.4903, + "step": 10829 + }, + { + "epoch": 1.7569759896171318, + "grad_norm": 0.6042547285229335, + "learning_rate": 1.852402428409955e-06, + "loss": 0.5098, + "step": 10830 + }, + { + "epoch": 1.7571382219338092, + "grad_norm": 0.5911330671179379, + "learning_rate": 1.8519899813272618e-06, + "loss": 0.5344, + "step": 10831 + }, + { + "epoch": 1.7573004542504866, + "grad_norm": 0.6365097300980019, + "learning_rate": 1.8515775531516096e-06, + "loss": 0.5428, + "step": 10832 + }, + { + "epoch": 1.7574626865671643, + "grad_norm": 0.6145821116347755, + "learning_rate": 1.85116514389503e-06, + "loss": 0.5348, + "step": 10833 + }, + { + "epoch": 1.7576249188838418, + "grad_norm": 0.6085504561021048, + "learning_rate": 1.8507527535695568e-06, + "loss": 0.4815, + "step": 10834 + }, + { + "epoch": 1.7577871512005192, + "grad_norm": 0.6308768805950178, + "learning_rate": 1.850340382187223e-06, + "loss": 0.5355, + "step": 10835 + }, + { + "epoch": 1.7579493835171967, + "grad_norm": 0.6217178717304948, + "learning_rate": 1.8499280297600594e-06, + "loss": 0.501, + "step": 10836 + }, + { + "epoch": 1.758111615833874, + "grad_norm": 0.5844601642745072, + "learning_rate": 1.8495156963000987e-06, + "loss": 0.5206, + "step": 10837 + }, + { + "epoch": 1.7582738481505515, + "grad_norm": 0.5776305081209083, + "learning_rate": 1.8491033818193704e-06, + "loss": 0.5328, + "step": 10838 + }, + { + "epoch": 1.758436080467229, + "grad_norm": 0.5781785513232945, + "learning_rate": 1.848691086329904e-06, + "loss": 0.5104, + "step": 10839 + }, + { + "epoch": 1.7585983127839064, + "grad_norm": 0.6936314706491291, + "learning_rate": 1.8482788098437306e-06, + "loss": 0.5033, + "step": 10840 + }, + { + "epoch": 1.7587605451005839, + "grad_norm": 0.5902764303718204, + "learning_rate": 1.8478665523728784e-06, + "loss": 0.5145, + "step": 10841 + }, + { + "epoch": 1.7589227774172616, + "grad_norm": 0.6645713585582631, + "learning_rate": 1.8474543139293765e-06, + "loss": 0.5268, + "step": 10842 + }, + { + "epoch": 1.759085009733939, + "grad_norm": 0.5985750988780413, + "learning_rate": 1.847042094525252e-06, + "loss": 0.4821, + "step": 10843 + }, + { + "epoch": 1.7592472420506164, + "grad_norm": 0.5880922952157307, + "learning_rate": 1.8466298941725324e-06, + "loss": 0.5194, + "step": 10844 + }, + { + "epoch": 1.7594094743672941, + "grad_norm": 0.6170877307416824, + "learning_rate": 1.8462177128832452e-06, + "loss": 0.5379, + "step": 10845 + }, + { + "epoch": 1.7595717066839716, + "grad_norm": 0.6049494709002925, + "learning_rate": 1.8458055506694158e-06, + "loss": 0.4969, + "step": 10846 + }, + { + "epoch": 1.759733939000649, + "grad_norm": 0.5974637381880266, + "learning_rate": 1.845393407543071e-06, + "loss": 0.5034, + "step": 10847 + }, + { + "epoch": 1.7598961713173265, + "grad_norm": 0.5850206304179566, + "learning_rate": 1.8449812835162343e-06, + "loss": 0.5158, + "step": 10848 + }, + { + "epoch": 1.760058403634004, + "grad_norm": 0.5670923382458275, + "learning_rate": 1.8445691786009312e-06, + "loss": 0.5144, + "step": 10849 + }, + { + "epoch": 1.7602206359506813, + "grad_norm": 0.5902343802861445, + "learning_rate": 1.8441570928091857e-06, + "loss": 0.5158, + "step": 10850 + }, + { + "epoch": 1.7603828682673588, + "grad_norm": 0.6217756759682428, + "learning_rate": 1.8437450261530215e-06, + "loss": 0.5157, + "step": 10851 + }, + { + "epoch": 1.7605451005840362, + "grad_norm": 0.5579323318525009, + "learning_rate": 1.843332978644462e-06, + "loss": 0.5268, + "step": 10852 + }, + { + "epoch": 1.7607073329007137, + "grad_norm": 0.5950070365705439, + "learning_rate": 1.8429209502955276e-06, + "loss": 0.5044, + "step": 10853 + }, + { + "epoch": 1.7608695652173914, + "grad_norm": 0.6081648044461379, + "learning_rate": 1.8425089411182414e-06, + "loss": 0.4971, + "step": 10854 + }, + { + "epoch": 1.7610317975340688, + "grad_norm": 0.5971394782518065, + "learning_rate": 1.8420969511246246e-06, + "loss": 0.4961, + "step": 10855 + }, + { + "epoch": 1.7611940298507462, + "grad_norm": 0.6089643378196996, + "learning_rate": 1.8416849803266979e-06, + "loss": 0.4934, + "step": 10856 + }, + { + "epoch": 1.761356262167424, + "grad_norm": 0.586015023965277, + "learning_rate": 1.8412730287364819e-06, + "loss": 0.5313, + "step": 10857 + }, + { + "epoch": 1.7615184944841014, + "grad_norm": 0.589354708613768, + "learning_rate": 1.840861096365995e-06, + "loss": 0.5225, + "step": 10858 + }, + { + "epoch": 1.7616807268007788, + "grad_norm": 0.5759609560849123, + "learning_rate": 1.8404491832272562e-06, + "loss": 0.5212, + "step": 10859 + }, + { + "epoch": 1.7618429591174563, + "grad_norm": 0.6196132790839215, + "learning_rate": 1.840037289332285e-06, + "loss": 0.4966, + "step": 10860 + }, + { + "epoch": 1.7620051914341337, + "grad_norm": 0.5784714599565841, + "learning_rate": 1.8396254146930978e-06, + "loss": 0.4947, + "step": 10861 + }, + { + "epoch": 1.7621674237508111, + "grad_norm": 0.6084155161366586, + "learning_rate": 1.8392135593217142e-06, + "loss": 0.4726, + "step": 10862 + }, + { + "epoch": 1.7623296560674886, + "grad_norm": 0.5876212406636366, + "learning_rate": 1.8388017232301487e-06, + "loss": 0.5185, + "step": 10863 + }, + { + "epoch": 1.762491888384166, + "grad_norm": 0.5995629171461102, + "learning_rate": 1.8383899064304177e-06, + "loss": 0.5222, + "step": 10864 + }, + { + "epoch": 1.7626541207008435, + "grad_norm": 0.6424276684070801, + "learning_rate": 1.8379781089345382e-06, + "loss": 0.5348, + "step": 10865 + }, + { + "epoch": 1.762816353017521, + "grad_norm": 0.629821048404346, + "learning_rate": 1.837566330754524e-06, + "loss": 0.5043, + "step": 10866 + }, + { + "epoch": 1.7629785853341986, + "grad_norm": 0.6350570210482961, + "learning_rate": 1.8371545719023917e-06, + "loss": 0.5058, + "step": 10867 + }, + { + "epoch": 1.763140817650876, + "grad_norm": 0.5642333128183257, + "learning_rate": 1.8367428323901517e-06, + "loss": 0.4946, + "step": 10868 + }, + { + "epoch": 1.7633030499675535, + "grad_norm": 0.6285141797086056, + "learning_rate": 1.83633111222982e-06, + "loss": 0.5125, + "step": 10869 + }, + { + "epoch": 1.7634652822842312, + "grad_norm": 0.5951976597797367, + "learning_rate": 1.8359194114334078e-06, + "loss": 0.5372, + "step": 10870 + }, + { + "epoch": 1.7636275146009086, + "grad_norm": 0.6122187153685864, + "learning_rate": 1.8355077300129286e-06, + "loss": 0.5312, + "step": 10871 + }, + { + "epoch": 1.763789746917586, + "grad_norm": 0.5980628958331773, + "learning_rate": 1.8350960679803935e-06, + "loss": 0.5234, + "step": 10872 + }, + { + "epoch": 1.7639519792342635, + "grad_norm": 0.5918715058945642, + "learning_rate": 1.834684425347814e-06, + "loss": 0.5004, + "step": 10873 + }, + { + "epoch": 1.764114211550941, + "grad_norm": 0.6216560254521508, + "learning_rate": 1.8342728021272004e-06, + "loss": 0.4885, + "step": 10874 + }, + { + "epoch": 1.7642764438676184, + "grad_norm": 0.6170994739171823, + "learning_rate": 1.833861198330562e-06, + "loss": 0.4863, + "step": 10875 + }, + { + "epoch": 1.7644386761842958, + "grad_norm": 0.5971847035082628, + "learning_rate": 1.8334496139699092e-06, + "loss": 0.4822, + "step": 10876 + }, + { + "epoch": 1.7646009085009733, + "grad_norm": 0.5927012382967715, + "learning_rate": 1.83303804905725e-06, + "loss": 0.4844, + "step": 10877 + }, + { + "epoch": 1.7647631408176507, + "grad_norm": 0.6264278155566441, + "learning_rate": 1.832626503604594e-06, + "loss": 0.485, + "step": 10878 + }, + { + "epoch": 1.7649253731343284, + "grad_norm": 0.5964712819397096, + "learning_rate": 1.8322149776239476e-06, + "loss": 0.5344, + "step": 10879 + }, + { + "epoch": 1.7650876054510058, + "grad_norm": 0.6247155941166948, + "learning_rate": 1.8318034711273181e-06, + "loss": 0.5353, + "step": 10880 + }, + { + "epoch": 1.7652498377676833, + "grad_norm": 0.6054747154248463, + "learning_rate": 1.831391984126712e-06, + "loss": 0.4872, + "step": 10881 + }, + { + "epoch": 1.765412070084361, + "grad_norm": 0.6011350569058057, + "learning_rate": 1.8309805166341354e-06, + "loss": 0.5483, + "step": 10882 + }, + { + "epoch": 1.7655743024010384, + "grad_norm": 0.5698257222047826, + "learning_rate": 1.8305690686615951e-06, + "loss": 0.5083, + "step": 10883 + }, + { + "epoch": 1.7657365347177159, + "grad_norm": 0.6017782182906584, + "learning_rate": 1.8301576402210935e-06, + "loss": 0.4989, + "step": 10884 + }, + { + "epoch": 1.7658987670343933, + "grad_norm": 0.6069495955531325, + "learning_rate": 1.8297462313246365e-06, + "loss": 0.5434, + "step": 10885 + }, + { + "epoch": 1.7660609993510707, + "grad_norm": 0.6176617690731051, + "learning_rate": 1.829334841984227e-06, + "loss": 0.5443, + "step": 10886 + }, + { + "epoch": 1.7662232316677482, + "grad_norm": 0.5745120067112789, + "learning_rate": 1.8289234722118688e-06, + "loss": 0.4802, + "step": 10887 + }, + { + "epoch": 1.7663854639844256, + "grad_norm": 0.5990854629965718, + "learning_rate": 1.828512122019565e-06, + "loss": 0.4872, + "step": 10888 + }, + { + "epoch": 1.766547696301103, + "grad_norm": 0.5838264882722061, + "learning_rate": 1.8281007914193155e-06, + "loss": 0.5218, + "step": 10889 + }, + { + "epoch": 1.7667099286177805, + "grad_norm": 0.6001749307974529, + "learning_rate": 1.8276894804231237e-06, + "loss": 0.506, + "step": 10890 + }, + { + "epoch": 1.766872160934458, + "grad_norm": 0.5841747289154957, + "learning_rate": 1.82727818904299e-06, + "loss": 0.516, + "step": 10891 + }, + { + "epoch": 1.7670343932511356, + "grad_norm": 0.6014654693269815, + "learning_rate": 1.8268669172909137e-06, + "loss": 0.5537, + "step": 10892 + }, + { + "epoch": 1.767196625567813, + "grad_norm": 0.6140847954435907, + "learning_rate": 1.8264556651788965e-06, + "loss": 0.551, + "step": 10893 + }, + { + "epoch": 1.7673588578844905, + "grad_norm": 0.6196255467882008, + "learning_rate": 1.8260444327189355e-06, + "loss": 0.5398, + "step": 10894 + }, + { + "epoch": 1.7675210902011682, + "grad_norm": 0.6004477302812091, + "learning_rate": 1.8256332199230299e-06, + "loss": 0.5059, + "step": 10895 + }, + { + "epoch": 1.7676833225178457, + "grad_norm": 0.6142804959526624, + "learning_rate": 1.8252220268031782e-06, + "loss": 0.4937, + "step": 10896 + }, + { + "epoch": 1.767845554834523, + "grad_norm": 0.5982590802987788, + "learning_rate": 1.8248108533713771e-06, + "loss": 0.5072, + "step": 10897 + }, + { + "epoch": 1.7680077871512005, + "grad_norm": 0.6129309391312017, + "learning_rate": 1.8243996996396246e-06, + "loss": 0.4819, + "step": 10898 + }, + { + "epoch": 1.768170019467878, + "grad_norm": 0.5805098897527731, + "learning_rate": 1.8239885656199158e-06, + "loss": 0.5247, + "step": 10899 + }, + { + "epoch": 1.7683322517845554, + "grad_norm": 0.5933057207268914, + "learning_rate": 1.8235774513242464e-06, + "loss": 0.5132, + "step": 10900 + }, + { + "epoch": 1.7684944841012329, + "grad_norm": 0.5890945142594342, + "learning_rate": 1.823166356764612e-06, + "loss": 0.5032, + "step": 10901 + }, + { + "epoch": 1.7686567164179103, + "grad_norm": 0.6281406756584496, + "learning_rate": 1.822755281953007e-06, + "loss": 0.5581, + "step": 10902 + }, + { + "epoch": 1.7688189487345878, + "grad_norm": 0.609343387765827, + "learning_rate": 1.8223442269014269e-06, + "loss": 0.4935, + "step": 10903 + }, + { + "epoch": 1.7689811810512654, + "grad_norm": 0.5863973962192187, + "learning_rate": 1.8219331916218616e-06, + "loss": 0.4909, + "step": 10904 + }, + { + "epoch": 1.769143413367943, + "grad_norm": 0.6443553247919578, + "learning_rate": 1.8215221761263068e-06, + "loss": 0.5204, + "step": 10905 + }, + { + "epoch": 1.7693056456846203, + "grad_norm": 0.6110738833239823, + "learning_rate": 1.8211111804267531e-06, + "loss": 0.5079, + "step": 10906 + }, + { + "epoch": 1.769467878001298, + "grad_norm": 0.6144971852403205, + "learning_rate": 1.8207002045351934e-06, + "loss": 0.5172, + "step": 10907 + }, + { + "epoch": 1.7696301103179755, + "grad_norm": 0.5983410941334184, + "learning_rate": 1.820289248463619e-06, + "loss": 0.5119, + "step": 10908 + }, + { + "epoch": 1.769792342634653, + "grad_norm": 0.6222261639946416, + "learning_rate": 1.8198783122240182e-06, + "loss": 0.5144, + "step": 10909 + }, + { + "epoch": 1.7699545749513304, + "grad_norm": 0.589821512677025, + "learning_rate": 1.819467395828383e-06, + "loss": 0.5298, + "step": 10910 + }, + { + "epoch": 1.7701168072680078, + "grad_norm": 0.6095337293621866, + "learning_rate": 1.819056499288702e-06, + "loss": 0.4962, + "step": 10911 + }, + { + "epoch": 1.7702790395846852, + "grad_norm": 0.6018975146627897, + "learning_rate": 1.8186456226169641e-06, + "loss": 0.5043, + "step": 10912 + }, + { + "epoch": 1.7704412719013627, + "grad_norm": 0.615080571993278, + "learning_rate": 1.8182347658251583e-06, + "loss": 0.5244, + "step": 10913 + }, + { + "epoch": 1.7706035042180401, + "grad_norm": 0.6294809084483975, + "learning_rate": 1.8178239289252708e-06, + "loss": 0.5238, + "step": 10914 + }, + { + "epoch": 1.7707657365347176, + "grad_norm": 0.6089366610827297, + "learning_rate": 1.8174131119292887e-06, + "loss": 0.5105, + "step": 10915 + }, + { + "epoch": 1.7709279688513953, + "grad_norm": 0.5800186117585447, + "learning_rate": 1.817002314849199e-06, + "loss": 0.4974, + "step": 10916 + }, + { + "epoch": 1.7710902011680727, + "grad_norm": 0.7220335211265348, + "learning_rate": 1.8165915376969872e-06, + "loss": 0.5179, + "step": 10917 + }, + { + "epoch": 1.7712524334847501, + "grad_norm": 0.6180697825747383, + "learning_rate": 1.8161807804846399e-06, + "loss": 0.5291, + "step": 10918 + }, + { + "epoch": 1.7714146658014276, + "grad_norm": 0.5988080786228076, + "learning_rate": 1.8157700432241407e-06, + "loss": 0.5199, + "step": 10919 + }, + { + "epoch": 1.7715768981181053, + "grad_norm": 0.5994312502518483, + "learning_rate": 1.8153593259274727e-06, + "loss": 0.5502, + "step": 10920 + }, + { + "epoch": 1.7717391304347827, + "grad_norm": 0.6208855882368096, + "learning_rate": 1.8149486286066213e-06, + "loss": 0.536, + "step": 10921 + }, + { + "epoch": 1.7719013627514602, + "grad_norm": 0.6221852195988301, + "learning_rate": 1.8145379512735678e-06, + "loss": 0.517, + "step": 10922 + }, + { + "epoch": 1.7720635950681376, + "grad_norm": 0.6201861271304893, + "learning_rate": 1.8141272939402959e-06, + "loss": 0.5218, + "step": 10923 + }, + { + "epoch": 1.772225827384815, + "grad_norm": 0.5991882674495895, + "learning_rate": 1.813716656618788e-06, + "loss": 0.5366, + "step": 10924 + }, + { + "epoch": 1.7723880597014925, + "grad_norm": 0.5950969481741906, + "learning_rate": 1.8133060393210234e-06, + "loss": 0.4876, + "step": 10925 + }, + { + "epoch": 1.77255029201817, + "grad_norm": 0.648798546778688, + "learning_rate": 1.8128954420589825e-06, + "loss": 0.5081, + "step": 10926 + }, + { + "epoch": 1.7727125243348474, + "grad_norm": 0.6316482429880411, + "learning_rate": 1.8124848648446474e-06, + "loss": 0.5084, + "step": 10927 + }, + { + "epoch": 1.7728747566515248, + "grad_norm": 0.6216683677290793, + "learning_rate": 1.8120743076899954e-06, + "loss": 0.5437, + "step": 10928 + }, + { + "epoch": 1.7730369889682025, + "grad_norm": 0.5670523783733822, + "learning_rate": 1.8116637706070082e-06, + "loss": 0.5307, + "step": 10929 + }, + { + "epoch": 1.77319922128488, + "grad_norm": 0.5758908116028969, + "learning_rate": 1.8112532536076613e-06, + "loss": 0.5111, + "step": 10930 + }, + { + "epoch": 1.7733614536015574, + "grad_norm": 0.5810324221291048, + "learning_rate": 1.810842756703933e-06, + "loss": 0.493, + "step": 10931 + }, + { + "epoch": 1.773523685918235, + "grad_norm": 0.6204313552676618, + "learning_rate": 1.8104322799078014e-06, + "loss": 0.4869, + "step": 10932 + }, + { + "epoch": 1.7736859182349125, + "grad_norm": 0.580709781738026, + "learning_rate": 1.8100218232312416e-06, + "loss": 0.5199, + "step": 10933 + }, + { + "epoch": 1.77384815055159, + "grad_norm": 0.5832642993247601, + "learning_rate": 1.809611386686232e-06, + "loss": 0.5251, + "step": 10934 + }, + { + "epoch": 1.7740103828682674, + "grad_norm": 0.6181061432448863, + "learning_rate": 1.8092009702847453e-06, + "loss": 0.4861, + "step": 10935 + }, + { + "epoch": 1.7741726151849448, + "grad_norm": 0.585130089562608, + "learning_rate": 1.8087905740387565e-06, + "loss": 0.5005, + "step": 10936 + }, + { + "epoch": 1.7743348475016223, + "grad_norm": 0.6190427955341895, + "learning_rate": 1.8083801979602412e-06, + "loss": 0.5224, + "step": 10937 + }, + { + "epoch": 1.7744970798182997, + "grad_norm": 0.6244435930044759, + "learning_rate": 1.8079698420611725e-06, + "loss": 0.4982, + "step": 10938 + }, + { + "epoch": 1.7746593121349772, + "grad_norm": 0.5657622707481026, + "learning_rate": 1.8075595063535233e-06, + "loss": 0.5293, + "step": 10939 + }, + { + "epoch": 1.7748215444516546, + "grad_norm": 0.6030987249975098, + "learning_rate": 1.807149190849265e-06, + "loss": 0.4951, + "step": 10940 + }, + { + "epoch": 1.7749837767683323, + "grad_norm": 0.6491508918581114, + "learning_rate": 1.806738895560371e-06, + "loss": 0.5189, + "step": 10941 + }, + { + "epoch": 1.7751460090850097, + "grad_norm": 0.6125774130011307, + "learning_rate": 1.8063286204988112e-06, + "loss": 0.4904, + "step": 10942 + }, + { + "epoch": 1.7753082414016872, + "grad_norm": 0.613652896167686, + "learning_rate": 1.8059183656765571e-06, + "loss": 0.5206, + "step": 10943 + }, + { + "epoch": 1.7754704737183649, + "grad_norm": 0.5625020341871494, + "learning_rate": 1.805508131105579e-06, + "loss": 0.5226, + "step": 10944 + }, + { + "epoch": 1.7756327060350423, + "grad_norm": 0.6103851734369713, + "learning_rate": 1.805097916797845e-06, + "loss": 0.4801, + "step": 10945 + }, + { + "epoch": 1.7757949383517198, + "grad_norm": 0.6744725953662349, + "learning_rate": 1.8046877227653248e-06, + "loss": 0.4954, + "step": 10946 + }, + { + "epoch": 1.7759571706683972, + "grad_norm": 0.599972564268881, + "learning_rate": 1.8042775490199866e-06, + "loss": 0.4992, + "step": 10947 + }, + { + "epoch": 1.7761194029850746, + "grad_norm": 0.6416207513605626, + "learning_rate": 1.8038673955737983e-06, + "loss": 0.496, + "step": 10948 + }, + { + "epoch": 1.776281635301752, + "grad_norm": 0.6128284582445052, + "learning_rate": 1.8034572624387274e-06, + "loss": 0.5025, + "step": 10949 + }, + { + "epoch": 1.7764438676184295, + "grad_norm": 0.5974688457952403, + "learning_rate": 1.8030471496267393e-06, + "loss": 0.4837, + "step": 10950 + }, + { + "epoch": 1.776606099935107, + "grad_norm": 0.6021266562295851, + "learning_rate": 1.8026370571497996e-06, + "loss": 0.5422, + "step": 10951 + }, + { + "epoch": 1.7767683322517844, + "grad_norm": 0.6206626793138939, + "learning_rate": 1.802226985019875e-06, + "loss": 0.5352, + "step": 10952 + }, + { + "epoch": 1.7769305645684619, + "grad_norm": 0.6116061891824421, + "learning_rate": 1.8018169332489294e-06, + "loss": 0.547, + "step": 10953 + }, + { + "epoch": 1.7770927968851395, + "grad_norm": 0.5882472737703323, + "learning_rate": 1.8014069018489277e-06, + "loss": 0.4987, + "step": 10954 + }, + { + "epoch": 1.777255029201817, + "grad_norm": 0.5903643200107478, + "learning_rate": 1.800996890831832e-06, + "loss": 0.5273, + "step": 10955 + }, + { + "epoch": 1.7774172615184944, + "grad_norm": 0.5875754011702118, + "learning_rate": 1.800586900209606e-06, + "loss": 0.5273, + "step": 10956 + }, + { + "epoch": 1.777579493835172, + "grad_norm": 0.5927420721880696, + "learning_rate": 1.8001769299942123e-06, + "loss": 0.4944, + "step": 10957 + }, + { + "epoch": 1.7777417261518496, + "grad_norm": 0.5912804063312824, + "learning_rate": 1.799766980197612e-06, + "loss": 0.4824, + "step": 10958 + }, + { + "epoch": 1.777903958468527, + "grad_norm": 0.6368152500549402, + "learning_rate": 1.799357050831768e-06, + "loss": 0.5124, + "step": 10959 + }, + { + "epoch": 1.7780661907852044, + "grad_norm": 0.5968637397290457, + "learning_rate": 1.7989471419086381e-06, + "loss": 0.4717, + "step": 10960 + }, + { + "epoch": 1.778228423101882, + "grad_norm": 0.5971640038260734, + "learning_rate": 1.7985372534401841e-06, + "loss": 0.5449, + "step": 10961 + }, + { + "epoch": 1.7783906554185593, + "grad_norm": 0.5733823602426231, + "learning_rate": 1.7981273854383644e-06, + "loss": 0.5237, + "step": 10962 + }, + { + "epoch": 1.7785528877352368, + "grad_norm": 0.5789177183142545, + "learning_rate": 1.7977175379151386e-06, + "loss": 0.514, + "step": 10963 + }, + { + "epoch": 1.7787151200519142, + "grad_norm": 0.5760278257949538, + "learning_rate": 1.7973077108824639e-06, + "loss": 0.4751, + "step": 10964 + }, + { + "epoch": 1.7788773523685917, + "grad_norm": 0.6235976290445023, + "learning_rate": 1.7968979043522993e-06, + "loss": 0.5032, + "step": 10965 + }, + { + "epoch": 1.7790395846852693, + "grad_norm": 0.5799337525907139, + "learning_rate": 1.7964881183366007e-06, + "loss": 0.4894, + "step": 10966 + }, + { + "epoch": 1.7792018170019468, + "grad_norm": 0.5831380448745205, + "learning_rate": 1.7960783528473238e-06, + "loss": 0.5073, + "step": 10967 + }, + { + "epoch": 1.7793640493186242, + "grad_norm": 0.5990486078399915, + "learning_rate": 1.7956686078964257e-06, + "loss": 0.5099, + "step": 10968 + }, + { + "epoch": 1.779526281635302, + "grad_norm": 0.6034837418524818, + "learning_rate": 1.795258883495861e-06, + "loss": 0.523, + "step": 10969 + }, + { + "epoch": 1.7796885139519794, + "grad_norm": 0.6092072767540271, + "learning_rate": 1.794849179657585e-06, + "loss": 0.5423, + "step": 10970 + }, + { + "epoch": 1.7798507462686568, + "grad_norm": 0.6292284177715273, + "learning_rate": 1.7944394963935504e-06, + "loss": 0.467, + "step": 10971 + }, + { + "epoch": 1.7800129785853342, + "grad_norm": 0.6052670151798589, + "learning_rate": 1.7940298337157113e-06, + "loss": 0.5047, + "step": 10972 + }, + { + "epoch": 1.7801752109020117, + "grad_norm": 0.582784796208354, + "learning_rate": 1.7936201916360196e-06, + "loss": 0.5173, + "step": 10973 + }, + { + "epoch": 1.7803374432186891, + "grad_norm": 0.6038943655830192, + "learning_rate": 1.7932105701664287e-06, + "loss": 0.5099, + "step": 10974 + }, + { + "epoch": 1.7804996755353666, + "grad_norm": 0.6195038247197889, + "learning_rate": 1.7928009693188902e-06, + "loss": 0.5279, + "step": 10975 + }, + { + "epoch": 1.780661907852044, + "grad_norm": 0.5836355477894121, + "learning_rate": 1.7923913891053535e-06, + "loss": 0.5177, + "step": 10976 + }, + { + "epoch": 1.7808241401687215, + "grad_norm": 0.5895752600270637, + "learning_rate": 1.7919818295377706e-06, + "loss": 0.4792, + "step": 10977 + }, + { + "epoch": 1.780986372485399, + "grad_norm": 0.6278199344944372, + "learning_rate": 1.7915722906280897e-06, + "loss": 0.5316, + "step": 10978 + }, + { + "epoch": 1.7811486048020766, + "grad_norm": 0.6123276292899863, + "learning_rate": 1.7911627723882616e-06, + "loss": 0.5384, + "step": 10979 + }, + { + "epoch": 1.781310837118754, + "grad_norm": 0.6098637564731461, + "learning_rate": 1.7907532748302346e-06, + "loss": 0.4992, + "step": 10980 + }, + { + "epoch": 1.7814730694354315, + "grad_norm": 0.6100690260371114, + "learning_rate": 1.7903437979659549e-06, + "loss": 0.5074, + "step": 10981 + }, + { + "epoch": 1.7816353017521092, + "grad_norm": 0.6310240338799672, + "learning_rate": 1.7899343418073716e-06, + "loss": 0.5161, + "step": 10982 + }, + { + "epoch": 1.7817975340687866, + "grad_norm": 0.590405806692425, + "learning_rate": 1.7895249063664306e-06, + "loss": 0.517, + "step": 10983 + }, + { + "epoch": 1.781959766385464, + "grad_norm": 0.623222615991778, + "learning_rate": 1.7891154916550785e-06, + "loss": 0.5147, + "step": 10984 + }, + { + "epoch": 1.7821219987021415, + "grad_norm": 0.6352346575816948, + "learning_rate": 1.7887060976852614e-06, + "loss": 0.507, + "step": 10985 + }, + { + "epoch": 1.782284231018819, + "grad_norm": 0.6657727093291117, + "learning_rate": 1.7882967244689228e-06, + "loss": 0.5178, + "step": 10986 + }, + { + "epoch": 1.7824464633354964, + "grad_norm": 0.6255449579238159, + "learning_rate": 1.7878873720180073e-06, + "loss": 0.5381, + "step": 10987 + }, + { + "epoch": 1.7826086956521738, + "grad_norm": 0.621751277022663, + "learning_rate": 1.7874780403444595e-06, + "loss": 0.5237, + "step": 10988 + }, + { + "epoch": 1.7827709279688513, + "grad_norm": 0.5980891526246378, + "learning_rate": 1.7870687294602213e-06, + "loss": 0.4934, + "step": 10989 + }, + { + "epoch": 1.7829331602855287, + "grad_norm": 0.6240658103094322, + "learning_rate": 1.7866594393772375e-06, + "loss": 0.5091, + "step": 10990 + }, + { + "epoch": 1.7830953926022064, + "grad_norm": 0.5853898708494881, + "learning_rate": 1.7862501701074472e-06, + "loss": 0.501, + "step": 10991 + }, + { + "epoch": 1.7832576249188838, + "grad_norm": 0.5874854376839669, + "learning_rate": 1.7858409216627925e-06, + "loss": 0.5202, + "step": 10992 + }, + { + "epoch": 1.7834198572355613, + "grad_norm": 0.6131075940321385, + "learning_rate": 1.7854316940552152e-06, + "loss": 0.5231, + "step": 10993 + }, + { + "epoch": 1.783582089552239, + "grad_norm": 0.5957397963562814, + "learning_rate": 1.7850224872966538e-06, + "loss": 0.5013, + "step": 10994 + }, + { + "epoch": 1.7837443218689164, + "grad_norm": 0.5952020425339467, + "learning_rate": 1.7846133013990502e-06, + "loss": 0.4931, + "step": 10995 + }, + { + "epoch": 1.7839065541855939, + "grad_norm": 0.6130473831181076, + "learning_rate": 1.78420413637434e-06, + "loss": 0.5247, + "step": 10996 + }, + { + "epoch": 1.7840687865022713, + "grad_norm": 0.6259629281036544, + "learning_rate": 1.7837949922344638e-06, + "loss": 0.4979, + "step": 10997 + }, + { + "epoch": 1.7842310188189487, + "grad_norm": 0.6353030664670962, + "learning_rate": 1.783385868991358e-06, + "loss": 0.5233, + "step": 10998 + }, + { + "epoch": 1.7843932511356262, + "grad_norm": 0.6178240956911487, + "learning_rate": 1.7829767666569604e-06, + "loss": 0.502, + "step": 10999 + }, + { + "epoch": 1.7845554834523036, + "grad_norm": 0.5944297528897359, + "learning_rate": 1.7825676852432078e-06, + "loss": 0.5084, + "step": 11000 + }, + { + "epoch": 1.784717715768981, + "grad_norm": 0.6011846024632826, + "learning_rate": 1.7821586247620344e-06, + "loss": 0.5271, + "step": 11001 + }, + { + "epoch": 1.7848799480856585, + "grad_norm": 0.612087733600209, + "learning_rate": 1.7817495852253764e-06, + "loss": 0.4728, + "step": 11002 + }, + { + "epoch": 1.7850421804023362, + "grad_norm": 0.5911156296097055, + "learning_rate": 1.781340566645168e-06, + "loss": 0.5039, + "step": 11003 + }, + { + "epoch": 1.7852044127190136, + "grad_norm": 0.6274739924248601, + "learning_rate": 1.780931569033344e-06, + "loss": 0.4725, + "step": 11004 + }, + { + "epoch": 1.785366645035691, + "grad_norm": 0.6404020625531585, + "learning_rate": 1.7805225924018377e-06, + "loss": 0.5463, + "step": 11005 + }, + { + "epoch": 1.7855288773523685, + "grad_norm": 0.5579309662835362, + "learning_rate": 1.780113636762581e-06, + "loss": 0.5045, + "step": 11006 + }, + { + "epoch": 1.7856911096690462, + "grad_norm": 0.5924172248756788, + "learning_rate": 1.7797047021275056e-06, + "loss": 0.4832, + "step": 11007 + }, + { + "epoch": 1.7858533419857237, + "grad_norm": 0.5820342774188196, + "learning_rate": 1.7792957885085444e-06, + "loss": 0.4923, + "step": 11008 + }, + { + "epoch": 1.786015574302401, + "grad_norm": 0.581207563957015, + "learning_rate": 1.7788868959176275e-06, + "loss": 0.4708, + "step": 11009 + }, + { + "epoch": 1.7861778066190785, + "grad_norm": 0.6032864081960493, + "learning_rate": 1.7784780243666855e-06, + "loss": 0.5084, + "step": 11010 + }, + { + "epoch": 1.786340038935756, + "grad_norm": 0.5878109714528267, + "learning_rate": 1.7780691738676487e-06, + "loss": 0.5502, + "step": 11011 + }, + { + "epoch": 1.7865022712524334, + "grad_norm": 0.5960133850446713, + "learning_rate": 1.7776603444324445e-06, + "loss": 0.511, + "step": 11012 + }, + { + "epoch": 1.7866645035691109, + "grad_norm": 0.5857109798151682, + "learning_rate": 1.7772515360730025e-06, + "loss": 0.5268, + "step": 11013 + }, + { + "epoch": 1.7868267358857883, + "grad_norm": 0.5722419431916393, + "learning_rate": 1.7768427488012501e-06, + "loss": 0.5015, + "step": 11014 + }, + { + "epoch": 1.7869889682024658, + "grad_norm": 0.6077825411571247, + "learning_rate": 1.7764339826291156e-06, + "loss": 0.5081, + "step": 11015 + }, + { + "epoch": 1.7871512005191434, + "grad_norm": 0.6205599162302047, + "learning_rate": 1.7760252375685245e-06, + "loss": 0.4861, + "step": 11016 + }, + { + "epoch": 1.787313432835821, + "grad_norm": 0.5946829596321881, + "learning_rate": 1.7756165136314024e-06, + "loss": 0.504, + "step": 11017 + }, + { + "epoch": 1.7874756651524983, + "grad_norm": 0.5837160561365655, + "learning_rate": 1.7752078108296759e-06, + "loss": 0.527, + "step": 11018 + }, + { + "epoch": 1.787637897469176, + "grad_norm": 0.580127014012292, + "learning_rate": 1.7747991291752692e-06, + "loss": 0.5253, + "step": 11019 + }, + { + "epoch": 1.7878001297858535, + "grad_norm": 0.5818966104565649, + "learning_rate": 1.7743904686801055e-06, + "loss": 0.4849, + "step": 11020 + }, + { + "epoch": 1.787962362102531, + "grad_norm": 0.602088856717632, + "learning_rate": 1.7739818293561106e-06, + "loss": 0.5411, + "step": 11021 + }, + { + "epoch": 1.7881245944192083, + "grad_norm": 0.6030487871302387, + "learning_rate": 1.7735732112152054e-06, + "loss": 0.5287, + "step": 11022 + }, + { + "epoch": 1.7882868267358858, + "grad_norm": 0.6123905393806276, + "learning_rate": 1.7731646142693124e-06, + "loss": 0.5038, + "step": 11023 + }, + { + "epoch": 1.7884490590525632, + "grad_norm": 0.5877786368230958, + "learning_rate": 1.7727560385303542e-06, + "loss": 0.533, + "step": 11024 + }, + { + "epoch": 1.7886112913692407, + "grad_norm": 0.5866017551699796, + "learning_rate": 1.7723474840102506e-06, + "loss": 0.5151, + "step": 11025 + }, + { + "epoch": 1.7887735236859181, + "grad_norm": 0.59987766397332, + "learning_rate": 1.771938950720924e-06, + "loss": 0.5261, + "step": 11026 + }, + { + "epoch": 1.7889357560025956, + "grad_norm": 0.5807165930766892, + "learning_rate": 1.771530438674292e-06, + "loss": 0.519, + "step": 11027 + }, + { + "epoch": 1.7890979883192732, + "grad_norm": 0.5550258722408988, + "learning_rate": 1.771121947882275e-06, + "loss": 0.5211, + "step": 11028 + }, + { + "epoch": 1.7892602206359507, + "grad_norm": 0.6394208762374486, + "learning_rate": 1.7707134783567915e-06, + "loss": 0.5162, + "step": 11029 + }, + { + "epoch": 1.7894224529526281, + "grad_norm": 0.6177067702772288, + "learning_rate": 1.770305030109759e-06, + "loss": 0.4943, + "step": 11030 + }, + { + "epoch": 1.7895846852693058, + "grad_norm": 0.5828629999945039, + "learning_rate": 1.7698966031530955e-06, + "loss": 0.5155, + "step": 11031 + }, + { + "epoch": 1.7897469175859833, + "grad_norm": 0.6165694519765577, + "learning_rate": 1.7694881974987165e-06, + "loss": 0.494, + "step": 11032 + }, + { + "epoch": 1.7899091499026607, + "grad_norm": 0.6134454135905361, + "learning_rate": 1.7690798131585397e-06, + "loss": 0.4743, + "step": 11033 + }, + { + "epoch": 1.7900713822193381, + "grad_norm": 0.583103634864894, + "learning_rate": 1.7686714501444791e-06, + "loss": 0.4884, + "step": 11034 + }, + { + "epoch": 1.7902336145360156, + "grad_norm": 0.5982274444177645, + "learning_rate": 1.7682631084684504e-06, + "loss": 0.5131, + "step": 11035 + }, + { + "epoch": 1.790395846852693, + "grad_norm": 0.5805336251870821, + "learning_rate": 1.7678547881423686e-06, + "loss": 0.4866, + "step": 11036 + }, + { + "epoch": 1.7905580791693705, + "grad_norm": 0.6051404244571709, + "learning_rate": 1.7674464891781448e-06, + "loss": 0.516, + "step": 11037 + }, + { + "epoch": 1.790720311486048, + "grad_norm": 0.628928358562578, + "learning_rate": 1.7670382115876941e-06, + "loss": 0.5074, + "step": 11038 + }, + { + "epoch": 1.7908825438027254, + "grad_norm": 0.5946274193625386, + "learning_rate": 1.7666299553829278e-06, + "loss": 0.5331, + "step": 11039 + }, + { + "epoch": 1.7910447761194028, + "grad_norm": 0.616420300578228, + "learning_rate": 1.7662217205757587e-06, + "loss": 0.5381, + "step": 11040 + }, + { + "epoch": 1.7912070084360805, + "grad_norm": 0.6184864862079295, + "learning_rate": 1.7658135071780974e-06, + "loss": 0.5482, + "step": 11041 + }, + { + "epoch": 1.791369240752758, + "grad_norm": 0.6171157396031955, + "learning_rate": 1.7654053152018539e-06, + "loss": 0.5255, + "step": 11042 + }, + { + "epoch": 1.7915314730694354, + "grad_norm": 0.5872181584063348, + "learning_rate": 1.764997144658938e-06, + "loss": 0.5219, + "step": 11043 + }, + { + "epoch": 1.791693705386113, + "grad_norm": 0.6133935967231211, + "learning_rate": 1.7645889955612595e-06, + "loss": 0.5423, + "step": 11044 + }, + { + "epoch": 1.7918559377027905, + "grad_norm": 0.5689966081019721, + "learning_rate": 1.7641808679207262e-06, + "loss": 0.5241, + "step": 11045 + }, + { + "epoch": 1.792018170019468, + "grad_norm": 0.615509564890001, + "learning_rate": 1.763772761749248e-06, + "loss": 0.5219, + "step": 11046 + }, + { + "epoch": 1.7921804023361454, + "grad_norm": 0.5948453108693145, + "learning_rate": 1.7633646770587303e-06, + "loss": 0.5254, + "step": 11047 + }, + { + "epoch": 1.7923426346528228, + "grad_norm": 0.5939933274565036, + "learning_rate": 1.7629566138610799e-06, + "loss": 0.4591, + "step": 11048 + }, + { + "epoch": 1.7925048669695003, + "grad_norm": 0.5874593182006932, + "learning_rate": 1.7625485721682039e-06, + "loss": 0.5266, + "step": 11049 + }, + { + "epoch": 1.7926670992861777, + "grad_norm": 0.6052892293179857, + "learning_rate": 1.7621405519920072e-06, + "loss": 0.5266, + "step": 11050 + }, + { + "epoch": 1.7928293316028552, + "grad_norm": 0.5991249929979743, + "learning_rate": 1.7617325533443952e-06, + "loss": 0.5252, + "step": 11051 + }, + { + "epoch": 1.7929915639195326, + "grad_norm": 0.602196849913055, + "learning_rate": 1.761324576237271e-06, + "loss": 0.5128, + "step": 11052 + }, + { + "epoch": 1.7931537962362103, + "grad_norm": 0.6426818756425792, + "learning_rate": 1.7609166206825392e-06, + "loss": 0.5017, + "step": 11053 + }, + { + "epoch": 1.7933160285528877, + "grad_norm": 0.6047835847282075, + "learning_rate": 1.7605086866921018e-06, + "loss": 0.4957, + "step": 11054 + }, + { + "epoch": 1.7934782608695652, + "grad_norm": 0.6151973083124515, + "learning_rate": 1.7601007742778623e-06, + "loss": 0.493, + "step": 11055 + }, + { + "epoch": 1.7936404931862429, + "grad_norm": 0.5911655435914078, + "learning_rate": 1.759692883451721e-06, + "loss": 0.5192, + "step": 11056 + }, + { + "epoch": 1.7938027255029203, + "grad_norm": 0.5735134318742189, + "learning_rate": 1.759285014225581e-06, + "loss": 0.5048, + "step": 11057 + }, + { + "epoch": 1.7939649578195977, + "grad_norm": 0.6022697706522722, + "learning_rate": 1.7588771666113407e-06, + "loss": 0.5266, + "step": 11058 + }, + { + "epoch": 1.7941271901362752, + "grad_norm": 0.6145568859152917, + "learning_rate": 1.7584693406209005e-06, + "loss": 0.5219, + "step": 11059 + }, + { + "epoch": 1.7942894224529526, + "grad_norm": 0.5861046871633291, + "learning_rate": 1.7580615362661602e-06, + "loss": 0.5349, + "step": 11060 + }, + { + "epoch": 1.79445165476963, + "grad_norm": 0.5996038732863288, + "learning_rate": 1.7576537535590172e-06, + "loss": 0.5221, + "step": 11061 + }, + { + "epoch": 1.7946138870863075, + "grad_norm": 0.5770635484597181, + "learning_rate": 1.757245992511371e-06, + "loss": 0.5154, + "step": 11062 + }, + { + "epoch": 1.794776119402985, + "grad_norm": 0.6179409011916642, + "learning_rate": 1.756838253135118e-06, + "loss": 0.5258, + "step": 11063 + }, + { + "epoch": 1.7949383517196624, + "grad_norm": 0.6190145844272904, + "learning_rate": 1.7564305354421545e-06, + "loss": 0.4888, + "step": 11064 + }, + { + "epoch": 1.7951005840363399, + "grad_norm": 0.5843157106120602, + "learning_rate": 1.7560228394443763e-06, + "loss": 0.4921, + "step": 11065 + }, + { + "epoch": 1.7952628163530175, + "grad_norm": 0.6268036202658458, + "learning_rate": 1.7556151651536798e-06, + "loss": 0.5139, + "step": 11066 + }, + { + "epoch": 1.795425048669695, + "grad_norm": 0.588240857500453, + "learning_rate": 1.7552075125819598e-06, + "loss": 0.5017, + "step": 11067 + }, + { + "epoch": 1.7955872809863724, + "grad_norm": 0.6043295000209041, + "learning_rate": 1.7547998817411087e-06, + "loss": 0.5235, + "step": 11068 + }, + { + "epoch": 1.79574951330305, + "grad_norm": 0.5843258044443411, + "learning_rate": 1.7543922726430221e-06, + "loss": 0.5426, + "step": 11069 + }, + { + "epoch": 1.7959117456197276, + "grad_norm": 0.5933953367972628, + "learning_rate": 1.7539846852995912e-06, + "loss": 0.5458, + "step": 11070 + }, + { + "epoch": 1.796073977936405, + "grad_norm": 0.6426036674706044, + "learning_rate": 1.7535771197227092e-06, + "loss": 0.524, + "step": 11071 + }, + { + "epoch": 1.7962362102530824, + "grad_norm": 0.6202766361900851, + "learning_rate": 1.7531695759242678e-06, + "loss": 0.5248, + "step": 11072 + }, + { + "epoch": 1.79639844256976, + "grad_norm": 0.6078575004975716, + "learning_rate": 1.7527620539161568e-06, + "loss": 0.5054, + "step": 11073 + }, + { + "epoch": 1.7965606748864373, + "grad_norm": 0.6144000747124952, + "learning_rate": 1.7523545537102676e-06, + "loss": 0.4953, + "step": 11074 + }, + { + "epoch": 1.7967229072031148, + "grad_norm": 0.6221630219663774, + "learning_rate": 1.7519470753184891e-06, + "loss": 0.5082, + "step": 11075 + }, + { + "epoch": 1.7968851395197922, + "grad_norm": 0.642928691695931, + "learning_rate": 1.7515396187527112e-06, + "loss": 0.5067, + "step": 11076 + }, + { + "epoch": 1.7970473718364697, + "grad_norm": 0.5916186826242951, + "learning_rate": 1.7511321840248222e-06, + "loss": 0.5189, + "step": 11077 + }, + { + "epoch": 1.7972096041531473, + "grad_norm": 0.596386362339387, + "learning_rate": 1.750724771146709e-06, + "loss": 0.5153, + "step": 11078 + }, + { + "epoch": 1.7973718364698248, + "grad_norm": 0.6042787869972731, + "learning_rate": 1.7503173801302586e-06, + "loss": 0.5034, + "step": 11079 + }, + { + "epoch": 1.7975340687865022, + "grad_norm": 0.5814485044400937, + "learning_rate": 1.7499100109873584e-06, + "loss": 0.4878, + "step": 11080 + }, + { + "epoch": 1.79769630110318, + "grad_norm": 0.6097391020860325, + "learning_rate": 1.749502663729894e-06, + "loss": 0.5454, + "step": 11081 + }, + { + "epoch": 1.7978585334198574, + "grad_norm": 0.6181747952586483, + "learning_rate": 1.749095338369751e-06, + "loss": 0.5318, + "step": 11082 + }, + { + "epoch": 1.7980207657365348, + "grad_norm": 0.575697108138682, + "learning_rate": 1.748688034918813e-06, + "loss": 0.4994, + "step": 11083 + }, + { + "epoch": 1.7981829980532122, + "grad_norm": 0.5928912832425384, + "learning_rate": 1.748280753388964e-06, + "loss": 0.5132, + "step": 11084 + }, + { + "epoch": 1.7983452303698897, + "grad_norm": 0.6057538812838732, + "learning_rate": 1.7478734937920882e-06, + "loss": 0.5166, + "step": 11085 + }, + { + "epoch": 1.7985074626865671, + "grad_norm": 0.593447632923153, + "learning_rate": 1.747466256140067e-06, + "loss": 0.5257, + "step": 11086 + }, + { + "epoch": 1.7986696950032446, + "grad_norm": 0.6301871757899234, + "learning_rate": 1.747059040444785e-06, + "loss": 0.5056, + "step": 11087 + }, + { + "epoch": 1.798831927319922, + "grad_norm": 0.56481512597528, + "learning_rate": 1.7466518467181198e-06, + "loss": 0.5167, + "step": 11088 + }, + { + "epoch": 1.7989941596365995, + "grad_norm": 0.5698047828504149, + "learning_rate": 1.7462446749719548e-06, + "loss": 0.5143, + "step": 11089 + }, + { + "epoch": 1.7991563919532771, + "grad_norm": 0.6036424732012469, + "learning_rate": 1.7458375252181687e-06, + "loss": 0.5229, + "step": 11090 + }, + { + "epoch": 1.7993186242699546, + "grad_norm": 0.5887097798611786, + "learning_rate": 1.745430397468642e-06, + "loss": 0.5268, + "step": 11091 + }, + { + "epoch": 1.799480856586632, + "grad_norm": 0.5986847101337562, + "learning_rate": 1.745023291735254e-06, + "loss": 0.5002, + "step": 11092 + }, + { + "epoch": 1.7996430889033095, + "grad_norm": 0.6288856633684972, + "learning_rate": 1.74461620802988e-06, + "loss": 0.5029, + "step": 11093 + }, + { + "epoch": 1.7998053212199872, + "grad_norm": 0.5866991400844457, + "learning_rate": 1.7442091463644007e-06, + "loss": 0.5016, + "step": 11094 + }, + { + "epoch": 1.7999675535366646, + "grad_norm": 0.5837098973890287, + "learning_rate": 1.7438021067506912e-06, + "loss": 0.5103, + "step": 11095 + }, + { + "epoch": 1.800129785853342, + "grad_norm": 0.5942415370857544, + "learning_rate": 1.743395089200628e-06, + "loss": 0.5379, + "step": 11096 + }, + { + "epoch": 1.8002920181700195, + "grad_norm": 0.628872109091631, + "learning_rate": 1.7429880937260878e-06, + "loss": 0.4895, + "step": 11097 + }, + { + "epoch": 1.800454250486697, + "grad_norm": 0.600472404559577, + "learning_rate": 1.7425811203389442e-06, + "loss": 0.5047, + "step": 11098 + }, + { + "epoch": 1.8006164828033744, + "grad_norm": 0.6061714306778704, + "learning_rate": 1.7421741690510712e-06, + "loss": 0.4943, + "step": 11099 + }, + { + "epoch": 1.8007787151200518, + "grad_norm": 0.5939910762296071, + "learning_rate": 1.741767239874344e-06, + "loss": 0.5327, + "step": 11100 + }, + { + "epoch": 1.8009409474367293, + "grad_norm": 0.5971749183544101, + "learning_rate": 1.7413603328206342e-06, + "loss": 0.4933, + "step": 11101 + }, + { + "epoch": 1.8011031797534067, + "grad_norm": 0.6344839527844731, + "learning_rate": 1.7409534479018158e-06, + "loss": 0.5089, + "step": 11102 + }, + { + "epoch": 1.8012654120700844, + "grad_norm": 0.5763338490292421, + "learning_rate": 1.7405465851297587e-06, + "loss": 0.4801, + "step": 11103 + }, + { + "epoch": 1.8014276443867618, + "grad_norm": 0.6357565787827292, + "learning_rate": 1.7401397445163342e-06, + "loss": 0.5288, + "step": 11104 + }, + { + "epoch": 1.8015898767034393, + "grad_norm": 0.628270897447266, + "learning_rate": 1.739732926073414e-06, + "loss": 0.5226, + "step": 11105 + }, + { + "epoch": 1.801752109020117, + "grad_norm": 0.5979809348244901, + "learning_rate": 1.7393261298128666e-06, + "loss": 0.5237, + "step": 11106 + }, + { + "epoch": 1.8019143413367944, + "grad_norm": 0.5942228218531197, + "learning_rate": 1.7389193557465621e-06, + "loss": 0.5487, + "step": 11107 + }, + { + "epoch": 1.8020765736534718, + "grad_norm": 0.5941678810585316, + "learning_rate": 1.7385126038863692e-06, + "loss": 0.5097, + "step": 11108 + }, + { + "epoch": 1.8022388059701493, + "grad_norm": 0.6060538656509276, + "learning_rate": 1.7381058742441537e-06, + "loss": 0.5469, + "step": 11109 + }, + { + "epoch": 1.8024010382868267, + "grad_norm": 0.5860316478455309, + "learning_rate": 1.737699166831785e-06, + "loss": 0.5244, + "step": 11110 + }, + { + "epoch": 1.8025632706035042, + "grad_norm": 0.6221327863593114, + "learning_rate": 1.7372924816611283e-06, + "loss": 0.5074, + "step": 11111 + }, + { + "epoch": 1.8027255029201816, + "grad_norm": 0.6254061028987477, + "learning_rate": 1.7368858187440497e-06, + "loss": 0.517, + "step": 11112 + }, + { + "epoch": 1.802887735236859, + "grad_norm": 0.588981430046556, + "learning_rate": 1.7364791780924158e-06, + "loss": 0.4778, + "step": 11113 + }, + { + "epoch": 1.8030499675535365, + "grad_norm": 0.6034031106917029, + "learning_rate": 1.7360725597180896e-06, + "loss": 0.5007, + "step": 11114 + }, + { + "epoch": 1.8032121998702142, + "grad_norm": 0.627123651486411, + "learning_rate": 1.7356659636329348e-06, + "loss": 0.5008, + "step": 11115 + }, + { + "epoch": 1.8033744321868916, + "grad_norm": 0.6183907459752218, + "learning_rate": 1.7352593898488163e-06, + "loss": 0.516, + "step": 11116 + }, + { + "epoch": 1.803536664503569, + "grad_norm": 0.5926744319991795, + "learning_rate": 1.7348528383775951e-06, + "loss": 0.5047, + "step": 11117 + }, + { + "epoch": 1.8036988968202468, + "grad_norm": 0.6323817790325441, + "learning_rate": 1.7344463092311348e-06, + "loss": 0.514, + "step": 11118 + }, + { + "epoch": 1.8038611291369242, + "grad_norm": 0.6183876745218845, + "learning_rate": 1.7340398024212956e-06, + "loss": 0.5075, + "step": 11119 + }, + { + "epoch": 1.8040233614536016, + "grad_norm": 0.5901695268188616, + "learning_rate": 1.7336333179599378e-06, + "loss": 0.5374, + "step": 11120 + }, + { + "epoch": 1.804185593770279, + "grad_norm": 0.5922605670068921, + "learning_rate": 1.7332268558589226e-06, + "loss": 0.4955, + "step": 11121 + }, + { + "epoch": 1.8043478260869565, + "grad_norm": 0.608481187894606, + "learning_rate": 1.7328204161301084e-06, + "loss": 0.5077, + "step": 11122 + }, + { + "epoch": 1.804510058403634, + "grad_norm": 0.6025337064388411, + "learning_rate": 1.7324139987853561e-06, + "loss": 0.4801, + "step": 11123 + }, + { + "epoch": 1.8046722907203114, + "grad_norm": 0.5931073976146954, + "learning_rate": 1.7320076038365203e-06, + "loss": 0.5149, + "step": 11124 + }, + { + "epoch": 1.8048345230369889, + "grad_norm": 0.6285545060819151, + "learning_rate": 1.7316012312954605e-06, + "loss": 0.5161, + "step": 11125 + }, + { + "epoch": 1.8049967553536663, + "grad_norm": 0.6207806957732485, + "learning_rate": 1.7311948811740329e-06, + "loss": 0.4895, + "step": 11126 + }, + { + "epoch": 1.8051589876703438, + "grad_norm": 0.6560207910800305, + "learning_rate": 1.730788553484094e-06, + "loss": 0.4975, + "step": 11127 + }, + { + "epoch": 1.8053212199870214, + "grad_norm": 0.6244627376466064, + "learning_rate": 1.7303822482375e-06, + "loss": 0.5287, + "step": 11128 + }, + { + "epoch": 1.8054834523036989, + "grad_norm": 0.6045976813924042, + "learning_rate": 1.7299759654461038e-06, + "loss": 0.5247, + "step": 11129 + }, + { + "epoch": 1.8056456846203763, + "grad_norm": 0.5989622349982814, + "learning_rate": 1.7295697051217608e-06, + "loss": 0.5388, + "step": 11130 + }, + { + "epoch": 1.805807916937054, + "grad_norm": 0.5835919515474807, + "learning_rate": 1.7291634672763238e-06, + "loss": 0.5287, + "step": 11131 + }, + { + "epoch": 1.8059701492537314, + "grad_norm": 0.6465765483123275, + "learning_rate": 1.7287572519216467e-06, + "loss": 0.5324, + "step": 11132 + }, + { + "epoch": 1.806132381570409, + "grad_norm": 0.5900563137600932, + "learning_rate": 1.7283510590695813e-06, + "loss": 0.5079, + "step": 11133 + }, + { + "epoch": 1.8062946138870863, + "grad_norm": 0.6366288995517975, + "learning_rate": 1.7279448887319786e-06, + "loss": 0.5089, + "step": 11134 + }, + { + "epoch": 1.8064568462037638, + "grad_norm": 0.6008672979514482, + "learning_rate": 1.727538740920689e-06, + "loss": 0.5025, + "step": 11135 + }, + { + "epoch": 1.8066190785204412, + "grad_norm": 0.6055968467377479, + "learning_rate": 1.7271326156475643e-06, + "loss": 0.526, + "step": 11136 + }, + { + "epoch": 1.8067813108371187, + "grad_norm": 0.5926378048126744, + "learning_rate": 1.7267265129244523e-06, + "loss": 0.5175, + "step": 11137 + }, + { + "epoch": 1.8069435431537961, + "grad_norm": 0.6001224100744703, + "learning_rate": 1.7263204327632039e-06, + "loss": 0.4823, + "step": 11138 + }, + { + "epoch": 1.8071057754704736, + "grad_norm": 0.6384244675432909, + "learning_rate": 1.7259143751756658e-06, + "loss": 0.4588, + "step": 11139 + }, + { + "epoch": 1.8072680077871512, + "grad_norm": 0.6212163061805346, + "learning_rate": 1.7255083401736856e-06, + "loss": 0.5023, + "step": 11140 + }, + { + "epoch": 1.8074302401038287, + "grad_norm": 0.6136454658746009, + "learning_rate": 1.7251023277691109e-06, + "loss": 0.4898, + "step": 11141 + }, + { + "epoch": 1.8075924724205061, + "grad_norm": 0.6134882159521644, + "learning_rate": 1.724696337973787e-06, + "loss": 0.5059, + "step": 11142 + }, + { + "epoch": 1.8077547047371838, + "grad_norm": 0.605713268314913, + "learning_rate": 1.7242903707995614e-06, + "loss": 0.4969, + "step": 11143 + }, + { + "epoch": 1.8079169370538613, + "grad_norm": 0.5961817028299448, + "learning_rate": 1.723884426258277e-06, + "loss": 0.4964, + "step": 11144 + }, + { + "epoch": 1.8080791693705387, + "grad_norm": 0.5920640923350378, + "learning_rate": 1.7234785043617795e-06, + "loss": 0.5053, + "step": 11145 + }, + { + "epoch": 1.8082414016872161, + "grad_norm": 0.5968347598420045, + "learning_rate": 1.723072605121911e-06, + "loss": 0.5351, + "step": 11146 + }, + { + "epoch": 1.8084036340038936, + "grad_norm": 0.6205691384365232, + "learning_rate": 1.7226667285505158e-06, + "loss": 0.4943, + "step": 11147 + }, + { + "epoch": 1.808565866320571, + "grad_norm": 0.5833929438690924, + "learning_rate": 1.7222608746594363e-06, + "loss": 0.4997, + "step": 11148 + }, + { + "epoch": 1.8087280986372485, + "grad_norm": 0.6060645315702464, + "learning_rate": 1.7218550434605124e-06, + "loss": 0.4764, + "step": 11149 + }, + { + "epoch": 1.808890330953926, + "grad_norm": 0.5941852044953188, + "learning_rate": 1.721449234965587e-06, + "loss": 0.5151, + "step": 11150 + }, + { + "epoch": 1.8090525632706034, + "grad_norm": 0.6030465770788077, + "learning_rate": 1.7210434491864992e-06, + "loss": 0.4772, + "step": 11151 + }, + { + "epoch": 1.8092147955872808, + "grad_norm": 0.6027826381656849, + "learning_rate": 1.7206376861350893e-06, + "loss": 0.5118, + "step": 11152 + }, + { + "epoch": 1.8093770279039585, + "grad_norm": 0.6321120358864417, + "learning_rate": 1.7202319458231955e-06, + "loss": 0.4833, + "step": 11153 + }, + { + "epoch": 1.809539260220636, + "grad_norm": 0.6106278897527704, + "learning_rate": 1.719826228262658e-06, + "loss": 0.5154, + "step": 11154 + }, + { + "epoch": 1.8097014925373134, + "grad_norm": 0.6066815166802926, + "learning_rate": 1.7194205334653125e-06, + "loss": 0.5222, + "step": 11155 + }, + { + "epoch": 1.809863724853991, + "grad_norm": 0.6050275290684359, + "learning_rate": 1.719014861442996e-06, + "loss": 0.5381, + "step": 11156 + }, + { + "epoch": 1.8100259571706685, + "grad_norm": 0.6145050102442621, + "learning_rate": 1.718609212207546e-06, + "loss": 0.5336, + "step": 11157 + }, + { + "epoch": 1.810188189487346, + "grad_norm": 0.6123898181491286, + "learning_rate": 1.718203585770798e-06, + "loss": 0.5328, + "step": 11158 + }, + { + "epoch": 1.8103504218040234, + "grad_norm": 0.590322769440609, + "learning_rate": 1.7177979821445868e-06, + "loss": 0.5192, + "step": 11159 + }, + { + "epoch": 1.8105126541207008, + "grad_norm": 0.6118087233635592, + "learning_rate": 1.7173924013407454e-06, + "loss": 0.5254, + "step": 11160 + }, + { + "epoch": 1.8106748864373783, + "grad_norm": 0.572283203118896, + "learning_rate": 1.716986843371109e-06, + "loss": 0.5073, + "step": 11161 + }, + { + "epoch": 1.8108371187540557, + "grad_norm": 0.5941527938283785, + "learning_rate": 1.7165813082475102e-06, + "loss": 0.5065, + "step": 11162 + }, + { + "epoch": 1.8109993510707332, + "grad_norm": 0.6126672676781456, + "learning_rate": 1.7161757959817814e-06, + "loss": 0.5066, + "step": 11163 + }, + { + "epoch": 1.8111615833874106, + "grad_norm": 0.6173483804357253, + "learning_rate": 1.715770306585755e-06, + "loss": 0.5049, + "step": 11164 + }, + { + "epoch": 1.8113238157040883, + "grad_norm": 0.6182483332572749, + "learning_rate": 1.7153648400712599e-06, + "loss": 0.5155, + "step": 11165 + }, + { + "epoch": 1.8114860480207657, + "grad_norm": 0.5905810303506106, + "learning_rate": 1.7149593964501285e-06, + "loss": 0.5185, + "step": 11166 + }, + { + "epoch": 1.8116482803374432, + "grad_norm": 0.6273187519273199, + "learning_rate": 1.714553975734189e-06, + "loss": 0.5362, + "step": 11167 + }, + { + "epoch": 1.8118105126541209, + "grad_norm": 0.6003627747118141, + "learning_rate": 1.7141485779352715e-06, + "loss": 0.5256, + "step": 11168 + }, + { + "epoch": 1.8119727449707983, + "grad_norm": 0.6247792172645841, + "learning_rate": 1.7137432030652045e-06, + "loss": 0.4999, + "step": 11169 + }, + { + "epoch": 1.8121349772874757, + "grad_norm": 0.6007151847290756, + "learning_rate": 1.7133378511358145e-06, + "loss": 0.5299, + "step": 11170 + }, + { + "epoch": 1.8122972096041532, + "grad_norm": 0.6165807402307732, + "learning_rate": 1.7129325221589288e-06, + "loss": 0.5179, + "step": 11171 + }, + { + "epoch": 1.8124594419208306, + "grad_norm": 0.5886070119037786, + "learning_rate": 1.712527216146374e-06, + "loss": 0.5186, + "step": 11172 + }, + { + "epoch": 1.812621674237508, + "grad_norm": 0.6046586501413009, + "learning_rate": 1.7121219331099755e-06, + "loss": 0.5477, + "step": 11173 + }, + { + "epoch": 1.8127839065541855, + "grad_norm": 0.5786364296118927, + "learning_rate": 1.7117166730615597e-06, + "loss": 0.5131, + "step": 11174 + }, + { + "epoch": 1.812946138870863, + "grad_norm": 0.6182578119993868, + "learning_rate": 1.711311436012949e-06, + "loss": 0.5052, + "step": 11175 + }, + { + "epoch": 1.8131083711875404, + "grad_norm": 0.6379195985513609, + "learning_rate": 1.710906221975967e-06, + "loss": 0.4828, + "step": 11176 + }, + { + "epoch": 1.813270603504218, + "grad_norm": 0.5820582866353865, + "learning_rate": 1.7105010309624381e-06, + "loss": 0.5075, + "step": 11177 + }, + { + "epoch": 1.8134328358208955, + "grad_norm": 0.6158553863063465, + "learning_rate": 1.710095862984183e-06, + "loss": 0.5041, + "step": 11178 + }, + { + "epoch": 1.813595068137573, + "grad_norm": 0.5991358585314229, + "learning_rate": 1.7096907180530259e-06, + "loss": 0.5362, + "step": 11179 + }, + { + "epoch": 1.8137573004542504, + "grad_norm": 0.6038047065044063, + "learning_rate": 1.7092855961807847e-06, + "loss": 0.5294, + "step": 11180 + }, + { + "epoch": 1.813919532770928, + "grad_norm": 0.594270682120592, + "learning_rate": 1.7088804973792816e-06, + "loss": 0.5051, + "step": 11181 + }, + { + "epoch": 1.8140817650876055, + "grad_norm": 0.5827961987778375, + "learning_rate": 1.7084754216603347e-06, + "loss": 0.4811, + "step": 11182 + }, + { + "epoch": 1.814243997404283, + "grad_norm": 0.6287934156973071, + "learning_rate": 1.7080703690357645e-06, + "loss": 0.5034, + "step": 11183 + }, + { + "epoch": 1.8144062297209604, + "grad_norm": 0.5722094195227877, + "learning_rate": 1.7076653395173892e-06, + "loss": 0.4961, + "step": 11184 + }, + { + "epoch": 1.8145684620376379, + "grad_norm": 0.6039465970179346, + "learning_rate": 1.7072603331170245e-06, + "loss": 0.5112, + "step": 11185 + }, + { + "epoch": 1.8147306943543153, + "grad_norm": 0.6324733144472033, + "learning_rate": 1.7068553498464894e-06, + "loss": 0.5152, + "step": 11186 + }, + { + "epoch": 1.8148929266709928, + "grad_norm": 0.599648499322222, + "learning_rate": 1.7064503897175985e-06, + "loss": 0.5207, + "step": 11187 + }, + { + "epoch": 1.8150551589876702, + "grad_norm": 0.6108017545871612, + "learning_rate": 1.7060454527421688e-06, + "loss": 0.5381, + "step": 11188 + }, + { + "epoch": 1.8152173913043477, + "grad_norm": 0.6372074610185627, + "learning_rate": 1.7056405389320152e-06, + "loss": 0.5422, + "step": 11189 + }, + { + "epoch": 1.8153796236210253, + "grad_norm": 0.6178381078258051, + "learning_rate": 1.7052356482989498e-06, + "loss": 0.5072, + "step": 11190 + }, + { + "epoch": 1.8155418559377028, + "grad_norm": 0.5814019867386174, + "learning_rate": 1.7048307808547884e-06, + "loss": 0.5248, + "step": 11191 + }, + { + "epoch": 1.8157040882543802, + "grad_norm": 0.6228834908394796, + "learning_rate": 1.704425936611343e-06, + "loss": 0.5002, + "step": 11192 + }, + { + "epoch": 1.815866320571058, + "grad_norm": 0.5752332294940247, + "learning_rate": 1.704021115580425e-06, + "loss": 0.5528, + "step": 11193 + }, + { + "epoch": 1.8160285528877353, + "grad_norm": 0.6143529764448323, + "learning_rate": 1.703616317773848e-06, + "loss": 0.5386, + "step": 11194 + }, + { + "epoch": 1.8161907852044128, + "grad_norm": 0.6020890614869799, + "learning_rate": 1.7032115432034208e-06, + "loss": 0.5058, + "step": 11195 + }, + { + "epoch": 1.8163530175210902, + "grad_norm": 0.6561040355214687, + "learning_rate": 1.7028067918809539e-06, + "loss": 0.527, + "step": 11196 + }, + { + "epoch": 1.8165152498377677, + "grad_norm": 0.6067300066635614, + "learning_rate": 1.7024020638182575e-06, + "loss": 0.5024, + "step": 11197 + }, + { + "epoch": 1.8166774821544451, + "grad_norm": 0.6326441668680506, + "learning_rate": 1.7019973590271394e-06, + "loss": 0.5238, + "step": 11198 + }, + { + "epoch": 1.8168397144711226, + "grad_norm": 0.6027793404712257, + "learning_rate": 1.7015926775194088e-06, + "loss": 0.4814, + "step": 11199 + }, + { + "epoch": 1.8170019467878, + "grad_norm": 0.5980444358452348, + "learning_rate": 1.7011880193068732e-06, + "loss": 0.5154, + "step": 11200 + }, + { + "epoch": 1.8171641791044775, + "grad_norm": 0.6143475702916038, + "learning_rate": 1.7007833844013377e-06, + "loss": 0.487, + "step": 11201 + }, + { + "epoch": 1.8173264114211551, + "grad_norm": 0.5839468609300719, + "learning_rate": 1.70037877281461e-06, + "loss": 0.5146, + "step": 11202 + }, + { + "epoch": 1.8174886437378326, + "grad_norm": 0.6129068972969337, + "learning_rate": 1.699974184558495e-06, + "loss": 0.5171, + "step": 11203 + }, + { + "epoch": 1.81765087605451, + "grad_norm": 0.6325967821348035, + "learning_rate": 1.6995696196447964e-06, + "loss": 0.4941, + "step": 11204 + }, + { + "epoch": 1.8178131083711877, + "grad_norm": 0.6201779752667944, + "learning_rate": 1.6991650780853205e-06, + "loss": 0.4883, + "step": 11205 + }, + { + "epoch": 1.8179753406878651, + "grad_norm": 0.5811247978512775, + "learning_rate": 1.6987605598918689e-06, + "loss": 0.4696, + "step": 11206 + }, + { + "epoch": 1.8181375730045426, + "grad_norm": 0.6091579814709643, + "learning_rate": 1.6983560650762436e-06, + "loss": 0.5293, + "step": 11207 + }, + { + "epoch": 1.81829980532122, + "grad_norm": 0.5886306588047069, + "learning_rate": 1.6979515936502483e-06, + "loss": 0.5145, + "step": 11208 + }, + { + "epoch": 1.8184620376378975, + "grad_norm": 0.5880967669187539, + "learning_rate": 1.6975471456256832e-06, + "loss": 0.5237, + "step": 11209 + }, + { + "epoch": 1.818624269954575, + "grad_norm": 0.596849787338753, + "learning_rate": 1.6971427210143503e-06, + "loss": 0.4909, + "step": 11210 + }, + { + "epoch": 1.8187865022712524, + "grad_norm": 0.6394697063161675, + "learning_rate": 1.6967383198280482e-06, + "loss": 0.5138, + "step": 11211 + }, + { + "epoch": 1.8189487345879298, + "grad_norm": 0.5848543177978348, + "learning_rate": 1.6963339420785757e-06, + "loss": 0.499, + "step": 11212 + }, + { + "epoch": 1.8191109669046073, + "grad_norm": 0.6301124950119027, + "learning_rate": 1.695929587777733e-06, + "loss": 0.531, + "step": 11213 + }, + { + "epoch": 1.8192731992212847, + "grad_norm": 0.6329894889550538, + "learning_rate": 1.6955252569373165e-06, + "loss": 0.533, + "step": 11214 + }, + { + "epoch": 1.8194354315379624, + "grad_norm": 0.5925665351284833, + "learning_rate": 1.6951209495691254e-06, + "loss": 0.4987, + "step": 11215 + }, + { + "epoch": 1.8195976638546398, + "grad_norm": 0.5914734111760256, + "learning_rate": 1.6947166656849535e-06, + "loss": 0.5151, + "step": 11216 + }, + { + "epoch": 1.8197598961713173, + "grad_norm": 0.6184352102202159, + "learning_rate": 1.6943124052965981e-06, + "loss": 0.5334, + "step": 11217 + }, + { + "epoch": 1.819922128487995, + "grad_norm": 0.5874680679389976, + "learning_rate": 1.6939081684158542e-06, + "loss": 0.5259, + "step": 11218 + }, + { + "epoch": 1.8200843608046724, + "grad_norm": 0.584223765627751, + "learning_rate": 1.6935039550545163e-06, + "loss": 0.5424, + "step": 11219 + }, + { + "epoch": 1.8202465931213498, + "grad_norm": 0.6348194271075974, + "learning_rate": 1.6930997652243789e-06, + "loss": 0.5383, + "step": 11220 + }, + { + "epoch": 1.8204088254380273, + "grad_norm": 0.7009623123534612, + "learning_rate": 1.692695598937233e-06, + "loss": 0.5304, + "step": 11221 + }, + { + "epoch": 1.8205710577547047, + "grad_norm": 0.6235255970514149, + "learning_rate": 1.6922914562048731e-06, + "loss": 0.5027, + "step": 11222 + }, + { + "epoch": 1.8207332900713822, + "grad_norm": 0.6271836332315756, + "learning_rate": 1.6918873370390895e-06, + "loss": 0.4792, + "step": 11223 + }, + { + "epoch": 1.8208955223880596, + "grad_norm": 0.590128033670766, + "learning_rate": 1.691483241451674e-06, + "loss": 0.5266, + "step": 11224 + }, + { + "epoch": 1.821057754704737, + "grad_norm": 0.6378068242258157, + "learning_rate": 1.6910791694544176e-06, + "loss": 0.5326, + "step": 11225 + }, + { + "epoch": 1.8212199870214145, + "grad_norm": 0.6269612306563326, + "learning_rate": 1.6906751210591087e-06, + "loss": 0.5241, + "step": 11226 + }, + { + "epoch": 1.8213822193380922, + "grad_norm": 0.577811451688887, + "learning_rate": 1.690271096277536e-06, + "loss": 0.5407, + "step": 11227 + }, + { + "epoch": 1.8215444516547696, + "grad_norm": 0.6112081403540507, + "learning_rate": 1.6898670951214887e-06, + "loss": 0.5196, + "step": 11228 + }, + { + "epoch": 1.821706683971447, + "grad_norm": 0.6398471372873125, + "learning_rate": 1.6894631176027537e-06, + "loss": 0.5317, + "step": 11229 + }, + { + "epoch": 1.8218689162881248, + "grad_norm": 0.6287176596650155, + "learning_rate": 1.6890591637331193e-06, + "loss": 0.5178, + "step": 11230 + }, + { + "epoch": 1.8220311486048022, + "grad_norm": 0.6282888153969862, + "learning_rate": 1.68865523352437e-06, + "loss": 0.5061, + "step": 11231 + }, + { + "epoch": 1.8221933809214796, + "grad_norm": 0.6414667588026167, + "learning_rate": 1.6882513269882916e-06, + "loss": 0.5255, + "step": 11232 + }, + { + "epoch": 1.822355613238157, + "grad_norm": 0.6214096096759112, + "learning_rate": 1.6878474441366695e-06, + "loss": 0.5212, + "step": 11233 + }, + { + "epoch": 1.8225178455548345, + "grad_norm": 0.6000005918545789, + "learning_rate": 1.6874435849812873e-06, + "loss": 0.5024, + "step": 11234 + }, + { + "epoch": 1.822680077871512, + "grad_norm": 0.6506662544256927, + "learning_rate": 1.6870397495339297e-06, + "loss": 0.4731, + "step": 11235 + }, + { + "epoch": 1.8228423101881894, + "grad_norm": 0.5720371639385984, + "learning_rate": 1.686635937806378e-06, + "loss": 0.4811, + "step": 11236 + }, + { + "epoch": 1.8230045425048669, + "grad_norm": 0.6043251076990566, + "learning_rate": 1.6862321498104145e-06, + "loss": 0.526, + "step": 11237 + }, + { + "epoch": 1.8231667748215443, + "grad_norm": 0.5964074599681, + "learning_rate": 1.6858283855578205e-06, + "loss": 0.5292, + "step": 11238 + }, + { + "epoch": 1.8233290071382218, + "grad_norm": 0.59478160282316, + "learning_rate": 1.6854246450603773e-06, + "loss": 0.5098, + "step": 11239 + }, + { + "epoch": 1.8234912394548994, + "grad_norm": 0.5819279826240698, + "learning_rate": 1.685020928329865e-06, + "loss": 0.4974, + "step": 11240 + }, + { + "epoch": 1.8236534717715769, + "grad_norm": 0.5724776400403949, + "learning_rate": 1.6846172353780612e-06, + "loss": 0.5337, + "step": 11241 + }, + { + "epoch": 1.8238157040882543, + "grad_norm": 0.638974866337263, + "learning_rate": 1.6842135662167464e-06, + "loss": 0.5293, + "step": 11242 + }, + { + "epoch": 1.823977936404932, + "grad_norm": 0.5919336070106929, + "learning_rate": 1.6838099208576972e-06, + "loss": 0.4807, + "step": 11243 + }, + { + "epoch": 1.8241401687216094, + "grad_norm": 0.6064283733331916, + "learning_rate": 1.6834062993126915e-06, + "loss": 0.5153, + "step": 11244 + }, + { + "epoch": 1.824302401038287, + "grad_norm": 0.641268919959645, + "learning_rate": 1.6830027015935056e-06, + "loss": 0.4916, + "step": 11245 + }, + { + "epoch": 1.8244646333549643, + "grad_norm": 0.6213548551595098, + "learning_rate": 1.682599127711916e-06, + "loss": 0.5087, + "step": 11246 + }, + { + "epoch": 1.8246268656716418, + "grad_norm": 0.5885397947026858, + "learning_rate": 1.6821955776796966e-06, + "loss": 0.5172, + "step": 11247 + }, + { + "epoch": 1.8247890979883192, + "grad_norm": 0.604559375063169, + "learning_rate": 1.6817920515086223e-06, + "loss": 0.5097, + "step": 11248 + }, + { + "epoch": 1.8249513303049967, + "grad_norm": 0.6423856984271891, + "learning_rate": 1.681388549210467e-06, + "loss": 0.5087, + "step": 11249 + }, + { + "epoch": 1.8251135626216741, + "grad_norm": 0.5882073387562293, + "learning_rate": 1.6809850707970035e-06, + "loss": 0.5414, + "step": 11250 + }, + { + "epoch": 1.8252757949383516, + "grad_norm": 0.6110128492882266, + "learning_rate": 1.6805816162800048e-06, + "loss": 0.4985, + "step": 11251 + }, + { + "epoch": 1.8254380272550292, + "grad_norm": 0.5904983218559146, + "learning_rate": 1.680178185671241e-06, + "loss": 0.5151, + "step": 11252 + }, + { + "epoch": 1.8256002595717067, + "grad_norm": 0.5923146936289747, + "learning_rate": 1.6797747789824845e-06, + "loss": 0.5062, + "step": 11253 + }, + { + "epoch": 1.8257624918883841, + "grad_norm": 0.60644155330367, + "learning_rate": 1.6793713962255043e-06, + "loss": 0.5067, + "step": 11254 + }, + { + "epoch": 1.8259247242050618, + "grad_norm": 0.6040808539524439, + "learning_rate": 1.6789680374120713e-06, + "loss": 0.4939, + "step": 11255 + }, + { + "epoch": 1.8260869565217392, + "grad_norm": 0.5696123643066898, + "learning_rate": 1.678564702553954e-06, + "loss": 0.5291, + "step": 11256 + }, + { + "epoch": 1.8262491888384167, + "grad_norm": 0.6158779547815713, + "learning_rate": 1.6781613916629192e-06, + "loss": 0.5498, + "step": 11257 + }, + { + "epoch": 1.8264114211550941, + "grad_norm": 0.6143027435638343, + "learning_rate": 1.6777581047507358e-06, + "loss": 0.5221, + "step": 11258 + }, + { + "epoch": 1.8265736534717716, + "grad_norm": 0.6308413077165704, + "learning_rate": 1.6773548418291697e-06, + "loss": 0.5322, + "step": 11259 + }, + { + "epoch": 1.826735885788449, + "grad_norm": 0.5932442666370051, + "learning_rate": 1.6769516029099876e-06, + "loss": 0.4998, + "step": 11260 + }, + { + "epoch": 1.8268981181051265, + "grad_norm": 0.6157431986673955, + "learning_rate": 1.6765483880049549e-06, + "loss": 0.5081, + "step": 11261 + }, + { + "epoch": 1.827060350421804, + "grad_norm": 0.6101723483973174, + "learning_rate": 1.6761451971258357e-06, + "loss": 0.4902, + "step": 11262 + }, + { + "epoch": 1.8272225827384814, + "grad_norm": 0.6040538115372844, + "learning_rate": 1.6757420302843935e-06, + "loss": 0.4864, + "step": 11263 + }, + { + "epoch": 1.827384815055159, + "grad_norm": 0.6159178460053076, + "learning_rate": 1.6753388874923923e-06, + "loss": 0.5398, + "step": 11264 + }, + { + "epoch": 1.8275470473718365, + "grad_norm": 0.6033146251431845, + "learning_rate": 1.6749357687615942e-06, + "loss": 0.5129, + "step": 11265 + }, + { + "epoch": 1.827709279688514, + "grad_norm": 0.5727351078631784, + "learning_rate": 1.6745326741037621e-06, + "loss": 0.5235, + "step": 11266 + }, + { + "epoch": 1.8278715120051914, + "grad_norm": 0.6297468825882968, + "learning_rate": 1.674129603530656e-06, + "loss": 0.514, + "step": 11267 + }, + { + "epoch": 1.828033744321869, + "grad_norm": 0.6094812794235974, + "learning_rate": 1.673726557054036e-06, + "loss": 0.5004, + "step": 11268 + }, + { + "epoch": 1.8281959766385465, + "grad_norm": 0.6043769568963699, + "learning_rate": 1.673323534685663e-06, + "loss": 0.5083, + "step": 11269 + }, + { + "epoch": 1.828358208955224, + "grad_norm": 0.5986895911559653, + "learning_rate": 1.6729205364372952e-06, + "loss": 0.5093, + "step": 11270 + }, + { + "epoch": 1.8285204412719014, + "grad_norm": 0.6256917797540648, + "learning_rate": 1.6725175623206924e-06, + "loss": 0.526, + "step": 11271 + }, + { + "epoch": 1.8286826735885788, + "grad_norm": 0.61136881205275, + "learning_rate": 1.6721146123476095e-06, + "loss": 0.514, + "step": 11272 + }, + { + "epoch": 1.8288449059052563, + "grad_norm": 0.5957426560705532, + "learning_rate": 1.6717116865298052e-06, + "loss": 0.4833, + "step": 11273 + }, + { + "epoch": 1.8290071382219337, + "grad_norm": 0.6633513638496601, + "learning_rate": 1.671308784879035e-06, + "loss": 0.53, + "step": 11274 + }, + { + "epoch": 1.8291693705386112, + "grad_norm": 0.6137823586312611, + "learning_rate": 1.6709059074070556e-06, + "loss": 0.5253, + "step": 11275 + }, + { + "epoch": 1.8293316028552886, + "grad_norm": 0.5874833755769294, + "learning_rate": 1.6705030541256211e-06, + "loss": 0.5149, + "step": 11276 + }, + { + "epoch": 1.8294938351719663, + "grad_norm": 0.5962253522630075, + "learning_rate": 1.6701002250464843e-06, + "loss": 0.5484, + "step": 11277 + }, + { + "epoch": 1.8296560674886437, + "grad_norm": 0.5856098812626092, + "learning_rate": 1.6696974201814004e-06, + "loss": 0.5304, + "step": 11278 + }, + { + "epoch": 1.8298182998053212, + "grad_norm": 0.6601546887677431, + "learning_rate": 1.6692946395421212e-06, + "loss": 0.5107, + "step": 11279 + }, + { + "epoch": 1.8299805321219988, + "grad_norm": 0.5998516404390436, + "learning_rate": 1.6688918831403988e-06, + "loss": 0.5098, + "step": 11280 + }, + { + "epoch": 1.8301427644386763, + "grad_norm": 0.5915618431724236, + "learning_rate": 1.6684891509879852e-06, + "loss": 0.4892, + "step": 11281 + }, + { + "epoch": 1.8303049967553537, + "grad_norm": 0.5984927758808908, + "learning_rate": 1.6680864430966295e-06, + "loss": 0.5152, + "step": 11282 + }, + { + "epoch": 1.8304672290720312, + "grad_norm": 0.6174144835021272, + "learning_rate": 1.6676837594780825e-06, + "loss": 0.516, + "step": 11283 + }, + { + "epoch": 1.8306294613887086, + "grad_norm": 0.589073330135597, + "learning_rate": 1.6672811001440934e-06, + "loss": 0.4969, + "step": 11284 + }, + { + "epoch": 1.830791693705386, + "grad_norm": 0.613671761791028, + "learning_rate": 1.6668784651064096e-06, + "loss": 0.5355, + "step": 11285 + }, + { + "epoch": 1.8309539260220635, + "grad_norm": 0.5539574872697589, + "learning_rate": 1.6664758543767807e-06, + "loss": 0.4901, + "step": 11286 + }, + { + "epoch": 1.831116158338741, + "grad_norm": 0.6220871178522948, + "learning_rate": 1.6660732679669522e-06, + "loss": 0.5189, + "step": 11287 + }, + { + "epoch": 1.8312783906554184, + "grad_norm": 0.5741249910381176, + "learning_rate": 1.66567070588867e-06, + "loss": 0.4967, + "step": 11288 + }, + { + "epoch": 1.831440622972096, + "grad_norm": 0.6008345779680323, + "learning_rate": 1.665268168153681e-06, + "loss": 0.5326, + "step": 11289 + }, + { + "epoch": 1.8316028552887735, + "grad_norm": 0.629875561418149, + "learning_rate": 1.664865654773729e-06, + "loss": 0.5279, + "step": 11290 + }, + { + "epoch": 1.831765087605451, + "grad_norm": 0.6352267219453958, + "learning_rate": 1.6644631657605592e-06, + "loss": 0.5088, + "step": 11291 + }, + { + "epoch": 1.8319273199221286, + "grad_norm": 0.5977600741027512, + "learning_rate": 1.6640607011259152e-06, + "loss": 0.4846, + "step": 11292 + }, + { + "epoch": 1.832089552238806, + "grad_norm": 0.5921433826146008, + "learning_rate": 1.6636582608815382e-06, + "loss": 0.5282, + "step": 11293 + }, + { + "epoch": 1.8322517845554835, + "grad_norm": 0.5848019944788724, + "learning_rate": 1.663255845039171e-06, + "loss": 0.5149, + "step": 11294 + }, + { + "epoch": 1.832414016872161, + "grad_norm": 0.5549353782270887, + "learning_rate": 1.6628534536105551e-06, + "loss": 0.5589, + "step": 11295 + }, + { + "epoch": 1.8325762491888384, + "grad_norm": 0.6496793689512635, + "learning_rate": 1.6624510866074317e-06, + "loss": 0.4955, + "step": 11296 + }, + { + "epoch": 1.8327384815055159, + "grad_norm": 0.635853185170639, + "learning_rate": 1.6620487440415401e-06, + "loss": 0.5142, + "step": 11297 + }, + { + "epoch": 1.8329007138221933, + "grad_norm": 0.6001033864234249, + "learning_rate": 1.661646425924619e-06, + "loss": 0.5174, + "step": 11298 + }, + { + "epoch": 1.8330629461388708, + "grad_norm": 0.6185025366232212, + "learning_rate": 1.661244132268407e-06, + "loss": 0.5162, + "step": 11299 + }, + { + "epoch": 1.8332251784555482, + "grad_norm": 0.6207152977101222, + "learning_rate": 1.6608418630846427e-06, + "loss": 0.5777, + "step": 11300 + }, + { + "epoch": 1.8333874107722257, + "grad_norm": 0.6740161552153288, + "learning_rate": 1.6604396183850618e-06, + "loss": 0.4854, + "step": 11301 + }, + { + "epoch": 1.8335496430889033, + "grad_norm": 0.5882645114057499, + "learning_rate": 1.6600373981814024e-06, + "loss": 0.5492, + "step": 11302 + }, + { + "epoch": 1.8337118754055808, + "grad_norm": 0.5888091912726381, + "learning_rate": 1.659635202485399e-06, + "loss": 0.5247, + "step": 11303 + }, + { + "epoch": 1.8338741077222582, + "grad_norm": 0.590368470708951, + "learning_rate": 1.6592330313087856e-06, + "loss": 0.4785, + "step": 11304 + }, + { + "epoch": 1.834036340038936, + "grad_norm": 0.603131757568183, + "learning_rate": 1.6588308846632983e-06, + "loss": 0.509, + "step": 11305 + }, + { + "epoch": 1.8341985723556133, + "grad_norm": 0.6106056515297468, + "learning_rate": 1.6584287625606691e-06, + "loss": 0.5163, + "step": 11306 + }, + { + "epoch": 1.8343608046722908, + "grad_norm": 0.5971867030955954, + "learning_rate": 1.6580266650126325e-06, + "loss": 0.5088, + "step": 11307 + }, + { + "epoch": 1.8345230369889682, + "grad_norm": 0.5827241351955372, + "learning_rate": 1.657624592030918e-06, + "loss": 0.5214, + "step": 11308 + }, + { + "epoch": 1.8346852693056457, + "grad_norm": 0.5952469523276628, + "learning_rate": 1.657222543627259e-06, + "loss": 0.4939, + "step": 11309 + }, + { + "epoch": 1.8348475016223231, + "grad_norm": 0.6285789670145985, + "learning_rate": 1.6568205198133847e-06, + "loss": 0.4854, + "step": 11310 + }, + { + "epoch": 1.8350097339390006, + "grad_norm": 0.6119457602827201, + "learning_rate": 1.6564185206010264e-06, + "loss": 0.5013, + "step": 11311 + }, + { + "epoch": 1.835171966255678, + "grad_norm": 0.5746140434231195, + "learning_rate": 1.6560165460019126e-06, + "loss": 0.503, + "step": 11312 + }, + { + "epoch": 1.8353341985723555, + "grad_norm": 0.619864764319721, + "learning_rate": 1.655614596027771e-06, + "loss": 0.501, + "step": 11313 + }, + { + "epoch": 1.8354964308890331, + "grad_norm": 0.6093456057943635, + "learning_rate": 1.6552126706903302e-06, + "loss": 0.4767, + "step": 11314 + }, + { + "epoch": 1.8356586632057106, + "grad_norm": 0.610664560351583, + "learning_rate": 1.6548107700013166e-06, + "loss": 0.5298, + "step": 11315 + }, + { + "epoch": 1.835820895522388, + "grad_norm": 0.6329338571770287, + "learning_rate": 1.6544088939724573e-06, + "loss": 0.5415, + "step": 11316 + }, + { + "epoch": 1.8359831278390657, + "grad_norm": 0.592135736538375, + "learning_rate": 1.6540070426154782e-06, + "loss": 0.5032, + "step": 11317 + }, + { + "epoch": 1.8361453601557431, + "grad_norm": 0.5637463359699562, + "learning_rate": 1.6536052159421023e-06, + "loss": 0.4938, + "step": 11318 + }, + { + "epoch": 1.8363075924724206, + "grad_norm": 0.6043910563123838, + "learning_rate": 1.6532034139640547e-06, + "loss": 0.5227, + "step": 11319 + }, + { + "epoch": 1.836469824789098, + "grad_norm": 0.6056040784116692, + "learning_rate": 1.6528016366930594e-06, + "loss": 0.4977, + "step": 11320 + }, + { + "epoch": 1.8366320571057755, + "grad_norm": 0.6180933694051653, + "learning_rate": 1.6523998841408381e-06, + "loss": 0.5263, + "step": 11321 + }, + { + "epoch": 1.836794289422453, + "grad_norm": 0.6132401037807419, + "learning_rate": 1.6519981563191145e-06, + "loss": 0.5054, + "step": 11322 + }, + { + "epoch": 1.8369565217391304, + "grad_norm": 0.6186016811448497, + "learning_rate": 1.6515964532396078e-06, + "loss": 0.5462, + "step": 11323 + }, + { + "epoch": 1.8371187540558078, + "grad_norm": 0.5908584866882342, + "learning_rate": 1.6511947749140388e-06, + "loss": 0.5126, + "step": 11324 + }, + { + "epoch": 1.8372809863724853, + "grad_norm": 0.614391723902748, + "learning_rate": 1.6507931213541287e-06, + "loss": 0.507, + "step": 11325 + }, + { + "epoch": 1.8374432186891627, + "grad_norm": 0.6122210769958186, + "learning_rate": 1.6503914925715952e-06, + "loss": 0.5218, + "step": 11326 + }, + { + "epoch": 1.8376054510058404, + "grad_norm": 0.6116033468580371, + "learning_rate": 1.6499898885781578e-06, + "loss": 0.4984, + "step": 11327 + }, + { + "epoch": 1.8377676833225178, + "grad_norm": 0.6023556615401545, + "learning_rate": 1.6495883093855335e-06, + "loss": 0.5302, + "step": 11328 + }, + { + "epoch": 1.8379299156391953, + "grad_norm": 0.5745762806257444, + "learning_rate": 1.6491867550054385e-06, + "loss": 0.4962, + "step": 11329 + }, + { + "epoch": 1.838092147955873, + "grad_norm": 0.6307843486582926, + "learning_rate": 1.6487852254495901e-06, + "loss": 0.5137, + "step": 11330 + }, + { + "epoch": 1.8382543802725504, + "grad_norm": 0.5952973402782633, + "learning_rate": 1.6483837207297034e-06, + "loss": 0.488, + "step": 11331 + }, + { + "epoch": 1.8384166125892278, + "grad_norm": 0.641981893040471, + "learning_rate": 1.6479822408574937e-06, + "loss": 0.5364, + "step": 11332 + }, + { + "epoch": 1.8385788449059053, + "grad_norm": 0.6209256953386822, + "learning_rate": 1.6475807858446735e-06, + "loss": 0.5362, + "step": 11333 + }, + { + "epoch": 1.8387410772225827, + "grad_norm": 0.6239310658134253, + "learning_rate": 1.6471793557029576e-06, + "loss": 0.5346, + "step": 11334 + }, + { + "epoch": 1.8389033095392602, + "grad_norm": 0.6329880465437714, + "learning_rate": 1.6467779504440567e-06, + "loss": 0.5052, + "step": 11335 + }, + { + "epoch": 1.8390655418559376, + "grad_norm": 0.6333159964614673, + "learning_rate": 1.6463765700796852e-06, + "loss": 0.5381, + "step": 11336 + }, + { + "epoch": 1.839227774172615, + "grad_norm": 0.6133731808770928, + "learning_rate": 1.6459752146215522e-06, + "loss": 0.5307, + "step": 11337 + }, + { + "epoch": 1.8393900064892925, + "grad_norm": 0.5962711142294262, + "learning_rate": 1.6455738840813695e-06, + "loss": 0.51, + "step": 11338 + }, + { + "epoch": 1.8395522388059702, + "grad_norm": 0.5803619336210093, + "learning_rate": 1.645172578470846e-06, + "loss": 0.5298, + "step": 11339 + }, + { + "epoch": 1.8397144711226476, + "grad_norm": 0.602795287124666, + "learning_rate": 1.6447712978016897e-06, + "loss": 0.508, + "step": 11340 + }, + { + "epoch": 1.839876703439325, + "grad_norm": 0.6342541701999485, + "learning_rate": 1.6443700420856107e-06, + "loss": 0.5236, + "step": 11341 + }, + { + "epoch": 1.8400389357560027, + "grad_norm": 0.6427626589752229, + "learning_rate": 1.643968811334315e-06, + "loss": 0.4979, + "step": 11342 + }, + { + "epoch": 1.8402011680726802, + "grad_norm": 0.684425937893072, + "learning_rate": 1.6435676055595107e-06, + "loss": 0.5227, + "step": 11343 + }, + { + "epoch": 1.8403634003893576, + "grad_norm": 0.6132464618589816, + "learning_rate": 1.6431664247729019e-06, + "loss": 0.5126, + "step": 11344 + }, + { + "epoch": 1.840525632706035, + "grad_norm": 0.6316135349301903, + "learning_rate": 1.6427652689861956e-06, + "loss": 0.5204, + "step": 11345 + }, + { + "epoch": 1.8406878650227125, + "grad_norm": 0.618321512444294, + "learning_rate": 1.6423641382110949e-06, + "loss": 0.5227, + "step": 11346 + }, + { + "epoch": 1.84085009733939, + "grad_norm": 0.616120004551371, + "learning_rate": 1.6419630324593056e-06, + "loss": 0.5153, + "step": 11347 + }, + { + "epoch": 1.8410123296560674, + "grad_norm": 0.5528738370660747, + "learning_rate": 1.6415619517425296e-06, + "loss": 0.5081, + "step": 11348 + }, + { + "epoch": 1.8411745619727449, + "grad_norm": 0.63293250880237, + "learning_rate": 1.6411608960724684e-06, + "loss": 0.5427, + "step": 11349 + }, + { + "epoch": 1.8413367942894223, + "grad_norm": 0.5925869896632381, + "learning_rate": 1.6407598654608247e-06, + "loss": 0.5149, + "step": 11350 + }, + { + "epoch": 1.8414990266061, + "grad_norm": 0.5896984475717564, + "learning_rate": 1.6403588599192994e-06, + "loss": 0.5224, + "step": 11351 + }, + { + "epoch": 1.8416612589227774, + "grad_norm": 0.5990042112875102, + "learning_rate": 1.6399578794595926e-06, + "loss": 0.524, + "step": 11352 + }, + { + "epoch": 1.8418234912394549, + "grad_norm": 0.6511912240737859, + "learning_rate": 1.6395569240934042e-06, + "loss": 0.527, + "step": 11353 + }, + { + "epoch": 1.8419857235561323, + "grad_norm": 0.5731921623739225, + "learning_rate": 1.639155993832432e-06, + "loss": 0.5035, + "step": 11354 + }, + { + "epoch": 1.84214795587281, + "grad_norm": 0.5706998763763589, + "learning_rate": 1.6387550886883735e-06, + "loss": 0.4998, + "step": 11355 + }, + { + "epoch": 1.8423101881894874, + "grad_norm": 0.6050795256607016, + "learning_rate": 1.6383542086729276e-06, + "loss": 0.5136, + "step": 11356 + }, + { + "epoch": 1.8424724205061649, + "grad_norm": 0.5797513556904279, + "learning_rate": 1.6379533537977893e-06, + "loss": 0.5168, + "step": 11357 + }, + { + "epoch": 1.8426346528228423, + "grad_norm": 0.5851060713174001, + "learning_rate": 1.6375525240746566e-06, + "loss": 0.502, + "step": 11358 + }, + { + "epoch": 1.8427968851395198, + "grad_norm": 0.6040073284505928, + "learning_rate": 1.6371517195152218e-06, + "loss": 0.5027, + "step": 11359 + }, + { + "epoch": 1.8429591174561972, + "grad_norm": 0.5816223419369558, + "learning_rate": 1.63675094013118e-06, + "loss": 0.5123, + "step": 11360 + }, + { + "epoch": 1.8431213497728747, + "grad_norm": 0.6145057598005815, + "learning_rate": 1.636350185934226e-06, + "loss": 0.4931, + "step": 11361 + }, + { + "epoch": 1.8432835820895521, + "grad_norm": 0.6297445780218794, + "learning_rate": 1.635949456936051e-06, + "loss": 0.5099, + "step": 11362 + }, + { + "epoch": 1.8434458144062296, + "grad_norm": 0.600299352427936, + "learning_rate": 1.6355487531483491e-06, + "loss": 0.5237, + "step": 11363 + }, + { + "epoch": 1.8436080467229072, + "grad_norm": 0.640402373489075, + "learning_rate": 1.6351480745828098e-06, + "loss": 0.5301, + "step": 11364 + }, + { + "epoch": 1.8437702790395847, + "grad_norm": 0.6285017455745247, + "learning_rate": 1.6347474212511247e-06, + "loss": 0.5139, + "step": 11365 + }, + { + "epoch": 1.8439325113562621, + "grad_norm": 0.618125601119079, + "learning_rate": 1.6343467931649825e-06, + "loss": 0.5098, + "step": 11366 + }, + { + "epoch": 1.8440947436729398, + "grad_norm": 0.5965437401897965, + "learning_rate": 1.6339461903360743e-06, + "loss": 0.5221, + "step": 11367 + }, + { + "epoch": 1.8442569759896172, + "grad_norm": 0.5932734544273628, + "learning_rate": 1.6335456127760874e-06, + "loss": 0.511, + "step": 11368 + }, + { + "epoch": 1.8444192083062947, + "grad_norm": 0.594076666150454, + "learning_rate": 1.633145060496709e-06, + "loss": 0.54, + "step": 11369 + }, + { + "epoch": 1.8445814406229721, + "grad_norm": 0.5875443752348118, + "learning_rate": 1.632744533509627e-06, + "loss": 0.4947, + "step": 11370 + }, + { + "epoch": 1.8447436729396496, + "grad_norm": 0.5903113835963814, + "learning_rate": 1.6323440318265265e-06, + "loss": 0.4948, + "step": 11371 + }, + { + "epoch": 1.844905905256327, + "grad_norm": 0.6059044552551989, + "learning_rate": 1.6319435554590945e-06, + "loss": 0.5082, + "step": 11372 + }, + { + "epoch": 1.8450681375730045, + "grad_norm": 0.6311508929932151, + "learning_rate": 1.6315431044190156e-06, + "loss": 0.5072, + "step": 11373 + }, + { + "epoch": 1.845230369889682, + "grad_norm": 0.5918031023855396, + "learning_rate": 1.6311426787179715e-06, + "loss": 0.5198, + "step": 11374 + }, + { + "epoch": 1.8453926022063594, + "grad_norm": 0.6529489192007232, + "learning_rate": 1.630742278367648e-06, + "loss": 0.5036, + "step": 11375 + }, + { + "epoch": 1.845554834523037, + "grad_norm": 0.6147105144747577, + "learning_rate": 1.6303419033797263e-06, + "loss": 0.5025, + "step": 11376 + }, + { + "epoch": 1.8457170668397145, + "grad_norm": 0.5894378322247891, + "learning_rate": 1.6299415537658886e-06, + "loss": 0.518, + "step": 11377 + }, + { + "epoch": 1.845879299156392, + "grad_norm": 0.6329103711400792, + "learning_rate": 1.6295412295378166e-06, + "loss": 0.5297, + "step": 11378 + }, + { + "epoch": 1.8460415314730696, + "grad_norm": 0.6040113646935602, + "learning_rate": 1.6291409307071893e-06, + "loss": 0.5002, + "step": 11379 + }, + { + "epoch": 1.846203763789747, + "grad_norm": 0.6208137963146484, + "learning_rate": 1.6287406572856864e-06, + "loss": 0.5275, + "step": 11380 + }, + { + "epoch": 1.8463659961064245, + "grad_norm": 0.5972597844914561, + "learning_rate": 1.6283404092849875e-06, + "loss": 0.5399, + "step": 11381 + }, + { + "epoch": 1.846528228423102, + "grad_norm": 0.6072705288960087, + "learning_rate": 1.6279401867167703e-06, + "loss": 0.5293, + "step": 11382 + }, + { + "epoch": 1.8466904607397794, + "grad_norm": 0.6363004226333553, + "learning_rate": 1.6275399895927128e-06, + "loss": 0.4653, + "step": 11383 + }, + { + "epoch": 1.8468526930564568, + "grad_norm": 0.6526074294294396, + "learning_rate": 1.6271398179244903e-06, + "loss": 0.5031, + "step": 11384 + }, + { + "epoch": 1.8470149253731343, + "grad_norm": 0.6079451445123556, + "learning_rate": 1.6267396717237787e-06, + "loss": 0.5212, + "step": 11385 + }, + { + "epoch": 1.8471771576898117, + "grad_norm": 0.6767507317346092, + "learning_rate": 1.6263395510022546e-06, + "loss": 0.5205, + "step": 11386 + }, + { + "epoch": 1.8473393900064892, + "grad_norm": 0.6166691593917433, + "learning_rate": 1.6259394557715905e-06, + "loss": 0.529, + "step": 11387 + }, + { + "epoch": 1.8475016223231666, + "grad_norm": 0.5743330783047581, + "learning_rate": 1.6255393860434616e-06, + "loss": 0.4996, + "step": 11388 + }, + { + "epoch": 1.8476638546398443, + "grad_norm": 0.6045016076870806, + "learning_rate": 1.6251393418295402e-06, + "loss": 0.4964, + "step": 11389 + }, + { + "epoch": 1.8478260869565217, + "grad_norm": 0.5929134434704089, + "learning_rate": 1.6247393231414985e-06, + "loss": 0.5405, + "step": 11390 + }, + { + "epoch": 1.8479883192731992, + "grad_norm": 0.6156288456554317, + "learning_rate": 1.6243393299910064e-06, + "loss": 0.5199, + "step": 11391 + }, + { + "epoch": 1.8481505515898768, + "grad_norm": 0.5743841019092086, + "learning_rate": 1.623939362389737e-06, + "loss": 0.4979, + "step": 11392 + }, + { + "epoch": 1.8483127839065543, + "grad_norm": 0.5633551963761465, + "learning_rate": 1.6235394203493582e-06, + "loss": 0.502, + "step": 11393 + }, + { + "epoch": 1.8484750162232317, + "grad_norm": 0.6292369743798207, + "learning_rate": 1.623139503881541e-06, + "loss": 0.4887, + "step": 11394 + }, + { + "epoch": 1.8486372485399092, + "grad_norm": 0.6039035800588524, + "learning_rate": 1.6227396129979523e-06, + "loss": 0.5382, + "step": 11395 + }, + { + "epoch": 1.8487994808565866, + "grad_norm": 0.5978483087605496, + "learning_rate": 1.6223397477102598e-06, + "loss": 0.5083, + "step": 11396 + }, + { + "epoch": 1.848961713173264, + "grad_norm": 0.6417166731188353, + "learning_rate": 1.6219399080301313e-06, + "loss": 0.5173, + "step": 11397 + }, + { + "epoch": 1.8491239454899415, + "grad_norm": 0.5968428954950178, + "learning_rate": 1.6215400939692316e-06, + "loss": 0.4869, + "step": 11398 + }, + { + "epoch": 1.849286177806619, + "grad_norm": 0.6020090495490487, + "learning_rate": 1.6211403055392286e-06, + "loss": 0.5094, + "step": 11399 + }, + { + "epoch": 1.8494484101232964, + "grad_norm": 0.5919596381074537, + "learning_rate": 1.6207405427517836e-06, + "loss": 0.5342, + "step": 11400 + }, + { + "epoch": 1.849610642439974, + "grad_norm": 0.5693232653802938, + "learning_rate": 1.6203408056185633e-06, + "loss": 0.5146, + "step": 11401 + }, + { + "epoch": 1.8497728747566515, + "grad_norm": 0.6034050115962235, + "learning_rate": 1.619941094151229e-06, + "loss": 0.5024, + "step": 11402 + }, + { + "epoch": 1.849935107073329, + "grad_norm": 0.61782374135716, + "learning_rate": 1.6195414083614442e-06, + "loss": 0.5412, + "step": 11403 + }, + { + "epoch": 1.8500973393900066, + "grad_norm": 0.6591860384445537, + "learning_rate": 1.6191417482608706e-06, + "loss": 0.5098, + "step": 11404 + }, + { + "epoch": 1.850259571706684, + "grad_norm": 0.6101218029189853, + "learning_rate": 1.618742113861168e-06, + "loss": 0.5479, + "step": 11405 + }, + { + "epoch": 1.8504218040233615, + "grad_norm": 0.5888728417255762, + "learning_rate": 1.618342505173998e-06, + "loss": 0.5135, + "step": 11406 + }, + { + "epoch": 1.850584036340039, + "grad_norm": 0.5926033699455724, + "learning_rate": 1.6179429222110188e-06, + "loss": 0.5568, + "step": 11407 + }, + { + "epoch": 1.8507462686567164, + "grad_norm": 0.6170949518849013, + "learning_rate": 1.6175433649838901e-06, + "loss": 0.508, + "step": 11408 + }, + { + "epoch": 1.8509085009733939, + "grad_norm": 0.5991652085878812, + "learning_rate": 1.6171438335042699e-06, + "loss": 0.5233, + "step": 11409 + }, + { + "epoch": 1.8510707332900713, + "grad_norm": 0.6306984769825046, + "learning_rate": 1.6167443277838143e-06, + "loss": 0.4887, + "step": 11410 + }, + { + "epoch": 1.8512329656067488, + "grad_norm": 0.5981525626880191, + "learning_rate": 1.6163448478341797e-06, + "loss": 0.5435, + "step": 11411 + }, + { + "epoch": 1.8513951979234262, + "grad_norm": 0.5776344176494613, + "learning_rate": 1.6159453936670226e-06, + "loss": 0.5162, + "step": 11412 + }, + { + "epoch": 1.8515574302401037, + "grad_norm": 0.6155320061711329, + "learning_rate": 1.6155459652939969e-06, + "loss": 0.5053, + "step": 11413 + }, + { + "epoch": 1.8517196625567813, + "grad_norm": 0.6069723633869538, + "learning_rate": 1.615146562726759e-06, + "loss": 0.4705, + "step": 11414 + }, + { + "epoch": 1.8518818948734588, + "grad_norm": 0.6238935953845992, + "learning_rate": 1.6147471859769597e-06, + "loss": 0.4987, + "step": 11415 + }, + { + "epoch": 1.8520441271901362, + "grad_norm": 0.6300727895461623, + "learning_rate": 1.6143478350562524e-06, + "loss": 0.5058, + "step": 11416 + }, + { + "epoch": 1.852206359506814, + "grad_norm": 0.5845453247393905, + "learning_rate": 1.61394850997629e-06, + "loss": 0.4455, + "step": 11417 + }, + { + "epoch": 1.8523685918234913, + "grad_norm": 0.6107832899728677, + "learning_rate": 1.6135492107487222e-06, + "loss": 0.5408, + "step": 11418 + }, + { + "epoch": 1.8525308241401688, + "grad_norm": 0.5965557582392086, + "learning_rate": 1.613149937385201e-06, + "loss": 0.5303, + "step": 11419 + }, + { + "epoch": 1.8526930564568462, + "grad_norm": 0.6004982305767111, + "learning_rate": 1.6127506898973747e-06, + "loss": 0.4977, + "step": 11420 + }, + { + "epoch": 1.8528552887735237, + "grad_norm": 0.6360209289842567, + "learning_rate": 1.6123514682968922e-06, + "loss": 0.5199, + "step": 11421 + }, + { + "epoch": 1.8530175210902011, + "grad_norm": 0.6005279094415135, + "learning_rate": 1.6119522725954024e-06, + "loss": 0.4989, + "step": 11422 + }, + { + "epoch": 1.8531797534068786, + "grad_norm": 0.6191428484360941, + "learning_rate": 1.6115531028045522e-06, + "loss": 0.534, + "step": 11423 + }, + { + "epoch": 1.853341985723556, + "grad_norm": 0.5907193833859173, + "learning_rate": 1.6111539589359888e-06, + "loss": 0.5292, + "step": 11424 + }, + { + "epoch": 1.8535042180402335, + "grad_norm": 0.6567010775321774, + "learning_rate": 1.6107548410013566e-06, + "loss": 0.5208, + "step": 11425 + }, + { + "epoch": 1.8536664503569111, + "grad_norm": 0.6268434788377448, + "learning_rate": 1.6103557490123023e-06, + "loss": 0.5459, + "step": 11426 + }, + { + "epoch": 1.8538286826735886, + "grad_norm": 0.5923549531268324, + "learning_rate": 1.6099566829804686e-06, + "loss": 0.5259, + "step": 11427 + }, + { + "epoch": 1.853990914990266, + "grad_norm": 0.6289375163594312, + "learning_rate": 1.6095576429175009e-06, + "loss": 0.5315, + "step": 11428 + }, + { + "epoch": 1.8541531473069437, + "grad_norm": 0.6072355326966645, + "learning_rate": 1.609158628835042e-06, + "loss": 0.5103, + "step": 11429 + }, + { + "epoch": 1.8543153796236211, + "grad_norm": 0.5843258939595787, + "learning_rate": 1.6087596407447314e-06, + "loss": 0.5476, + "step": 11430 + }, + { + "epoch": 1.8544776119402986, + "grad_norm": 0.600350824253897, + "learning_rate": 1.6083606786582134e-06, + "loss": 0.4942, + "step": 11431 + }, + { + "epoch": 1.854639844256976, + "grad_norm": 0.613303555486341, + "learning_rate": 1.6079617425871264e-06, + "loss": 0.5236, + "step": 11432 + }, + { + "epoch": 1.8548020765736535, + "grad_norm": 0.5977126246745934, + "learning_rate": 1.607562832543112e-06, + "loss": 0.493, + "step": 11433 + }, + { + "epoch": 1.854964308890331, + "grad_norm": 0.5902091982167524, + "learning_rate": 1.6071639485378072e-06, + "loss": 0.5053, + "step": 11434 + }, + { + "epoch": 1.8551265412070084, + "grad_norm": 0.6281373132332266, + "learning_rate": 1.6067650905828533e-06, + "loss": 0.4813, + "step": 11435 + }, + { + "epoch": 1.8552887735236858, + "grad_norm": 0.59694667609632, + "learning_rate": 1.6063662586898842e-06, + "loss": 0.5118, + "step": 11436 + }, + { + "epoch": 1.8554510058403633, + "grad_norm": 0.6046557283772522, + "learning_rate": 1.6059674528705394e-06, + "loss": 0.5573, + "step": 11437 + }, + { + "epoch": 1.855613238157041, + "grad_norm": 0.6168763255107768, + "learning_rate": 1.6055686731364529e-06, + "loss": 0.5255, + "step": 11438 + }, + { + "epoch": 1.8557754704737184, + "grad_norm": 0.6056474436114876, + "learning_rate": 1.6051699194992613e-06, + "loss": 0.5281, + "step": 11439 + }, + { + "epoch": 1.8559377027903958, + "grad_norm": 0.6555630608854532, + "learning_rate": 1.6047711919705995e-06, + "loss": 0.5216, + "step": 11440 + }, + { + "epoch": 1.8560999351070735, + "grad_norm": 0.5985197827863193, + "learning_rate": 1.6043724905620989e-06, + "loss": 0.5172, + "step": 11441 + }, + { + "epoch": 1.856262167423751, + "grad_norm": 0.6160670436178648, + "learning_rate": 1.6039738152853946e-06, + "loss": 0.5109, + "step": 11442 + }, + { + "epoch": 1.8564243997404284, + "grad_norm": 0.6457235156089383, + "learning_rate": 1.6035751661521177e-06, + "loss": 0.5029, + "step": 11443 + }, + { + "epoch": 1.8565866320571058, + "grad_norm": 0.5723494596179763, + "learning_rate": 1.6031765431739002e-06, + "loss": 0.5097, + "step": 11444 + }, + { + "epoch": 1.8567488643737833, + "grad_norm": 0.6049130293647704, + "learning_rate": 1.6027779463623734e-06, + "loss": 0.5219, + "step": 11445 + }, + { + "epoch": 1.8569110966904607, + "grad_norm": 0.6180954208402831, + "learning_rate": 1.6023793757291655e-06, + "loss": 0.5027, + "step": 11446 + }, + { + "epoch": 1.8570733290071382, + "grad_norm": 0.5933324897557121, + "learning_rate": 1.6019808312859061e-06, + "loss": 0.5169, + "step": 11447 + }, + { + "epoch": 1.8572355613238156, + "grad_norm": 0.6123311602662765, + "learning_rate": 1.6015823130442244e-06, + "loss": 0.5186, + "step": 11448 + }, + { + "epoch": 1.857397793640493, + "grad_norm": 0.6082559526497733, + "learning_rate": 1.601183821015747e-06, + "loss": 0.5134, + "step": 11449 + }, + { + "epoch": 1.8575600259571705, + "grad_norm": 0.6027178097408057, + "learning_rate": 1.6007853552121025e-06, + "loss": 0.5228, + "step": 11450 + }, + { + "epoch": 1.8577222582738482, + "grad_norm": 0.6038337446443436, + "learning_rate": 1.600386915644915e-06, + "loss": 0.4993, + "step": 11451 + }, + { + "epoch": 1.8578844905905256, + "grad_norm": 0.6251331090964533, + "learning_rate": 1.5999885023258099e-06, + "loss": 0.5399, + "step": 11452 + }, + { + "epoch": 1.858046722907203, + "grad_norm": 0.6120026316715464, + "learning_rate": 1.5995901152664134e-06, + "loss": 0.5306, + "step": 11453 + }, + { + "epoch": 1.8582089552238807, + "grad_norm": 0.5882962293974447, + "learning_rate": 1.5991917544783473e-06, + "loss": 0.4969, + "step": 11454 + }, + { + "epoch": 1.8583711875405582, + "grad_norm": 0.6075033260012223, + "learning_rate": 1.5987934199732364e-06, + "loss": 0.5073, + "step": 11455 + }, + { + "epoch": 1.8585334198572356, + "grad_norm": 0.6035381189460441, + "learning_rate": 1.5983951117627019e-06, + "loss": 0.5072, + "step": 11456 + }, + { + "epoch": 1.858695652173913, + "grad_norm": 0.5800991224782038, + "learning_rate": 1.5979968298583654e-06, + "loss": 0.4962, + "step": 11457 + }, + { + "epoch": 1.8588578844905905, + "grad_norm": 0.6188498390985595, + "learning_rate": 1.597598574271847e-06, + "loss": 0.4918, + "step": 11458 + }, + { + "epoch": 1.859020116807268, + "grad_norm": 0.6291109394674358, + "learning_rate": 1.597200345014768e-06, + "loss": 0.5141, + "step": 11459 + }, + { + "epoch": 1.8591823491239454, + "grad_norm": 0.6300289054260102, + "learning_rate": 1.5968021420987474e-06, + "loss": 0.4891, + "step": 11460 + }, + { + "epoch": 1.8593445814406229, + "grad_norm": 0.619938375529375, + "learning_rate": 1.5964039655354019e-06, + "loss": 0.5133, + "step": 11461 + }, + { + "epoch": 1.8595068137573003, + "grad_norm": 0.5940988732266774, + "learning_rate": 1.5960058153363505e-06, + "loss": 0.4769, + "step": 11462 + }, + { + "epoch": 1.859669046073978, + "grad_norm": 0.5919340493419352, + "learning_rate": 1.59560769151321e-06, + "loss": 0.5038, + "step": 11463 + }, + { + "epoch": 1.8598312783906554, + "grad_norm": 0.5893379578574252, + "learning_rate": 1.5952095940775963e-06, + "loss": 0.4847, + "step": 11464 + }, + { + "epoch": 1.8599935107073329, + "grad_norm": 0.5690154420519328, + "learning_rate": 1.5948115230411255e-06, + "loss": 0.5117, + "step": 11465 + }, + { + "epoch": 1.8601557430240105, + "grad_norm": 0.5975697586835735, + "learning_rate": 1.5944134784154103e-06, + "loss": 0.5019, + "step": 11466 + }, + { + "epoch": 1.860317975340688, + "grad_norm": 0.6216300237363876, + "learning_rate": 1.594015460212066e-06, + "loss": 0.5085, + "step": 11467 + }, + { + "epoch": 1.8604802076573654, + "grad_norm": 0.6028884783802777, + "learning_rate": 1.5936174684427053e-06, + "loss": 0.5003, + "step": 11468 + }, + { + "epoch": 1.8606424399740429, + "grad_norm": 0.5910859707955108, + "learning_rate": 1.5932195031189401e-06, + "loss": 0.4991, + "step": 11469 + }, + { + "epoch": 1.8608046722907203, + "grad_norm": 0.596307295057696, + "learning_rate": 1.592821564252383e-06, + "loss": 0.5115, + "step": 11470 + }, + { + "epoch": 1.8609669046073978, + "grad_norm": 0.5926770826259208, + "learning_rate": 1.5924236518546432e-06, + "loss": 0.4978, + "step": 11471 + }, + { + "epoch": 1.8611291369240752, + "grad_norm": 0.6062880239058022, + "learning_rate": 1.5920257659373306e-06, + "loss": 0.526, + "step": 11472 + }, + { + "epoch": 1.8612913692407527, + "grad_norm": 0.6147586851706983, + "learning_rate": 1.5916279065120555e-06, + "loss": 0.4983, + "step": 11473 + }, + { + "epoch": 1.8614536015574301, + "grad_norm": 0.6015500228429278, + "learning_rate": 1.5912300735904252e-06, + "loss": 0.5273, + "step": 11474 + }, + { + "epoch": 1.8616158338741076, + "grad_norm": 0.6059691057228654, + "learning_rate": 1.5908322671840492e-06, + "loss": 0.5157, + "step": 11475 + }, + { + "epoch": 1.8617780661907852, + "grad_norm": 0.5767849858328081, + "learning_rate": 1.590434487304532e-06, + "loss": 0.5073, + "step": 11476 + }, + { + "epoch": 1.8619402985074627, + "grad_norm": 0.596900211083578, + "learning_rate": 1.5900367339634803e-06, + "loss": 0.5001, + "step": 11477 + }, + { + "epoch": 1.8621025308241401, + "grad_norm": 0.6045985285041556, + "learning_rate": 1.5896390071725005e-06, + "loss": 0.5387, + "step": 11478 + }, + { + "epoch": 1.8622647631408178, + "grad_norm": 0.6211614201337146, + "learning_rate": 1.5892413069431956e-06, + "loss": 0.5018, + "step": 11479 + }, + { + "epoch": 1.8624269954574952, + "grad_norm": 0.6245795021346281, + "learning_rate": 1.5888436332871703e-06, + "loss": 0.5217, + "step": 11480 + }, + { + "epoch": 1.8625892277741727, + "grad_norm": 0.5854994592102237, + "learning_rate": 1.5884459862160278e-06, + "loss": 0.5067, + "step": 11481 + }, + { + "epoch": 1.8627514600908501, + "grad_norm": 0.6173163092342516, + "learning_rate": 1.5880483657413697e-06, + "loss": 0.5267, + "step": 11482 + }, + { + "epoch": 1.8629136924075276, + "grad_norm": 0.6540009303197051, + "learning_rate": 1.5876507718747969e-06, + "loss": 0.5643, + "step": 11483 + }, + { + "epoch": 1.863075924724205, + "grad_norm": 0.6146293616044065, + "learning_rate": 1.587253204627911e-06, + "loss": 0.4824, + "step": 11484 + }, + { + "epoch": 1.8632381570408825, + "grad_norm": 0.5697339935208188, + "learning_rate": 1.5868556640123109e-06, + "loss": 0.4937, + "step": 11485 + }, + { + "epoch": 1.86340038935756, + "grad_norm": 0.6114388195854447, + "learning_rate": 1.5864581500395973e-06, + "loss": 0.5365, + "step": 11486 + }, + { + "epoch": 1.8635626216742374, + "grad_norm": 0.5759570945435359, + "learning_rate": 1.5860606627213671e-06, + "loss": 0.5127, + "step": 11487 + }, + { + "epoch": 1.863724853990915, + "grad_norm": 0.5833283175809382, + "learning_rate": 1.5856632020692175e-06, + "loss": 0.4976, + "step": 11488 + }, + { + "epoch": 1.8638870863075925, + "grad_norm": 0.5970694530085895, + "learning_rate": 1.5852657680947464e-06, + "loss": 0.4626, + "step": 11489 + }, + { + "epoch": 1.86404931862427, + "grad_norm": 0.6011324809922306, + "learning_rate": 1.5848683608095488e-06, + "loss": 0.5189, + "step": 11490 + }, + { + "epoch": 1.8642115509409476, + "grad_norm": 0.5982828925044337, + "learning_rate": 1.5844709802252216e-06, + "loss": 0.517, + "step": 11491 + }, + { + "epoch": 1.864373783257625, + "grad_norm": 0.6213628527060149, + "learning_rate": 1.5840736263533568e-06, + "loss": 0.5184, + "step": 11492 + }, + { + "epoch": 1.8645360155743025, + "grad_norm": 0.6062337695163417, + "learning_rate": 1.5836762992055496e-06, + "loss": 0.5075, + "step": 11493 + }, + { + "epoch": 1.86469824789098, + "grad_norm": 0.6283091547949318, + "learning_rate": 1.5832789987933918e-06, + "loss": 0.5222, + "step": 11494 + }, + { + "epoch": 1.8648604802076574, + "grad_norm": 0.6068015179355024, + "learning_rate": 1.5828817251284767e-06, + "loss": 0.5234, + "step": 11495 + }, + { + "epoch": 1.8650227125243348, + "grad_norm": 0.6164064202030451, + "learning_rate": 1.5824844782223956e-06, + "loss": 0.5212, + "step": 11496 + }, + { + "epoch": 1.8651849448410123, + "grad_norm": 0.6063381460200687, + "learning_rate": 1.5820872580867372e-06, + "loss": 0.5178, + "step": 11497 + }, + { + "epoch": 1.8653471771576897, + "grad_norm": 0.5818946290569923, + "learning_rate": 1.581690064733093e-06, + "loss": 0.4843, + "step": 11498 + }, + { + "epoch": 1.8655094094743672, + "grad_norm": 0.6254044199451284, + "learning_rate": 1.5812928981730508e-06, + "loss": 0.5036, + "step": 11499 + }, + { + "epoch": 1.8656716417910446, + "grad_norm": 0.6053764432352179, + "learning_rate": 1.5808957584181997e-06, + "loss": 0.4866, + "step": 11500 + }, + { + "epoch": 1.8658338741077223, + "grad_norm": 0.6072488229367655, + "learning_rate": 1.5804986454801275e-06, + "loss": 0.5116, + "step": 11501 + }, + { + "epoch": 1.8659961064243997, + "grad_norm": 0.5920542333039592, + "learning_rate": 1.5801015593704187e-06, + "loss": 0.5423, + "step": 11502 + }, + { + "epoch": 1.8661583387410772, + "grad_norm": 0.599891426072584, + "learning_rate": 1.5797045001006609e-06, + "loss": 0.5245, + "step": 11503 + }, + { + "epoch": 1.8663205710577548, + "grad_norm": 0.6151459983025012, + "learning_rate": 1.5793074676824387e-06, + "loss": 0.5014, + "step": 11504 + }, + { + "epoch": 1.8664828033744323, + "grad_norm": 0.6164032562427728, + "learning_rate": 1.5789104621273358e-06, + "loss": 0.5586, + "step": 11505 + }, + { + "epoch": 1.8666450356911097, + "grad_norm": 0.5799003790999868, + "learning_rate": 1.578513483446937e-06, + "loss": 0.5122, + "step": 11506 + }, + { + "epoch": 1.8668072680077872, + "grad_norm": 0.6266179448518993, + "learning_rate": 1.5781165316528239e-06, + "loss": 0.5071, + "step": 11507 + }, + { + "epoch": 1.8669695003244646, + "grad_norm": 0.5975466329600413, + "learning_rate": 1.5777196067565778e-06, + "loss": 0.5031, + "step": 11508 + }, + { + "epoch": 1.867131732641142, + "grad_norm": 0.589581054509312, + "learning_rate": 1.5773227087697811e-06, + "loss": 0.5212, + "step": 11509 + }, + { + "epoch": 1.8672939649578195, + "grad_norm": 0.6169413487996085, + "learning_rate": 1.5769258377040135e-06, + "loss": 0.5126, + "step": 11510 + }, + { + "epoch": 1.867456197274497, + "grad_norm": 0.5849679425099638, + "learning_rate": 1.5765289935708556e-06, + "loss": 0.5042, + "step": 11511 + }, + { + "epoch": 1.8676184295911744, + "grad_norm": 0.5904975166794424, + "learning_rate": 1.5761321763818842e-06, + "loss": 0.5133, + "step": 11512 + }, + { + "epoch": 1.867780661907852, + "grad_norm": 0.5738424565054787, + "learning_rate": 1.5757353861486785e-06, + "loss": 0.5133, + "step": 11513 + }, + { + "epoch": 1.8679428942245295, + "grad_norm": 0.5708949498083916, + "learning_rate": 1.5753386228828155e-06, + "loss": 0.4835, + "step": 11514 + }, + { + "epoch": 1.868105126541207, + "grad_norm": 0.6242487604935638, + "learning_rate": 1.5749418865958719e-06, + "loss": 0.5274, + "step": 11515 + }, + { + "epoch": 1.8682673588578846, + "grad_norm": 0.607005650810144, + "learning_rate": 1.5745451772994234e-06, + "loss": 0.5366, + "step": 11516 + }, + { + "epoch": 1.868429591174562, + "grad_norm": 0.5920626164718452, + "learning_rate": 1.5741484950050432e-06, + "loss": 0.4989, + "step": 11517 + }, + { + "epoch": 1.8685918234912395, + "grad_norm": 0.6395482503741964, + "learning_rate": 1.5737518397243074e-06, + "loss": 0.5203, + "step": 11518 + }, + { + "epoch": 1.868754055807917, + "grad_norm": 0.6293086327119772, + "learning_rate": 1.5733552114687873e-06, + "loss": 0.5082, + "step": 11519 + }, + { + "epoch": 1.8689162881245944, + "grad_norm": 0.606926955311699, + "learning_rate": 1.5729586102500575e-06, + "loss": 0.4774, + "step": 11520 + }, + { + "epoch": 1.8690785204412719, + "grad_norm": 0.5750572833676181, + "learning_rate": 1.572562036079689e-06, + "loss": 0.5013, + "step": 11521 + }, + { + "epoch": 1.8692407527579493, + "grad_norm": 0.667860560408849, + "learning_rate": 1.5721654889692508e-06, + "loss": 0.4973, + "step": 11522 + }, + { + "epoch": 1.8694029850746268, + "grad_norm": 0.6131529003679014, + "learning_rate": 1.5717689689303154e-06, + "loss": 0.5435, + "step": 11523 + }, + { + "epoch": 1.8695652173913042, + "grad_norm": 0.6108259478294106, + "learning_rate": 1.5713724759744503e-06, + "loss": 0.5319, + "step": 11524 + }, + { + "epoch": 1.8697274497079819, + "grad_norm": 0.6062643809750914, + "learning_rate": 1.570976010113226e-06, + "loss": 0.4843, + "step": 11525 + }, + { + "epoch": 1.8698896820246593, + "grad_norm": 0.6915809104965829, + "learning_rate": 1.5705795713582079e-06, + "loss": 0.4999, + "step": 11526 + }, + { + "epoch": 1.8700519143413368, + "grad_norm": 0.654770979541673, + "learning_rate": 1.570183159720966e-06, + "loss": 0.5593, + "step": 11527 + }, + { + "epoch": 1.8702141466580144, + "grad_norm": 0.6421095164050282, + "learning_rate": 1.569786775213063e-06, + "loss": 0.5004, + "step": 11528 + }, + { + "epoch": 1.8703763789746919, + "grad_norm": 0.588057359089971, + "learning_rate": 1.569390417846066e-06, + "loss": 0.4932, + "step": 11529 + }, + { + "epoch": 1.8705386112913693, + "grad_norm": 0.5876373101102285, + "learning_rate": 1.568994087631539e-06, + "loss": 0.4834, + "step": 11530 + }, + { + "epoch": 1.8707008436080468, + "grad_norm": 0.6147271485024041, + "learning_rate": 1.568597784581047e-06, + "loss": 0.5182, + "step": 11531 + }, + { + "epoch": 1.8708630759247242, + "grad_norm": 0.6021793084968708, + "learning_rate": 1.568201508706152e-06, + "loss": 0.5231, + "step": 11532 + }, + { + "epoch": 1.8710253082414017, + "grad_norm": 0.6467943487636882, + "learning_rate": 1.5678052600184152e-06, + "loss": 0.5225, + "step": 11533 + }, + { + "epoch": 1.8711875405580791, + "grad_norm": 0.619497123162423, + "learning_rate": 1.5674090385294e-06, + "loss": 0.5337, + "step": 11534 + }, + { + "epoch": 1.8713497728747566, + "grad_norm": 0.6419830050277444, + "learning_rate": 1.567012844250665e-06, + "loss": 0.4748, + "step": 11535 + }, + { + "epoch": 1.871512005191434, + "grad_norm": 0.5919532551003505, + "learning_rate": 1.566616677193772e-06, + "loss": 0.5039, + "step": 11536 + }, + { + "epoch": 1.8716742375081115, + "grad_norm": 0.5635799434792303, + "learning_rate": 1.5662205373702788e-06, + "loss": 0.4753, + "step": 11537 + }, + { + "epoch": 1.8718364698247891, + "grad_norm": 0.6099614845018879, + "learning_rate": 1.5658244247917437e-06, + "loss": 0.5319, + "step": 11538 + }, + { + "epoch": 1.8719987021414666, + "grad_norm": 0.6309478216797457, + "learning_rate": 1.5654283394697239e-06, + "loss": 0.4999, + "step": 11539 + }, + { + "epoch": 1.872160934458144, + "grad_norm": 0.639143905840933, + "learning_rate": 1.5650322814157764e-06, + "loss": 0.5419, + "step": 11540 + }, + { + "epoch": 1.8723231667748217, + "grad_norm": 0.6172709891201319, + "learning_rate": 1.5646362506414564e-06, + "loss": 0.511, + "step": 11541 + }, + { + "epoch": 1.8724853990914991, + "grad_norm": 0.6051162733631662, + "learning_rate": 1.5642402471583206e-06, + "loss": 0.467, + "step": 11542 + }, + { + "epoch": 1.8726476314081766, + "grad_norm": 0.6377248057466437, + "learning_rate": 1.5638442709779216e-06, + "loss": 0.5124, + "step": 11543 + }, + { + "epoch": 1.872809863724854, + "grad_norm": 0.5871840129018374, + "learning_rate": 1.5634483221118129e-06, + "loss": 0.4799, + "step": 11544 + }, + { + "epoch": 1.8729720960415315, + "grad_norm": 0.6170198261455608, + "learning_rate": 1.563052400571548e-06, + "loss": 0.5375, + "step": 11545 + }, + { + "epoch": 1.873134328358209, + "grad_norm": 0.5867251160671647, + "learning_rate": 1.5626565063686777e-06, + "loss": 0.5149, + "step": 11546 + }, + { + "epoch": 1.8732965606748864, + "grad_norm": 0.6059200649550578, + "learning_rate": 1.562260639514755e-06, + "loss": 0.4774, + "step": 11547 + }, + { + "epoch": 1.8734587929915638, + "grad_norm": 0.6527178384246655, + "learning_rate": 1.5618648000213277e-06, + "loss": 0.4641, + "step": 11548 + }, + { + "epoch": 1.8736210253082413, + "grad_norm": 0.6138355646317143, + "learning_rate": 1.561468987899946e-06, + "loss": 0.5461, + "step": 11549 + }, + { + "epoch": 1.873783257624919, + "grad_norm": 0.6264373745470153, + "learning_rate": 1.561073203162159e-06, + "loss": 0.5069, + "step": 11550 + }, + { + "epoch": 1.8739454899415964, + "grad_norm": 0.6420538209717119, + "learning_rate": 1.5606774458195145e-06, + "loss": 0.5247, + "step": 11551 + }, + { + "epoch": 1.8741077222582738, + "grad_norm": 0.5864448841810632, + "learning_rate": 1.5602817158835604e-06, + "loss": 0.5176, + "step": 11552 + }, + { + "epoch": 1.8742699545749515, + "grad_norm": 0.6220788221255638, + "learning_rate": 1.5598860133658406e-06, + "loss": 0.4872, + "step": 11553 + }, + { + "epoch": 1.874432186891629, + "grad_norm": 0.6113997664192421, + "learning_rate": 1.5594903382779021e-06, + "loss": 0.4963, + "step": 11554 + }, + { + "epoch": 1.8745944192083064, + "grad_norm": 0.5859439931558679, + "learning_rate": 1.5590946906312892e-06, + "loss": 0.4865, + "step": 11555 + }, + { + "epoch": 1.8747566515249838, + "grad_norm": 0.5892784130322859, + "learning_rate": 1.5586990704375465e-06, + "loss": 0.4749, + "step": 11556 + }, + { + "epoch": 1.8749188838416613, + "grad_norm": 0.6099838036325692, + "learning_rate": 1.5583034777082167e-06, + "loss": 0.5165, + "step": 11557 + }, + { + "epoch": 1.8750811161583387, + "grad_norm": 0.6059138153091809, + "learning_rate": 1.5579079124548407e-06, + "loss": 0.5203, + "step": 11558 + }, + { + "epoch": 1.8752433484750162, + "grad_norm": 0.5961016550738496, + "learning_rate": 1.5575123746889619e-06, + "loss": 0.5501, + "step": 11559 + }, + { + "epoch": 1.8754055807916936, + "grad_norm": 0.6128704312654987, + "learning_rate": 1.5571168644221191e-06, + "loss": 0.522, + "step": 11560 + }, + { + "epoch": 1.875567813108371, + "grad_norm": 0.6257745917557785, + "learning_rate": 1.5567213816658539e-06, + "loss": 0.5266, + "step": 11561 + }, + { + "epoch": 1.8757300454250485, + "grad_norm": 0.624687866401175, + "learning_rate": 1.5563259264317048e-06, + "loss": 0.4723, + "step": 11562 + }, + { + "epoch": 1.8758922777417262, + "grad_norm": 0.6017175788992835, + "learning_rate": 1.5559304987312095e-06, + "loss": 0.5221, + "step": 11563 + }, + { + "epoch": 1.8760545100584036, + "grad_norm": 0.7289246033788195, + "learning_rate": 1.555535098575905e-06, + "loss": 0.4881, + "step": 11564 + }, + { + "epoch": 1.876216742375081, + "grad_norm": 0.6174518975520109, + "learning_rate": 1.555139725977329e-06, + "loss": 0.5122, + "step": 11565 + }, + { + "epoch": 1.8763789746917587, + "grad_norm": 0.5870243397179102, + "learning_rate": 1.5547443809470165e-06, + "loss": 0.4818, + "step": 11566 + }, + { + "epoch": 1.8765412070084362, + "grad_norm": 0.589767904681272, + "learning_rate": 1.5543490634965041e-06, + "loss": 0.5162, + "step": 11567 + }, + { + "epoch": 1.8767034393251136, + "grad_norm": 0.6450101057530386, + "learning_rate": 1.5539537736373239e-06, + "loss": 0.5191, + "step": 11568 + }, + { + "epoch": 1.876865671641791, + "grad_norm": 0.6331515706172369, + "learning_rate": 1.55355851138101e-06, + "loss": 0.5237, + "step": 11569 + }, + { + "epoch": 1.8770279039584685, + "grad_norm": 0.6082242924092942, + "learning_rate": 1.5531632767390957e-06, + "loss": 0.4982, + "step": 11570 + }, + { + "epoch": 1.877190136275146, + "grad_norm": 0.6002433569792351, + "learning_rate": 1.5527680697231117e-06, + "loss": 0.5313, + "step": 11571 + }, + { + "epoch": 1.8773523685918234, + "grad_norm": 0.6366586401362649, + "learning_rate": 1.5523728903445907e-06, + "loss": 0.5138, + "step": 11572 + }, + { + "epoch": 1.8775146009085009, + "grad_norm": 0.6044500657576436, + "learning_rate": 1.5519777386150616e-06, + "loss": 0.5039, + "step": 11573 + }, + { + "epoch": 1.8776768332251783, + "grad_norm": 0.5940264874052014, + "learning_rate": 1.551582614546054e-06, + "loss": 0.5142, + "step": 11574 + }, + { + "epoch": 1.877839065541856, + "grad_norm": 0.6226244488818242, + "learning_rate": 1.5511875181490955e-06, + "loss": 0.53, + "step": 11575 + }, + { + "epoch": 1.8780012978585334, + "grad_norm": 0.6322906856995792, + "learning_rate": 1.5507924494357155e-06, + "loss": 0.5031, + "step": 11576 + }, + { + "epoch": 1.8781635301752109, + "grad_norm": 0.638522674604262, + "learning_rate": 1.5503974084174394e-06, + "loss": 0.5133, + "step": 11577 + }, + { + "epoch": 1.8783257624918885, + "grad_norm": 0.6859202885685325, + "learning_rate": 1.5500023951057958e-06, + "loss": 0.509, + "step": 11578 + }, + { + "epoch": 1.878487994808566, + "grad_norm": 0.6251738888832351, + "learning_rate": 1.5496074095123076e-06, + "loss": 0.5323, + "step": 11579 + }, + { + "epoch": 1.8786502271252434, + "grad_norm": 0.6081975971557009, + "learning_rate": 1.5492124516484997e-06, + "loss": 0.4923, + "step": 11580 + }, + { + "epoch": 1.8788124594419209, + "grad_norm": 0.5907363574703178, + "learning_rate": 1.5488175215258965e-06, + "loss": 0.5332, + "step": 11581 + }, + { + "epoch": 1.8789746917585983, + "grad_norm": 0.6086133793567741, + "learning_rate": 1.5484226191560204e-06, + "loss": 0.4954, + "step": 11582 + }, + { + "epoch": 1.8791369240752758, + "grad_norm": 0.6140409443691676, + "learning_rate": 1.5480277445503955e-06, + "loss": 0.5188, + "step": 11583 + }, + { + "epoch": 1.8792991563919532, + "grad_norm": 0.6109202565382693, + "learning_rate": 1.5476328977205396e-06, + "loss": 0.5151, + "step": 11584 + }, + { + "epoch": 1.8794613887086307, + "grad_norm": 0.6537882966737466, + "learning_rate": 1.5472380786779754e-06, + "loss": 0.5162, + "step": 11585 + }, + { + "epoch": 1.8796236210253081, + "grad_norm": 0.5861056193980185, + "learning_rate": 1.5468432874342214e-06, + "loss": 0.5175, + "step": 11586 + }, + { + "epoch": 1.8797858533419856, + "grad_norm": 0.6058641301174528, + "learning_rate": 1.546448524000798e-06, + "loss": 0.5133, + "step": 11587 + }, + { + "epoch": 1.8799480856586632, + "grad_norm": 0.5883560719644209, + "learning_rate": 1.5460537883892228e-06, + "loss": 0.4668, + "step": 11588 + }, + { + "epoch": 1.8801103179753407, + "grad_norm": 0.5734480864860648, + "learning_rate": 1.5456590806110118e-06, + "loss": 0.5088, + "step": 11589 + }, + { + "epoch": 1.8802725502920181, + "grad_norm": 0.5783084587328756, + "learning_rate": 1.5452644006776826e-06, + "loss": 0.5196, + "step": 11590 + }, + { + "epoch": 1.8804347826086958, + "grad_norm": 0.6322425834242251, + "learning_rate": 1.54486974860075e-06, + "loss": 0.5294, + "step": 11591 + }, + { + "epoch": 1.8805970149253732, + "grad_norm": 0.5959785892243602, + "learning_rate": 1.5444751243917296e-06, + "loss": 0.5313, + "step": 11592 + }, + { + "epoch": 1.8807592472420507, + "grad_norm": 0.6259685248241473, + "learning_rate": 1.5440805280621356e-06, + "loss": 0.5407, + "step": 11593 + }, + { + "epoch": 1.8809214795587281, + "grad_norm": 0.6144333072091026, + "learning_rate": 1.54368595962348e-06, + "loss": 0.5275, + "step": 11594 + }, + { + "epoch": 1.8810837118754056, + "grad_norm": 0.6016251493529707, + "learning_rate": 1.5432914190872757e-06, + "loss": 0.5084, + "step": 11595 + }, + { + "epoch": 1.881245944192083, + "grad_norm": 0.5969030024695237, + "learning_rate": 1.5428969064650347e-06, + "loss": 0.5433, + "step": 11596 + }, + { + "epoch": 1.8814081765087605, + "grad_norm": 0.6271400285575607, + "learning_rate": 1.5425024217682666e-06, + "loss": 0.5364, + "step": 11597 + }, + { + "epoch": 1.881570408825438, + "grad_norm": 0.6450358448413924, + "learning_rate": 1.5421079650084833e-06, + "loss": 0.5559, + "step": 11598 + }, + { + "epoch": 1.8817326411421154, + "grad_norm": 0.5986048067675878, + "learning_rate": 1.5417135361971918e-06, + "loss": 0.5021, + "step": 11599 + }, + { + "epoch": 1.881894873458793, + "grad_norm": 0.5948195467691196, + "learning_rate": 1.5413191353459012e-06, + "loss": 0.5284, + "step": 11600 + }, + { + "epoch": 1.8820571057754705, + "grad_norm": 0.6173428751280117, + "learning_rate": 1.5409247624661192e-06, + "loss": 0.5028, + "step": 11601 + }, + { + "epoch": 1.882219338092148, + "grad_norm": 0.6101803945443477, + "learning_rate": 1.5405304175693516e-06, + "loss": 0.4794, + "step": 11602 + }, + { + "epoch": 1.8823815704088256, + "grad_norm": 0.5992043735940652, + "learning_rate": 1.5401361006671062e-06, + "loss": 0.5159, + "step": 11603 + }, + { + "epoch": 1.882543802725503, + "grad_norm": 0.6081066743564684, + "learning_rate": 1.539741811770886e-06, + "loss": 0.5248, + "step": 11604 + }, + { + "epoch": 1.8827060350421805, + "grad_norm": 0.6410819478835598, + "learning_rate": 1.5393475508921952e-06, + "loss": 0.5077, + "step": 11605 + }, + { + "epoch": 1.882868267358858, + "grad_norm": 0.6108556673073314, + "learning_rate": 1.5389533180425387e-06, + "loss": 0.5139, + "step": 11606 + }, + { + "epoch": 1.8830304996755354, + "grad_norm": 0.5996080135780417, + "learning_rate": 1.5385591132334177e-06, + "loss": 0.5385, + "step": 11607 + }, + { + "epoch": 1.8831927319922128, + "grad_norm": 0.6108331502801086, + "learning_rate": 1.5381649364763358e-06, + "loss": 0.5032, + "step": 11608 + }, + { + "epoch": 1.8833549643088903, + "grad_norm": 0.6131305840297279, + "learning_rate": 1.5377707877827911e-06, + "loss": 0.5045, + "step": 11609 + }, + { + "epoch": 1.8835171966255677, + "grad_norm": 0.6234250528838647, + "learning_rate": 1.537376667164286e-06, + "loss": 0.4652, + "step": 11610 + }, + { + "epoch": 1.8836794289422452, + "grad_norm": 0.6083205005087209, + "learning_rate": 1.536982574632318e-06, + "loss": 0.545, + "step": 11611 + }, + { + "epoch": 1.8838416612589228, + "grad_norm": 0.6485304616205758, + "learning_rate": 1.5365885101983875e-06, + "loss": 0.5032, + "step": 11612 + }, + { + "epoch": 1.8840038935756003, + "grad_norm": 0.592957675052901, + "learning_rate": 1.5361944738739914e-06, + "loss": 0.5414, + "step": 11613 + }, + { + "epoch": 1.8841661258922777, + "grad_norm": 0.6538671749950026, + "learning_rate": 1.5358004656706255e-06, + "loss": 0.5017, + "step": 11614 + }, + { + "epoch": 1.8843283582089554, + "grad_norm": 0.6202767621233963, + "learning_rate": 1.535406485599787e-06, + "loss": 0.5078, + "step": 11615 + }, + { + "epoch": 1.8844905905256328, + "grad_norm": 0.5922792694192649, + "learning_rate": 1.5350125336729704e-06, + "loss": 0.5119, + "step": 11616 + }, + { + "epoch": 1.8846528228423103, + "grad_norm": 0.6067056861323525, + "learning_rate": 1.534618609901671e-06, + "loss": 0.4895, + "step": 11617 + }, + { + "epoch": 1.8848150551589877, + "grad_norm": 0.6209445629035113, + "learning_rate": 1.534224714297382e-06, + "loss": 0.4958, + "step": 11618 + }, + { + "epoch": 1.8849772874756652, + "grad_norm": 0.5783339964500406, + "learning_rate": 1.5338308468715958e-06, + "loss": 0.5207, + "step": 11619 + }, + { + "epoch": 1.8851395197923426, + "grad_norm": 0.5801750741836428, + "learning_rate": 1.5334370076358036e-06, + "loss": 0.5127, + "step": 11620 + }, + { + "epoch": 1.88530175210902, + "grad_norm": 0.6010067469899459, + "learning_rate": 1.5330431966014978e-06, + "loss": 0.5274, + "step": 11621 + }, + { + "epoch": 1.8854639844256975, + "grad_norm": 0.6001738550043657, + "learning_rate": 1.5326494137801675e-06, + "loss": 0.5255, + "step": 11622 + }, + { + "epoch": 1.885626216742375, + "grad_norm": 0.5935667229468623, + "learning_rate": 1.5322556591833031e-06, + "loss": 0.5076, + "step": 11623 + }, + { + "epoch": 1.8857884490590524, + "grad_norm": 0.626183188840731, + "learning_rate": 1.531861932822394e-06, + "loss": 0.5136, + "step": 11624 + }, + { + "epoch": 1.88595068137573, + "grad_norm": 0.6216207468130596, + "learning_rate": 1.5314682347089255e-06, + "loss": 0.4656, + "step": 11625 + }, + { + "epoch": 1.8861129136924075, + "grad_norm": 0.612652009472653, + "learning_rate": 1.5310745648543861e-06, + "loss": 0.5149, + "step": 11626 + }, + { + "epoch": 1.886275146009085, + "grad_norm": 0.6224827172911727, + "learning_rate": 1.5306809232702615e-06, + "loss": 0.5357, + "step": 11627 + }, + { + "epoch": 1.8864373783257626, + "grad_norm": 0.602449911097313, + "learning_rate": 1.5302873099680378e-06, + "loss": 0.4763, + "step": 11628 + }, + { + "epoch": 1.88659961064244, + "grad_norm": 0.5739440790041626, + "learning_rate": 1.5298937249591994e-06, + "loss": 0.5287, + "step": 11629 + }, + { + "epoch": 1.8867618429591175, + "grad_norm": 0.5790675110041966, + "learning_rate": 1.5295001682552291e-06, + "loss": 0.5174, + "step": 11630 + }, + { + "epoch": 1.886924075275795, + "grad_norm": 0.6730525059745887, + "learning_rate": 1.5291066398676097e-06, + "loss": 0.5119, + "step": 11631 + }, + { + "epoch": 1.8870863075924724, + "grad_norm": 0.613124493483122, + "learning_rate": 1.5287131398078236e-06, + "loss": 0.5146, + "step": 11632 + }, + { + "epoch": 1.8872485399091499, + "grad_norm": 0.6109208290160167, + "learning_rate": 1.5283196680873518e-06, + "loss": 0.5135, + "step": 11633 + }, + { + "epoch": 1.8874107722258273, + "grad_norm": 0.5884916283264456, + "learning_rate": 1.5279262247176763e-06, + "loss": 0.4851, + "step": 11634 + }, + { + "epoch": 1.8875730045425048, + "grad_norm": 0.5941428725940231, + "learning_rate": 1.5275328097102743e-06, + "loss": 0.5085, + "step": 11635 + }, + { + "epoch": 1.8877352368591822, + "grad_norm": 0.6567199677153351, + "learning_rate": 1.527139423076625e-06, + "loss": 0.5305, + "step": 11636 + }, + { + "epoch": 1.8878974691758599, + "grad_norm": 0.5970826024070528, + "learning_rate": 1.5267460648282074e-06, + "loss": 0.5317, + "step": 11637 + }, + { + "epoch": 1.8880597014925373, + "grad_norm": 0.6241015196705084, + "learning_rate": 1.526352734976497e-06, + "loss": 0.5233, + "step": 11638 + }, + { + "epoch": 1.8882219338092148, + "grad_norm": 0.5949235323722728, + "learning_rate": 1.525959433532972e-06, + "loss": 0.5033, + "step": 11639 + }, + { + "epoch": 1.8883841661258924, + "grad_norm": 0.6584241858411274, + "learning_rate": 1.5255661605091063e-06, + "loss": 0.5172, + "step": 11640 + }, + { + "epoch": 1.8885463984425699, + "grad_norm": 0.5885417627447224, + "learning_rate": 1.5251729159163742e-06, + "loss": 0.5119, + "step": 11641 + }, + { + "epoch": 1.8887086307592473, + "grad_norm": 0.5930974308260792, + "learning_rate": 1.5247796997662503e-06, + "loss": 0.4958, + "step": 11642 + }, + { + "epoch": 1.8888708630759248, + "grad_norm": 0.6078361676373853, + "learning_rate": 1.5243865120702073e-06, + "loss": 0.5163, + "step": 11643 + }, + { + "epoch": 1.8890330953926022, + "grad_norm": 0.5708291046219611, + "learning_rate": 1.5239933528397177e-06, + "loss": 0.5072, + "step": 11644 + }, + { + "epoch": 1.8891953277092797, + "grad_norm": 0.6684744144301197, + "learning_rate": 1.5236002220862517e-06, + "loss": 0.5105, + "step": 11645 + }, + { + "epoch": 1.8893575600259571, + "grad_norm": 0.5995425742410567, + "learning_rate": 1.52320711982128e-06, + "loss": 0.5017, + "step": 11646 + }, + { + "epoch": 1.8895197923426346, + "grad_norm": 0.593620659154426, + "learning_rate": 1.5228140460562724e-06, + "loss": 0.5202, + "step": 11647 + }, + { + "epoch": 1.889682024659312, + "grad_norm": 0.5879033958080966, + "learning_rate": 1.522421000802698e-06, + "loss": 0.5123, + "step": 11648 + }, + { + "epoch": 1.8898442569759895, + "grad_norm": 0.5888385192645964, + "learning_rate": 1.5220279840720248e-06, + "loss": 0.5399, + "step": 11649 + }, + { + "epoch": 1.8900064892926671, + "grad_norm": 0.6005034766517333, + "learning_rate": 1.5216349958757187e-06, + "loss": 0.5273, + "step": 11650 + }, + { + "epoch": 1.8901687216093446, + "grad_norm": 0.6005979192060394, + "learning_rate": 1.5212420362252472e-06, + "loss": 0.5348, + "step": 11651 + }, + { + "epoch": 1.890330953926022, + "grad_norm": 0.5876390039510317, + "learning_rate": 1.5208491051320745e-06, + "loss": 0.4687, + "step": 11652 + }, + { + "epoch": 1.8904931862426997, + "grad_norm": 0.6236656153035405, + "learning_rate": 1.5204562026076663e-06, + "loss": 0.5074, + "step": 11653 + }, + { + "epoch": 1.8906554185593771, + "grad_norm": 0.6045847594601436, + "learning_rate": 1.5200633286634869e-06, + "loss": 0.5194, + "step": 11654 + }, + { + "epoch": 1.8908176508760546, + "grad_norm": 0.6266754310241363, + "learning_rate": 1.5196704833109975e-06, + "loss": 0.4932, + "step": 11655 + }, + { + "epoch": 1.890979883192732, + "grad_norm": 0.5757777454946547, + "learning_rate": 1.5192776665616604e-06, + "loss": 0.5045, + "step": 11656 + }, + { + "epoch": 1.8911421155094095, + "grad_norm": 0.6125393469246482, + "learning_rate": 1.5188848784269378e-06, + "loss": 0.5301, + "step": 11657 + }, + { + "epoch": 1.891304347826087, + "grad_norm": 0.5892138290402501, + "learning_rate": 1.5184921189182893e-06, + "loss": 0.5105, + "step": 11658 + }, + { + "epoch": 1.8914665801427644, + "grad_norm": 0.6538777558151809, + "learning_rate": 1.518099388047176e-06, + "loss": 0.5385, + "step": 11659 + }, + { + "epoch": 1.8916288124594418, + "grad_norm": 0.610865230057948, + "learning_rate": 1.517706685825055e-06, + "loss": 0.516, + "step": 11660 + }, + { + "epoch": 1.8917910447761193, + "grad_norm": 0.5939230835141209, + "learning_rate": 1.517314012263384e-06, + "loss": 0.5236, + "step": 11661 + }, + { + "epoch": 1.891953277092797, + "grad_norm": 0.6135064651333704, + "learning_rate": 1.5169213673736213e-06, + "loss": 0.4928, + "step": 11662 + }, + { + "epoch": 1.8921155094094744, + "grad_norm": 0.6048675738786781, + "learning_rate": 1.516528751167222e-06, + "loss": 0.5083, + "step": 11663 + }, + { + "epoch": 1.8922777417261518, + "grad_norm": 0.6120337918581954, + "learning_rate": 1.5161361636556443e-06, + "loss": 0.5124, + "step": 11664 + }, + { + "epoch": 1.8924399740428295, + "grad_norm": 0.6222973402758889, + "learning_rate": 1.5157436048503384e-06, + "loss": 0.4745, + "step": 11665 + }, + { + "epoch": 1.892602206359507, + "grad_norm": 0.5966286238307733, + "learning_rate": 1.515351074762761e-06, + "loss": 0.4827, + "step": 11666 + }, + { + "epoch": 1.8927644386761844, + "grad_norm": 0.6229010509187453, + "learning_rate": 1.5149585734043636e-06, + "loss": 0.5203, + "step": 11667 + }, + { + "epoch": 1.8929266709928618, + "grad_norm": 0.6058406091215937, + "learning_rate": 1.5145661007865992e-06, + "loss": 0.4866, + "step": 11668 + }, + { + "epoch": 1.8930889033095393, + "grad_norm": 0.6447695370705223, + "learning_rate": 1.5141736569209181e-06, + "loss": 0.5445, + "step": 11669 + }, + { + "epoch": 1.8932511356262167, + "grad_norm": 0.6160361407486756, + "learning_rate": 1.5137812418187726e-06, + "loss": 0.487, + "step": 11670 + }, + { + "epoch": 1.8934133679428942, + "grad_norm": 0.6057318468925992, + "learning_rate": 1.5133888554916098e-06, + "loss": 0.5382, + "step": 11671 + }, + { + "epoch": 1.8935756002595716, + "grad_norm": 0.5936614192870667, + "learning_rate": 1.5129964979508792e-06, + "loss": 0.4877, + "step": 11672 + }, + { + "epoch": 1.893737832576249, + "grad_norm": 0.5846155887896552, + "learning_rate": 1.5126041692080293e-06, + "loss": 0.4954, + "step": 11673 + }, + { + "epoch": 1.8939000648929265, + "grad_norm": 0.6012631108853475, + "learning_rate": 1.512211869274506e-06, + "loss": 0.4958, + "step": 11674 + }, + { + "epoch": 1.8940622972096042, + "grad_norm": 0.6404784489896556, + "learning_rate": 1.5118195981617573e-06, + "loss": 0.5309, + "step": 11675 + }, + { + "epoch": 1.8942245295262816, + "grad_norm": 0.6050399134445188, + "learning_rate": 1.511427355881227e-06, + "loss": 0.5323, + "step": 11676 + }, + { + "epoch": 1.894386761842959, + "grad_norm": 0.6091584031413178, + "learning_rate": 1.5110351424443599e-06, + "loss": 0.5517, + "step": 11677 + }, + { + "epoch": 1.8945489941596367, + "grad_norm": 0.6102289068289577, + "learning_rate": 1.5106429578625987e-06, + "loss": 0.5048, + "step": 11678 + }, + { + "epoch": 1.8947112264763142, + "grad_norm": 0.5813439112973054, + "learning_rate": 1.510250802147388e-06, + "loss": 0.5026, + "step": 11679 + }, + { + "epoch": 1.8948734587929916, + "grad_norm": 0.5755562544372836, + "learning_rate": 1.5098586753101693e-06, + "loss": 0.5025, + "step": 11680 + }, + { + "epoch": 1.895035691109669, + "grad_norm": 0.6113787557984925, + "learning_rate": 1.5094665773623824e-06, + "loss": 0.4875, + "step": 11681 + }, + { + "epoch": 1.8951979234263465, + "grad_norm": 0.6086642870861342, + "learning_rate": 1.5090745083154693e-06, + "loss": 0.5544, + "step": 11682 + }, + { + "epoch": 1.895360155743024, + "grad_norm": 0.6198078807014412, + "learning_rate": 1.508682468180868e-06, + "loss": 0.5204, + "step": 11683 + }, + { + "epoch": 1.8955223880597014, + "grad_norm": 0.5837113331078755, + "learning_rate": 1.508290456970018e-06, + "loss": 0.5136, + "step": 11684 + }, + { + "epoch": 1.8956846203763789, + "grad_norm": 0.6538126406927184, + "learning_rate": 1.5078984746943576e-06, + "loss": 0.5215, + "step": 11685 + }, + { + "epoch": 1.8958468526930563, + "grad_norm": 0.6445662928735664, + "learning_rate": 1.5075065213653217e-06, + "loss": 0.5204, + "step": 11686 + }, + { + "epoch": 1.896009085009734, + "grad_norm": 0.5908074797184949, + "learning_rate": 1.507114596994348e-06, + "loss": 0.4887, + "step": 11687 + }, + { + "epoch": 1.8961713173264114, + "grad_norm": 0.6112676358788592, + "learning_rate": 1.5067227015928714e-06, + "loss": 0.514, + "step": 11688 + }, + { + "epoch": 1.8963335496430889, + "grad_norm": 0.6266394825061989, + "learning_rate": 1.5063308351723255e-06, + "loss": 0.5096, + "step": 11689 + }, + { + "epoch": 1.8964957819597665, + "grad_norm": 0.5989672738446407, + "learning_rate": 1.5059389977441455e-06, + "loss": 0.5094, + "step": 11690 + }, + { + "epoch": 1.896658014276444, + "grad_norm": 0.6020973208860897, + "learning_rate": 1.5055471893197626e-06, + "loss": 0.4946, + "step": 11691 + }, + { + "epoch": 1.8968202465931214, + "grad_norm": 0.5908325032498195, + "learning_rate": 1.505155409910608e-06, + "loss": 0.5242, + "step": 11692 + }, + { + "epoch": 1.8969824789097989, + "grad_norm": 0.6172936937472435, + "learning_rate": 1.5047636595281148e-06, + "loss": 0.5238, + "step": 11693 + }, + { + "epoch": 1.8971447112264763, + "grad_norm": 0.6038582647421267, + "learning_rate": 1.5043719381837113e-06, + "loss": 0.4919, + "step": 11694 + }, + { + "epoch": 1.8973069435431538, + "grad_norm": 0.627186432646179, + "learning_rate": 1.5039802458888287e-06, + "loss": 0.4867, + "step": 11695 + }, + { + "epoch": 1.8974691758598312, + "grad_norm": 0.600785700536105, + "learning_rate": 1.5035885826548934e-06, + "loss": 0.5427, + "step": 11696 + }, + { + "epoch": 1.8976314081765087, + "grad_norm": 0.6347339898386589, + "learning_rate": 1.5031969484933334e-06, + "loss": 0.497, + "step": 11697 + }, + { + "epoch": 1.897793640493186, + "grad_norm": 0.6440390235095524, + "learning_rate": 1.5028053434155769e-06, + "loss": 0.5122, + "step": 11698 + }, + { + "epoch": 1.8979558728098638, + "grad_norm": 0.6351747806249505, + "learning_rate": 1.5024137674330478e-06, + "loss": 0.4968, + "step": 11699 + }, + { + "epoch": 1.8981181051265412, + "grad_norm": 0.6152633962835958, + "learning_rate": 1.502022220557174e-06, + "loss": 0.515, + "step": 11700 + }, + { + "epoch": 1.8982803374432187, + "grad_norm": 0.581330551989143, + "learning_rate": 1.5016307027993756e-06, + "loss": 0.5159, + "step": 11701 + }, + { + "epoch": 1.8984425697598963, + "grad_norm": 0.588369488676862, + "learning_rate": 1.5012392141710791e-06, + "loss": 0.5224, + "step": 11702 + }, + { + "epoch": 1.8986048020765738, + "grad_norm": 0.6113826064522907, + "learning_rate": 1.5008477546837058e-06, + "loss": 0.5102, + "step": 11703 + }, + { + "epoch": 1.8987670343932512, + "grad_norm": 0.6180607240834148, + "learning_rate": 1.5004563243486778e-06, + "loss": 0.509, + "step": 11704 + }, + { + "epoch": 1.8989292667099287, + "grad_norm": 0.6061324037477922, + "learning_rate": 1.500064923177416e-06, + "loss": 0.4944, + "step": 11705 + }, + { + "epoch": 1.8990914990266061, + "grad_norm": 0.6076961710244917, + "learning_rate": 1.4996735511813395e-06, + "loss": 0.4905, + "step": 11706 + }, + { + "epoch": 1.8992537313432836, + "grad_norm": 0.6461048843553079, + "learning_rate": 1.4992822083718682e-06, + "loss": 0.4916, + "step": 11707 + }, + { + "epoch": 1.899415963659961, + "grad_norm": 0.6658561898756478, + "learning_rate": 1.4988908947604198e-06, + "loss": 0.5269, + "step": 11708 + }, + { + "epoch": 1.8995781959766385, + "grad_norm": 0.5972659716417926, + "learning_rate": 1.498499610358412e-06, + "loss": 0.4684, + "step": 11709 + }, + { + "epoch": 1.899740428293316, + "grad_norm": 0.641784767970355, + "learning_rate": 1.4981083551772624e-06, + "loss": 0.4957, + "step": 11710 + }, + { + "epoch": 1.8999026606099934, + "grad_norm": 0.6103320849269014, + "learning_rate": 1.497717129228385e-06, + "loss": 0.5694, + "step": 11711 + }, + { + "epoch": 1.900064892926671, + "grad_norm": 0.6162274529185374, + "learning_rate": 1.4973259325231943e-06, + "loss": 0.5178, + "step": 11712 + }, + { + "epoch": 1.9002271252433485, + "grad_norm": 0.5937336227834614, + "learning_rate": 1.496934765073106e-06, + "loss": 0.526, + "step": 11713 + }, + { + "epoch": 1.900389357560026, + "grad_norm": 0.598024444888522, + "learning_rate": 1.4965436268895316e-06, + "loss": 0.5354, + "step": 11714 + }, + { + "epoch": 1.9005515898767036, + "grad_norm": 0.600850774568674, + "learning_rate": 1.4961525179838849e-06, + "loss": 0.5367, + "step": 11715 + }, + { + "epoch": 1.900713822193381, + "grad_norm": 0.6170289525926401, + "learning_rate": 1.495761438367577e-06, + "loss": 0.5321, + "step": 11716 + }, + { + "epoch": 1.9008760545100585, + "grad_norm": 0.6397391695044529, + "learning_rate": 1.495370388052017e-06, + "loss": 0.5518, + "step": 11717 + }, + { + "epoch": 1.901038286826736, + "grad_norm": 0.6181920583852628, + "learning_rate": 1.4949793670486166e-06, + "loss": 0.4966, + "step": 11718 + }, + { + "epoch": 1.9012005191434134, + "grad_norm": 0.5796939282815049, + "learning_rate": 1.4945883753687828e-06, + "loss": 0.5274, + "step": 11719 + }, + { + "epoch": 1.9013627514600908, + "grad_norm": 0.600914752166036, + "learning_rate": 1.4941974130239251e-06, + "loss": 0.5409, + "step": 11720 + }, + { + "epoch": 1.9015249837767683, + "grad_norm": 0.614025158872316, + "learning_rate": 1.4938064800254506e-06, + "loss": 0.5071, + "step": 11721 + }, + { + "epoch": 1.9016872160934457, + "grad_norm": 0.6113601068865769, + "learning_rate": 1.4934155763847646e-06, + "loss": 0.4889, + "step": 11722 + }, + { + "epoch": 1.9018494484101232, + "grad_norm": 0.6294488073373535, + "learning_rate": 1.4930247021132724e-06, + "loss": 0.5165, + "step": 11723 + }, + { + "epoch": 1.9020116807268008, + "grad_norm": 0.611262051462962, + "learning_rate": 1.4926338572223798e-06, + "loss": 0.4746, + "step": 11724 + }, + { + "epoch": 1.9021739130434783, + "grad_norm": 0.6224663605581295, + "learning_rate": 1.492243041723489e-06, + "loss": 0.5223, + "step": 11725 + }, + { + "epoch": 1.9023361453601557, + "grad_norm": 0.6033744357387376, + "learning_rate": 1.4918522556280048e-06, + "loss": 0.493, + "step": 11726 + }, + { + "epoch": 1.9024983776768334, + "grad_norm": 0.6015025251029655, + "learning_rate": 1.4914614989473277e-06, + "loss": 0.5428, + "step": 11727 + }, + { + "epoch": 1.9026606099935108, + "grad_norm": 0.5902086635965201, + "learning_rate": 1.4910707716928587e-06, + "loss": 0.4708, + "step": 11728 + }, + { + "epoch": 1.9028228423101883, + "grad_norm": 0.5902747621761865, + "learning_rate": 1.4906800738759991e-06, + "loss": 0.5253, + "step": 11729 + }, + { + "epoch": 1.9029850746268657, + "grad_norm": 0.6425926655642036, + "learning_rate": 1.4902894055081477e-06, + "loss": 0.4919, + "step": 11730 + }, + { + "epoch": 1.9031473069435432, + "grad_norm": 0.6114542452801517, + "learning_rate": 1.4898987666007036e-06, + "loss": 0.5089, + "step": 11731 + }, + { + "epoch": 1.9033095392602206, + "grad_norm": 0.5908697303724435, + "learning_rate": 1.489508157165064e-06, + "loss": 0.5106, + "step": 11732 + }, + { + "epoch": 1.903471771576898, + "grad_norm": 0.5722264737356798, + "learning_rate": 1.4891175772126254e-06, + "loss": 0.5075, + "step": 11733 + }, + { + "epoch": 1.9036340038935755, + "grad_norm": 0.6145336336049979, + "learning_rate": 1.4887270267547845e-06, + "loss": 0.5095, + "step": 11734 + }, + { + "epoch": 1.903796236210253, + "grad_norm": 0.6228106048282331, + "learning_rate": 1.4883365058029364e-06, + "loss": 0.5246, + "step": 11735 + }, + { + "epoch": 1.9039584685269304, + "grad_norm": 0.610736951466802, + "learning_rate": 1.4879460143684755e-06, + "loss": 0.519, + "step": 11736 + }, + { + "epoch": 1.904120700843608, + "grad_norm": 0.6128670491615474, + "learning_rate": 1.4875555524627939e-06, + "loss": 0.4956, + "step": 11737 + }, + { + "epoch": 1.9042829331602855, + "grad_norm": 0.6299203480543686, + "learning_rate": 1.4871651200972854e-06, + "loss": 0.487, + "step": 11738 + }, + { + "epoch": 1.904445165476963, + "grad_norm": 0.6207415716619137, + "learning_rate": 1.4867747172833408e-06, + "loss": 0.5197, + "step": 11739 + }, + { + "epoch": 1.9046073977936406, + "grad_norm": 0.6354795459061966, + "learning_rate": 1.4863843440323516e-06, + "loss": 0.4786, + "step": 11740 + }, + { + "epoch": 1.904769630110318, + "grad_norm": 0.6042407095659971, + "learning_rate": 1.4859940003557088e-06, + "loss": 0.4955, + "step": 11741 + }, + { + "epoch": 1.9049318624269955, + "grad_norm": 0.6197973845839205, + "learning_rate": 1.4856036862647988e-06, + "loss": 0.5256, + "step": 11742 + }, + { + "epoch": 1.905094094743673, + "grad_norm": 0.5938156589533021, + "learning_rate": 1.4852134017710122e-06, + "loss": 0.5286, + "step": 11743 + }, + { + "epoch": 1.9052563270603504, + "grad_norm": 0.6163842582290276, + "learning_rate": 1.4848231468857346e-06, + "loss": 0.5107, + "step": 11744 + }, + { + "epoch": 1.9054185593770279, + "grad_norm": 0.6142719296296374, + "learning_rate": 1.484432921620354e-06, + "loss": 0.5433, + "step": 11745 + }, + { + "epoch": 1.9055807916937053, + "grad_norm": 0.6045924593690345, + "learning_rate": 1.4840427259862561e-06, + "loss": 0.52, + "step": 11746 + }, + { + "epoch": 1.9057430240103828, + "grad_norm": 0.6317496646000424, + "learning_rate": 1.4836525599948244e-06, + "loss": 0.5107, + "step": 11747 + }, + { + "epoch": 1.9059052563270602, + "grad_norm": 0.6034809566219589, + "learning_rate": 1.4832624236574424e-06, + "loss": 0.5132, + "step": 11748 + }, + { + "epoch": 1.9060674886437379, + "grad_norm": 0.5614268143720756, + "learning_rate": 1.4828723169854953e-06, + "loss": 0.4809, + "step": 11749 + }, + { + "epoch": 1.9062297209604153, + "grad_norm": 0.626835301878611, + "learning_rate": 1.4824822399903633e-06, + "loss": 0.5044, + "step": 11750 + }, + { + "epoch": 1.9063919532770928, + "grad_norm": 0.5931847471005489, + "learning_rate": 1.4820921926834292e-06, + "loss": 0.4831, + "step": 11751 + }, + { + "epoch": 1.9065541855937704, + "grad_norm": 0.5980733979548136, + "learning_rate": 1.4817021750760728e-06, + "loss": 0.511, + "step": 11752 + }, + { + "epoch": 1.9067164179104479, + "grad_norm": 0.6307221111995562, + "learning_rate": 1.4813121871796726e-06, + "loss": 0.5537, + "step": 11753 + }, + { + "epoch": 1.9068786502271253, + "grad_norm": 0.5757712362890877, + "learning_rate": 1.4809222290056092e-06, + "loss": 0.5544, + "step": 11754 + }, + { + "epoch": 1.9070408825438028, + "grad_norm": 0.6621853610685219, + "learning_rate": 1.4805323005652589e-06, + "loss": 0.5065, + "step": 11755 + }, + { + "epoch": 1.9072031148604802, + "grad_norm": 0.5966912383808881, + "learning_rate": 1.4801424018700007e-06, + "loss": 0.5347, + "step": 11756 + }, + { + "epoch": 1.9073653471771577, + "grad_norm": 0.610444663106797, + "learning_rate": 1.4797525329312079e-06, + "loss": 0.5268, + "step": 11757 + }, + { + "epoch": 1.9075275794938351, + "grad_norm": 0.610095468439281, + "learning_rate": 1.4793626937602579e-06, + "loss": 0.5296, + "step": 11758 + }, + { + "epoch": 1.9076898118105126, + "grad_norm": 0.6260075113597992, + "learning_rate": 1.4789728843685235e-06, + "loss": 0.4859, + "step": 11759 + }, + { + "epoch": 1.90785204412719, + "grad_norm": 0.5687103857110601, + "learning_rate": 1.47858310476738e-06, + "loss": 0.5138, + "step": 11760 + }, + { + "epoch": 1.9080142764438677, + "grad_norm": 0.5925328530695837, + "learning_rate": 1.4781933549681984e-06, + "loss": 0.5081, + "step": 11761 + }, + { + "epoch": 1.9081765087605451, + "grad_norm": 0.6099793582727999, + "learning_rate": 1.477803634982352e-06, + "loss": 0.5066, + "step": 11762 + }, + { + "epoch": 1.9083387410772226, + "grad_norm": 0.5932038359793779, + "learning_rate": 1.4774139448212105e-06, + "loss": 0.5306, + "step": 11763 + }, + { + "epoch": 1.9085009733939, + "grad_norm": 0.5965805394376007, + "learning_rate": 1.4770242844961436e-06, + "loss": 0.5215, + "step": 11764 + }, + { + "epoch": 1.9086632057105777, + "grad_norm": 0.6136093117210236, + "learning_rate": 1.4766346540185218e-06, + "loss": 0.5129, + "step": 11765 + }, + { + "epoch": 1.9088254380272551, + "grad_norm": 0.5808959667055436, + "learning_rate": 1.4762450533997119e-06, + "loss": 0.5032, + "step": 11766 + }, + { + "epoch": 1.9089876703439326, + "grad_norm": 0.5774730139020697, + "learning_rate": 1.4758554826510835e-06, + "loss": 0.5107, + "step": 11767 + }, + { + "epoch": 1.90914990266061, + "grad_norm": 0.6241191754851421, + "learning_rate": 1.4754659417840009e-06, + "loss": 0.4638, + "step": 11768 + }, + { + "epoch": 1.9093121349772875, + "grad_norm": 0.6146754921092906, + "learning_rate": 1.4750764308098305e-06, + "loss": 0.519, + "step": 11769 + }, + { + "epoch": 1.909474367293965, + "grad_norm": 0.645803030694446, + "learning_rate": 1.4746869497399369e-06, + "loss": 0.5192, + "step": 11770 + }, + { + "epoch": 1.9096365996106424, + "grad_norm": 0.6045373272137097, + "learning_rate": 1.4742974985856848e-06, + "loss": 0.5123, + "step": 11771 + }, + { + "epoch": 1.9097988319273198, + "grad_norm": 0.6303788464899881, + "learning_rate": 1.4739080773584372e-06, + "loss": 0.4889, + "step": 11772 + }, + { + "epoch": 1.9099610642439973, + "grad_norm": 0.5947890441743611, + "learning_rate": 1.473518686069555e-06, + "loss": 0.5113, + "step": 11773 + }, + { + "epoch": 1.910123296560675, + "grad_norm": 0.649253656501753, + "learning_rate": 1.4731293247304005e-06, + "loss": 0.55, + "step": 11774 + }, + { + "epoch": 1.9102855288773524, + "grad_norm": 0.5768395355523553, + "learning_rate": 1.4727399933523333e-06, + "loss": 0.5273, + "step": 11775 + }, + { + "epoch": 1.9104477611940298, + "grad_norm": 0.6039764282534185, + "learning_rate": 1.4723506919467146e-06, + "loss": 0.5166, + "step": 11776 + }, + { + "epoch": 1.9106099935107075, + "grad_norm": 0.5973392907265811, + "learning_rate": 1.4719614205249026e-06, + "loss": 0.486, + "step": 11777 + }, + { + "epoch": 1.910772225827385, + "grad_norm": 0.6026573950752243, + "learning_rate": 1.4715721790982535e-06, + "loss": 0.5105, + "step": 11778 + }, + { + "epoch": 1.9109344581440624, + "grad_norm": 0.6359781985639353, + "learning_rate": 1.4711829676781256e-06, + "loss": 0.5191, + "step": 11779 + }, + { + "epoch": 1.9110966904607398, + "grad_norm": 0.5981950784631037, + "learning_rate": 1.4707937862758743e-06, + "loss": 0.4652, + "step": 11780 + }, + { + "epoch": 1.9112589227774173, + "grad_norm": 0.6132734657224336, + "learning_rate": 1.4704046349028556e-06, + "loss": 0.4952, + "step": 11781 + }, + { + "epoch": 1.9114211550940947, + "grad_norm": 0.6045256493643485, + "learning_rate": 1.470015513570424e-06, + "loss": 0.5081, + "step": 11782 + }, + { + "epoch": 1.9115833874107722, + "grad_norm": 0.630998174961501, + "learning_rate": 1.4696264222899315e-06, + "loss": 0.542, + "step": 11783 + }, + { + "epoch": 1.9117456197274496, + "grad_norm": 0.6251555823018435, + "learning_rate": 1.4692373610727313e-06, + "loss": 0.4993, + "step": 11784 + }, + { + "epoch": 1.911907852044127, + "grad_norm": 0.617530232283957, + "learning_rate": 1.4688483299301753e-06, + "loss": 0.4973, + "step": 11785 + }, + { + "epoch": 1.9120700843608047, + "grad_norm": 0.5924178641733757, + "learning_rate": 1.4684593288736137e-06, + "loss": 0.5291, + "step": 11786 + }, + { + "epoch": 1.9122323166774822, + "grad_norm": 0.5832959501872418, + "learning_rate": 1.4680703579143982e-06, + "loss": 0.5403, + "step": 11787 + }, + { + "epoch": 1.9123945489941596, + "grad_norm": 0.6399978834154324, + "learning_rate": 1.467681417063876e-06, + "loss": 0.4962, + "step": 11788 + }, + { + "epoch": 1.9125567813108373, + "grad_norm": 0.6192230752967436, + "learning_rate": 1.4672925063333952e-06, + "loss": 0.4832, + "step": 11789 + }, + { + "epoch": 1.9127190136275147, + "grad_norm": 0.593878830109014, + "learning_rate": 1.4669036257343039e-06, + "loss": 0.4964, + "step": 11790 + }, + { + "epoch": 1.9128812459441922, + "grad_norm": 0.6156979939686593, + "learning_rate": 1.4665147752779481e-06, + "loss": 0.5219, + "step": 11791 + }, + { + "epoch": 1.9130434782608696, + "grad_norm": 0.5745672684035723, + "learning_rate": 1.4661259549756752e-06, + "loss": 0.51, + "step": 11792 + }, + { + "epoch": 1.913205710577547, + "grad_norm": 0.5903125199920987, + "learning_rate": 1.4657371648388264e-06, + "loss": 0.5153, + "step": 11793 + }, + { + "epoch": 1.9133679428942245, + "grad_norm": 0.6013317219235409, + "learning_rate": 1.4653484048787476e-06, + "loss": 0.4757, + "step": 11794 + }, + { + "epoch": 1.913530175210902, + "grad_norm": 0.5893313546628167, + "learning_rate": 1.4649596751067807e-06, + "loss": 0.5007, + "step": 11795 + }, + { + "epoch": 1.9136924075275794, + "grad_norm": 0.6268219406434632, + "learning_rate": 1.464570975534269e-06, + "loss": 0.5413, + "step": 11796 + }, + { + "epoch": 1.9138546398442569, + "grad_norm": 0.6125630004588077, + "learning_rate": 1.4641823061725532e-06, + "loss": 0.5261, + "step": 11797 + }, + { + "epoch": 1.9140168721609343, + "grad_norm": 0.6056985391042636, + "learning_rate": 1.4637936670329723e-06, + "loss": 0.5022, + "step": 11798 + }, + { + "epoch": 1.914179104477612, + "grad_norm": 0.6259618914119052, + "learning_rate": 1.463405058126867e-06, + "loss": 0.5176, + "step": 11799 + }, + { + "epoch": 1.9143413367942894, + "grad_norm": 0.6340324646973786, + "learning_rate": 1.4630164794655752e-06, + "loss": 0.5271, + "step": 11800 + }, + { + "epoch": 1.9145035691109669, + "grad_norm": 0.5865001777002473, + "learning_rate": 1.462627931060435e-06, + "loss": 0.5135, + "step": 11801 + }, + { + "epoch": 1.9146658014276445, + "grad_norm": 0.5963253745667237, + "learning_rate": 1.4622394129227829e-06, + "loss": 0.5381, + "step": 11802 + }, + { + "epoch": 1.914828033744322, + "grad_norm": 0.600211577431486, + "learning_rate": 1.4618509250639545e-06, + "loss": 0.5232, + "step": 11803 + }, + { + "epoch": 1.9149902660609994, + "grad_norm": 0.5921623582410968, + "learning_rate": 1.4614624674952843e-06, + "loss": 0.4883, + "step": 11804 + }, + { + "epoch": 1.9151524983776769, + "grad_norm": 0.5852967838288137, + "learning_rate": 1.4610740402281074e-06, + "loss": 0.5124, + "step": 11805 + }, + { + "epoch": 1.9153147306943543, + "grad_norm": 0.635000603646406, + "learning_rate": 1.4606856432737557e-06, + "loss": 0.5493, + "step": 11806 + }, + { + "epoch": 1.9154769630110318, + "grad_norm": 0.5905570973000311, + "learning_rate": 1.460297276643563e-06, + "loss": 0.5007, + "step": 11807 + }, + { + "epoch": 1.9156391953277092, + "grad_norm": 0.6210388187464257, + "learning_rate": 1.4599089403488603e-06, + "loss": 0.5455, + "step": 11808 + }, + { + "epoch": 1.9158014276443867, + "grad_norm": 0.5991725398928077, + "learning_rate": 1.4595206344009772e-06, + "loss": 0.5079, + "step": 11809 + }, + { + "epoch": 1.915963659961064, + "grad_norm": 0.6010012787948934, + "learning_rate": 1.4591323588112435e-06, + "loss": 0.5147, + "step": 11810 + }, + { + "epoch": 1.9161258922777418, + "grad_norm": 0.6251245019237768, + "learning_rate": 1.4587441135909886e-06, + "loss": 0.5112, + "step": 11811 + }, + { + "epoch": 1.9162881245944192, + "grad_norm": 0.604499829208464, + "learning_rate": 1.4583558987515397e-06, + "loss": 0.5059, + "step": 11812 + }, + { + "epoch": 1.9164503569110967, + "grad_norm": 0.6045837875008331, + "learning_rate": 1.4579677143042253e-06, + "loss": 0.5473, + "step": 11813 + }, + { + "epoch": 1.9166125892277743, + "grad_norm": 0.584244023014345, + "learning_rate": 1.4575795602603703e-06, + "loss": 0.5247, + "step": 11814 + }, + { + "epoch": 1.9167748215444518, + "grad_norm": 0.6633325952570572, + "learning_rate": 1.4571914366312994e-06, + "loss": 0.5046, + "step": 11815 + }, + { + "epoch": 1.9169370538611292, + "grad_norm": 0.6368834093567359, + "learning_rate": 1.4568033434283375e-06, + "loss": 0.5669, + "step": 11816 + }, + { + "epoch": 1.9170992861778067, + "grad_norm": 0.6324654615726767, + "learning_rate": 1.4564152806628085e-06, + "loss": 0.5207, + "step": 11817 + }, + { + "epoch": 1.9172615184944841, + "grad_norm": 0.627960110234508, + "learning_rate": 1.4560272483460346e-06, + "loss": 0.5398, + "step": 11818 + }, + { + "epoch": 1.9174237508111616, + "grad_norm": 0.5964984006057026, + "learning_rate": 1.4556392464893366e-06, + "loss": 0.5223, + "step": 11819 + }, + { + "epoch": 1.917585983127839, + "grad_norm": 0.579015651952097, + "learning_rate": 1.4552512751040358e-06, + "loss": 0.4775, + "step": 11820 + }, + { + "epoch": 1.9177482154445165, + "grad_norm": 0.5985871493985885, + "learning_rate": 1.454863334201452e-06, + "loss": 0.5036, + "step": 11821 + }, + { + "epoch": 1.917910447761194, + "grad_norm": 0.605194673631495, + "learning_rate": 1.4544754237929057e-06, + "loss": 0.5095, + "step": 11822 + }, + { + "epoch": 1.9180726800778714, + "grad_norm": 0.6033342580868131, + "learning_rate": 1.4540875438897137e-06, + "loss": 0.5104, + "step": 11823 + }, + { + "epoch": 1.918234912394549, + "grad_norm": 0.6566832608947893, + "learning_rate": 1.4536996945031923e-06, + "loss": 0.4997, + "step": 11824 + }, + { + "epoch": 1.9183971447112265, + "grad_norm": 0.6290732805873274, + "learning_rate": 1.4533118756446585e-06, + "loss": 0.4982, + "step": 11825 + }, + { + "epoch": 1.918559377027904, + "grad_norm": 0.5982838046326182, + "learning_rate": 1.452924087325428e-06, + "loss": 0.5342, + "step": 11826 + }, + { + "epoch": 1.9187216093445816, + "grad_norm": 0.6281883744206792, + "learning_rate": 1.4525363295568162e-06, + "loss": 0.508, + "step": 11827 + }, + { + "epoch": 1.918883841661259, + "grad_norm": 0.6298478611248602, + "learning_rate": 1.4521486023501363e-06, + "loss": 0.5216, + "step": 11828 + }, + { + "epoch": 1.9190460739779365, + "grad_norm": 0.5932046104665087, + "learning_rate": 1.4517609057166993e-06, + "loss": 0.4811, + "step": 11829 + }, + { + "epoch": 1.919208306294614, + "grad_norm": 0.636606106983657, + "learning_rate": 1.4513732396678181e-06, + "loss": 0.5122, + "step": 11830 + }, + { + "epoch": 1.9193705386112914, + "grad_norm": 0.556887030910688, + "learning_rate": 1.4509856042148053e-06, + "loss": 0.5201, + "step": 11831 + }, + { + "epoch": 1.9195327709279688, + "grad_norm": 0.5821280099231273, + "learning_rate": 1.4505979993689682e-06, + "loss": 0.5079, + "step": 11832 + }, + { + "epoch": 1.9196950032446463, + "grad_norm": 0.5907079809162161, + "learning_rate": 1.4502104251416183e-06, + "loss": 0.4753, + "step": 11833 + }, + { + "epoch": 1.9198572355613237, + "grad_norm": 0.5775916385854132, + "learning_rate": 1.4498228815440624e-06, + "loss": 0.5497, + "step": 11834 + }, + { + "epoch": 1.9200194678780012, + "grad_norm": 0.5888761161055995, + "learning_rate": 1.4494353685876077e-06, + "loss": 0.492, + "step": 11835 + }, + { + "epoch": 1.9201817001946788, + "grad_norm": 0.6063107353310435, + "learning_rate": 1.449047886283563e-06, + "loss": 0.5098, + "step": 11836 + }, + { + "epoch": 1.9203439325113563, + "grad_norm": 0.6331747050668056, + "learning_rate": 1.4486604346432311e-06, + "loss": 0.4827, + "step": 11837 + }, + { + "epoch": 1.9205061648280337, + "grad_norm": 0.5928408988233705, + "learning_rate": 1.4482730136779188e-06, + "loss": 0.5153, + "step": 11838 + }, + { + "epoch": 1.9206683971447114, + "grad_norm": 0.6413678186753341, + "learning_rate": 1.4478856233989286e-06, + "loss": 0.4787, + "step": 11839 + }, + { + "epoch": 1.9208306294613888, + "grad_norm": 0.6201096791005736, + "learning_rate": 1.4474982638175645e-06, + "loss": 0.5031, + "step": 11840 + }, + { + "epoch": 1.9209928617780663, + "grad_norm": 0.616366609972126, + "learning_rate": 1.4471109349451267e-06, + "loss": 0.5114, + "step": 11841 + }, + { + "epoch": 1.9211550940947437, + "grad_norm": 0.6050308888403797, + "learning_rate": 1.4467236367929175e-06, + "loss": 0.481, + "step": 11842 + }, + { + "epoch": 1.9213173264114212, + "grad_norm": 0.6136374653969398, + "learning_rate": 1.4463363693722383e-06, + "loss": 0.5024, + "step": 11843 + }, + { + "epoch": 1.9214795587280986, + "grad_norm": 0.6367687879804862, + "learning_rate": 1.445949132694386e-06, + "loss": 0.5349, + "step": 11844 + }, + { + "epoch": 1.921641791044776, + "grad_norm": 0.5891442500628232, + "learning_rate": 1.4455619267706616e-06, + "loss": 0.5018, + "step": 11845 + }, + { + "epoch": 1.9218040233614535, + "grad_norm": 0.5958615130111118, + "learning_rate": 1.4451747516123605e-06, + "loss": 0.5026, + "step": 11846 + }, + { + "epoch": 1.921966255678131, + "grad_norm": 0.6223118762013151, + "learning_rate": 1.4447876072307798e-06, + "loss": 0.52, + "step": 11847 + }, + { + "epoch": 1.9221284879948086, + "grad_norm": 0.6419598650427336, + "learning_rate": 1.4444004936372166e-06, + "loss": 0.5082, + "step": 11848 + }, + { + "epoch": 1.922290720311486, + "grad_norm": 0.5729326256848334, + "learning_rate": 1.4440134108429649e-06, + "loss": 0.5084, + "step": 11849 + }, + { + "epoch": 1.9224529526281635, + "grad_norm": 0.5825598180556802, + "learning_rate": 1.4436263588593174e-06, + "loss": 0.5462, + "step": 11850 + }, + { + "epoch": 1.922615184944841, + "grad_norm": 0.5970653903968055, + "learning_rate": 1.4432393376975683e-06, + "loss": 0.5255, + "step": 11851 + }, + { + "epoch": 1.9227774172615186, + "grad_norm": 0.6350240117638993, + "learning_rate": 1.4428523473690097e-06, + "loss": 0.523, + "step": 11852 + }, + { + "epoch": 1.922939649578196, + "grad_norm": 0.6317354607841417, + "learning_rate": 1.4424653878849344e-06, + "loss": 0.4983, + "step": 11853 + }, + { + "epoch": 1.9231018818948735, + "grad_norm": 0.5863779404024672, + "learning_rate": 1.4420784592566307e-06, + "loss": 0.5166, + "step": 11854 + }, + { + "epoch": 1.923264114211551, + "grad_norm": 0.6137641888970604, + "learning_rate": 1.441691561495387e-06, + "loss": 0.5043, + "step": 11855 + }, + { + "epoch": 1.9234263465282284, + "grad_norm": 0.585293300154557, + "learning_rate": 1.4413046946124941e-06, + "loss": 0.5145, + "step": 11856 + }, + { + "epoch": 1.9235885788449059, + "grad_norm": 0.614951876259602, + "learning_rate": 1.4409178586192391e-06, + "loss": 0.5416, + "step": 11857 + }, + { + "epoch": 1.9237508111615833, + "grad_norm": 0.6107496929677444, + "learning_rate": 1.440531053526909e-06, + "loss": 0.4929, + "step": 11858 + }, + { + "epoch": 1.9239130434782608, + "grad_norm": 0.6126634373242246, + "learning_rate": 1.4401442793467897e-06, + "loss": 0.5316, + "step": 11859 + }, + { + "epoch": 1.9240752757949382, + "grad_norm": 0.6027290331711431, + "learning_rate": 1.4397575360901642e-06, + "loss": 0.5102, + "step": 11860 + }, + { + "epoch": 1.9242375081116159, + "grad_norm": 0.59340492452923, + "learning_rate": 1.4393708237683183e-06, + "loss": 0.5173, + "step": 11861 + }, + { + "epoch": 1.9243997404282933, + "grad_norm": 0.6237327775850474, + "learning_rate": 1.4389841423925358e-06, + "loss": 0.5173, + "step": 11862 + }, + { + "epoch": 1.9245619727449708, + "grad_norm": 0.606784153032608, + "learning_rate": 1.438597491974097e-06, + "loss": 0.5004, + "step": 11863 + }, + { + "epoch": 1.9247242050616484, + "grad_norm": 0.592769867380721, + "learning_rate": 1.438210872524285e-06, + "loss": 0.4896, + "step": 11864 + }, + { + "epoch": 1.9248864373783259, + "grad_norm": 0.5904970462008442, + "learning_rate": 1.4378242840543787e-06, + "loss": 0.5139, + "step": 11865 + }, + { + "epoch": 1.9250486696950033, + "grad_norm": 0.5851586225896737, + "learning_rate": 1.4374377265756585e-06, + "loss": 0.5082, + "step": 11866 + }, + { + "epoch": 1.9252109020116808, + "grad_norm": 0.6033060064773457, + "learning_rate": 1.4370512000994035e-06, + "loss": 0.5433, + "step": 11867 + }, + { + "epoch": 1.9253731343283582, + "grad_norm": 0.6158771825550327, + "learning_rate": 1.4366647046368898e-06, + "loss": 0.4929, + "step": 11868 + }, + { + "epoch": 1.9255353666450357, + "grad_norm": 0.6400366305742763, + "learning_rate": 1.4362782401993964e-06, + "loss": 0.4897, + "step": 11869 + }, + { + "epoch": 1.925697598961713, + "grad_norm": 0.632488184154473, + "learning_rate": 1.4358918067981969e-06, + "loss": 0.5101, + "step": 11870 + }, + { + "epoch": 1.9258598312783906, + "grad_norm": 0.591480512099718, + "learning_rate": 1.4355054044445673e-06, + "loss": 0.5324, + "step": 11871 + }, + { + "epoch": 1.926022063595068, + "grad_norm": 0.5947266613100678, + "learning_rate": 1.435119033149783e-06, + "loss": 0.4956, + "step": 11872 + }, + { + "epoch": 1.9261842959117457, + "grad_norm": 0.6555181184616589, + "learning_rate": 1.4347326929251148e-06, + "loss": 0.4849, + "step": 11873 + }, + { + "epoch": 1.9263465282284231, + "grad_norm": 0.635573894487541, + "learning_rate": 1.4343463837818372e-06, + "loss": 0.5168, + "step": 11874 + }, + { + "epoch": 1.9265087605451006, + "grad_norm": 0.603346308065693, + "learning_rate": 1.4339601057312196e-06, + "loss": 0.522, + "step": 11875 + }, + { + "epoch": 1.9266709928617782, + "grad_norm": 0.6157727822442373, + "learning_rate": 1.4335738587845344e-06, + "loss": 0.5165, + "step": 11876 + }, + { + "epoch": 1.9268332251784557, + "grad_norm": 0.6342295165257159, + "learning_rate": 1.4331876429530495e-06, + "loss": 0.5299, + "step": 11877 + }, + { + "epoch": 1.9269954574951331, + "grad_norm": 0.6029244887687096, + "learning_rate": 1.4328014582480338e-06, + "loss": 0.5235, + "step": 11878 + }, + { + "epoch": 1.9271576898118106, + "grad_norm": 0.6033492075833307, + "learning_rate": 1.4324153046807562e-06, + "loss": 0.5218, + "step": 11879 + }, + { + "epoch": 1.927319922128488, + "grad_norm": 0.6065935947736503, + "learning_rate": 1.4320291822624822e-06, + "loss": 0.5088, + "step": 11880 + }, + { + "epoch": 1.9274821544451655, + "grad_norm": 0.6307231255526607, + "learning_rate": 1.4316430910044792e-06, + "loss": 0.5101, + "step": 11881 + }, + { + "epoch": 1.927644386761843, + "grad_norm": 0.5871263361014263, + "learning_rate": 1.4312570309180102e-06, + "loss": 0.4915, + "step": 11882 + }, + { + "epoch": 1.9278066190785204, + "grad_norm": 0.5989591266471779, + "learning_rate": 1.4308710020143407e-06, + "loss": 0.504, + "step": 11883 + }, + { + "epoch": 1.9279688513951978, + "grad_norm": 0.5943313113783948, + "learning_rate": 1.4304850043047342e-06, + "loss": 0.5257, + "step": 11884 + }, + { + "epoch": 1.9281310837118752, + "grad_norm": 0.5895304512845428, + "learning_rate": 1.4300990378004526e-06, + "loss": 0.5025, + "step": 11885 + }, + { + "epoch": 1.928293316028553, + "grad_norm": 0.6441574468853705, + "learning_rate": 1.4297131025127564e-06, + "loss": 0.5301, + "step": 11886 + }, + { + "epoch": 1.9284555483452304, + "grad_norm": 0.624784565673084, + "learning_rate": 1.4293271984529062e-06, + "loss": 0.5193, + "step": 11887 + }, + { + "epoch": 1.9286177806619078, + "grad_norm": 0.6020287965072119, + "learning_rate": 1.4289413256321624e-06, + "loss": 0.5056, + "step": 11888 + }, + { + "epoch": 1.9287800129785855, + "grad_norm": 0.6020933690083973, + "learning_rate": 1.4285554840617843e-06, + "loss": 0.5259, + "step": 11889 + }, + { + "epoch": 1.928942245295263, + "grad_norm": 0.6073670560976526, + "learning_rate": 1.4281696737530287e-06, + "loss": 0.487, + "step": 11890 + }, + { + "epoch": 1.9291044776119404, + "grad_norm": 0.5895815032606462, + "learning_rate": 1.4277838947171514e-06, + "loss": 0.5155, + "step": 11891 + }, + { + "epoch": 1.9292667099286178, + "grad_norm": 0.6657205551942228, + "learning_rate": 1.4273981469654093e-06, + "loss": 0.5466, + "step": 11892 + }, + { + "epoch": 1.9294289422452953, + "grad_norm": 0.6120201739553304, + "learning_rate": 1.4270124305090573e-06, + "loss": 0.5302, + "step": 11893 + }, + { + "epoch": 1.9295911745619727, + "grad_norm": 0.5999613252821603, + "learning_rate": 1.4266267453593507e-06, + "loss": 0.5151, + "step": 11894 + }, + { + "epoch": 1.9297534068786502, + "grad_norm": 0.6480688883641401, + "learning_rate": 1.4262410915275415e-06, + "loss": 0.5103, + "step": 11895 + }, + { + "epoch": 1.9299156391953276, + "grad_norm": 0.6517841239667728, + "learning_rate": 1.425855469024881e-06, + "loss": 0.4753, + "step": 11896 + }, + { + "epoch": 1.930077871512005, + "grad_norm": 0.5797686370352041, + "learning_rate": 1.4254698778626216e-06, + "loss": 0.4958, + "step": 11897 + }, + { + "epoch": 1.9302401038286827, + "grad_norm": 0.6204850987999254, + "learning_rate": 1.4250843180520146e-06, + "loss": 0.5034, + "step": 11898 + }, + { + "epoch": 1.9304023361453602, + "grad_norm": 0.6160428637532169, + "learning_rate": 1.4246987896043085e-06, + "loss": 0.5127, + "step": 11899 + }, + { + "epoch": 1.9305645684620376, + "grad_norm": 0.5994519528794982, + "learning_rate": 1.424313292530751e-06, + "loss": 0.5019, + "step": 11900 + }, + { + "epoch": 1.9307268007787153, + "grad_norm": 0.6466485494955896, + "learning_rate": 1.4239278268425907e-06, + "loss": 0.5048, + "step": 11901 + }, + { + "epoch": 1.9308890330953927, + "grad_norm": 0.6421562087301154, + "learning_rate": 1.4235423925510744e-06, + "loss": 0.5154, + "step": 11902 + }, + { + "epoch": 1.9310512654120702, + "grad_norm": 0.5911941797560177, + "learning_rate": 1.4231569896674492e-06, + "loss": 0.5441, + "step": 11903 + }, + { + "epoch": 1.9312134977287476, + "grad_norm": 0.6363513944856035, + "learning_rate": 1.4227716182029575e-06, + "loss": 0.5138, + "step": 11904 + }, + { + "epoch": 1.931375730045425, + "grad_norm": 0.5960847524300021, + "learning_rate": 1.4223862781688456e-06, + "loss": 0.5241, + "step": 11905 + }, + { + "epoch": 1.9315379623621025, + "grad_norm": 0.6282995033630643, + "learning_rate": 1.4220009695763548e-06, + "loss": 0.4865, + "step": 11906 + }, + { + "epoch": 1.93170019467878, + "grad_norm": 0.6539580563750494, + "learning_rate": 1.4216156924367277e-06, + "loss": 0.5173, + "step": 11907 + }, + { + "epoch": 1.9318624269954574, + "grad_norm": 0.6169159544733366, + "learning_rate": 1.421230446761207e-06, + "loss": 0.509, + "step": 11908 + }, + { + "epoch": 1.9320246593121349, + "grad_norm": 0.5942192863172353, + "learning_rate": 1.4208452325610312e-06, + "loss": 0.4796, + "step": 11909 + }, + { + "epoch": 1.9321868916288123, + "grad_norm": 0.6094044838396363, + "learning_rate": 1.4204600498474414e-06, + "loss": 0.5186, + "step": 11910 + }, + { + "epoch": 1.93234912394549, + "grad_norm": 0.5857026319092561, + "learning_rate": 1.4200748986316741e-06, + "loss": 0.5087, + "step": 11911 + }, + { + "epoch": 1.9325113562621674, + "grad_norm": 0.5857741853726978, + "learning_rate": 1.4196897789249692e-06, + "loss": 0.5262, + "step": 11912 + }, + { + "epoch": 1.9326735885788449, + "grad_norm": 0.6103815057378372, + "learning_rate": 1.419304690738561e-06, + "loss": 0.5118, + "step": 11913 + }, + { + "epoch": 1.9328358208955225, + "grad_norm": 0.6596070469085252, + "learning_rate": 1.4189196340836866e-06, + "loss": 0.4876, + "step": 11914 + }, + { + "epoch": 1.9329980532122, + "grad_norm": 0.5865966975018951, + "learning_rate": 1.4185346089715818e-06, + "loss": 0.5015, + "step": 11915 + }, + { + "epoch": 1.9331602855288774, + "grad_norm": 0.6239133593657274, + "learning_rate": 1.418149615413478e-06, + "loss": 0.547, + "step": 11916 + }, + { + "epoch": 1.9333225178455549, + "grad_norm": 0.6230099904382375, + "learning_rate": 1.4177646534206107e-06, + "loss": 0.4976, + "step": 11917 + }, + { + "epoch": 1.9334847501622323, + "grad_norm": 0.6286017179637323, + "learning_rate": 1.4173797230042098e-06, + "loss": 0.516, + "step": 11918 + }, + { + "epoch": 1.9336469824789098, + "grad_norm": 0.5905555448936154, + "learning_rate": 1.4169948241755071e-06, + "loss": 0.5078, + "step": 11919 + }, + { + "epoch": 1.9338092147955872, + "grad_norm": 0.6332756151811566, + "learning_rate": 1.4166099569457348e-06, + "loss": 0.5113, + "step": 11920 + }, + { + "epoch": 1.9339714471122647, + "grad_norm": 0.6411117880334228, + "learning_rate": 1.41622512132612e-06, + "loss": 0.5076, + "step": 11921 + }, + { + "epoch": 1.934133679428942, + "grad_norm": 0.6019873841345061, + "learning_rate": 1.415840317327891e-06, + "loss": 0.5155, + "step": 11922 + }, + { + "epoch": 1.9342959117456198, + "grad_norm": 0.6052109853358081, + "learning_rate": 1.415455544962276e-06, + "loss": 0.5272, + "step": 11923 + }, + { + "epoch": 1.9344581440622972, + "grad_norm": 0.628306133711932, + "learning_rate": 1.415070804240501e-06, + "loss": 0.4815, + "step": 11924 + }, + { + "epoch": 1.9346203763789747, + "grad_norm": 0.6275853833885787, + "learning_rate": 1.4146860951737934e-06, + "loss": 0.525, + "step": 11925 + }, + { + "epoch": 1.9347826086956523, + "grad_norm": 0.5809047025668785, + "learning_rate": 1.4143014177733763e-06, + "loss": 0.5314, + "step": 11926 + }, + { + "epoch": 1.9349448410123298, + "grad_norm": 0.6173692375744585, + "learning_rate": 1.413916772050473e-06, + "loss": 0.4994, + "step": 11927 + }, + { + "epoch": 1.9351070733290072, + "grad_norm": 0.5987639525236633, + "learning_rate": 1.413532158016307e-06, + "loss": 0.482, + "step": 11928 + }, + { + "epoch": 1.9352693056456847, + "grad_norm": 0.5932510619508062, + "learning_rate": 1.4131475756821001e-06, + "loss": 0.5298, + "step": 11929 + }, + { + "epoch": 1.9354315379623621, + "grad_norm": 0.6376132917667597, + "learning_rate": 1.4127630250590753e-06, + "loss": 0.5511, + "step": 11930 + }, + { + "epoch": 1.9355937702790396, + "grad_norm": 0.5916098971691743, + "learning_rate": 1.4123785061584494e-06, + "loss": 0.5057, + "step": 11931 + }, + { + "epoch": 1.935756002595717, + "grad_norm": 0.5949899204644147, + "learning_rate": 1.4119940189914424e-06, + "loss": 0.5031, + "step": 11932 + }, + { + "epoch": 1.9359182349123945, + "grad_norm": 0.593477129172988, + "learning_rate": 1.4116095635692733e-06, + "loss": 0.5295, + "step": 11933 + }, + { + "epoch": 1.936080467229072, + "grad_norm": 0.6091554124421023, + "learning_rate": 1.41122513990316e-06, + "loss": 0.489, + "step": 11934 + }, + { + "epoch": 1.9362426995457496, + "grad_norm": 0.6445345926431639, + "learning_rate": 1.4108407480043183e-06, + "loss": 0.5148, + "step": 11935 + }, + { + "epoch": 1.936404931862427, + "grad_norm": 0.606612242208678, + "learning_rate": 1.4104563878839623e-06, + "loss": 0.4989, + "step": 11936 + }, + { + "epoch": 1.9365671641791045, + "grad_norm": 0.5974058863639868, + "learning_rate": 1.4100720595533074e-06, + "loss": 0.5504, + "step": 11937 + }, + { + "epoch": 1.936729396495782, + "grad_norm": 0.6202999059920888, + "learning_rate": 1.4096877630235672e-06, + "loss": 0.4911, + "step": 11938 + }, + { + "epoch": 1.9368916288124596, + "grad_norm": 0.6307837018988796, + "learning_rate": 1.4093034983059555e-06, + "loss": 0.5071, + "step": 11939 + }, + { + "epoch": 1.937053861129137, + "grad_norm": 0.6075674777297714, + "learning_rate": 1.4089192654116835e-06, + "loss": 0.5517, + "step": 11940 + }, + { + "epoch": 1.9372160934458145, + "grad_norm": 0.615243868249621, + "learning_rate": 1.4085350643519603e-06, + "loss": 0.5127, + "step": 11941 + }, + { + "epoch": 1.937378325762492, + "grad_norm": 0.5980410407916493, + "learning_rate": 1.4081508951379965e-06, + "loss": 0.5225, + "step": 11942 + }, + { + "epoch": 1.9375405580791694, + "grad_norm": 0.6241985917665699, + "learning_rate": 1.4077667577810028e-06, + "loss": 0.5216, + "step": 11943 + }, + { + "epoch": 1.9377027903958468, + "grad_norm": 0.5842721557998946, + "learning_rate": 1.407382652292185e-06, + "loss": 0.5174, + "step": 11944 + }, + { + "epoch": 1.9378650227125243, + "grad_norm": 0.5548357737214477, + "learning_rate": 1.4069985786827524e-06, + "loss": 0.4919, + "step": 11945 + }, + { + "epoch": 1.9380272550292017, + "grad_norm": 0.6117380395654128, + "learning_rate": 1.4066145369639085e-06, + "loss": 0.5119, + "step": 11946 + }, + { + "epoch": 1.9381894873458791, + "grad_norm": 0.6148329622510535, + "learning_rate": 1.4062305271468595e-06, + "loss": 0.4708, + "step": 11947 + }, + { + "epoch": 1.9383517196625568, + "grad_norm": 0.6139036336505855, + "learning_rate": 1.4058465492428115e-06, + "loss": 0.5336, + "step": 11948 + }, + { + "epoch": 1.9385139519792343, + "grad_norm": 0.61316695603567, + "learning_rate": 1.405462603262965e-06, + "loss": 0.5329, + "step": 11949 + }, + { + "epoch": 1.9386761842959117, + "grad_norm": 0.5981904203947289, + "learning_rate": 1.4050786892185241e-06, + "loss": 0.5252, + "step": 11950 + }, + { + "epoch": 1.9388384166125894, + "grad_norm": 0.6039356480527304, + "learning_rate": 1.404694807120691e-06, + "loss": 0.5039, + "step": 11951 + }, + { + "epoch": 1.9390006489292668, + "grad_norm": 0.5999040433545206, + "learning_rate": 1.404310956980664e-06, + "loss": 0.5113, + "step": 11952 + }, + { + "epoch": 1.9391628812459443, + "grad_norm": 0.6341539121543879, + "learning_rate": 1.4039271388096453e-06, + "loss": 0.5064, + "step": 11953 + }, + { + "epoch": 1.9393251135626217, + "grad_norm": 0.6222634009958574, + "learning_rate": 1.4035433526188312e-06, + "loss": 0.4968, + "step": 11954 + }, + { + "epoch": 1.9394873458792992, + "grad_norm": 0.5913164693457973, + "learning_rate": 1.4031595984194204e-06, + "loss": 0.5088, + "step": 11955 + }, + { + "epoch": 1.9396495781959766, + "grad_norm": 0.6131185013084091, + "learning_rate": 1.402775876222611e-06, + "loss": 0.5105, + "step": 11956 + }, + { + "epoch": 1.939811810512654, + "grad_norm": 0.5734281727145354, + "learning_rate": 1.4023921860395979e-06, + "loss": 0.5384, + "step": 11957 + }, + { + "epoch": 1.9399740428293315, + "grad_norm": 0.6165464376200855, + "learning_rate": 1.4020085278815745e-06, + "loss": 0.5131, + "step": 11958 + }, + { + "epoch": 1.940136275146009, + "grad_norm": 0.6316419356077417, + "learning_rate": 1.4016249017597367e-06, + "loss": 0.5253, + "step": 11959 + }, + { + "epoch": 1.9402985074626866, + "grad_norm": 0.6073252553157865, + "learning_rate": 1.4012413076852766e-06, + "loss": 0.5371, + "step": 11960 + }, + { + "epoch": 1.940460739779364, + "grad_norm": 0.5917994096727724, + "learning_rate": 1.4008577456693885e-06, + "loss": 0.516, + "step": 11961 + }, + { + "epoch": 1.9406229720960415, + "grad_norm": 0.5983489920616609, + "learning_rate": 1.4004742157232615e-06, + "loss": 0.522, + "step": 11962 + }, + { + "epoch": 1.9407852044127192, + "grad_norm": 0.607158114672435, + "learning_rate": 1.4000907178580855e-06, + "loss": 0.5113, + "step": 11963 + }, + { + "epoch": 1.9409474367293966, + "grad_norm": 0.6171818288298292, + "learning_rate": 1.3997072520850508e-06, + "loss": 0.5059, + "step": 11964 + }, + { + "epoch": 1.941109669046074, + "grad_norm": 0.6409621135810024, + "learning_rate": 1.3993238184153457e-06, + "loss": 0.4891, + "step": 11965 + }, + { + "epoch": 1.9412719013627515, + "grad_norm": 0.6077358264754571, + "learning_rate": 1.39894041686016e-06, + "loss": 0.5092, + "step": 11966 + }, + { + "epoch": 1.941434133679429, + "grad_norm": 0.6224903430120503, + "learning_rate": 1.3985570474306758e-06, + "loss": 0.5205, + "step": 11967 + }, + { + "epoch": 1.9415963659961064, + "grad_norm": 0.5733928487330532, + "learning_rate": 1.3981737101380807e-06, + "loss": 0.5063, + "step": 11968 + }, + { + "epoch": 1.9417585983127839, + "grad_norm": 0.6053586334766546, + "learning_rate": 1.3977904049935598e-06, + "loss": 0.5141, + "step": 11969 + }, + { + "epoch": 1.9419208306294613, + "grad_norm": 0.5796859659871317, + "learning_rate": 1.397407132008297e-06, + "loss": 0.5043, + "step": 11970 + }, + { + "epoch": 1.9420830629461387, + "grad_norm": 0.6439057066002373, + "learning_rate": 1.3970238911934753e-06, + "loss": 0.5012, + "step": 11971 + }, + { + "epoch": 1.9422452952628162, + "grad_norm": 0.6464774266494816, + "learning_rate": 1.3966406825602747e-06, + "loss": 0.5279, + "step": 11972 + }, + { + "epoch": 1.9424075275794939, + "grad_norm": 0.6231742583420284, + "learning_rate": 1.3962575061198775e-06, + "loss": 0.4999, + "step": 11973 + }, + { + "epoch": 1.9425697598961713, + "grad_norm": 0.6474316827184777, + "learning_rate": 1.395874361883463e-06, + "loss": 0.5138, + "step": 11974 + }, + { + "epoch": 1.9427319922128488, + "grad_norm": 0.6197616036509828, + "learning_rate": 1.395491249862212e-06, + "loss": 0.5019, + "step": 11975 + }, + { + "epoch": 1.9428942245295264, + "grad_norm": 0.60221326209349, + "learning_rate": 1.3951081700673008e-06, + "loss": 0.5167, + "step": 11976 + }, + { + "epoch": 1.9430564568462039, + "grad_norm": 0.612622021343991, + "learning_rate": 1.3947251225099067e-06, + "loss": 0.5354, + "step": 11977 + }, + { + "epoch": 1.9432186891628813, + "grad_norm": 0.652467853043348, + "learning_rate": 1.3943421072012058e-06, + "loss": 0.5148, + "step": 11978 + }, + { + "epoch": 1.9433809214795588, + "grad_norm": 0.6514626289207684, + "learning_rate": 1.393959124152375e-06, + "loss": 0.5114, + "step": 11979 + }, + { + "epoch": 1.9435431537962362, + "grad_norm": 0.5730311568637017, + "learning_rate": 1.3935761733745865e-06, + "loss": 0.4983, + "step": 11980 + }, + { + "epoch": 1.9437053861129137, + "grad_norm": 0.6380593821424743, + "learning_rate": 1.3931932548790156e-06, + "loss": 0.4848, + "step": 11981 + }, + { + "epoch": 1.943867618429591, + "grad_norm": 0.5934567434530446, + "learning_rate": 1.3928103686768324e-06, + "loss": 0.5042, + "step": 11982 + }, + { + "epoch": 1.9440298507462686, + "grad_norm": 0.6091530666339953, + "learning_rate": 1.3924275147792102e-06, + "loss": 0.5013, + "step": 11983 + }, + { + "epoch": 1.944192083062946, + "grad_norm": 0.5811271574727538, + "learning_rate": 1.39204469319732e-06, + "loss": 0.4708, + "step": 11984 + }, + { + "epoch": 1.9443543153796237, + "grad_norm": 0.63840499093939, + "learning_rate": 1.3916619039423296e-06, + "loss": 0.4912, + "step": 11985 + }, + { + "epoch": 1.9445165476963011, + "grad_norm": 0.5964968308559115, + "learning_rate": 1.3912791470254095e-06, + "loss": 0.5202, + "step": 11986 + }, + { + "epoch": 1.9446787800129786, + "grad_norm": 0.6376880134010083, + "learning_rate": 1.3908964224577256e-06, + "loss": 0.5145, + "step": 11987 + }, + { + "epoch": 1.9448410123296562, + "grad_norm": 0.6202054704361541, + "learning_rate": 1.3905137302504468e-06, + "loss": 0.5278, + "step": 11988 + }, + { + "epoch": 1.9450032446463337, + "grad_norm": 0.5908826291748029, + "learning_rate": 1.3901310704147364e-06, + "loss": 0.4945, + "step": 11989 + }, + { + "epoch": 1.9451654769630111, + "grad_norm": 0.6123048082382215, + "learning_rate": 1.389748442961761e-06, + "loss": 0.5212, + "step": 11990 + }, + { + "epoch": 1.9453277092796886, + "grad_norm": 0.5779514337587991, + "learning_rate": 1.389365847902685e-06, + "loss": 0.4828, + "step": 11991 + }, + { + "epoch": 1.945489941596366, + "grad_norm": 0.594085615776113, + "learning_rate": 1.3889832852486698e-06, + "loss": 0.5371, + "step": 11992 + }, + { + "epoch": 1.9456521739130435, + "grad_norm": 0.6091138964384841, + "learning_rate": 1.3886007550108793e-06, + "loss": 0.5296, + "step": 11993 + }, + { + "epoch": 1.945814406229721, + "grad_norm": 0.6104788626711258, + "learning_rate": 1.3882182572004727e-06, + "loss": 0.4942, + "step": 11994 + }, + { + "epoch": 1.9459766385463984, + "grad_norm": 0.5921915296077461, + "learning_rate": 1.3878357918286106e-06, + "loss": 0.4988, + "step": 11995 + }, + { + "epoch": 1.9461388708630758, + "grad_norm": 0.5915503841786072, + "learning_rate": 1.3874533589064532e-06, + "loss": 0.5343, + "step": 11996 + }, + { + "epoch": 1.9463011031797532, + "grad_norm": 0.5990302348749854, + "learning_rate": 1.3870709584451591e-06, + "loss": 0.5034, + "step": 11997 + }, + { + "epoch": 1.946463335496431, + "grad_norm": 0.5964304881050784, + "learning_rate": 1.3866885904558847e-06, + "loss": 0.5315, + "step": 11998 + }, + { + "epoch": 1.9466255678131084, + "grad_norm": 0.5952549072202145, + "learning_rate": 1.3863062549497859e-06, + "loss": 0.5034, + "step": 11999 + }, + { + "epoch": 1.9467878001297858, + "grad_norm": 0.6183442455434475, + "learning_rate": 1.385923951938018e-06, + "loss": 0.5127, + "step": 12000 + }, + { + "epoch": 1.9469500324464635, + "grad_norm": 0.5986592869575903, + "learning_rate": 1.3855416814317376e-06, + "loss": 0.5186, + "step": 12001 + }, + { + "epoch": 1.947112264763141, + "grad_norm": 0.6161224673871066, + "learning_rate": 1.3851594434420968e-06, + "loss": 0.4817, + "step": 12002 + }, + { + "epoch": 1.9472744970798184, + "grad_norm": 0.6046932469396681, + "learning_rate": 1.3847772379802475e-06, + "loss": 0.4876, + "step": 12003 + }, + { + "epoch": 1.9474367293964958, + "grad_norm": 0.5790663024216115, + "learning_rate": 1.3843950650573418e-06, + "loss": 0.5225, + "step": 12004 + }, + { + "epoch": 1.9475989617131733, + "grad_norm": 0.6165628777979114, + "learning_rate": 1.3840129246845308e-06, + "loss": 0.4928, + "step": 12005 + }, + { + "epoch": 1.9477611940298507, + "grad_norm": 0.6317366889966425, + "learning_rate": 1.3836308168729648e-06, + "loss": 0.4977, + "step": 12006 + }, + { + "epoch": 1.9479234263465282, + "grad_norm": 0.6244322519610886, + "learning_rate": 1.383248741633792e-06, + "loss": 0.4882, + "step": 12007 + }, + { + "epoch": 1.9480856586632056, + "grad_norm": 0.5965590481517609, + "learning_rate": 1.3828666989781592e-06, + "loss": 0.4889, + "step": 12008 + }, + { + "epoch": 1.948247890979883, + "grad_norm": 0.5849707731651014, + "learning_rate": 1.3824846889172145e-06, + "loss": 0.5134, + "step": 12009 + }, + { + "epoch": 1.9484101232965607, + "grad_norm": 0.6365884142341559, + "learning_rate": 1.382102711462103e-06, + "loss": 0.5524, + "step": 12010 + }, + { + "epoch": 1.9485723556132382, + "grad_norm": 0.6104635918623248, + "learning_rate": 1.3817207666239715e-06, + "loss": 0.4824, + "step": 12011 + }, + { + "epoch": 1.9487345879299156, + "grad_norm": 0.6341931473854238, + "learning_rate": 1.3813388544139627e-06, + "loss": 0.5107, + "step": 12012 + }, + { + "epoch": 1.9488968202465933, + "grad_norm": 0.6210934495389808, + "learning_rate": 1.380956974843219e-06, + "loss": 0.5016, + "step": 12013 + }, + { + "epoch": 1.9490590525632707, + "grad_norm": 0.5867657309020259, + "learning_rate": 1.3805751279228833e-06, + "loss": 0.5084, + "step": 12014 + }, + { + "epoch": 1.9492212848799482, + "grad_norm": 0.5872617964721397, + "learning_rate": 1.3801933136640975e-06, + "loss": 0.5084, + "step": 12015 + }, + { + "epoch": 1.9493835171966256, + "grad_norm": 0.6342661458402731, + "learning_rate": 1.3798115320780003e-06, + "loss": 0.5335, + "step": 12016 + }, + { + "epoch": 1.949545749513303, + "grad_norm": 0.6261742135963835, + "learning_rate": 1.3794297831757325e-06, + "loss": 0.4893, + "step": 12017 + }, + { + "epoch": 1.9497079818299805, + "grad_norm": 0.6115865527770767, + "learning_rate": 1.379048066968431e-06, + "loss": 0.5333, + "step": 12018 + }, + { + "epoch": 1.949870214146658, + "grad_norm": 0.5867420779745279, + "learning_rate": 1.3786663834672337e-06, + "loss": 0.5126, + "step": 12019 + }, + { + "epoch": 1.9500324464633354, + "grad_norm": 0.6069481339092134, + "learning_rate": 1.3782847326832783e-06, + "loss": 0.5385, + "step": 12020 + }, + { + "epoch": 1.9501946787800128, + "grad_norm": 0.6420276102841369, + "learning_rate": 1.3779031146276978e-06, + "loss": 0.521, + "step": 12021 + }, + { + "epoch": 1.9503569110966905, + "grad_norm": 0.5927446288204165, + "learning_rate": 1.3775215293116291e-06, + "loss": 0.5053, + "step": 12022 + }, + { + "epoch": 1.950519143413368, + "grad_norm": 0.6110352528478645, + "learning_rate": 1.3771399767462041e-06, + "loss": 0.5265, + "step": 12023 + }, + { + "epoch": 1.9506813757300454, + "grad_norm": 0.6205041233763767, + "learning_rate": 1.3767584569425562e-06, + "loss": 0.5403, + "step": 12024 + }, + { + "epoch": 1.9508436080467229, + "grad_norm": 0.6152692551250191, + "learning_rate": 1.3763769699118162e-06, + "loss": 0.534, + "step": 12025 + }, + { + "epoch": 1.9510058403634005, + "grad_norm": 0.5962340229043727, + "learning_rate": 1.3759955156651157e-06, + "loss": 0.5093, + "step": 12026 + }, + { + "epoch": 1.951168072680078, + "grad_norm": 0.5696089610012307, + "learning_rate": 1.3756140942135848e-06, + "loss": 0.5106, + "step": 12027 + }, + { + "epoch": 1.9513303049967554, + "grad_norm": 0.6566520913204641, + "learning_rate": 1.3752327055683504e-06, + "loss": 0.4936, + "step": 12028 + }, + { + "epoch": 1.9514925373134329, + "grad_norm": 0.6302391068293979, + "learning_rate": 1.3748513497405425e-06, + "loss": 0.4804, + "step": 12029 + }, + { + "epoch": 1.9516547696301103, + "grad_norm": 0.5927305994424926, + "learning_rate": 1.3744700267412858e-06, + "loss": 0.5166, + "step": 12030 + }, + { + "epoch": 1.9518170019467878, + "grad_norm": 0.5918573408246592, + "learning_rate": 1.3740887365817076e-06, + "loss": 0.5192, + "step": 12031 + }, + { + "epoch": 1.9519792342634652, + "grad_norm": 0.618302987874387, + "learning_rate": 1.3737074792729333e-06, + "loss": 0.5715, + "step": 12032 + }, + { + "epoch": 1.9521414665801426, + "grad_norm": 0.6156064844635754, + "learning_rate": 1.3733262548260854e-06, + "loss": 0.5413, + "step": 12033 + }, + { + "epoch": 1.95230369889682, + "grad_norm": 0.5763359345101252, + "learning_rate": 1.3729450632522885e-06, + "loss": 0.4841, + "step": 12034 + }, + { + "epoch": 1.9524659312134978, + "grad_norm": 0.5820293397674374, + "learning_rate": 1.3725639045626627e-06, + "loss": 0.5171, + "step": 12035 + }, + { + "epoch": 1.9526281635301752, + "grad_norm": 0.5939172231338602, + "learning_rate": 1.37218277876833e-06, + "loss": 0.4787, + "step": 12036 + }, + { + "epoch": 1.9527903958468527, + "grad_norm": 0.5874039961670874, + "learning_rate": 1.3718016858804118e-06, + "loss": 0.4886, + "step": 12037 + }, + { + "epoch": 1.9529526281635303, + "grad_norm": 0.6080600399950064, + "learning_rate": 1.3714206259100262e-06, + "loss": 0.527, + "step": 12038 + }, + { + "epoch": 1.9531148604802078, + "grad_norm": 0.6405549391244736, + "learning_rate": 1.3710395988682907e-06, + "loss": 0.504, + "step": 12039 + }, + { + "epoch": 1.9532770927968852, + "grad_norm": 0.6244030900901654, + "learning_rate": 1.3706586047663228e-06, + "loss": 0.5032, + "step": 12040 + }, + { + "epoch": 1.9534393251135627, + "grad_norm": 0.6227589026962409, + "learning_rate": 1.3702776436152398e-06, + "loss": 0.514, + "step": 12041 + }, + { + "epoch": 1.95360155743024, + "grad_norm": 0.6420838062696668, + "learning_rate": 1.3698967154261572e-06, + "loss": 0.5127, + "step": 12042 + }, + { + "epoch": 1.9537637897469176, + "grad_norm": 0.6428558014542899, + "learning_rate": 1.3695158202101884e-06, + "loss": 0.5317, + "step": 12043 + }, + { + "epoch": 1.953926022063595, + "grad_norm": 0.5962603553154036, + "learning_rate": 1.3691349579784463e-06, + "loss": 0.5149, + "step": 12044 + }, + { + "epoch": 1.9540882543802724, + "grad_norm": 0.6005553374396304, + "learning_rate": 1.3687541287420442e-06, + "loss": 0.4619, + "step": 12045 + }, + { + "epoch": 1.95425048669695, + "grad_norm": 0.6042123967923583, + "learning_rate": 1.3683733325120934e-06, + "loss": 0.5053, + "step": 12046 + }, + { + "epoch": 1.9544127190136276, + "grad_norm": 0.5902667437172076, + "learning_rate": 1.367992569299706e-06, + "loss": 0.4902, + "step": 12047 + }, + { + "epoch": 1.954574951330305, + "grad_norm": 0.615369297754223, + "learning_rate": 1.3676118391159896e-06, + "loss": 0.493, + "step": 12048 + }, + { + "epoch": 1.9547371836469825, + "grad_norm": 0.5869297945273765, + "learning_rate": 1.3672311419720525e-06, + "loss": 0.5229, + "step": 12049 + }, + { + "epoch": 1.9548994159636601, + "grad_norm": 0.6138227374081959, + "learning_rate": 1.366850477879003e-06, + "loss": 0.5097, + "step": 12050 + }, + { + "epoch": 1.9550616482803376, + "grad_norm": 0.6213973620775558, + "learning_rate": 1.3664698468479486e-06, + "loss": 0.5398, + "step": 12051 + }, + { + "epoch": 1.955223880597015, + "grad_norm": 0.6029320854704127, + "learning_rate": 1.3660892488899935e-06, + "loss": 0.5046, + "step": 12052 + }, + { + "epoch": 1.9553861129136925, + "grad_norm": 0.6357299140585683, + "learning_rate": 1.3657086840162445e-06, + "loss": 0.4961, + "step": 12053 + }, + { + "epoch": 1.95554834523037, + "grad_norm": 0.6098877233430332, + "learning_rate": 1.365328152237803e-06, + "loss": 0.4955, + "step": 12054 + }, + { + "epoch": 1.9557105775470474, + "grad_norm": 0.639282814020243, + "learning_rate": 1.3649476535657727e-06, + "loss": 0.5144, + "step": 12055 + }, + { + "epoch": 1.9558728098637248, + "grad_norm": 0.6168465383870742, + "learning_rate": 1.3645671880112564e-06, + "loss": 0.4959, + "step": 12056 + }, + { + "epoch": 1.9560350421804023, + "grad_norm": 0.6392747347531946, + "learning_rate": 1.3641867555853539e-06, + "loss": 0.5043, + "step": 12057 + }, + { + "epoch": 1.9561972744970797, + "grad_norm": 0.5934431119911334, + "learning_rate": 1.363806356299166e-06, + "loss": 0.5466, + "step": 12058 + }, + { + "epoch": 1.9563595068137571, + "grad_norm": 0.6224467534482377, + "learning_rate": 1.3634259901637903e-06, + "loss": 0.4808, + "step": 12059 + }, + { + "epoch": 1.9565217391304348, + "grad_norm": 0.6152252931527717, + "learning_rate": 1.363045657190326e-06, + "loss": 0.545, + "step": 12060 + }, + { + "epoch": 1.9566839714471123, + "grad_norm": 0.622398948969054, + "learning_rate": 1.3626653573898693e-06, + "loss": 0.5223, + "step": 12061 + }, + { + "epoch": 1.9568462037637897, + "grad_norm": 0.5915498578443276, + "learning_rate": 1.3622850907735164e-06, + "loss": 0.5144, + "step": 12062 + }, + { + "epoch": 1.9570084360804674, + "grad_norm": 0.5824005425164174, + "learning_rate": 1.3619048573523634e-06, + "loss": 0.5187, + "step": 12063 + }, + { + "epoch": 1.9571706683971448, + "grad_norm": 0.6153041804081125, + "learning_rate": 1.3615246571375024e-06, + "loss": 0.5307, + "step": 12064 + }, + { + "epoch": 1.9573329007138223, + "grad_norm": 0.6164052173098045, + "learning_rate": 1.361144490140029e-06, + "loss": 0.518, + "step": 12065 + }, + { + "epoch": 1.9574951330304997, + "grad_norm": 0.6006523793780305, + "learning_rate": 1.3607643563710329e-06, + "loss": 0.5128, + "step": 12066 + }, + { + "epoch": 1.9576573653471772, + "grad_norm": 0.6535799186076993, + "learning_rate": 1.3603842558416063e-06, + "loss": 0.5173, + "step": 12067 + }, + { + "epoch": 1.9578195976638546, + "grad_norm": 0.6055633586571135, + "learning_rate": 1.360004188562841e-06, + "loss": 0.5186, + "step": 12068 + }, + { + "epoch": 1.957981829980532, + "grad_norm": 0.6015048789974382, + "learning_rate": 1.3596241545458245e-06, + "loss": 0.4849, + "step": 12069 + }, + { + "epoch": 1.9581440622972095, + "grad_norm": 0.5762144715414105, + "learning_rate": 1.3592441538016443e-06, + "loss": 0.5142, + "step": 12070 + }, + { + "epoch": 1.958306294613887, + "grad_norm": 0.5987999338805489, + "learning_rate": 1.358864186341389e-06, + "loss": 0.5078, + "step": 12071 + }, + { + "epoch": 1.9584685269305646, + "grad_norm": 0.5888900613795554, + "learning_rate": 1.3584842521761448e-06, + "loss": 0.4697, + "step": 12072 + }, + { + "epoch": 1.958630759247242, + "grad_norm": 0.6135304444969578, + "learning_rate": 1.3581043513169978e-06, + "loss": 0.4922, + "step": 12073 + }, + { + "epoch": 1.9587929915639195, + "grad_norm": 0.5813656956836876, + "learning_rate": 1.3577244837750319e-06, + "loss": 0.5062, + "step": 12074 + }, + { + "epoch": 1.9589552238805972, + "grad_norm": 0.6020622515835233, + "learning_rate": 1.3573446495613291e-06, + "loss": 0.4882, + "step": 12075 + }, + { + "epoch": 1.9591174561972746, + "grad_norm": 0.623145641005078, + "learning_rate": 1.356964848686973e-06, + "loss": 0.5248, + "step": 12076 + }, + { + "epoch": 1.959279688513952, + "grad_norm": 0.6169862645428166, + "learning_rate": 1.3565850811630448e-06, + "loss": 0.5053, + "step": 12077 + }, + { + "epoch": 1.9594419208306295, + "grad_norm": 0.5989038902747109, + "learning_rate": 1.3562053470006266e-06, + "loss": 0.5099, + "step": 12078 + }, + { + "epoch": 1.959604153147307, + "grad_norm": 0.5820687206069199, + "learning_rate": 1.3558256462107965e-06, + "loss": 0.4925, + "step": 12079 + }, + { + "epoch": 1.9597663854639844, + "grad_norm": 0.6070519424765408, + "learning_rate": 1.3554459788046325e-06, + "loss": 0.5134, + "step": 12080 + }, + { + "epoch": 1.9599286177806619, + "grad_norm": 0.592234980675383, + "learning_rate": 1.3550663447932125e-06, + "loss": 0.5176, + "step": 12081 + }, + { + "epoch": 1.9600908500973393, + "grad_norm": 0.6000682819480829, + "learning_rate": 1.3546867441876147e-06, + "loss": 0.4778, + "step": 12082 + }, + { + "epoch": 1.9602530824140167, + "grad_norm": 0.6027133923783846, + "learning_rate": 1.3543071769989132e-06, + "loss": 0.5037, + "step": 12083 + }, + { + "epoch": 1.9604153147306942, + "grad_norm": 0.6215975303446847, + "learning_rate": 1.3539276432381824e-06, + "loss": 0.4937, + "step": 12084 + }, + { + "epoch": 1.9605775470473719, + "grad_norm": 0.6178802887282097, + "learning_rate": 1.3535481429164962e-06, + "loss": 0.5101, + "step": 12085 + }, + { + "epoch": 1.9607397793640493, + "grad_norm": 0.5812350428548338, + "learning_rate": 1.353168676044928e-06, + "loss": 0.4908, + "step": 12086 + }, + { + "epoch": 1.9609020116807268, + "grad_norm": 0.6014986498493446, + "learning_rate": 1.3527892426345498e-06, + "loss": 0.5143, + "step": 12087 + }, + { + "epoch": 1.9610642439974044, + "grad_norm": 0.5980069889911779, + "learning_rate": 1.352409842696431e-06, + "loss": 0.4854, + "step": 12088 + }, + { + "epoch": 1.9612264763140819, + "grad_norm": 0.5760200614273397, + "learning_rate": 1.3520304762416434e-06, + "loss": 0.4925, + "step": 12089 + }, + { + "epoch": 1.9613887086307593, + "grad_norm": 0.6424793642015326, + "learning_rate": 1.351651143281253e-06, + "loss": 0.4967, + "step": 12090 + }, + { + "epoch": 1.9615509409474368, + "grad_norm": 0.6440452632848316, + "learning_rate": 1.3512718438263297e-06, + "loss": 0.5296, + "step": 12091 + }, + { + "epoch": 1.9617131732641142, + "grad_norm": 0.5873241350178297, + "learning_rate": 1.3508925778879405e-06, + "loss": 0.4931, + "step": 12092 + }, + { + "epoch": 1.9618754055807917, + "grad_norm": 0.6178010897311211, + "learning_rate": 1.3505133454771498e-06, + "loss": 0.5225, + "step": 12093 + }, + { + "epoch": 1.962037637897469, + "grad_norm": 0.5945677883469096, + "learning_rate": 1.3501341466050244e-06, + "loss": 0.5014, + "step": 12094 + }, + { + "epoch": 1.9621998702141465, + "grad_norm": 0.6127083072002798, + "learning_rate": 1.3497549812826261e-06, + "loss": 0.5202, + "step": 12095 + }, + { + "epoch": 1.962362102530824, + "grad_norm": 0.6118820812344905, + "learning_rate": 1.3493758495210196e-06, + "loss": 0.5321, + "step": 12096 + }, + { + "epoch": 1.9625243348475017, + "grad_norm": 0.6319399248786011, + "learning_rate": 1.3489967513312652e-06, + "loss": 0.5203, + "step": 12097 + }, + { + "epoch": 1.962686567164179, + "grad_norm": 0.6316882038415929, + "learning_rate": 1.348617686724425e-06, + "loss": 0.5008, + "step": 12098 + }, + { + "epoch": 1.9628487994808566, + "grad_norm": 0.614288719034263, + "learning_rate": 1.3482386557115596e-06, + "loss": 0.5143, + "step": 12099 + }, + { + "epoch": 1.9630110317975342, + "grad_norm": 0.6339617715155201, + "learning_rate": 1.3478596583037261e-06, + "loss": 0.5009, + "step": 12100 + }, + { + "epoch": 1.9631732641142117, + "grad_norm": 0.6384395563675817, + "learning_rate": 1.3474806945119851e-06, + "loss": 0.5096, + "step": 12101 + }, + { + "epoch": 1.9633354964308891, + "grad_norm": 0.6007472955559451, + "learning_rate": 1.3471017643473907e-06, + "loss": 0.5117, + "step": 12102 + }, + { + "epoch": 1.9634977287475666, + "grad_norm": 0.6264735792424488, + "learning_rate": 1.3467228678210009e-06, + "loss": 0.4967, + "step": 12103 + }, + { + "epoch": 1.963659961064244, + "grad_norm": 0.5915642014123726, + "learning_rate": 1.3463440049438711e-06, + "loss": 0.51, + "step": 12104 + }, + { + "epoch": 1.9638221933809215, + "grad_norm": 0.6183259033855703, + "learning_rate": 1.3459651757270549e-06, + "loss": 0.4937, + "step": 12105 + }, + { + "epoch": 1.963984425697599, + "grad_norm": 0.6124299846999811, + "learning_rate": 1.3455863801816039e-06, + "loss": 0.4971, + "step": 12106 + }, + { + "epoch": 1.9641466580142763, + "grad_norm": 0.6338733672369582, + "learning_rate": 1.3452076183185716e-06, + "loss": 0.5211, + "step": 12107 + }, + { + "epoch": 1.9643088903309538, + "grad_norm": 0.5872237214226912, + "learning_rate": 1.3448288901490094e-06, + "loss": 0.4957, + "step": 12108 + }, + { + "epoch": 1.9644711226476315, + "grad_norm": 0.5866770916126758, + "learning_rate": 1.3444501956839682e-06, + "loss": 0.4615, + "step": 12109 + }, + { + "epoch": 1.964633354964309, + "grad_norm": 0.63290171567871, + "learning_rate": 1.344071534934496e-06, + "loss": 0.5041, + "step": 12110 + }, + { + "epoch": 1.9647955872809864, + "grad_norm": 0.6120686014812796, + "learning_rate": 1.3436929079116401e-06, + "loss": 0.5445, + "step": 12111 + }, + { + "epoch": 1.9649578195976638, + "grad_norm": 0.5961655253350508, + "learning_rate": 1.3433143146264494e-06, + "loss": 0.5231, + "step": 12112 + }, + { + "epoch": 1.9651200519143415, + "grad_norm": 0.5928905242194169, + "learning_rate": 1.3429357550899692e-06, + "loss": 0.5003, + "step": 12113 + }, + { + "epoch": 1.965282284231019, + "grad_norm": 0.7142441174984105, + "learning_rate": 1.3425572293132462e-06, + "loss": 0.4902, + "step": 12114 + }, + { + "epoch": 1.9654445165476964, + "grad_norm": 0.64901132483896, + "learning_rate": 1.3421787373073236e-06, + "loss": 0.5234, + "step": 12115 + }, + { + "epoch": 1.9656067488643738, + "grad_norm": 0.635481128274442, + "learning_rate": 1.3418002790832441e-06, + "loss": 0.532, + "step": 12116 + }, + { + "epoch": 1.9657689811810513, + "grad_norm": 0.6506500765281854, + "learning_rate": 1.3414218546520503e-06, + "loss": 0.5514, + "step": 12117 + }, + { + "epoch": 1.9659312134977287, + "grad_norm": 0.616335514920201, + "learning_rate": 1.3410434640247849e-06, + "loss": 0.5117, + "step": 12118 + }, + { + "epoch": 1.9660934458144061, + "grad_norm": 0.6188889892114008, + "learning_rate": 1.3406651072124872e-06, + "loss": 0.5058, + "step": 12119 + }, + { + "epoch": 1.9662556781310836, + "grad_norm": 0.6058413112471465, + "learning_rate": 1.3402867842261955e-06, + "loss": 0.5132, + "step": 12120 + }, + { + "epoch": 1.966417910447761, + "grad_norm": 0.5929703735301354, + "learning_rate": 1.3399084950769492e-06, + "loss": 0.5219, + "step": 12121 + }, + { + "epoch": 1.9665801427644387, + "grad_norm": 0.6441730695693558, + "learning_rate": 1.339530239775786e-06, + "loss": 0.4951, + "step": 12122 + }, + { + "epoch": 1.9667423750811162, + "grad_norm": 0.5987145486344037, + "learning_rate": 1.3391520183337425e-06, + "loss": 0.5182, + "step": 12123 + }, + { + "epoch": 1.9669046073977936, + "grad_norm": 0.6134579651658405, + "learning_rate": 1.3387738307618536e-06, + "loss": 0.5118, + "step": 12124 + }, + { + "epoch": 1.9670668397144713, + "grad_norm": 0.5917410271724237, + "learning_rate": 1.3383956770711532e-06, + "loss": 0.4951, + "step": 12125 + }, + { + "epoch": 1.9672290720311487, + "grad_norm": 0.6233279974778576, + "learning_rate": 1.3380175572726745e-06, + "loss": 0.4959, + "step": 12126 + }, + { + "epoch": 1.9673913043478262, + "grad_norm": 0.6067345272589537, + "learning_rate": 1.3376394713774521e-06, + "loss": 0.5226, + "step": 12127 + }, + { + "epoch": 1.9675535366645036, + "grad_norm": 0.5936105545464512, + "learning_rate": 1.3372614193965148e-06, + "loss": 0.5479, + "step": 12128 + }, + { + "epoch": 1.967715768981181, + "grad_norm": 0.6044042344879271, + "learning_rate": 1.336883401340895e-06, + "loss": 0.5201, + "step": 12129 + }, + { + "epoch": 1.9678780012978585, + "grad_norm": 0.5975740035959907, + "learning_rate": 1.3365054172216208e-06, + "loss": 0.5233, + "step": 12130 + }, + { + "epoch": 1.968040233614536, + "grad_norm": 0.6313430488126583, + "learning_rate": 1.336127467049721e-06, + "loss": 0.5321, + "step": 12131 + }, + { + "epoch": 1.9682024659312134, + "grad_norm": 0.5736354367850062, + "learning_rate": 1.3357495508362245e-06, + "loss": 0.5168, + "step": 12132 + }, + { + "epoch": 1.9683646982478908, + "grad_norm": 0.5881988501785208, + "learning_rate": 1.3353716685921553e-06, + "loss": 0.5262, + "step": 12133 + }, + { + "epoch": 1.9685269305645685, + "grad_norm": 0.5686568576417298, + "learning_rate": 1.3349938203285412e-06, + "loss": 0.4989, + "step": 12134 + }, + { + "epoch": 1.968689162881246, + "grad_norm": 0.6251144242220122, + "learning_rate": 1.3346160060564051e-06, + "loss": 0.5192, + "step": 12135 + }, + { + "epoch": 1.9688513951979234, + "grad_norm": 0.6027426357891591, + "learning_rate": 1.334238225786771e-06, + "loss": 0.5209, + "step": 12136 + }, + { + "epoch": 1.969013627514601, + "grad_norm": 0.5991821127583554, + "learning_rate": 1.3338604795306625e-06, + "loss": 0.5308, + "step": 12137 + }, + { + "epoch": 1.9691758598312785, + "grad_norm": 0.5985942180699007, + "learning_rate": 1.3334827672990993e-06, + "loss": 0.5103, + "step": 12138 + }, + { + "epoch": 1.969338092147956, + "grad_norm": 0.6180550496947828, + "learning_rate": 1.3331050891031027e-06, + "loss": 0.5031, + "step": 12139 + }, + { + "epoch": 1.9695003244646334, + "grad_norm": 0.579679316953213, + "learning_rate": 1.3327274449536937e-06, + "loss": 0.4855, + "step": 12140 + }, + { + "epoch": 1.9696625567813109, + "grad_norm": 0.5792143435342268, + "learning_rate": 1.3323498348618893e-06, + "loss": 0.4995, + "step": 12141 + }, + { + "epoch": 1.9698247890979883, + "grad_norm": 0.5804627324036123, + "learning_rate": 1.3319722588387063e-06, + "loss": 0.4777, + "step": 12142 + }, + { + "epoch": 1.9699870214146658, + "grad_norm": 0.596933852612102, + "learning_rate": 1.3315947168951626e-06, + "loss": 0.5527, + "step": 12143 + }, + { + "epoch": 1.9701492537313432, + "grad_norm": 0.5886496340492365, + "learning_rate": 1.3312172090422732e-06, + "loss": 0.5001, + "step": 12144 + }, + { + "epoch": 1.9703114860480206, + "grad_norm": 0.5727979116428096, + "learning_rate": 1.330839735291054e-06, + "loss": 0.4963, + "step": 12145 + }, + { + "epoch": 1.970473718364698, + "grad_norm": 0.6250414806566238, + "learning_rate": 1.3304622956525173e-06, + "loss": 0.4911, + "step": 12146 + }, + { + "epoch": 1.9706359506813758, + "grad_norm": 0.5773178472598036, + "learning_rate": 1.330084890137675e-06, + "loss": 0.4948, + "step": 12147 + }, + { + "epoch": 1.9707981829980532, + "grad_norm": 0.6182243641929408, + "learning_rate": 1.3297075187575398e-06, + "loss": 0.4939, + "step": 12148 + }, + { + "epoch": 1.9709604153147307, + "grad_norm": 0.6104025680717883, + "learning_rate": 1.3293301815231219e-06, + "loss": 0.5151, + "step": 12149 + }, + { + "epoch": 1.9711226476314083, + "grad_norm": 0.6219861109656484, + "learning_rate": 1.3289528784454329e-06, + "loss": 0.4841, + "step": 12150 + }, + { + "epoch": 1.9712848799480858, + "grad_norm": 0.5946437273105561, + "learning_rate": 1.3285756095354777e-06, + "loss": 0.5377, + "step": 12151 + }, + { + "epoch": 1.9714471122647632, + "grad_norm": 0.6253237022207815, + "learning_rate": 1.328198374804266e-06, + "loss": 0.5025, + "step": 12152 + }, + { + "epoch": 1.9716093445814407, + "grad_norm": 0.6294947068645143, + "learning_rate": 1.3278211742628038e-06, + "loss": 0.5426, + "step": 12153 + }, + { + "epoch": 1.971771576898118, + "grad_norm": 0.616335437394996, + "learning_rate": 1.3274440079220986e-06, + "loss": 0.5058, + "step": 12154 + }, + { + "epoch": 1.9719338092147956, + "grad_norm": 0.5978576445730671, + "learning_rate": 1.327066875793153e-06, + "loss": 0.5132, + "step": 12155 + }, + { + "epoch": 1.972096041531473, + "grad_norm": 0.6105786369874854, + "learning_rate": 1.3266897778869704e-06, + "loss": 0.5376, + "step": 12156 + }, + { + "epoch": 1.9722582738481504, + "grad_norm": 0.584946180531381, + "learning_rate": 1.326312714214554e-06, + "loss": 0.5206, + "step": 12157 + }, + { + "epoch": 1.972420506164828, + "grad_norm": 0.5995543623981947, + "learning_rate": 1.3259356847869056e-06, + "loss": 0.5223, + "step": 12158 + }, + { + "epoch": 1.9725827384815056, + "grad_norm": 0.5974765555825071, + "learning_rate": 1.3255586896150268e-06, + "loss": 0.507, + "step": 12159 + }, + { + "epoch": 1.972744970798183, + "grad_norm": 0.6028826465374245, + "learning_rate": 1.3251817287099163e-06, + "loss": 0.5219, + "step": 12160 + }, + { + "epoch": 1.9729072031148605, + "grad_norm": 0.635411118878514, + "learning_rate": 1.3248048020825716e-06, + "loss": 0.5538, + "step": 12161 + }, + { + "epoch": 1.9730694354315381, + "grad_norm": 0.7558842983043179, + "learning_rate": 1.3244279097439913e-06, + "loss": 0.5148, + "step": 12162 + }, + { + "epoch": 1.9732316677482156, + "grad_norm": 0.634469708437216, + "learning_rate": 1.324051051705173e-06, + "loss": 0.5073, + "step": 12163 + }, + { + "epoch": 1.973393900064893, + "grad_norm": 0.5931569348114969, + "learning_rate": 1.3236742279771102e-06, + "loss": 0.5295, + "step": 12164 + }, + { + "epoch": 1.9735561323815705, + "grad_norm": 0.6473042447337264, + "learning_rate": 1.3232974385708004e-06, + "loss": 0.487, + "step": 12165 + }, + { + "epoch": 1.973718364698248, + "grad_norm": 0.6175955021610003, + "learning_rate": 1.3229206834972338e-06, + "loss": 0.4917, + "step": 12166 + }, + { + "epoch": 1.9738805970149254, + "grad_norm": 0.6075230007590766, + "learning_rate": 1.322543962767405e-06, + "loss": 0.5133, + "step": 12167 + }, + { + "epoch": 1.9740428293316028, + "grad_norm": 0.6199013641390613, + "learning_rate": 1.322167276392306e-06, + "loss": 0.5377, + "step": 12168 + }, + { + "epoch": 1.9742050616482802, + "grad_norm": 0.6105611043769871, + "learning_rate": 1.3217906243829259e-06, + "loss": 0.5115, + "step": 12169 + }, + { + "epoch": 1.9743672939649577, + "grad_norm": 0.6214416604628714, + "learning_rate": 1.3214140067502564e-06, + "loss": 0.4953, + "step": 12170 + }, + { + "epoch": 1.9745295262816351, + "grad_norm": 0.6099539409102227, + "learning_rate": 1.3210374235052836e-06, + "loss": 0.5304, + "step": 12171 + }, + { + "epoch": 1.9746917585983128, + "grad_norm": 0.5822673013533756, + "learning_rate": 1.3206608746589966e-06, + "loss": 0.4975, + "step": 12172 + }, + { + "epoch": 1.9748539909149903, + "grad_norm": 0.6073621564158437, + "learning_rate": 1.3202843602223825e-06, + "loss": 0.5208, + "step": 12173 + }, + { + "epoch": 1.9750162232316677, + "grad_norm": 0.5950158310417519, + "learning_rate": 1.3199078802064252e-06, + "loss": 0.5234, + "step": 12174 + }, + { + "epoch": 1.9751784555483454, + "grad_norm": 0.6176495394249949, + "learning_rate": 1.319531434622111e-06, + "loss": 0.5495, + "step": 12175 + }, + { + "epoch": 1.9753406878650228, + "grad_norm": 0.5782669641643977, + "learning_rate": 1.3191550234804219e-06, + "loss": 0.5242, + "step": 12176 + }, + { + "epoch": 1.9755029201817003, + "grad_norm": 0.6016982157854168, + "learning_rate": 1.3187786467923424e-06, + "loss": 0.5101, + "step": 12177 + }, + { + "epoch": 1.9756651524983777, + "grad_norm": 0.6034771726644691, + "learning_rate": 1.3184023045688515e-06, + "loss": 0.4956, + "step": 12178 + }, + { + "epoch": 1.9758273848150552, + "grad_norm": 0.5990714911079378, + "learning_rate": 1.3180259968209318e-06, + "loss": 0.4977, + "step": 12179 + }, + { + "epoch": 1.9759896171317326, + "grad_norm": 0.6235012889497447, + "learning_rate": 1.317649723559563e-06, + "loss": 0.4936, + "step": 12180 + }, + { + "epoch": 1.97615184944841, + "grad_norm": 0.5767547564144513, + "learning_rate": 1.3172734847957218e-06, + "loss": 0.5223, + "step": 12181 + }, + { + "epoch": 1.9763140817650875, + "grad_norm": 0.6082881114819295, + "learning_rate": 1.316897280540388e-06, + "loss": 0.5101, + "step": 12182 + }, + { + "epoch": 1.976476314081765, + "grad_norm": 0.5992130608194958, + "learning_rate": 1.3165211108045362e-06, + "loss": 0.5497, + "step": 12183 + }, + { + "epoch": 1.9766385463984426, + "grad_norm": 0.5754224248765719, + "learning_rate": 1.3161449755991426e-06, + "loss": 0.5027, + "step": 12184 + }, + { + "epoch": 1.97680077871512, + "grad_norm": 0.6009822484261511, + "learning_rate": 1.3157688749351827e-06, + "loss": 0.4933, + "step": 12185 + }, + { + "epoch": 1.9769630110317975, + "grad_norm": 0.6122354010544374, + "learning_rate": 1.3153928088236295e-06, + "loss": 0.5295, + "step": 12186 + }, + { + "epoch": 1.9771252433484752, + "grad_norm": 0.6048954063985096, + "learning_rate": 1.3150167772754541e-06, + "loss": 0.5469, + "step": 12187 + }, + { + "epoch": 1.9772874756651526, + "grad_norm": 0.6226697335617668, + "learning_rate": 1.3146407803016293e-06, + "loss": 0.5406, + "step": 12188 + }, + { + "epoch": 1.97744970798183, + "grad_norm": 0.5819961940211597, + "learning_rate": 1.314264817913125e-06, + "loss": 0.5242, + "step": 12189 + }, + { + "epoch": 1.9776119402985075, + "grad_norm": 0.6057472153710577, + "learning_rate": 1.3138888901209124e-06, + "loss": 0.4903, + "step": 12190 + }, + { + "epoch": 1.977774172615185, + "grad_norm": 0.6189706126183729, + "learning_rate": 1.3135129969359588e-06, + "loss": 0.5021, + "step": 12191 + }, + { + "epoch": 1.9779364049318624, + "grad_norm": 0.5863340240802305, + "learning_rate": 1.3131371383692304e-06, + "loss": 0.4992, + "step": 12192 + }, + { + "epoch": 1.9780986372485398, + "grad_norm": 0.6154045150817458, + "learning_rate": 1.3127613144316948e-06, + "loss": 0.5146, + "step": 12193 + }, + { + "epoch": 1.9782608695652173, + "grad_norm": 0.6374467036390338, + "learning_rate": 1.312385525134317e-06, + "loss": 0.5382, + "step": 12194 + }, + { + "epoch": 1.9784231018818947, + "grad_norm": 0.694836881102857, + "learning_rate": 1.3120097704880635e-06, + "loss": 0.5377, + "step": 12195 + }, + { + "epoch": 1.9785853341985724, + "grad_norm": 0.6226415500517994, + "learning_rate": 1.3116340505038958e-06, + "loss": 0.5103, + "step": 12196 + }, + { + "epoch": 1.9787475665152499, + "grad_norm": 0.5865262657983824, + "learning_rate": 1.3112583651927757e-06, + "loss": 0.5343, + "step": 12197 + }, + { + "epoch": 1.9789097988319273, + "grad_norm": 0.6138989552363329, + "learning_rate": 1.3108827145656655e-06, + "loss": 0.5235, + "step": 12198 + }, + { + "epoch": 1.9790720311486047, + "grad_norm": 0.6079300264926941, + "learning_rate": 1.3105070986335265e-06, + "loss": 0.4673, + "step": 12199 + }, + { + "epoch": 1.9792342634652824, + "grad_norm": 0.6488206123007644, + "learning_rate": 1.3101315174073162e-06, + "loss": 0.5283, + "step": 12200 + }, + { + "epoch": 1.9793964957819599, + "grad_norm": 0.6151344481734756, + "learning_rate": 1.3097559708979952e-06, + "loss": 0.522, + "step": 12201 + }, + { + "epoch": 1.9795587280986373, + "grad_norm": 0.5741445601667431, + "learning_rate": 1.3093804591165181e-06, + "loss": 0.521, + "step": 12202 + }, + { + "epoch": 1.9797209604153148, + "grad_norm": 0.6280624473958174, + "learning_rate": 1.3090049820738432e-06, + "loss": 0.5328, + "step": 12203 + }, + { + "epoch": 1.9798831927319922, + "grad_norm": 0.6168143833559202, + "learning_rate": 1.3086295397809262e-06, + "loss": 0.5038, + "step": 12204 + }, + { + "epoch": 1.9800454250486696, + "grad_norm": 0.6199137520516166, + "learning_rate": 1.3082541322487195e-06, + "loss": 0.4875, + "step": 12205 + }, + { + "epoch": 1.980207657365347, + "grad_norm": 0.6683727248029039, + "learning_rate": 1.307878759488178e-06, + "loss": 0.5088, + "step": 12206 + }, + { + "epoch": 1.9803698896820245, + "grad_norm": 0.6032346005870962, + "learning_rate": 1.3075034215102533e-06, + "loss": 0.5207, + "step": 12207 + }, + { + "epoch": 1.980532121998702, + "grad_norm": 0.5901038790685159, + "learning_rate": 1.307128118325897e-06, + "loss": 0.5117, + "step": 12208 + }, + { + "epoch": 1.9806943543153797, + "grad_norm": 0.6901575884878223, + "learning_rate": 1.3067528499460588e-06, + "loss": 0.5662, + "step": 12209 + }, + { + "epoch": 1.980856586632057, + "grad_norm": 0.5905999276193675, + "learning_rate": 1.3063776163816883e-06, + "loss": 0.476, + "step": 12210 + }, + { + "epoch": 1.9810188189487346, + "grad_norm": 0.6114962350261973, + "learning_rate": 1.306002417643734e-06, + "loss": 0.4673, + "step": 12211 + }, + { + "epoch": 1.9811810512654122, + "grad_norm": 0.6124888982895352, + "learning_rate": 1.3056272537431426e-06, + "loss": 0.5052, + "step": 12212 + }, + { + "epoch": 1.9813432835820897, + "grad_norm": 0.6130861908687024, + "learning_rate": 1.3052521246908612e-06, + "loss": 0.525, + "step": 12213 + }, + { + "epoch": 1.9815055158987671, + "grad_norm": 0.6081031807276759, + "learning_rate": 1.3048770304978331e-06, + "loss": 0.4893, + "step": 12214 + }, + { + "epoch": 1.9816677482154446, + "grad_norm": 0.631873624243388, + "learning_rate": 1.3045019711750038e-06, + "loss": 0.4966, + "step": 12215 + }, + { + "epoch": 1.981829980532122, + "grad_norm": 0.6046580847028071, + "learning_rate": 1.3041269467333173e-06, + "loss": 0.5029, + "step": 12216 + }, + { + "epoch": 1.9819922128487995, + "grad_norm": 0.6130502966952927, + "learning_rate": 1.3037519571837134e-06, + "loss": 0.5427, + "step": 12217 + }, + { + "epoch": 1.982154445165477, + "grad_norm": 0.6248819306114319, + "learning_rate": 1.3033770025371357e-06, + "loss": 0.4759, + "step": 12218 + }, + { + "epoch": 1.9823166774821543, + "grad_norm": 0.6108998898503031, + "learning_rate": 1.303002082804522e-06, + "loss": 0.4991, + "step": 12219 + }, + { + "epoch": 1.9824789097988318, + "grad_norm": 0.5844145643000398, + "learning_rate": 1.3026271979968124e-06, + "loss": 0.4992, + "step": 12220 + }, + { + "epoch": 1.9826411421155095, + "grad_norm": 0.6169573849210455, + "learning_rate": 1.302252348124946e-06, + "loss": 0.4829, + "step": 12221 + }, + { + "epoch": 1.982803374432187, + "grad_norm": 0.5906455426322598, + "learning_rate": 1.301877533199859e-06, + "loss": 0.4893, + "step": 12222 + }, + { + "epoch": 1.9829656067488644, + "grad_norm": 0.5847533169042317, + "learning_rate": 1.3015027532324859e-06, + "loss": 0.5054, + "step": 12223 + }, + { + "epoch": 1.983127839065542, + "grad_norm": 0.6526511552073955, + "learning_rate": 1.301128008233763e-06, + "loss": 0.538, + "step": 12224 + }, + { + "epoch": 1.9832900713822195, + "grad_norm": 0.6141725393982647, + "learning_rate": 1.3007532982146243e-06, + "loss": 0.4737, + "step": 12225 + }, + { + "epoch": 1.983452303698897, + "grad_norm": 0.6022877563703855, + "learning_rate": 1.300378623186004e-06, + "loss": 0.4909, + "step": 12226 + }, + { + "epoch": 1.9836145360155744, + "grad_norm": 0.5975667246534868, + "learning_rate": 1.300003983158832e-06, + "loss": 0.5093, + "step": 12227 + }, + { + "epoch": 1.9837767683322518, + "grad_norm": 0.5993139712049519, + "learning_rate": 1.2996293781440395e-06, + "loss": 0.5136, + "step": 12228 + }, + { + "epoch": 1.9839390006489293, + "grad_norm": 0.6344712920692828, + "learning_rate": 1.2992548081525565e-06, + "loss": 0.5317, + "step": 12229 + }, + { + "epoch": 1.9841012329656067, + "grad_norm": 0.6119406245157731, + "learning_rate": 1.2988802731953125e-06, + "loss": 0.5523, + "step": 12230 + }, + { + "epoch": 1.9842634652822841, + "grad_norm": 0.623000252354505, + "learning_rate": 1.2985057732832358e-06, + "loss": 0.5262, + "step": 12231 + }, + { + "epoch": 1.9844256975989616, + "grad_norm": 0.5804332127659715, + "learning_rate": 1.2981313084272523e-06, + "loss": 0.5132, + "step": 12232 + }, + { + "epoch": 1.984587929915639, + "grad_norm": 0.61149511711407, + "learning_rate": 1.2977568786382871e-06, + "loss": 0.5271, + "step": 12233 + }, + { + "epoch": 1.9847501622323167, + "grad_norm": 0.6150079229230191, + "learning_rate": 1.2973824839272655e-06, + "loss": 0.5122, + "step": 12234 + }, + { + "epoch": 1.9849123945489942, + "grad_norm": 0.605670326774413, + "learning_rate": 1.2970081243051126e-06, + "loss": 0.481, + "step": 12235 + }, + { + "epoch": 1.9850746268656716, + "grad_norm": 0.6259575393916648, + "learning_rate": 1.2966337997827492e-06, + "loss": 0.5504, + "step": 12236 + }, + { + "epoch": 1.9852368591823493, + "grad_norm": 0.60576219209904, + "learning_rate": 1.2962595103710988e-06, + "loss": 0.4645, + "step": 12237 + }, + { + "epoch": 1.9853990914990267, + "grad_norm": 0.6130868454366494, + "learning_rate": 1.2958852560810797e-06, + "loss": 0.5201, + "step": 12238 + }, + { + "epoch": 1.9855613238157042, + "grad_norm": 0.6148825338275723, + "learning_rate": 1.2955110369236136e-06, + "loss": 0.4817, + "step": 12239 + }, + { + "epoch": 1.9857235561323816, + "grad_norm": 0.598183446410718, + "learning_rate": 1.2951368529096189e-06, + "loss": 0.5305, + "step": 12240 + }, + { + "epoch": 1.985885788449059, + "grad_norm": 0.5987141860424098, + "learning_rate": 1.294762704050012e-06, + "loss": 0.5092, + "step": 12241 + }, + { + "epoch": 1.9860480207657365, + "grad_norm": 0.5871117463116323, + "learning_rate": 1.2943885903557112e-06, + "loss": 0.5254, + "step": 12242 + }, + { + "epoch": 1.986210253082414, + "grad_norm": 0.5675146934297887, + "learning_rate": 1.29401451183763e-06, + "loss": 0.5193, + "step": 12243 + }, + { + "epoch": 1.9863724853990914, + "grad_norm": 0.6093491620844598, + "learning_rate": 1.2936404685066852e-06, + "loss": 0.5073, + "step": 12244 + }, + { + "epoch": 1.9865347177157688, + "grad_norm": 0.6089278347690265, + "learning_rate": 1.2932664603737883e-06, + "loss": 0.5611, + "step": 12245 + }, + { + "epoch": 1.9866969500324465, + "grad_norm": 0.5982829477178977, + "learning_rate": 1.292892487449852e-06, + "loss": 0.5021, + "step": 12246 + }, + { + "epoch": 1.986859182349124, + "grad_norm": 0.5962783372066786, + "learning_rate": 1.2925185497457896e-06, + "loss": 0.5369, + "step": 12247 + }, + { + "epoch": 1.9870214146658014, + "grad_norm": 0.6213036774586806, + "learning_rate": 1.292144647272509e-06, + "loss": 0.5329, + "step": 12248 + }, + { + "epoch": 1.987183646982479, + "grad_norm": 0.6004799897488784, + "learning_rate": 1.291770780040922e-06, + "loss": 0.5284, + "step": 12249 + }, + { + "epoch": 1.9873458792991565, + "grad_norm": 0.6141409398607101, + "learning_rate": 1.2913969480619348e-06, + "loss": 0.4859, + "step": 12250 + }, + { + "epoch": 1.987508111615834, + "grad_norm": 0.592078214202894, + "learning_rate": 1.2910231513464556e-06, + "loss": 0.5096, + "step": 12251 + }, + { + "epoch": 1.9876703439325114, + "grad_norm": 0.6374162085443484, + "learning_rate": 1.2906493899053917e-06, + "loss": 0.5066, + "step": 12252 + }, + { + "epoch": 1.9878325762491889, + "grad_norm": 0.6004102041438738, + "learning_rate": 1.2902756637496463e-06, + "loss": 0.5423, + "step": 12253 + }, + { + "epoch": 1.9879948085658663, + "grad_norm": 0.6459557722153162, + "learning_rate": 1.289901972890126e-06, + "loss": 0.5254, + "step": 12254 + }, + { + "epoch": 1.9881570408825437, + "grad_norm": 0.6064057933350612, + "learning_rate": 1.2895283173377315e-06, + "loss": 0.51, + "step": 12255 + }, + { + "epoch": 1.9883192731992212, + "grad_norm": 0.6229264132984506, + "learning_rate": 1.2891546971033666e-06, + "loss": 0.4862, + "step": 12256 + }, + { + "epoch": 1.9884815055158986, + "grad_norm": 0.6296258227160793, + "learning_rate": 1.288781112197933e-06, + "loss": 0.4864, + "step": 12257 + }, + { + "epoch": 1.988643737832576, + "grad_norm": 0.6024295307467474, + "learning_rate": 1.28840756263233e-06, + "loss": 0.4923, + "step": 12258 + }, + { + "epoch": 1.9888059701492538, + "grad_norm": 0.5910495025054069, + "learning_rate": 1.2880340484174554e-06, + "loss": 0.5226, + "step": 12259 + }, + { + "epoch": 1.9889682024659312, + "grad_norm": 0.5866031210443646, + "learning_rate": 1.2876605695642086e-06, + "loss": 0.5121, + "step": 12260 + }, + { + "epoch": 1.9891304347826086, + "grad_norm": 0.6495127852228512, + "learning_rate": 1.2872871260834864e-06, + "loss": 0.4951, + "step": 12261 + }, + { + "epoch": 1.9892926670992863, + "grad_norm": 0.6053251594596486, + "learning_rate": 1.2869137179861856e-06, + "loss": 0.5163, + "step": 12262 + }, + { + "epoch": 1.9894548994159638, + "grad_norm": 0.5939476005157325, + "learning_rate": 1.286540345283201e-06, + "loss": 0.5335, + "step": 12263 + }, + { + "epoch": 1.9896171317326412, + "grad_norm": 0.5859156855144444, + "learning_rate": 1.2861670079854247e-06, + "loss": 0.5195, + "step": 12264 + }, + { + "epoch": 1.9897793640493187, + "grad_norm": 0.5981114828298322, + "learning_rate": 1.2857937061037506e-06, + "loss": 0.5231, + "step": 12265 + }, + { + "epoch": 1.989941596365996, + "grad_norm": 0.6040967035666657, + "learning_rate": 1.2854204396490722e-06, + "loss": 0.5262, + "step": 12266 + }, + { + "epoch": 1.9901038286826735, + "grad_norm": 0.6098949506619438, + "learning_rate": 1.2850472086322789e-06, + "loss": 0.5435, + "step": 12267 + }, + { + "epoch": 1.990266060999351, + "grad_norm": 0.6006816196197762, + "learning_rate": 1.2846740130642593e-06, + "loss": 0.5349, + "step": 12268 + }, + { + "epoch": 1.9904282933160284, + "grad_norm": 0.6160286993496936, + "learning_rate": 1.2843008529559034e-06, + "loss": 0.5113, + "step": 12269 + }, + { + "epoch": 1.9905905256327059, + "grad_norm": 0.620493617895582, + "learning_rate": 1.2839277283180989e-06, + "loss": 0.5043, + "step": 12270 + }, + { + "epoch": 1.9907527579493836, + "grad_norm": 0.6002464808736906, + "learning_rate": 1.2835546391617337e-06, + "loss": 0.506, + "step": 12271 + }, + { + "epoch": 1.990914990266061, + "grad_norm": 0.605955495153298, + "learning_rate": 1.283181585497692e-06, + "loss": 0.5288, + "step": 12272 + }, + { + "epoch": 1.9910772225827384, + "grad_norm": 0.6140610615997814, + "learning_rate": 1.2828085673368579e-06, + "loss": 0.5008, + "step": 12273 + }, + { + "epoch": 1.9912394548994161, + "grad_norm": 0.6348376908615125, + "learning_rate": 1.2824355846901158e-06, + "loss": 0.5059, + "step": 12274 + }, + { + "epoch": 1.9914016872160936, + "grad_norm": 0.5936123317350286, + "learning_rate": 1.282062637568348e-06, + "loss": 0.4925, + "step": 12275 + }, + { + "epoch": 1.991563919532771, + "grad_norm": 0.5987286764769923, + "learning_rate": 1.2816897259824371e-06, + "loss": 0.5081, + "step": 12276 + }, + { + "epoch": 1.9917261518494485, + "grad_norm": 0.5989222906135596, + "learning_rate": 1.2813168499432615e-06, + "loss": 0.5203, + "step": 12277 + }, + { + "epoch": 1.991888384166126, + "grad_norm": 0.6034843199881275, + "learning_rate": 1.2809440094617032e-06, + "loss": 0.5341, + "step": 12278 + }, + { + "epoch": 1.9920506164828033, + "grad_norm": 0.5905434452414595, + "learning_rate": 1.2805712045486379e-06, + "loss": 0.4941, + "step": 12279 + }, + { + "epoch": 1.9922128487994808, + "grad_norm": 0.6189775499006008, + "learning_rate": 1.2801984352149454e-06, + "loss": 0.5202, + "step": 12280 + }, + { + "epoch": 1.9923750811161582, + "grad_norm": 0.5995136032974391, + "learning_rate": 1.2798257014715002e-06, + "loss": 0.5175, + "step": 12281 + }, + { + "epoch": 1.9925373134328357, + "grad_norm": 0.6134502794364929, + "learning_rate": 1.279453003329178e-06, + "loss": 0.4928, + "step": 12282 + }, + { + "epoch": 1.9926995457495134, + "grad_norm": 0.5778965000959955, + "learning_rate": 1.279080340798854e-06, + "loss": 0.5043, + "step": 12283 + }, + { + "epoch": 1.9928617780661908, + "grad_norm": 0.6331597150697459, + "learning_rate": 1.2787077138914e-06, + "loss": 0.5196, + "step": 12284 + }, + { + "epoch": 1.9930240103828682, + "grad_norm": 0.623897404088266, + "learning_rate": 1.27833512261769e-06, + "loss": 0.5153, + "step": 12285 + }, + { + "epoch": 1.9931862426995457, + "grad_norm": 0.6380264513221583, + "learning_rate": 1.2779625669885925e-06, + "loss": 0.5275, + "step": 12286 + }, + { + "epoch": 1.9933484750162234, + "grad_norm": 0.5945959040088494, + "learning_rate": 1.2775900470149794e-06, + "loss": 0.4869, + "step": 12287 + }, + { + "epoch": 1.9935107073329008, + "grad_norm": 0.5997119808573458, + "learning_rate": 1.2772175627077204e-06, + "loss": 0.5083, + "step": 12288 + }, + { + "epoch": 1.9936729396495783, + "grad_norm": 0.6031160770783232, + "learning_rate": 1.2768451140776828e-06, + "loss": 0.4885, + "step": 12289 + }, + { + "epoch": 1.9938351719662557, + "grad_norm": 0.6029850942391554, + "learning_rate": 1.2764727011357319e-06, + "loss": 0.4657, + "step": 12290 + }, + { + "epoch": 1.9939974042829332, + "grad_norm": 0.5893136715367611, + "learning_rate": 1.276100323892735e-06, + "loss": 0.5031, + "step": 12291 + }, + { + "epoch": 1.9941596365996106, + "grad_norm": 0.6326015002012588, + "learning_rate": 1.2757279823595574e-06, + "loss": 0.485, + "step": 12292 + }, + { + "epoch": 1.994321868916288, + "grad_norm": 0.5900404225309828, + "learning_rate": 1.2753556765470633e-06, + "loss": 0.5072, + "step": 12293 + }, + { + "epoch": 1.9944841012329655, + "grad_norm": 0.5949772177111218, + "learning_rate": 1.2749834064661146e-06, + "loss": 0.5063, + "step": 12294 + }, + { + "epoch": 1.994646333549643, + "grad_norm": 0.6067598157159863, + "learning_rate": 1.2746111721275722e-06, + "loss": 0.5207, + "step": 12295 + }, + { + "epoch": 1.9948085658663206, + "grad_norm": 0.6267158166252739, + "learning_rate": 1.274238973542298e-06, + "loss": 0.5431, + "step": 12296 + }, + { + "epoch": 1.994970798182998, + "grad_norm": 0.5983408970656406, + "learning_rate": 1.2738668107211517e-06, + "loss": 0.511, + "step": 12297 + }, + { + "epoch": 1.9951330304996755, + "grad_norm": 0.5863222771626091, + "learning_rate": 1.273494683674992e-06, + "loss": 0.4988, + "step": 12298 + }, + { + "epoch": 1.9952952628163532, + "grad_norm": 0.635510143442739, + "learning_rate": 1.2731225924146768e-06, + "loss": 0.4905, + "step": 12299 + }, + { + "epoch": 1.9954574951330306, + "grad_norm": 0.5990290524901901, + "learning_rate": 1.2727505369510607e-06, + "loss": 0.5215, + "step": 12300 + }, + { + "epoch": 1.995619727449708, + "grad_norm": 0.5999498450430955, + "learning_rate": 1.2723785172950004e-06, + "loss": 0.5098, + "step": 12301 + }, + { + "epoch": 1.9957819597663855, + "grad_norm": 0.6482441677305656, + "learning_rate": 1.2720065334573518e-06, + "loss": 0.5242, + "step": 12302 + }, + { + "epoch": 1.995944192083063, + "grad_norm": 0.6037356036902171, + "learning_rate": 1.2716345854489664e-06, + "loss": 0.4983, + "step": 12303 + }, + { + "epoch": 1.9961064243997404, + "grad_norm": 0.6174782079954559, + "learning_rate": 1.2712626732806966e-06, + "loss": 0.5012, + "step": 12304 + }, + { + "epoch": 1.9962686567164178, + "grad_norm": 0.6163676386988722, + "learning_rate": 1.2708907969633937e-06, + "loss": 0.521, + "step": 12305 + }, + { + "epoch": 1.9964308890330953, + "grad_norm": 0.603900743872294, + "learning_rate": 1.270518956507909e-06, + "loss": 0.54, + "step": 12306 + }, + { + "epoch": 1.9965931213497727, + "grad_norm": 0.7466599703060305, + "learning_rate": 1.270147151925092e-06, + "loss": 0.5094, + "step": 12307 + }, + { + "epoch": 1.9967553536664504, + "grad_norm": 0.6232786395188505, + "learning_rate": 1.2697753832257903e-06, + "loss": 0.4639, + "step": 12308 + }, + { + "epoch": 1.9969175859831279, + "grad_norm": 0.5963753593137434, + "learning_rate": 1.2694036504208496e-06, + "loss": 0.4715, + "step": 12309 + }, + { + "epoch": 1.9970798182998053, + "grad_norm": 0.6415921538215029, + "learning_rate": 1.2690319535211171e-06, + "loss": 0.4895, + "step": 12310 + }, + { + "epoch": 1.997242050616483, + "grad_norm": 0.6094774822946936, + "learning_rate": 1.2686602925374381e-06, + "loss": 0.5228, + "step": 12311 + }, + { + "epoch": 1.9974042829331604, + "grad_norm": 0.6206474154855286, + "learning_rate": 1.2682886674806572e-06, + "loss": 0.5463, + "step": 12312 + }, + { + "epoch": 1.9975665152498379, + "grad_norm": 0.6099241221784393, + "learning_rate": 1.2679170783616162e-06, + "loss": 0.5139, + "step": 12313 + }, + { + "epoch": 1.9977287475665153, + "grad_norm": 0.5905560331099425, + "learning_rate": 1.267545525191157e-06, + "loss": 0.488, + "step": 12314 + }, + { + "epoch": 1.9978909798831928, + "grad_norm": 0.6425787465549403, + "learning_rate": 1.2671740079801204e-06, + "loss": 0.5168, + "step": 12315 + }, + { + "epoch": 1.9980532121998702, + "grad_norm": 0.5853081017090458, + "learning_rate": 1.2668025267393475e-06, + "loss": 0.5459, + "step": 12316 + }, + { + "epoch": 1.9982154445165476, + "grad_norm": 0.6142458717149804, + "learning_rate": 1.2664310814796754e-06, + "loss": 0.4969, + "step": 12317 + }, + { + "epoch": 1.998377676833225, + "grad_norm": 0.6061098504578213, + "learning_rate": 1.2660596722119428e-06, + "loss": 0.4993, + "step": 12318 + }, + { + "epoch": 1.9985399091499025, + "grad_norm": 0.6172084776750505, + "learning_rate": 1.2656882989469855e-06, + "loss": 0.5296, + "step": 12319 + }, + { + "epoch": 1.99870214146658, + "grad_norm": 0.5795375257996334, + "learning_rate": 1.2653169616956396e-06, + "loss": 0.5098, + "step": 12320 + }, + { + "epoch": 1.9988643737832577, + "grad_norm": 0.62967638605501, + "learning_rate": 1.2649456604687404e-06, + "loss": 0.5121, + "step": 12321 + }, + { + "epoch": 1.999026606099935, + "grad_norm": 0.6303679813348354, + "learning_rate": 1.2645743952771196e-06, + "loss": 0.5185, + "step": 12322 + }, + { + "epoch": 1.9991888384166125, + "grad_norm": 0.6621071202550411, + "learning_rate": 1.2642031661316109e-06, + "loss": 0.5108, + "step": 12323 + }, + { + "epoch": 1.9993510707332902, + "grad_norm": 0.5812209692778365, + "learning_rate": 1.2638319730430461e-06, + "loss": 0.5149, + "step": 12324 + }, + { + "epoch": 1.9995133030499677, + "grad_norm": 0.585303350559678, + "learning_rate": 1.263460816022255e-06, + "loss": 0.511, + "step": 12325 + }, + { + "epoch": 1.999675535366645, + "grad_norm": 0.5916944885777443, + "learning_rate": 1.2630896950800658e-06, + "loss": 0.506, + "step": 12326 + }, + { + "epoch": 1.9998377676833226, + "grad_norm": 0.6532068957199884, + "learning_rate": 1.2627186102273076e-06, + "loss": 0.5474, + "step": 12327 + }, + { + "epoch": 2.0, + "grad_norm": 0.6164232609805875, + "learning_rate": 1.2623475614748077e-06, + "loss": 0.5362, + "step": 12328 + } + ], + "logging_steps": 1, + "max_steps": 18492, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 6164, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1905623952261120.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}