{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.997677659080353, "eval_steps": 500, "global_step": 3228, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009289363678588017, "grad_norm": 54.7475656620091, "learning_rate": 0.0, "loss": 11.0245, "step": 1 }, { "epoch": 0.0018578727357176034, "grad_norm": 56.51890067533405, "learning_rate": 1.5479876160990715e-07, "loss": 10.9563, "step": 2 }, { "epoch": 0.002786809103576405, "grad_norm": 52.84327216336746, "learning_rate": 3.095975232198143e-07, "loss": 11.1232, "step": 3 }, { "epoch": 0.0037157454714352067, "grad_norm": 54.47240324631095, "learning_rate": 4.6439628482972136e-07, "loss": 11.0395, "step": 4 }, { "epoch": 0.004644681839294009, "grad_norm": 56.06603126343985, "learning_rate": 6.191950464396286e-07, "loss": 10.9521, "step": 5 }, { "epoch": 0.00557361820715281, "grad_norm": 56.653051495890196, "learning_rate": 7.739938080495357e-07, "loss": 10.9283, "step": 6 }, { "epoch": 0.006502554575011612, "grad_norm": 57.19065708163452, "learning_rate": 9.287925696594427e-07, "loss": 10.9048, "step": 7 }, { "epoch": 0.0074314909428704135, "grad_norm": 57.13309141244447, "learning_rate": 1.08359133126935e-06, "loss": 10.8692, "step": 8 }, { "epoch": 0.008360427310729215, "grad_norm": 61.70021371119148, "learning_rate": 1.2383900928792572e-06, "loss": 10.7026, "step": 9 }, { "epoch": 0.009289363678588018, "grad_norm": 58.64471121402323, "learning_rate": 1.3931888544891641e-06, "loss": 10.7762, "step": 10 }, { "epoch": 0.010218300046446818, "grad_norm": 61.95985964008096, "learning_rate": 1.5479876160990713e-06, "loss": 10.6442, "step": 11 }, { "epoch": 0.01114723641430562, "grad_norm": 81.74822353424084, "learning_rate": 1.7027863777089783e-06, "loss": 9.3569, "step": 12 }, { "epoch": 0.012076172782164421, "grad_norm": 87.15614377730954, "learning_rate": 1.8575851393188855e-06, "loss": 9.3139, "step": 13 }, { "epoch": 0.013005109150023224, "grad_norm": 90.69218149138536, "learning_rate": 2.012383900928793e-06, "loss": 8.9405, "step": 14 }, { "epoch": 0.013934045517882025, "grad_norm": 96.84729927584988, "learning_rate": 2.1671826625387e-06, "loss": 8.6999, "step": 15 }, { "epoch": 0.014862981885740827, "grad_norm": 65.07865556439408, "learning_rate": 2.321981424148607e-06, "loss": 3.7051, "step": 16 }, { "epoch": 0.01579191825359963, "grad_norm": 62.52820085696381, "learning_rate": 2.4767801857585144e-06, "loss": 3.6981, "step": 17 }, { "epoch": 0.01672085462145843, "grad_norm": 53.649988186848155, "learning_rate": 2.631578947368421e-06, "loss": 3.2385, "step": 18 }, { "epoch": 0.01764979098931723, "grad_norm": 39.157669241392924, "learning_rate": 2.7863777089783283e-06, "loss": 2.6875, "step": 19 }, { "epoch": 0.018578727357176035, "grad_norm": 34.53162931711357, "learning_rate": 2.9411764705882355e-06, "loss": 2.4641, "step": 20 }, { "epoch": 0.019507663725034836, "grad_norm": 14.044732265304198, "learning_rate": 3.0959752321981426e-06, "loss": 1.7491, "step": 21 }, { "epoch": 0.020436600092893636, "grad_norm": 6.143568036385673, "learning_rate": 3.25077399380805e-06, "loss": 1.2988, "step": 22 }, { "epoch": 0.021365536460752437, "grad_norm": 5.2826561116659745, "learning_rate": 3.4055727554179566e-06, "loss": 1.3204, "step": 23 }, { "epoch": 0.02229447282861124, "grad_norm": 4.030771783214909, "learning_rate": 3.560371517027864e-06, "loss": 1.2404, "step": 24 }, { "epoch": 0.023223409196470042, "grad_norm": 3.426990839601739, "learning_rate": 3.715170278637771e-06, "loss": 1.1722, "step": 25 }, { "epoch": 0.024152345564328843, "grad_norm": 2.663561421971141, "learning_rate": 3.869969040247678e-06, "loss": 1.132, "step": 26 }, { "epoch": 0.025081281932187643, "grad_norm": 2.2095185737240914, "learning_rate": 4.024767801857586e-06, "loss": 1.0879, "step": 27 }, { "epoch": 0.026010218300046448, "grad_norm": 1.8901365750210222, "learning_rate": 4.1795665634674924e-06, "loss": 1.0258, "step": 28 }, { "epoch": 0.02693915466790525, "grad_norm": 1.3861813570839094, "learning_rate": 4.3343653250774e-06, "loss": 0.9735, "step": 29 }, { "epoch": 0.02786809103576405, "grad_norm": 1.5525651218677432, "learning_rate": 4.489164086687307e-06, "loss": 0.9165, "step": 30 }, { "epoch": 0.028797027403622853, "grad_norm": 4.777669808853715, "learning_rate": 4.643962848297214e-06, "loss": 0.8589, "step": 31 }, { "epoch": 0.029725963771481654, "grad_norm": 1.9514011751823228, "learning_rate": 4.798761609907121e-06, "loss": 0.8463, "step": 32 }, { "epoch": 0.030654900139340455, "grad_norm": 1.0398033405311462, "learning_rate": 4.953560371517029e-06, "loss": 0.8531, "step": 33 }, { "epoch": 0.03158383650719926, "grad_norm": 0.9239326446187678, "learning_rate": 5.1083591331269355e-06, "loss": 0.8295, "step": 34 }, { "epoch": 0.032512772875058056, "grad_norm": 0.8239281374239108, "learning_rate": 5.263157894736842e-06, "loss": 0.7953, "step": 35 }, { "epoch": 0.03344170924291686, "grad_norm": 0.7456489091955973, "learning_rate": 5.41795665634675e-06, "loss": 0.8011, "step": 36 }, { "epoch": 0.034370645610775664, "grad_norm": 0.7322576669859537, "learning_rate": 5.5727554179566566e-06, "loss": 0.7743, "step": 37 }, { "epoch": 0.03529958197863446, "grad_norm": 0.6841211213339884, "learning_rate": 5.727554179566564e-06, "loss": 0.7502, "step": 38 }, { "epoch": 0.036228518346493266, "grad_norm": 0.6392360133178303, "learning_rate": 5.882352941176471e-06, "loss": 0.7588, "step": 39 }, { "epoch": 0.03715745471435207, "grad_norm": 0.5816712224571956, "learning_rate": 6.0371517027863785e-06, "loss": 0.763, "step": 40 }, { "epoch": 0.03808639108221087, "grad_norm": 0.5551930154686257, "learning_rate": 6.191950464396285e-06, "loss": 0.7203, "step": 41 }, { "epoch": 0.03901532745006967, "grad_norm": 0.627393609542681, "learning_rate": 6.346749226006192e-06, "loss": 0.7372, "step": 42 }, { "epoch": 0.03994426381792847, "grad_norm": 0.5209393663178439, "learning_rate": 6.5015479876161e-06, "loss": 0.7142, "step": 43 }, { "epoch": 0.04087320018578727, "grad_norm": 0.4311011906767075, "learning_rate": 6.656346749226007e-06, "loss": 0.6497, "step": 44 }, { "epoch": 0.04180213655364608, "grad_norm": 0.44948642321517024, "learning_rate": 6.811145510835913e-06, "loss": 0.694, "step": 45 }, { "epoch": 0.042731072921504874, "grad_norm": 0.4845127976887666, "learning_rate": 6.965944272445821e-06, "loss": 0.7111, "step": 46 }, { "epoch": 0.04366000928936368, "grad_norm": 0.4680884482351343, "learning_rate": 7.120743034055728e-06, "loss": 0.6841, "step": 47 }, { "epoch": 0.04458894565722248, "grad_norm": 0.4268577390410502, "learning_rate": 7.275541795665634e-06, "loss": 0.6861, "step": 48 }, { "epoch": 0.04551788202508128, "grad_norm": 0.4221226832933112, "learning_rate": 7.430340557275542e-06, "loss": 0.6741, "step": 49 }, { "epoch": 0.046446818392940084, "grad_norm": 0.5220141311400619, "learning_rate": 7.585139318885449e-06, "loss": 0.669, "step": 50 }, { "epoch": 0.04737575476079889, "grad_norm": 0.42881334524580667, "learning_rate": 7.739938080495356e-06, "loss": 0.676, "step": 51 }, { "epoch": 0.048304691128657685, "grad_norm": 0.3524885307145976, "learning_rate": 7.894736842105263e-06, "loss": 0.6381, "step": 52 }, { "epoch": 0.04923362749651649, "grad_norm": 0.4114675079627211, "learning_rate": 8.049535603715171e-06, "loss": 0.6049, "step": 53 }, { "epoch": 0.05016256386437529, "grad_norm": 0.40656103184868714, "learning_rate": 8.204334365325078e-06, "loss": 0.6202, "step": 54 }, { "epoch": 0.05109150023223409, "grad_norm": 0.3732755562975793, "learning_rate": 8.359133126934985e-06, "loss": 0.6231, "step": 55 }, { "epoch": 0.052020436600092895, "grad_norm": 0.3799659025653529, "learning_rate": 8.513931888544892e-06, "loss": 0.6167, "step": 56 }, { "epoch": 0.05294937296795169, "grad_norm": 0.3548794527966943, "learning_rate": 8.6687306501548e-06, "loss": 0.6235, "step": 57 }, { "epoch": 0.0538783093358105, "grad_norm": 0.3366392962881869, "learning_rate": 8.823529411764707e-06, "loss": 0.6037, "step": 58 }, { "epoch": 0.0548072457036693, "grad_norm": 0.35622745848432813, "learning_rate": 8.978328173374614e-06, "loss": 0.6505, "step": 59 }, { "epoch": 0.0557361820715281, "grad_norm": 0.33139910543786816, "learning_rate": 9.13312693498452e-06, "loss": 0.5907, "step": 60 }, { "epoch": 0.0566651184393869, "grad_norm": 0.26984628140930006, "learning_rate": 9.287925696594429e-06, "loss": 0.6075, "step": 61 }, { "epoch": 0.057594054807245706, "grad_norm": 0.30168457074265415, "learning_rate": 9.442724458204334e-06, "loss": 0.5929, "step": 62 }, { "epoch": 0.058522991175104504, "grad_norm": 0.3514265855140124, "learning_rate": 9.597523219814242e-06, "loss": 0.5962, "step": 63 }, { "epoch": 0.05945192754296331, "grad_norm": 0.3551734532081626, "learning_rate": 9.752321981424149e-06, "loss": 0.6197, "step": 64 }, { "epoch": 0.06038086391082211, "grad_norm": 0.29947361957296, "learning_rate": 9.907120743034057e-06, "loss": 0.5796, "step": 65 }, { "epoch": 0.06130980027868091, "grad_norm": 0.31337803543321063, "learning_rate": 1.0061919504643963e-05, "loss": 0.5948, "step": 66 }, { "epoch": 0.062238736646539713, "grad_norm": 0.3215034540150923, "learning_rate": 1.0216718266253871e-05, "loss": 0.5481, "step": 67 }, { "epoch": 0.06316767301439852, "grad_norm": 0.3102955136307414, "learning_rate": 1.0371517027863778e-05, "loss": 0.5789, "step": 68 }, { "epoch": 0.06409660938225732, "grad_norm": 0.2573332075607664, "learning_rate": 1.0526315789473684e-05, "loss": 0.5656, "step": 69 }, { "epoch": 0.06502554575011611, "grad_norm": 0.2537807753587459, "learning_rate": 1.0681114551083591e-05, "loss": 0.5678, "step": 70 }, { "epoch": 0.06595448211797492, "grad_norm": 0.30023463727934896, "learning_rate": 1.08359133126935e-05, "loss": 0.572, "step": 71 }, { "epoch": 0.06688341848583372, "grad_norm": 0.2705492110612055, "learning_rate": 1.0990712074303406e-05, "loss": 0.5956, "step": 72 }, { "epoch": 0.06781235485369252, "grad_norm": 0.27304258159582684, "learning_rate": 1.1145510835913313e-05, "loss": 0.5943, "step": 73 }, { "epoch": 0.06874129122155133, "grad_norm": 0.27460235847783515, "learning_rate": 1.130030959752322e-05, "loss": 0.5738, "step": 74 }, { "epoch": 0.06967022758941012, "grad_norm": 0.24083051338698006, "learning_rate": 1.1455108359133128e-05, "loss": 0.5543, "step": 75 }, { "epoch": 0.07059916395726892, "grad_norm": 0.2862680829656088, "learning_rate": 1.1609907120743033e-05, "loss": 0.5574, "step": 76 }, { "epoch": 0.07152810032512773, "grad_norm": 0.2522441867822406, "learning_rate": 1.1764705882352942e-05, "loss": 0.5561, "step": 77 }, { "epoch": 0.07245703669298653, "grad_norm": 0.2731655705611119, "learning_rate": 1.1919504643962849e-05, "loss": 0.5736, "step": 78 }, { "epoch": 0.07338597306084534, "grad_norm": 0.2520420481239673, "learning_rate": 1.2074303405572757e-05, "loss": 0.5418, "step": 79 }, { "epoch": 0.07431490942870414, "grad_norm": 0.22930537041421725, "learning_rate": 1.2229102167182662e-05, "loss": 0.5171, "step": 80 }, { "epoch": 0.07524384579656293, "grad_norm": 0.273072858512794, "learning_rate": 1.238390092879257e-05, "loss": 0.5433, "step": 81 }, { "epoch": 0.07617278216442173, "grad_norm": 0.2891845927667174, "learning_rate": 1.2538699690402477e-05, "loss": 0.5779, "step": 82 }, { "epoch": 0.07710171853228054, "grad_norm": 0.2523018814544343, "learning_rate": 1.2693498452012384e-05, "loss": 0.5461, "step": 83 }, { "epoch": 0.07803065490013934, "grad_norm": 0.3166725101519938, "learning_rate": 1.2848297213622292e-05, "loss": 0.5643, "step": 84 }, { "epoch": 0.07895959126799815, "grad_norm": 0.2631824444127893, "learning_rate": 1.30030959752322e-05, "loss": 0.5138, "step": 85 }, { "epoch": 0.07988852763585694, "grad_norm": 0.3004458758805224, "learning_rate": 1.3157894736842106e-05, "loss": 0.551, "step": 86 }, { "epoch": 0.08081746400371574, "grad_norm": 0.27824957348028967, "learning_rate": 1.3312693498452014e-05, "loss": 0.5774, "step": 87 }, { "epoch": 0.08174640037157455, "grad_norm": 0.30038729524349966, "learning_rate": 1.346749226006192e-05, "loss": 0.5255, "step": 88 }, { "epoch": 0.08267533673943335, "grad_norm": 0.2942525266347019, "learning_rate": 1.3622291021671826e-05, "loss": 0.5569, "step": 89 }, { "epoch": 0.08360427310729215, "grad_norm": 0.29120858799373767, "learning_rate": 1.3777089783281735e-05, "loss": 0.5224, "step": 90 }, { "epoch": 0.08453320947515096, "grad_norm": 0.3596968247905661, "learning_rate": 1.3931888544891641e-05, "loss": 0.5199, "step": 91 }, { "epoch": 0.08546214584300975, "grad_norm": 0.2812166695733365, "learning_rate": 1.4086687306501548e-05, "loss": 0.5363, "step": 92 }, { "epoch": 0.08639108221086855, "grad_norm": 0.2992496470672875, "learning_rate": 1.4241486068111457e-05, "loss": 0.5634, "step": 93 }, { "epoch": 0.08732001857872736, "grad_norm": 0.29257796169998596, "learning_rate": 1.4396284829721363e-05, "loss": 0.5527, "step": 94 }, { "epoch": 0.08824895494658616, "grad_norm": 0.2732189071664008, "learning_rate": 1.4551083591331268e-05, "loss": 0.5026, "step": 95 }, { "epoch": 0.08917789131444497, "grad_norm": 0.27378573341841284, "learning_rate": 1.4705882352941177e-05, "loss": 0.575, "step": 96 }, { "epoch": 0.09010682768230376, "grad_norm": 0.2917134046408098, "learning_rate": 1.4860681114551084e-05, "loss": 0.5561, "step": 97 }, { "epoch": 0.09103576405016256, "grad_norm": 0.26829458430398617, "learning_rate": 1.5015479876160992e-05, "loss": 0.5082, "step": 98 }, { "epoch": 0.09196470041802136, "grad_norm": 0.34740553302312405, "learning_rate": 1.5170278637770899e-05, "loss": 0.5794, "step": 99 }, { "epoch": 0.09289363678588017, "grad_norm": 0.2936917575237726, "learning_rate": 1.5325077399380806e-05, "loss": 0.548, "step": 100 }, { "epoch": 0.09382257315373897, "grad_norm": 0.3117382936574086, "learning_rate": 1.5479876160990712e-05, "loss": 0.5368, "step": 101 }, { "epoch": 0.09475150952159778, "grad_norm": 0.22828777606096362, "learning_rate": 1.563467492260062e-05, "loss": 0.5314, "step": 102 }, { "epoch": 0.09568044588945657, "grad_norm": 0.2771765662389807, "learning_rate": 1.5789473684210526e-05, "loss": 0.5482, "step": 103 }, { "epoch": 0.09660938225731537, "grad_norm": 0.2789355900210165, "learning_rate": 1.5944272445820436e-05, "loss": 0.5214, "step": 104 }, { "epoch": 0.09753831862517418, "grad_norm": 0.29645303240245474, "learning_rate": 1.6099071207430343e-05, "loss": 0.5403, "step": 105 }, { "epoch": 0.09846725499303298, "grad_norm": 0.2501658792361872, "learning_rate": 1.6253869969040246e-05, "loss": 0.5125, "step": 106 }, { "epoch": 0.09939619136089178, "grad_norm": 0.27343053126969535, "learning_rate": 1.6408668730650156e-05, "loss": 0.4957, "step": 107 }, { "epoch": 0.10032512772875057, "grad_norm": 0.2578817321890831, "learning_rate": 1.6563467492260063e-05, "loss": 0.5118, "step": 108 }, { "epoch": 0.10125406409660938, "grad_norm": 0.26242875789808284, "learning_rate": 1.671826625386997e-05, "loss": 0.5009, "step": 109 }, { "epoch": 0.10218300046446818, "grad_norm": 0.27788258841659896, "learning_rate": 1.6873065015479876e-05, "loss": 0.5152, "step": 110 }, { "epoch": 0.10311193683232699, "grad_norm": 0.32503303320367943, "learning_rate": 1.7027863777089783e-05, "loss": 0.5675, "step": 111 }, { "epoch": 0.10404087320018579, "grad_norm": 0.25485198676459764, "learning_rate": 1.7182662538699693e-05, "loss": 0.4957, "step": 112 }, { "epoch": 0.1049698095680446, "grad_norm": 0.33134645178022537, "learning_rate": 1.73374613003096e-05, "loss": 0.5523, "step": 113 }, { "epoch": 0.10589874593590339, "grad_norm": 0.29136775834871204, "learning_rate": 1.7492260061919503e-05, "loss": 0.5432, "step": 114 }, { "epoch": 0.10682768230376219, "grad_norm": 0.29657828249266954, "learning_rate": 1.7647058823529414e-05, "loss": 0.5074, "step": 115 }, { "epoch": 0.107756618671621, "grad_norm": 0.2694020146187824, "learning_rate": 1.780185758513932e-05, "loss": 0.5159, "step": 116 }, { "epoch": 0.1086855550394798, "grad_norm": 0.32008332295676645, "learning_rate": 1.7956656346749227e-05, "loss": 0.5078, "step": 117 }, { "epoch": 0.1096144914073386, "grad_norm": 0.30370374026876107, "learning_rate": 1.8111455108359134e-05, "loss": 0.5251, "step": 118 }, { "epoch": 0.11054342777519739, "grad_norm": 0.2540384388820426, "learning_rate": 1.826625386996904e-05, "loss": 0.4889, "step": 119 }, { "epoch": 0.1114723641430562, "grad_norm": 0.2817516173937527, "learning_rate": 1.8421052631578947e-05, "loss": 0.5096, "step": 120 }, { "epoch": 0.112401300510915, "grad_norm": 0.28087162622672707, "learning_rate": 1.8575851393188857e-05, "loss": 0.5184, "step": 121 }, { "epoch": 0.1133302368787738, "grad_norm": 0.2638926817448589, "learning_rate": 1.873065015479876e-05, "loss": 0.5073, "step": 122 }, { "epoch": 0.11425917324663261, "grad_norm": 0.27924441224774316, "learning_rate": 1.8885448916408668e-05, "loss": 0.5045, "step": 123 }, { "epoch": 0.11518810961449141, "grad_norm": 0.3038546535105753, "learning_rate": 1.9040247678018578e-05, "loss": 0.5647, "step": 124 }, { "epoch": 0.1161170459823502, "grad_norm": 0.28072761267618945, "learning_rate": 1.9195046439628485e-05, "loss": 0.5041, "step": 125 }, { "epoch": 0.11704598235020901, "grad_norm": 0.30934280983091644, "learning_rate": 1.934984520123839e-05, "loss": 0.5189, "step": 126 }, { "epoch": 0.11797491871806781, "grad_norm": 0.3017588490881342, "learning_rate": 1.9504643962848298e-05, "loss": 0.5113, "step": 127 }, { "epoch": 0.11890385508592662, "grad_norm": 0.2595245492733277, "learning_rate": 1.9659442724458205e-05, "loss": 0.5396, "step": 128 }, { "epoch": 0.11983279145378542, "grad_norm": 0.3188023407278314, "learning_rate": 1.9814241486068115e-05, "loss": 0.4933, "step": 129 }, { "epoch": 0.12076172782164422, "grad_norm": 0.2709603354646054, "learning_rate": 1.9969040247678018e-05, "loss": 0.5159, "step": 130 }, { "epoch": 0.12169066418950301, "grad_norm": 0.28324770317715753, "learning_rate": 2.0123839009287925e-05, "loss": 0.4932, "step": 131 }, { "epoch": 0.12261960055736182, "grad_norm": 0.27619503735171647, "learning_rate": 2.0278637770897835e-05, "loss": 0.5303, "step": 132 }, { "epoch": 0.12354853692522062, "grad_norm": 0.26401711089649377, "learning_rate": 2.0433436532507742e-05, "loss": 0.4868, "step": 133 }, { "epoch": 0.12447747329307943, "grad_norm": 0.2593772391982529, "learning_rate": 2.058823529411765e-05, "loss": 0.4929, "step": 134 }, { "epoch": 0.12540640966093822, "grad_norm": 0.28933660519338583, "learning_rate": 2.0743034055727555e-05, "loss": 0.476, "step": 135 }, { "epoch": 0.12633534602879704, "grad_norm": 0.23316118822530046, "learning_rate": 2.0897832817337462e-05, "loss": 0.4568, "step": 136 }, { "epoch": 0.12726428239665583, "grad_norm": 0.31249950051428027, "learning_rate": 2.105263157894737e-05, "loss": 0.5187, "step": 137 }, { "epoch": 0.12819321876451464, "grad_norm": 0.29165292379136437, "learning_rate": 2.1207430340557276e-05, "loss": 0.5117, "step": 138 }, { "epoch": 0.12912215513237343, "grad_norm": 0.26649297211443057, "learning_rate": 2.1362229102167182e-05, "loss": 0.479, "step": 139 }, { "epoch": 0.13005109150023222, "grad_norm": 0.3142784146098132, "learning_rate": 2.151702786377709e-05, "loss": 0.5216, "step": 140 }, { "epoch": 0.13098002786809104, "grad_norm": 0.29423375787358585, "learning_rate": 2.1671826625387e-05, "loss": 0.4994, "step": 141 }, { "epoch": 0.13190896423594983, "grad_norm": 0.28110784066712485, "learning_rate": 2.1826625386996906e-05, "loss": 0.4734, "step": 142 }, { "epoch": 0.13283790060380865, "grad_norm": 0.29325368926120227, "learning_rate": 2.1981424148606813e-05, "loss": 0.4963, "step": 143 }, { "epoch": 0.13376683697166744, "grad_norm": 0.29510748124918773, "learning_rate": 2.213622291021672e-05, "loss": 0.4982, "step": 144 }, { "epoch": 0.13469577333952623, "grad_norm": 0.26461232115849287, "learning_rate": 2.2291021671826626e-05, "loss": 0.5166, "step": 145 }, { "epoch": 0.13562470970738505, "grad_norm": 0.31236587334144944, "learning_rate": 2.2445820433436533e-05, "loss": 0.5312, "step": 146 }, { "epoch": 0.13655364607524384, "grad_norm": 0.300081395956833, "learning_rate": 2.260061919504644e-05, "loss": 0.5129, "step": 147 }, { "epoch": 0.13748258244310266, "grad_norm": 0.31610036230407396, "learning_rate": 2.2755417956656347e-05, "loss": 0.4776, "step": 148 }, { "epoch": 0.13841151881096145, "grad_norm": 0.30145949638272806, "learning_rate": 2.2910216718266257e-05, "loss": 0.4902, "step": 149 }, { "epoch": 0.13934045517882024, "grad_norm": 0.27339946306277607, "learning_rate": 2.3065015479876163e-05, "loss": 0.464, "step": 150 }, { "epoch": 0.14026939154667906, "grad_norm": 0.3156892182016014, "learning_rate": 2.3219814241486067e-05, "loss": 0.495, "step": 151 }, { "epoch": 0.14119832791453785, "grad_norm": 0.3687764130938456, "learning_rate": 2.3374613003095977e-05, "loss": 0.4868, "step": 152 }, { "epoch": 0.14212726428239666, "grad_norm": 0.2800596635623465, "learning_rate": 2.3529411764705884e-05, "loss": 0.4739, "step": 153 }, { "epoch": 0.14305620065025546, "grad_norm": 0.3992825928546232, "learning_rate": 2.368421052631579e-05, "loss": 0.4913, "step": 154 }, { "epoch": 0.14398513701811425, "grad_norm": 0.3239223108616911, "learning_rate": 2.3839009287925697e-05, "loss": 0.4734, "step": 155 }, { "epoch": 0.14491407338597306, "grad_norm": 0.3963740287758293, "learning_rate": 2.3993808049535604e-05, "loss": 0.5013, "step": 156 }, { "epoch": 0.14584300975383185, "grad_norm": 0.36982949622797573, "learning_rate": 2.4148606811145514e-05, "loss": 0.5329, "step": 157 }, { "epoch": 0.14677194612169067, "grad_norm": 0.3446342173260943, "learning_rate": 2.430340557275542e-05, "loss": 0.4617, "step": 158 }, { "epoch": 0.14770088248954946, "grad_norm": 0.3726891488879446, "learning_rate": 2.4458204334365324e-05, "loss": 0.5158, "step": 159 }, { "epoch": 0.14862981885740828, "grad_norm": 0.34306897903348194, "learning_rate": 2.4613003095975234e-05, "loss": 0.5011, "step": 160 }, { "epoch": 0.14955875522526707, "grad_norm": 0.3509672474944589, "learning_rate": 2.476780185758514e-05, "loss": 0.5179, "step": 161 }, { "epoch": 0.15048769159312586, "grad_norm": 0.36888921482695486, "learning_rate": 2.4922600619195048e-05, "loss": 0.488, "step": 162 }, { "epoch": 0.15141662796098468, "grad_norm": 0.2879360129205803, "learning_rate": 2.5077399380804955e-05, "loss": 0.4818, "step": 163 }, { "epoch": 0.15234556432884347, "grad_norm": 0.3541612368911695, "learning_rate": 2.5232198142414865e-05, "loss": 0.4645, "step": 164 }, { "epoch": 0.1532745006967023, "grad_norm": 0.35885056781188374, "learning_rate": 2.5386996904024768e-05, "loss": 0.5074, "step": 165 }, { "epoch": 0.15420343706456108, "grad_norm": 0.34753901489218775, "learning_rate": 2.5541795665634678e-05, "loss": 0.5209, "step": 166 }, { "epoch": 0.15513237343241987, "grad_norm": 0.3353853516137484, "learning_rate": 2.5696594427244585e-05, "loss": 0.5021, "step": 167 }, { "epoch": 0.15606130980027869, "grad_norm": 0.3298140566724545, "learning_rate": 2.585139318885449e-05, "loss": 0.5159, "step": 168 }, { "epoch": 0.15699024616813748, "grad_norm": 0.3391997674833228, "learning_rate": 2.60061919504644e-05, "loss": 0.4754, "step": 169 }, { "epoch": 0.1579191825359963, "grad_norm": 0.28704771216792646, "learning_rate": 2.616099071207431e-05, "loss": 0.4703, "step": 170 }, { "epoch": 0.15884811890385508, "grad_norm": 0.37267201290204965, "learning_rate": 2.6315789473684212e-05, "loss": 0.5214, "step": 171 }, { "epoch": 0.15977705527171387, "grad_norm": 0.27902165277762125, "learning_rate": 2.647058823529412e-05, "loss": 0.4699, "step": 172 }, { "epoch": 0.1607059916395727, "grad_norm": 0.3398458423351264, "learning_rate": 2.662538699690403e-05, "loss": 0.5197, "step": 173 }, { "epoch": 0.16163492800743148, "grad_norm": 0.3435324805081321, "learning_rate": 2.6780185758513932e-05, "loss": 0.4789, "step": 174 }, { "epoch": 0.1625638643752903, "grad_norm": 0.4351648566726122, "learning_rate": 2.693498452012384e-05, "loss": 0.4987, "step": 175 }, { "epoch": 0.1634928007431491, "grad_norm": 0.3198149348956976, "learning_rate": 2.708978328173375e-05, "loss": 0.475, "step": 176 }, { "epoch": 0.1644217371110079, "grad_norm": 0.3950400431692381, "learning_rate": 2.7244582043343652e-05, "loss": 0.4744, "step": 177 }, { "epoch": 0.1653506734788667, "grad_norm": 0.3760190425753378, "learning_rate": 2.7399380804953563e-05, "loss": 0.4938, "step": 178 }, { "epoch": 0.1662796098467255, "grad_norm": 0.417176002219102, "learning_rate": 2.755417956656347e-05, "loss": 0.513, "step": 179 }, { "epoch": 0.1672085462145843, "grad_norm": 0.3474000705492875, "learning_rate": 2.7708978328173373e-05, "loss": 0.464, "step": 180 }, { "epoch": 0.1681374825824431, "grad_norm": 0.35375534821068394, "learning_rate": 2.7863777089783283e-05, "loss": 0.5123, "step": 181 }, { "epoch": 0.16906641895030192, "grad_norm": 0.4153734241752173, "learning_rate": 2.8018575851393193e-05, "loss": 0.4786, "step": 182 }, { "epoch": 0.1699953553181607, "grad_norm": 0.3633545185465792, "learning_rate": 2.8173374613003096e-05, "loss": 0.4881, "step": 183 }, { "epoch": 0.1709242916860195, "grad_norm": 0.4177673269252791, "learning_rate": 2.8328173374613003e-05, "loss": 0.4851, "step": 184 }, { "epoch": 0.17185322805387832, "grad_norm": 0.34872663223370376, "learning_rate": 2.8482972136222913e-05, "loss": 0.4562, "step": 185 }, { "epoch": 0.1727821644217371, "grad_norm": 0.3826892295449655, "learning_rate": 2.8637770897832817e-05, "loss": 0.4704, "step": 186 }, { "epoch": 0.17371110078959592, "grad_norm": 0.2911667819931114, "learning_rate": 2.8792569659442727e-05, "loss": 0.4647, "step": 187 }, { "epoch": 0.17464003715745471, "grad_norm": 0.3577750615715225, "learning_rate": 2.8947368421052634e-05, "loss": 0.4758, "step": 188 }, { "epoch": 0.1755689735253135, "grad_norm": 0.28796027674445784, "learning_rate": 2.9102167182662537e-05, "loss": 0.4934, "step": 189 }, { "epoch": 0.17649790989317232, "grad_norm": 0.328868526631627, "learning_rate": 2.9256965944272447e-05, "loss": 0.4989, "step": 190 }, { "epoch": 0.1774268462610311, "grad_norm": 0.30926168846748986, "learning_rate": 2.9411764705882354e-05, "loss": 0.4641, "step": 191 }, { "epoch": 0.17835578262888993, "grad_norm": 0.3990926605295306, "learning_rate": 2.9566563467492264e-05, "loss": 0.5012, "step": 192 }, { "epoch": 0.17928471899674872, "grad_norm": 0.2967167603656067, "learning_rate": 2.9721362229102167e-05, "loss": 0.4804, "step": 193 }, { "epoch": 0.1802136553646075, "grad_norm": 0.334297436056706, "learning_rate": 2.9876160990712077e-05, "loss": 0.4589, "step": 194 }, { "epoch": 0.18114259173246633, "grad_norm": 0.39015351822564853, "learning_rate": 3.0030959752321984e-05, "loss": 0.5029, "step": 195 }, { "epoch": 0.18207152810032512, "grad_norm": 0.3166981562612517, "learning_rate": 3.0185758513931888e-05, "loss": 0.4826, "step": 196 }, { "epoch": 0.18300046446818394, "grad_norm": 0.3697612896094445, "learning_rate": 3.0340557275541798e-05, "loss": 0.4834, "step": 197 }, { "epoch": 0.18392940083604273, "grad_norm": 0.3931362669456308, "learning_rate": 3.0495356037151708e-05, "loss": 0.4827, "step": 198 }, { "epoch": 0.18485833720390155, "grad_norm": 0.37542137369737677, "learning_rate": 3.065015479876161e-05, "loss": 0.5137, "step": 199 }, { "epoch": 0.18578727357176034, "grad_norm": 0.40365155511717965, "learning_rate": 3.080495356037152e-05, "loss": 0.4616, "step": 200 }, { "epoch": 0.18671620993961913, "grad_norm": 0.3462177469197346, "learning_rate": 3.0959752321981425e-05, "loss": 0.5022, "step": 201 }, { "epoch": 0.18764514630747794, "grad_norm": 0.4146922057088302, "learning_rate": 3.111455108359133e-05, "loss": 0.463, "step": 202 }, { "epoch": 0.18857408267533673, "grad_norm": 0.34021579524785434, "learning_rate": 3.126934984520124e-05, "loss": 0.4588, "step": 203 }, { "epoch": 0.18950301904319555, "grad_norm": 0.37786067260822703, "learning_rate": 3.142414860681115e-05, "loss": 0.4759, "step": 204 }, { "epoch": 0.19043195541105434, "grad_norm": 0.3871068334106478, "learning_rate": 3.157894736842105e-05, "loss": 0.4702, "step": 205 }, { "epoch": 0.19136089177891313, "grad_norm": 0.30844377564643344, "learning_rate": 3.173374613003096e-05, "loss": 0.4506, "step": 206 }, { "epoch": 0.19228982814677195, "grad_norm": 0.387790224919225, "learning_rate": 3.188854489164087e-05, "loss": 0.4554, "step": 207 }, { "epoch": 0.19321876451463074, "grad_norm": 0.3486750036808893, "learning_rate": 3.204334365325077e-05, "loss": 0.4735, "step": 208 }, { "epoch": 0.19414770088248956, "grad_norm": 0.36108244308720244, "learning_rate": 3.2198142414860685e-05, "loss": 0.4516, "step": 209 }, { "epoch": 0.19507663725034835, "grad_norm": 0.34168388969435387, "learning_rate": 3.235294117647059e-05, "loss": 0.4716, "step": 210 }, { "epoch": 0.19600557361820714, "grad_norm": 0.3336301263066555, "learning_rate": 3.250773993808049e-05, "loss": 0.462, "step": 211 }, { "epoch": 0.19693450998606596, "grad_norm": 0.30575256557626257, "learning_rate": 3.2662538699690406e-05, "loss": 0.433, "step": 212 }, { "epoch": 0.19786344635392475, "grad_norm": 0.31175854650330226, "learning_rate": 3.281733746130031e-05, "loss": 0.4831, "step": 213 }, { "epoch": 0.19879238272178357, "grad_norm": 0.3195577431584941, "learning_rate": 3.297213622291022e-05, "loss": 0.4939, "step": 214 }, { "epoch": 0.19972131908964236, "grad_norm": 0.324334575666152, "learning_rate": 3.3126934984520126e-05, "loss": 0.4674, "step": 215 }, { "epoch": 0.20065025545750115, "grad_norm": 0.30930275701374427, "learning_rate": 3.328173374613003e-05, "loss": 0.4861, "step": 216 }, { "epoch": 0.20157919182535997, "grad_norm": 0.3196175745480417, "learning_rate": 3.343653250773994e-05, "loss": 0.4785, "step": 217 }, { "epoch": 0.20250812819321876, "grad_norm": 0.3460990192307274, "learning_rate": 3.3591331269349846e-05, "loss": 0.4843, "step": 218 }, { "epoch": 0.20343706456107757, "grad_norm": 0.3189777288334882, "learning_rate": 3.374613003095975e-05, "loss": 0.5066, "step": 219 }, { "epoch": 0.20436600092893636, "grad_norm": 0.36563447762228074, "learning_rate": 3.390092879256966e-05, "loss": 0.4544, "step": 220 }, { "epoch": 0.20529493729679518, "grad_norm": 0.3437365831635904, "learning_rate": 3.4055727554179566e-05, "loss": 0.4644, "step": 221 }, { "epoch": 0.20622387366465397, "grad_norm": 0.3568648135029391, "learning_rate": 3.421052631578947e-05, "loss": 0.4934, "step": 222 }, { "epoch": 0.20715281003251276, "grad_norm": 0.40598865357747305, "learning_rate": 3.436532507739939e-05, "loss": 0.4913, "step": 223 }, { "epoch": 0.20808174640037158, "grad_norm": 0.4241132879182254, "learning_rate": 3.452012383900929e-05, "loss": 0.4535, "step": 224 }, { "epoch": 0.20901068276823037, "grad_norm": 0.35305194926225575, "learning_rate": 3.46749226006192e-05, "loss": 0.4682, "step": 225 }, { "epoch": 0.2099396191360892, "grad_norm": 0.618623679289102, "learning_rate": 3.482972136222911e-05, "loss": 0.4818, "step": 226 }, { "epoch": 0.21086855550394798, "grad_norm": 0.5920146651422737, "learning_rate": 3.498452012383901e-05, "loss": 0.4841, "step": 227 }, { "epoch": 0.21179749187180677, "grad_norm": 0.527405882028656, "learning_rate": 3.513931888544892e-05, "loss": 0.4834, "step": 228 }, { "epoch": 0.2127264282396656, "grad_norm": 0.8374512884655507, "learning_rate": 3.529411764705883e-05, "loss": 0.4914, "step": 229 }, { "epoch": 0.21365536460752438, "grad_norm": 0.7597460219900155, "learning_rate": 3.5448916408668734e-05, "loss": 0.4609, "step": 230 }, { "epoch": 0.2145843009753832, "grad_norm": 0.5139866484545614, "learning_rate": 3.560371517027864e-05, "loss": 0.457, "step": 231 }, { "epoch": 0.215513237343242, "grad_norm": 1.235134692869283, "learning_rate": 3.575851393188855e-05, "loss": 0.5001, "step": 232 }, { "epoch": 0.21644217371110078, "grad_norm": 0.41145239593782507, "learning_rate": 3.5913312693498454e-05, "loss": 0.4375, "step": 233 }, { "epoch": 0.2173711100789596, "grad_norm": 0.6816112033947701, "learning_rate": 3.606811145510836e-05, "loss": 0.4771, "step": 234 }, { "epoch": 0.21830004644681839, "grad_norm": 0.4343968108259159, "learning_rate": 3.622291021671827e-05, "loss": 0.4565, "step": 235 }, { "epoch": 0.2192289828146772, "grad_norm": 0.6469045830088757, "learning_rate": 3.6377708978328174e-05, "loss": 0.4705, "step": 236 }, { "epoch": 0.220157919182536, "grad_norm": 0.43542141035499127, "learning_rate": 3.653250773993808e-05, "loss": 0.4497, "step": 237 }, { "epoch": 0.22108685555039478, "grad_norm": 0.5546407234766495, "learning_rate": 3.668730650154799e-05, "loss": 0.4678, "step": 238 }, { "epoch": 0.2220157919182536, "grad_norm": 0.4764192191948784, "learning_rate": 3.6842105263157895e-05, "loss": 0.475, "step": 239 }, { "epoch": 0.2229447282861124, "grad_norm": 0.43770272881976247, "learning_rate": 3.69969040247678e-05, "loss": 0.4427, "step": 240 }, { "epoch": 0.2238736646539712, "grad_norm": 0.4550187245086204, "learning_rate": 3.7151702786377715e-05, "loss": 0.4414, "step": 241 }, { "epoch": 0.22480260102183, "grad_norm": 0.5330169483676697, "learning_rate": 3.7306501547987615e-05, "loss": 0.4819, "step": 242 }, { "epoch": 0.22573153738968882, "grad_norm": 0.4355552273324838, "learning_rate": 3.746130030959752e-05, "loss": 0.473, "step": 243 }, { "epoch": 0.2266604737575476, "grad_norm": 0.529146441551152, "learning_rate": 3.7616099071207435e-05, "loss": 0.5233, "step": 244 }, { "epoch": 0.2275894101254064, "grad_norm": 0.42768230047362027, "learning_rate": 3.7770897832817335e-05, "loss": 0.4652, "step": 245 }, { "epoch": 0.22851834649326522, "grad_norm": 0.498811282728108, "learning_rate": 3.792569659442725e-05, "loss": 0.4983, "step": 246 }, { "epoch": 0.229447282861124, "grad_norm": 0.4438510879340567, "learning_rate": 3.8080495356037155e-05, "loss": 0.4366, "step": 247 }, { "epoch": 0.23037621922898283, "grad_norm": 0.455953819278577, "learning_rate": 3.8235294117647055e-05, "loss": 0.4506, "step": 248 }, { "epoch": 0.23130515559684162, "grad_norm": 0.40125931844226015, "learning_rate": 3.839009287925697e-05, "loss": 0.4626, "step": 249 }, { "epoch": 0.2322340919647004, "grad_norm": 0.3539141975061322, "learning_rate": 3.8544891640866876e-05, "loss": 0.4499, "step": 250 }, { "epoch": 0.23316302833255922, "grad_norm": 0.5042429216492247, "learning_rate": 3.869969040247678e-05, "loss": 0.4647, "step": 251 }, { "epoch": 0.23409196470041801, "grad_norm": 0.3535166606707465, "learning_rate": 3.885448916408669e-05, "loss": 0.4746, "step": 252 }, { "epoch": 0.23502090106827683, "grad_norm": 0.5056187976580123, "learning_rate": 3.9009287925696596e-05, "loss": 0.4405, "step": 253 }, { "epoch": 0.23594983743613562, "grad_norm": 0.33101182681558045, "learning_rate": 3.91640866873065e-05, "loss": 0.4589, "step": 254 }, { "epoch": 0.2368787738039944, "grad_norm": 0.4759625175194635, "learning_rate": 3.931888544891641e-05, "loss": 0.4532, "step": 255 }, { "epoch": 0.23780771017185323, "grad_norm": 0.3870086650346993, "learning_rate": 3.9473684210526316e-05, "loss": 0.4468, "step": 256 }, { "epoch": 0.23873664653971202, "grad_norm": 0.408444973925042, "learning_rate": 3.962848297213623e-05, "loss": 0.4646, "step": 257 }, { "epoch": 0.23966558290757084, "grad_norm": 0.44744012598105387, "learning_rate": 3.978328173374613e-05, "loss": 0.455, "step": 258 }, { "epoch": 0.24059451927542963, "grad_norm": 0.37785914884066873, "learning_rate": 3.9938080495356037e-05, "loss": 0.4874, "step": 259 }, { "epoch": 0.24152345564328845, "grad_norm": 0.476511379545392, "learning_rate": 4.009287925696595e-05, "loss": 0.4546, "step": 260 }, { "epoch": 0.24245239201114724, "grad_norm": 0.31290275808884926, "learning_rate": 4.024767801857585e-05, "loss": 0.4567, "step": 261 }, { "epoch": 0.24338132837900603, "grad_norm": 0.46969828490127746, "learning_rate": 4.0402476780185764e-05, "loss": 0.4921, "step": 262 }, { "epoch": 0.24431026474686485, "grad_norm": 0.29314739835009784, "learning_rate": 4.055727554179567e-05, "loss": 0.4263, "step": 263 }, { "epoch": 0.24523920111472364, "grad_norm": 0.4260667269541755, "learning_rate": 4.071207430340557e-05, "loss": 0.4473, "step": 264 }, { "epoch": 0.24616813748258246, "grad_norm": 0.29731013199401535, "learning_rate": 4.0866873065015484e-05, "loss": 0.4657, "step": 265 }, { "epoch": 0.24709707385044125, "grad_norm": 0.38120279832509496, "learning_rate": 4.102167182662539e-05, "loss": 0.4368, "step": 266 }, { "epoch": 0.24802601021830004, "grad_norm": 0.3167100904137455, "learning_rate": 4.11764705882353e-05, "loss": 0.4562, "step": 267 }, { "epoch": 0.24895494658615885, "grad_norm": 0.4814669900513275, "learning_rate": 4.1331269349845204e-05, "loss": 0.4461, "step": 268 }, { "epoch": 0.24988388295401764, "grad_norm": 0.32074498184201283, "learning_rate": 4.148606811145511e-05, "loss": 0.4613, "step": 269 }, { "epoch": 0.25081281932187643, "grad_norm": 0.4084599440557474, "learning_rate": 4.164086687306502e-05, "loss": 0.47, "step": 270 }, { "epoch": 0.2517417556897353, "grad_norm": 0.32165189918387554, "learning_rate": 4.1795665634674924e-05, "loss": 0.4814, "step": 271 }, { "epoch": 0.25267069205759407, "grad_norm": 0.32816460839997996, "learning_rate": 4.195046439628483e-05, "loss": 0.4256, "step": 272 }, { "epoch": 0.25359962842545286, "grad_norm": 0.3008939493300022, "learning_rate": 4.210526315789474e-05, "loss": 0.473, "step": 273 }, { "epoch": 0.25452856479331165, "grad_norm": 0.3529450610360616, "learning_rate": 4.2260061919504645e-05, "loss": 0.4547, "step": 274 }, { "epoch": 0.25545750116117044, "grad_norm": 0.2972895680132811, "learning_rate": 4.241486068111455e-05, "loss": 0.4539, "step": 275 }, { "epoch": 0.2563864375290293, "grad_norm": 0.30425905871603104, "learning_rate": 4.256965944272446e-05, "loss": 0.4468, "step": 276 }, { "epoch": 0.2573153738968881, "grad_norm": 0.314584382041487, "learning_rate": 4.2724458204334365e-05, "loss": 0.4952, "step": 277 }, { "epoch": 0.25824431026474687, "grad_norm": 0.3160101237954399, "learning_rate": 4.287925696594428e-05, "loss": 0.4654, "step": 278 }, { "epoch": 0.25917324663260566, "grad_norm": 0.31709523022756986, "learning_rate": 4.303405572755418e-05, "loss": 0.4673, "step": 279 }, { "epoch": 0.26010218300046445, "grad_norm": 0.2980363449021666, "learning_rate": 4.3188854489164085e-05, "loss": 0.4187, "step": 280 }, { "epoch": 0.2610311193683233, "grad_norm": 0.33880341972476546, "learning_rate": 4.3343653250774e-05, "loss": 0.477, "step": 281 }, { "epoch": 0.2619600557361821, "grad_norm": 0.35521193406469326, "learning_rate": 4.3498452012383905e-05, "loss": 0.4762, "step": 282 }, { "epoch": 0.2628889921040409, "grad_norm": 0.39850142060231275, "learning_rate": 4.365325077399381e-05, "loss": 0.4634, "step": 283 }, { "epoch": 0.26381792847189967, "grad_norm": 0.36023726265847905, "learning_rate": 4.380804953560372e-05, "loss": 0.4757, "step": 284 }, { "epoch": 0.26474686483975846, "grad_norm": 0.32295320874219785, "learning_rate": 4.3962848297213626e-05, "loss": 0.4523, "step": 285 }, { "epoch": 0.2656758012076173, "grad_norm": 0.42060056160022957, "learning_rate": 4.411764705882353e-05, "loss": 0.4928, "step": 286 }, { "epoch": 0.2666047375754761, "grad_norm": 0.38366257834055983, "learning_rate": 4.427244582043344e-05, "loss": 0.4773, "step": 287 }, { "epoch": 0.2675336739433349, "grad_norm": 0.34821572318085275, "learning_rate": 4.4427244582043346e-05, "loss": 0.4495, "step": 288 }, { "epoch": 0.2684626103111937, "grad_norm": 0.35080988887460157, "learning_rate": 4.458204334365325e-05, "loss": 0.4619, "step": 289 }, { "epoch": 0.26939154667905246, "grad_norm": 0.3120875797937547, "learning_rate": 4.473684210526316e-05, "loss": 0.4354, "step": 290 }, { "epoch": 0.2703204830469113, "grad_norm": 0.36916544806752755, "learning_rate": 4.4891640866873066e-05, "loss": 0.4428, "step": 291 }, { "epoch": 0.2712494194147701, "grad_norm": 0.3120938357971595, "learning_rate": 4.504643962848297e-05, "loss": 0.479, "step": 292 }, { "epoch": 0.2721783557826289, "grad_norm": 0.33009772967549367, "learning_rate": 4.520123839009288e-05, "loss": 0.44, "step": 293 }, { "epoch": 0.2731072921504877, "grad_norm": 0.3147604795938479, "learning_rate": 4.535603715170279e-05, "loss": 0.4873, "step": 294 }, { "epoch": 0.27403622851834647, "grad_norm": 0.3024028343733452, "learning_rate": 4.551083591331269e-05, "loss": 0.4684, "step": 295 }, { "epoch": 0.2749651648862053, "grad_norm": 0.31188391724375253, "learning_rate": 4.56656346749226e-05, "loss": 0.447, "step": 296 }, { "epoch": 0.2758941012540641, "grad_norm": 0.28892910625279733, "learning_rate": 4.582043343653251e-05, "loss": 0.4665, "step": 297 }, { "epoch": 0.2768230376219229, "grad_norm": 0.3736874839548374, "learning_rate": 4.597523219814241e-05, "loss": 0.4526, "step": 298 }, { "epoch": 0.2777519739897817, "grad_norm": 0.28993119120770083, "learning_rate": 4.613003095975233e-05, "loss": 0.4526, "step": 299 }, { "epoch": 0.2786809103576405, "grad_norm": 0.3489816006908061, "learning_rate": 4.6284829721362234e-05, "loss": 0.4389, "step": 300 }, { "epoch": 0.2796098467254993, "grad_norm": 0.32821341978908847, "learning_rate": 4.6439628482972134e-05, "loss": 0.47, "step": 301 }, { "epoch": 0.2805387830933581, "grad_norm": 0.3101425751832829, "learning_rate": 4.659442724458205e-05, "loss": 0.4602, "step": 302 }, { "epoch": 0.2814677194612169, "grad_norm": 0.3009900967986436, "learning_rate": 4.6749226006191954e-05, "loss": 0.4604, "step": 303 }, { "epoch": 0.2823966558290757, "grad_norm": 0.3103354270645873, "learning_rate": 4.690402476780186e-05, "loss": 0.4737, "step": 304 }, { "epoch": 0.2833255921969345, "grad_norm": 0.30614215441269343, "learning_rate": 4.705882352941177e-05, "loss": 0.4707, "step": 305 }, { "epoch": 0.28425452856479333, "grad_norm": 0.31908370093108424, "learning_rate": 4.7213622291021674e-05, "loss": 0.4473, "step": 306 }, { "epoch": 0.2851834649326521, "grad_norm": 0.2959072198444788, "learning_rate": 4.736842105263158e-05, "loss": 0.4579, "step": 307 }, { "epoch": 0.2861124013005109, "grad_norm": 0.3057457489708315, "learning_rate": 4.752321981424149e-05, "loss": 0.4199, "step": 308 }, { "epoch": 0.2870413376683697, "grad_norm": 0.30818993057548966, "learning_rate": 4.7678018575851394e-05, "loss": 0.4627, "step": 309 }, { "epoch": 0.2879702740362285, "grad_norm": 0.33780222748958855, "learning_rate": 4.783281733746131e-05, "loss": 0.4955, "step": 310 }, { "epoch": 0.28889921040408734, "grad_norm": 0.3538296738454953, "learning_rate": 4.798761609907121e-05, "loss": 0.4528, "step": 311 }, { "epoch": 0.2898281467719461, "grad_norm": 0.35013935319870626, "learning_rate": 4.8142414860681115e-05, "loss": 0.4896, "step": 312 }, { "epoch": 0.2907570831398049, "grad_norm": 0.3919444872765707, "learning_rate": 4.829721362229103e-05, "loss": 0.434, "step": 313 }, { "epoch": 0.2916860195076637, "grad_norm": 0.2873775450384677, "learning_rate": 4.845201238390093e-05, "loss": 0.43, "step": 314 }, { "epoch": 0.29261495587552255, "grad_norm": 0.43278899104997853, "learning_rate": 4.860681114551084e-05, "loss": 0.459, "step": 315 }, { "epoch": 0.29354389224338134, "grad_norm": 0.29762624151628303, "learning_rate": 4.876160990712075e-05, "loss": 0.4381, "step": 316 }, { "epoch": 0.29447282861124013, "grad_norm": 0.38552936527851245, "learning_rate": 4.891640866873065e-05, "loss": 0.4488, "step": 317 }, { "epoch": 0.2954017649790989, "grad_norm": 0.3262364914608491, "learning_rate": 4.907120743034056e-05, "loss": 0.4745, "step": 318 }, { "epoch": 0.2963307013469577, "grad_norm": 0.3368476064425339, "learning_rate": 4.922600619195047e-05, "loss": 0.4588, "step": 319 }, { "epoch": 0.29725963771481656, "grad_norm": 0.3312500891089113, "learning_rate": 4.9380804953560375e-05, "loss": 0.4794, "step": 320 }, { "epoch": 0.29818857408267535, "grad_norm": 0.3390444459850631, "learning_rate": 4.953560371517028e-05, "loss": 0.4604, "step": 321 }, { "epoch": 0.29911751045053414, "grad_norm": 0.3387119384494132, "learning_rate": 4.969040247678019e-05, "loss": 0.4602, "step": 322 }, { "epoch": 0.30004644681839293, "grad_norm": 0.37541797983485836, "learning_rate": 4.9845201238390096e-05, "loss": 0.4857, "step": 323 }, { "epoch": 0.3009753831862517, "grad_norm": 0.36204268391898264, "learning_rate": 5e-05, "loss": 0.4487, "step": 324 }, { "epoch": 0.30190431955411057, "grad_norm": 0.5069674260137537, "learning_rate": 4.998278829604131e-05, "loss": 0.4795, "step": 325 }, { "epoch": 0.30283325592196936, "grad_norm": 0.31607540585629734, "learning_rate": 4.996557659208262e-05, "loss": 0.459, "step": 326 }, { "epoch": 0.30376219228982815, "grad_norm": 0.5375774496764807, "learning_rate": 4.9948364888123924e-05, "loss": 0.465, "step": 327 }, { "epoch": 0.30469112865768694, "grad_norm": 0.34299271466845627, "learning_rate": 4.9931153184165236e-05, "loss": 0.4693, "step": 328 }, { "epoch": 0.30562006502554573, "grad_norm": 0.4143539281353358, "learning_rate": 4.991394148020654e-05, "loss": 0.4606, "step": 329 }, { "epoch": 0.3065490013934046, "grad_norm": 0.4855004584252198, "learning_rate": 4.989672977624785e-05, "loss": 0.4783, "step": 330 }, { "epoch": 0.30747793776126336, "grad_norm": 0.3423755850607668, "learning_rate": 4.987951807228916e-05, "loss": 0.4444, "step": 331 }, { "epoch": 0.30840687412912215, "grad_norm": 0.3583846261043207, "learning_rate": 4.986230636833047e-05, "loss": 0.473, "step": 332 }, { "epoch": 0.30933581049698095, "grad_norm": 0.3138663780309369, "learning_rate": 4.9845094664371775e-05, "loss": 0.4353, "step": 333 }, { "epoch": 0.31026474686483974, "grad_norm": 0.3232503486528393, "learning_rate": 4.9827882960413086e-05, "loss": 0.4662, "step": 334 }, { "epoch": 0.3111936832326986, "grad_norm": 0.33077321915284624, "learning_rate": 4.981067125645439e-05, "loss": 0.4389, "step": 335 }, { "epoch": 0.31212261960055737, "grad_norm": 0.32940453207532444, "learning_rate": 4.9793459552495696e-05, "loss": 0.4549, "step": 336 }, { "epoch": 0.31305155596841616, "grad_norm": 0.3489785623343884, "learning_rate": 4.977624784853701e-05, "loss": 0.4519, "step": 337 }, { "epoch": 0.31398049233627495, "grad_norm": 0.3380056249650716, "learning_rate": 4.975903614457831e-05, "loss": 0.4501, "step": 338 }, { "epoch": 0.31490942870413374, "grad_norm": 0.39621724081434734, "learning_rate": 4.9741824440619625e-05, "loss": 0.4886, "step": 339 }, { "epoch": 0.3158383650719926, "grad_norm": 0.34090797197414546, "learning_rate": 4.972461273666093e-05, "loss": 0.4738, "step": 340 }, { "epoch": 0.3167673014398514, "grad_norm": 0.35008163528496755, "learning_rate": 4.970740103270224e-05, "loss": 0.4533, "step": 341 }, { "epoch": 0.31769623780771017, "grad_norm": 0.35189206816774354, "learning_rate": 4.969018932874355e-05, "loss": 0.4286, "step": 342 }, { "epoch": 0.31862517417556896, "grad_norm": 0.3640074292494907, "learning_rate": 4.967297762478486e-05, "loss": 0.4412, "step": 343 }, { "epoch": 0.31955411054342775, "grad_norm": 0.3532873296836122, "learning_rate": 4.9655765920826164e-05, "loss": 0.4686, "step": 344 }, { "epoch": 0.3204830469112866, "grad_norm": 0.37025094985949847, "learning_rate": 4.9638554216867475e-05, "loss": 0.4338, "step": 345 }, { "epoch": 0.3214119832791454, "grad_norm": 0.33462244732386653, "learning_rate": 4.962134251290878e-05, "loss": 0.4392, "step": 346 }, { "epoch": 0.3223409196470042, "grad_norm": 0.3820555101933481, "learning_rate": 4.960413080895009e-05, "loss": 0.4898, "step": 347 }, { "epoch": 0.32326985601486297, "grad_norm": 0.35860399216197664, "learning_rate": 4.95869191049914e-05, "loss": 0.4876, "step": 348 }, { "epoch": 0.32419879238272176, "grad_norm": 0.3419630528737662, "learning_rate": 4.95697074010327e-05, "loss": 0.4698, "step": 349 }, { "epoch": 0.3251277287505806, "grad_norm": 0.35528988609754913, "learning_rate": 4.9552495697074014e-05, "loss": 0.4275, "step": 350 }, { "epoch": 0.3260566651184394, "grad_norm": 0.38379201396008805, "learning_rate": 4.953528399311532e-05, "loss": 0.4445, "step": 351 }, { "epoch": 0.3269856014862982, "grad_norm": 0.3398302347977464, "learning_rate": 4.951807228915663e-05, "loss": 0.4554, "step": 352 }, { "epoch": 0.327914537854157, "grad_norm": 0.3897934147080159, "learning_rate": 4.9500860585197936e-05, "loss": 0.4396, "step": 353 }, { "epoch": 0.3288434742220158, "grad_norm": 0.3704282311872543, "learning_rate": 4.948364888123925e-05, "loss": 0.4476, "step": 354 }, { "epoch": 0.3297724105898746, "grad_norm": 0.3792694514662743, "learning_rate": 4.946643717728055e-05, "loss": 0.4153, "step": 355 }, { "epoch": 0.3307013469577334, "grad_norm": 0.3900734100570611, "learning_rate": 4.944922547332186e-05, "loss": 0.4347, "step": 356 }, { "epoch": 0.3316302833255922, "grad_norm": 0.3409495630328528, "learning_rate": 4.943201376936317e-05, "loss": 0.4242, "step": 357 }, { "epoch": 0.332559219693451, "grad_norm": 0.3661404637247587, "learning_rate": 4.9414802065404474e-05, "loss": 0.4426, "step": 358 }, { "epoch": 0.3334881560613098, "grad_norm": 0.4272666657937548, "learning_rate": 4.9397590361445786e-05, "loss": 0.4468, "step": 359 }, { "epoch": 0.3344170924291686, "grad_norm": 0.2754985787418611, "learning_rate": 4.938037865748709e-05, "loss": 0.4187, "step": 360 }, { "epoch": 0.3353460287970274, "grad_norm": 0.38303057883878117, "learning_rate": 4.93631669535284e-05, "loss": 0.4594, "step": 361 }, { "epoch": 0.3362749651648862, "grad_norm": 0.3015558469101507, "learning_rate": 4.934595524956971e-05, "loss": 0.4337, "step": 362 }, { "epoch": 0.337203901532745, "grad_norm": 0.36850129972653306, "learning_rate": 4.932874354561102e-05, "loss": 0.4666, "step": 363 }, { "epoch": 0.33813283790060383, "grad_norm": 0.2884585481636936, "learning_rate": 4.9311531841652325e-05, "loss": 0.4437, "step": 364 }, { "epoch": 0.3390617742684626, "grad_norm": 0.3163751089768913, "learning_rate": 4.929432013769364e-05, "loss": 0.4591, "step": 365 }, { "epoch": 0.3399907106363214, "grad_norm": 0.28834325339704836, "learning_rate": 4.927710843373494e-05, "loss": 0.4339, "step": 366 }, { "epoch": 0.3409196470041802, "grad_norm": 0.3089186351649892, "learning_rate": 4.9259896729776253e-05, "loss": 0.4413, "step": 367 }, { "epoch": 0.341848583372039, "grad_norm": 0.35241634143696166, "learning_rate": 4.924268502581756e-05, "loss": 0.4571, "step": 368 }, { "epoch": 0.34277751973989784, "grad_norm": 0.3101976948447532, "learning_rate": 4.922547332185887e-05, "loss": 0.4558, "step": 369 }, { "epoch": 0.34370645610775663, "grad_norm": 0.35319592477754186, "learning_rate": 4.9208261617900175e-05, "loss": 0.4429, "step": 370 }, { "epoch": 0.3446353924756154, "grad_norm": 0.3322284047796149, "learning_rate": 4.919104991394149e-05, "loss": 0.4524, "step": 371 }, { "epoch": 0.3455643288434742, "grad_norm": 0.37262088448309033, "learning_rate": 4.917383820998279e-05, "loss": 0.4325, "step": 372 }, { "epoch": 0.346493265211333, "grad_norm": 0.336028619967502, "learning_rate": 4.9156626506024104e-05, "loss": 0.4488, "step": 373 }, { "epoch": 0.34742220157919185, "grad_norm": 0.37466782782498514, "learning_rate": 4.913941480206541e-05, "loss": 0.458, "step": 374 }, { "epoch": 0.34835113794705064, "grad_norm": 0.3083157732259951, "learning_rate": 4.9122203098106714e-05, "loss": 0.4522, "step": 375 }, { "epoch": 0.34928007431490943, "grad_norm": 0.3696409621615052, "learning_rate": 4.9104991394148026e-05, "loss": 0.4769, "step": 376 }, { "epoch": 0.3502090106827682, "grad_norm": 0.37589325200592977, "learning_rate": 4.908777969018933e-05, "loss": 0.4535, "step": 377 }, { "epoch": 0.351137947050627, "grad_norm": 0.290205612168557, "learning_rate": 4.9070567986230636e-05, "loss": 0.4669, "step": 378 }, { "epoch": 0.35206688341848585, "grad_norm": 0.37740647897617646, "learning_rate": 4.905335628227195e-05, "loss": 0.4384, "step": 379 }, { "epoch": 0.35299581978634464, "grad_norm": 0.28374091712444127, "learning_rate": 4.903614457831325e-05, "loss": 0.4437, "step": 380 }, { "epoch": 0.35392475615420343, "grad_norm": 0.3555291712519551, "learning_rate": 4.9018932874354564e-05, "loss": 0.4437, "step": 381 }, { "epoch": 0.3548536925220622, "grad_norm": 0.3021908819889071, "learning_rate": 4.900172117039587e-05, "loss": 0.4529, "step": 382 }, { "epoch": 0.355782628889921, "grad_norm": 0.28070022204204287, "learning_rate": 4.898450946643718e-05, "loss": 0.4445, "step": 383 }, { "epoch": 0.35671156525777986, "grad_norm": 0.34406702126509014, "learning_rate": 4.8967297762478486e-05, "loss": 0.4532, "step": 384 }, { "epoch": 0.35764050162563865, "grad_norm": 0.2888201625795591, "learning_rate": 4.89500860585198e-05, "loss": 0.4636, "step": 385 }, { "epoch": 0.35856943799349744, "grad_norm": 0.3598719298462701, "learning_rate": 4.89328743545611e-05, "loss": 0.4402, "step": 386 }, { "epoch": 0.35949837436135623, "grad_norm": 0.2895073326465502, "learning_rate": 4.8915662650602415e-05, "loss": 0.4625, "step": 387 }, { "epoch": 0.360427310729215, "grad_norm": 0.323679285493869, "learning_rate": 4.889845094664372e-05, "loss": 0.4253, "step": 388 }, { "epoch": 0.36135624709707387, "grad_norm": 0.32852184199735984, "learning_rate": 4.888123924268503e-05, "loss": 0.4404, "step": 389 }, { "epoch": 0.36228518346493266, "grad_norm": 0.3825310829770502, "learning_rate": 4.8864027538726336e-05, "loss": 0.4324, "step": 390 }, { "epoch": 0.36321411983279145, "grad_norm": 0.3147280199593259, "learning_rate": 4.884681583476765e-05, "loss": 0.4576, "step": 391 }, { "epoch": 0.36414305620065024, "grad_norm": 0.33189632998575647, "learning_rate": 4.882960413080895e-05, "loss": 0.4508, "step": 392 }, { "epoch": 0.36507199256850903, "grad_norm": 0.31161211897471663, "learning_rate": 4.8812392426850265e-05, "loss": 0.433, "step": 393 }, { "epoch": 0.3660009289363679, "grad_norm": 0.29323440533256345, "learning_rate": 4.879518072289157e-05, "loss": 0.4426, "step": 394 }, { "epoch": 0.36692986530422667, "grad_norm": 0.3044769147297849, "learning_rate": 4.877796901893288e-05, "loss": 0.4383, "step": 395 }, { "epoch": 0.36785880167208546, "grad_norm": 0.33105623962975367, "learning_rate": 4.876075731497419e-05, "loss": 0.4511, "step": 396 }, { "epoch": 0.36878773803994425, "grad_norm": 0.2926145124782726, "learning_rate": 4.87435456110155e-05, "loss": 0.4494, "step": 397 }, { "epoch": 0.3697166744078031, "grad_norm": 0.2785219519909291, "learning_rate": 4.8726333907056804e-05, "loss": 0.4193, "step": 398 }, { "epoch": 0.3706456107756619, "grad_norm": 0.3190152822347888, "learning_rate": 4.870912220309811e-05, "loss": 0.4348, "step": 399 }, { "epoch": 0.3715745471435207, "grad_norm": 0.2944642128721377, "learning_rate": 4.8691910499139414e-05, "loss": 0.4413, "step": 400 }, { "epoch": 0.37250348351137946, "grad_norm": 0.2874818565004639, "learning_rate": 4.8674698795180725e-05, "loss": 0.4379, "step": 401 }, { "epoch": 0.37343241987923825, "grad_norm": 0.27777508008099533, "learning_rate": 4.865748709122203e-05, "loss": 0.4241, "step": 402 }, { "epoch": 0.3743613562470971, "grad_norm": 0.32978399945466425, "learning_rate": 4.864027538726334e-05, "loss": 0.447, "step": 403 }, { "epoch": 0.3752902926149559, "grad_norm": 0.3094967262792066, "learning_rate": 4.862306368330465e-05, "loss": 0.4466, "step": 404 }, { "epoch": 0.3762192289828147, "grad_norm": 0.29574208650459116, "learning_rate": 4.860585197934596e-05, "loss": 0.436, "step": 405 }, { "epoch": 0.37714816535067347, "grad_norm": 0.288270087489997, "learning_rate": 4.8588640275387264e-05, "loss": 0.4441, "step": 406 }, { "epoch": 0.37807710171853226, "grad_norm": 0.30162172795880676, "learning_rate": 4.8571428571428576e-05, "loss": 0.4145, "step": 407 }, { "epoch": 0.3790060380863911, "grad_norm": 0.2618604647255097, "learning_rate": 4.855421686746988e-05, "loss": 0.4162, "step": 408 }, { "epoch": 0.3799349744542499, "grad_norm": 0.30851370446236615, "learning_rate": 4.853700516351119e-05, "loss": 0.4224, "step": 409 }, { "epoch": 0.3808639108221087, "grad_norm": 0.3006905343379606, "learning_rate": 4.85197934595525e-05, "loss": 0.4879, "step": 410 }, { "epoch": 0.3817928471899675, "grad_norm": 0.28307585696243226, "learning_rate": 4.850258175559381e-05, "loss": 0.4068, "step": 411 }, { "epoch": 0.38272178355782627, "grad_norm": 0.26102123895109014, "learning_rate": 4.8485370051635114e-05, "loss": 0.428, "step": 412 }, { "epoch": 0.3836507199256851, "grad_norm": 0.26532852818511954, "learning_rate": 4.8468158347676426e-05, "loss": 0.4356, "step": 413 }, { "epoch": 0.3845796562935439, "grad_norm": 0.27713412833380047, "learning_rate": 4.845094664371773e-05, "loss": 0.4166, "step": 414 }, { "epoch": 0.3855085926614027, "grad_norm": 0.2721011127848934, "learning_rate": 4.843373493975904e-05, "loss": 0.4588, "step": 415 }, { "epoch": 0.3864375290292615, "grad_norm": 0.270987247231762, "learning_rate": 4.841652323580035e-05, "loss": 0.4418, "step": 416 }, { "epoch": 0.3873664653971203, "grad_norm": 0.2815915571880338, "learning_rate": 4.839931153184166e-05, "loss": 0.438, "step": 417 }, { "epoch": 0.3882954017649791, "grad_norm": 0.2900554851874775, "learning_rate": 4.8382099827882965e-05, "loss": 0.4495, "step": 418 }, { "epoch": 0.3892243381328379, "grad_norm": 0.3334842549517787, "learning_rate": 4.836488812392428e-05, "loss": 0.4646, "step": 419 }, { "epoch": 0.3901532745006967, "grad_norm": 0.24007212579976095, "learning_rate": 4.8347676419965575e-05, "loss": 0.4238, "step": 420 }, { "epoch": 0.3910822108685555, "grad_norm": 0.3077325041506196, "learning_rate": 4.833046471600689e-05, "loss": 0.4447, "step": 421 }, { "epoch": 0.3920111472364143, "grad_norm": 0.2863122225447303, "learning_rate": 4.831325301204819e-05, "loss": 0.4365, "step": 422 }, { "epoch": 0.3929400836042731, "grad_norm": 0.30580235165952646, "learning_rate": 4.8296041308089504e-05, "loss": 0.4453, "step": 423 }, { "epoch": 0.3938690199721319, "grad_norm": 0.26472608629055344, "learning_rate": 4.827882960413081e-05, "loss": 0.445, "step": 424 }, { "epoch": 0.3947979563399907, "grad_norm": 0.30850387581287186, "learning_rate": 4.826161790017212e-05, "loss": 0.4459, "step": 425 }, { "epoch": 0.3957268927078495, "grad_norm": 0.32129564773476693, "learning_rate": 4.8244406196213425e-05, "loss": 0.4302, "step": 426 }, { "epoch": 0.3966558290757083, "grad_norm": 0.2856821353622809, "learning_rate": 4.822719449225474e-05, "loss": 0.4366, "step": 427 }, { "epoch": 0.39758476544356713, "grad_norm": 0.27875816280973154, "learning_rate": 4.820998278829604e-05, "loss": 0.4177, "step": 428 }, { "epoch": 0.3985137018114259, "grad_norm": 0.375391127760771, "learning_rate": 4.8192771084337354e-05, "loss": 0.4549, "step": 429 }, { "epoch": 0.3994426381792847, "grad_norm": 0.2919547832552509, "learning_rate": 4.817555938037866e-05, "loss": 0.4377, "step": 430 }, { "epoch": 0.4003715745471435, "grad_norm": 0.35769067629140655, "learning_rate": 4.815834767641997e-05, "loss": 0.4106, "step": 431 }, { "epoch": 0.4013005109150023, "grad_norm": 0.3169078733636773, "learning_rate": 4.8141135972461276e-05, "loss": 0.4344, "step": 432 }, { "epoch": 0.40222944728286114, "grad_norm": 0.38647441373213326, "learning_rate": 4.812392426850259e-05, "loss": 0.4662, "step": 433 }, { "epoch": 0.40315838365071993, "grad_norm": 0.3093808502182461, "learning_rate": 4.810671256454389e-05, "loss": 0.4268, "step": 434 }, { "epoch": 0.4040873200185787, "grad_norm": 0.3428360410015756, "learning_rate": 4.8089500860585204e-05, "loss": 0.4248, "step": 435 }, { "epoch": 0.4050162563864375, "grad_norm": 0.39226349195208254, "learning_rate": 4.807228915662651e-05, "loss": 0.4828, "step": 436 }, { "epoch": 0.40594519275429636, "grad_norm": 0.3124911712167451, "learning_rate": 4.805507745266782e-05, "loss": 0.42, "step": 437 }, { "epoch": 0.40687412912215515, "grad_norm": 0.3325357418649828, "learning_rate": 4.8037865748709126e-05, "loss": 0.4183, "step": 438 }, { "epoch": 0.40780306549001394, "grad_norm": 0.2967793995799997, "learning_rate": 4.802065404475044e-05, "loss": 0.4522, "step": 439 }, { "epoch": 0.40873200185787273, "grad_norm": 0.3542440897290239, "learning_rate": 4.800344234079174e-05, "loss": 0.4463, "step": 440 }, { "epoch": 0.4096609382257315, "grad_norm": 0.307305266388483, "learning_rate": 4.798623063683305e-05, "loss": 0.4444, "step": 441 }, { "epoch": 0.41058987459359036, "grad_norm": 0.3009613722666484, "learning_rate": 4.796901893287435e-05, "loss": 0.42, "step": 442 }, { "epoch": 0.41151881096144916, "grad_norm": 0.2942089986687925, "learning_rate": 4.7951807228915665e-05, "loss": 0.4408, "step": 443 }, { "epoch": 0.41244774732930795, "grad_norm": 0.25994860698808103, "learning_rate": 4.793459552495697e-05, "loss": 0.422, "step": 444 }, { "epoch": 0.41337668369716674, "grad_norm": 0.30165826806753704, "learning_rate": 4.791738382099828e-05, "loss": 0.4433, "step": 445 }, { "epoch": 0.4143056200650255, "grad_norm": 0.2591476595221208, "learning_rate": 4.7900172117039587e-05, "loss": 0.4035, "step": 446 }, { "epoch": 0.41523455643288437, "grad_norm": 0.31443538659749376, "learning_rate": 4.78829604130809e-05, "loss": 0.4451, "step": 447 }, { "epoch": 0.41616349280074316, "grad_norm": 0.2845675404201316, "learning_rate": 4.78657487091222e-05, "loss": 0.4499, "step": 448 }, { "epoch": 0.41709242916860195, "grad_norm": 0.3082724543796442, "learning_rate": 4.7848537005163515e-05, "loss": 0.4567, "step": 449 }, { "epoch": 0.41802136553646074, "grad_norm": 0.2886233884056253, "learning_rate": 4.783132530120482e-05, "loss": 0.4351, "step": 450 }, { "epoch": 0.41895030190431953, "grad_norm": 0.32824185907530123, "learning_rate": 4.781411359724613e-05, "loss": 0.4343, "step": 451 }, { "epoch": 0.4198792382721784, "grad_norm": 0.3045951917878721, "learning_rate": 4.779690189328744e-05, "loss": 0.4233, "step": 452 }, { "epoch": 0.42080817464003717, "grad_norm": 0.30065030364940504, "learning_rate": 4.777969018932875e-05, "loss": 0.4449, "step": 453 }, { "epoch": 0.42173711100789596, "grad_norm": 0.28194914295465456, "learning_rate": 4.7762478485370054e-05, "loss": 0.4216, "step": 454 }, { "epoch": 0.42266604737575475, "grad_norm": 0.358326292324705, "learning_rate": 4.7745266781411366e-05, "loss": 0.4545, "step": 455 }, { "epoch": 0.42359498374361354, "grad_norm": 0.32509662830836533, "learning_rate": 4.772805507745267e-05, "loss": 0.4173, "step": 456 }, { "epoch": 0.4245239201114724, "grad_norm": 0.33738335206220016, "learning_rate": 4.771084337349398e-05, "loss": 0.4433, "step": 457 }, { "epoch": 0.4254528564793312, "grad_norm": 0.3607433985711699, "learning_rate": 4.769363166953529e-05, "loss": 0.4317, "step": 458 }, { "epoch": 0.42638179284718997, "grad_norm": 0.32470986956497677, "learning_rate": 4.76764199655766e-05, "loss": 0.4278, "step": 459 }, { "epoch": 0.42731072921504876, "grad_norm": 0.31596561843145204, "learning_rate": 4.7659208261617904e-05, "loss": 0.4507, "step": 460 }, { "epoch": 0.42823966558290755, "grad_norm": 0.3145646642534761, "learning_rate": 4.7641996557659216e-05, "loss": 0.4353, "step": 461 }, { "epoch": 0.4291686019507664, "grad_norm": 0.28777196903769586, "learning_rate": 4.7624784853700514e-05, "loss": 0.4606, "step": 462 }, { "epoch": 0.4300975383186252, "grad_norm": 0.34643745463301656, "learning_rate": 4.7607573149741826e-05, "loss": 0.4264, "step": 463 }, { "epoch": 0.431026474686484, "grad_norm": 0.3022418092258536, "learning_rate": 4.759036144578313e-05, "loss": 0.4046, "step": 464 }, { "epoch": 0.43195541105434276, "grad_norm": 0.27983510155632235, "learning_rate": 4.757314974182444e-05, "loss": 0.4122, "step": 465 }, { "epoch": 0.43288434742220155, "grad_norm": 0.28454933014088335, "learning_rate": 4.755593803786575e-05, "loss": 0.4558, "step": 466 }, { "epoch": 0.4338132837900604, "grad_norm": 0.32254214458961294, "learning_rate": 4.753872633390706e-05, "loss": 0.4272, "step": 467 }, { "epoch": 0.4347422201579192, "grad_norm": 0.2613178418721715, "learning_rate": 4.7521514629948365e-05, "loss": 0.4418, "step": 468 }, { "epoch": 0.435671156525778, "grad_norm": 0.3056014331601247, "learning_rate": 4.7504302925989676e-05, "loss": 0.421, "step": 469 }, { "epoch": 0.43660009289363677, "grad_norm": 0.2695626668562775, "learning_rate": 4.748709122203098e-05, "loss": 0.4321, "step": 470 }, { "epoch": 0.43752902926149556, "grad_norm": 0.27446080453869376, "learning_rate": 4.746987951807229e-05, "loss": 0.4375, "step": 471 }, { "epoch": 0.4384579656293544, "grad_norm": 0.29987194356039903, "learning_rate": 4.74526678141136e-05, "loss": 0.4436, "step": 472 }, { "epoch": 0.4393869019972132, "grad_norm": 0.2973778889764395, "learning_rate": 4.743545611015491e-05, "loss": 0.4508, "step": 473 }, { "epoch": 0.440315838365072, "grad_norm": 0.28858718688747587, "learning_rate": 4.7418244406196215e-05, "loss": 0.4088, "step": 474 }, { "epoch": 0.4412447747329308, "grad_norm": 0.2776236267106209, "learning_rate": 4.740103270223753e-05, "loss": 0.4472, "step": 475 }, { "epoch": 0.44217371110078957, "grad_norm": 0.2673174324156048, "learning_rate": 4.738382099827883e-05, "loss": 0.4318, "step": 476 }, { "epoch": 0.4431026474686484, "grad_norm": 0.2789675691022897, "learning_rate": 4.7366609294320144e-05, "loss": 0.4256, "step": 477 }, { "epoch": 0.4440315838365072, "grad_norm": 0.3242561622863543, "learning_rate": 4.734939759036145e-05, "loss": 0.4314, "step": 478 }, { "epoch": 0.444960520204366, "grad_norm": 0.26245305270546254, "learning_rate": 4.733218588640276e-05, "loss": 0.432, "step": 479 }, { "epoch": 0.4458894565722248, "grad_norm": 0.2908962068774102, "learning_rate": 4.7314974182444065e-05, "loss": 0.4262, "step": 480 }, { "epoch": 0.44681839294008363, "grad_norm": 0.27171256702754587, "learning_rate": 4.729776247848538e-05, "loss": 0.4352, "step": 481 }, { "epoch": 0.4477473293079424, "grad_norm": 0.3640009107002527, "learning_rate": 4.728055077452668e-05, "loss": 0.4337, "step": 482 }, { "epoch": 0.4486762656758012, "grad_norm": 0.3035811404578588, "learning_rate": 4.7263339070567994e-05, "loss": 0.453, "step": 483 }, { "epoch": 0.44960520204366, "grad_norm": 0.3635508538266957, "learning_rate": 4.724612736660929e-05, "loss": 0.4296, "step": 484 }, { "epoch": 0.4505341384115188, "grad_norm": 0.3170039262392314, "learning_rate": 4.7228915662650604e-05, "loss": 0.4207, "step": 485 }, { "epoch": 0.45146307477937764, "grad_norm": 0.31774438891510837, "learning_rate": 4.721170395869191e-05, "loss": 0.4406, "step": 486 }, { "epoch": 0.45239201114723643, "grad_norm": 0.34019517987590486, "learning_rate": 4.719449225473322e-05, "loss": 0.4902, "step": 487 }, { "epoch": 0.4533209475150952, "grad_norm": 0.2793264697380186, "learning_rate": 4.7177280550774526e-05, "loss": 0.4454, "step": 488 }, { "epoch": 0.454249883882954, "grad_norm": 0.28144894598721126, "learning_rate": 4.716006884681584e-05, "loss": 0.4187, "step": 489 }, { "epoch": 0.4551788202508128, "grad_norm": 0.30680476206969465, "learning_rate": 4.714285714285714e-05, "loss": 0.4458, "step": 490 }, { "epoch": 0.45610775661867164, "grad_norm": 0.2761255587228729, "learning_rate": 4.7125645438898454e-05, "loss": 0.4141, "step": 491 }, { "epoch": 0.45703669298653044, "grad_norm": 0.2747933627980402, "learning_rate": 4.710843373493976e-05, "loss": 0.4558, "step": 492 }, { "epoch": 0.4579656293543892, "grad_norm": 0.2914395069490274, "learning_rate": 4.709122203098107e-05, "loss": 0.4353, "step": 493 }, { "epoch": 0.458894565722248, "grad_norm": 0.26839160122030176, "learning_rate": 4.7074010327022376e-05, "loss": 0.4131, "step": 494 }, { "epoch": 0.4598235020901068, "grad_norm": 0.2822993839047011, "learning_rate": 4.705679862306369e-05, "loss": 0.4265, "step": 495 }, { "epoch": 0.46075243845796565, "grad_norm": 0.2657294003991585, "learning_rate": 4.703958691910499e-05, "loss": 0.4408, "step": 496 }, { "epoch": 0.46168137482582444, "grad_norm": 0.29482740291064663, "learning_rate": 4.7022375215146305e-05, "loss": 0.4368, "step": 497 }, { "epoch": 0.46261031119368323, "grad_norm": 0.26269631652127934, "learning_rate": 4.700516351118761e-05, "loss": 0.4313, "step": 498 }, { "epoch": 0.463539247561542, "grad_norm": 0.33705615306936615, "learning_rate": 4.698795180722892e-05, "loss": 0.4529, "step": 499 }, { "epoch": 0.4644681839294008, "grad_norm": 0.24499796930149098, "learning_rate": 4.6970740103270227e-05, "loss": 0.4385, "step": 500 }, { "epoch": 0.46539712029725966, "grad_norm": 0.30948537181680397, "learning_rate": 4.695352839931154e-05, "loss": 0.4449, "step": 501 }, { "epoch": 0.46632605666511845, "grad_norm": 0.29180609911144206, "learning_rate": 4.6936316695352843e-05, "loss": 0.4352, "step": 502 }, { "epoch": 0.46725499303297724, "grad_norm": 0.2891708931616031, "learning_rate": 4.6919104991394155e-05, "loss": 0.439, "step": 503 }, { "epoch": 0.46818392940083603, "grad_norm": 0.2686671320742356, "learning_rate": 4.690189328743546e-05, "loss": 0.4306, "step": 504 }, { "epoch": 0.4691128657686948, "grad_norm": 0.30780838224951595, "learning_rate": 4.6884681583476765e-05, "loss": 0.431, "step": 505 }, { "epoch": 0.47004180213655367, "grad_norm": 0.25568604631287883, "learning_rate": 4.686746987951807e-05, "loss": 0.458, "step": 506 }, { "epoch": 0.47097073850441246, "grad_norm": 0.37480261887391536, "learning_rate": 4.685025817555938e-05, "loss": 0.4343, "step": 507 }, { "epoch": 0.47189967487227125, "grad_norm": 0.2552649038864197, "learning_rate": 4.683304647160069e-05, "loss": 0.3908, "step": 508 }, { "epoch": 0.47282861124013004, "grad_norm": 0.35376195689446877, "learning_rate": 4.6815834767642e-05, "loss": 0.4491, "step": 509 }, { "epoch": 0.4737575476079888, "grad_norm": 0.3016203905265542, "learning_rate": 4.6798623063683304e-05, "loss": 0.4291, "step": 510 }, { "epoch": 0.4746864839758477, "grad_norm": 0.29055972391783796, "learning_rate": 4.6781411359724616e-05, "loss": 0.4195, "step": 511 }, { "epoch": 0.47561542034370646, "grad_norm": 0.3190347010255864, "learning_rate": 4.676419965576592e-05, "loss": 0.4108, "step": 512 }, { "epoch": 0.47654435671156525, "grad_norm": 0.3294349955740729, "learning_rate": 4.674698795180723e-05, "loss": 0.4693, "step": 513 }, { "epoch": 0.47747329307942404, "grad_norm": 0.31781079968735304, "learning_rate": 4.672977624784854e-05, "loss": 0.4435, "step": 514 }, { "epoch": 0.47840222944728283, "grad_norm": 0.3226175038410287, "learning_rate": 4.671256454388985e-05, "loss": 0.4335, "step": 515 }, { "epoch": 0.4793311658151417, "grad_norm": 0.2637433153066965, "learning_rate": 4.6695352839931154e-05, "loss": 0.4415, "step": 516 }, { "epoch": 0.48026010218300047, "grad_norm": 0.3003332421321039, "learning_rate": 4.6678141135972466e-05, "loss": 0.4202, "step": 517 }, { "epoch": 0.48118903855085926, "grad_norm": 0.37384718084964086, "learning_rate": 4.666092943201377e-05, "loss": 0.4603, "step": 518 }, { "epoch": 0.48211797491871805, "grad_norm": 0.2715471956417477, "learning_rate": 4.664371772805508e-05, "loss": 0.4507, "step": 519 }, { "epoch": 0.4830469112865769, "grad_norm": 0.262436801625057, "learning_rate": 4.662650602409639e-05, "loss": 0.4209, "step": 520 }, { "epoch": 0.4839758476544357, "grad_norm": 0.3007149616543295, "learning_rate": 4.66092943201377e-05, "loss": 0.4493, "step": 521 }, { "epoch": 0.4849047840222945, "grad_norm": 0.2804336311294239, "learning_rate": 4.6592082616179005e-05, "loss": 0.4286, "step": 522 }, { "epoch": 0.48583372039015327, "grad_norm": 0.26330683566772206, "learning_rate": 4.6574870912220316e-05, "loss": 0.4202, "step": 523 }, { "epoch": 0.48676265675801206, "grad_norm": 0.286309727815638, "learning_rate": 4.655765920826162e-05, "loss": 0.4301, "step": 524 }, { "epoch": 0.4876915931258709, "grad_norm": 0.2529965819385294, "learning_rate": 4.654044750430293e-05, "loss": 0.4325, "step": 525 }, { "epoch": 0.4886205294937297, "grad_norm": 0.3253005644243001, "learning_rate": 4.652323580034423e-05, "loss": 0.42, "step": 526 }, { "epoch": 0.4895494658615885, "grad_norm": 0.2658924013835058, "learning_rate": 4.650602409638554e-05, "loss": 0.4193, "step": 527 }, { "epoch": 0.4904784022294473, "grad_norm": 0.27995881407155443, "learning_rate": 4.648881239242685e-05, "loss": 0.4216, "step": 528 }, { "epoch": 0.49140733859730606, "grad_norm": 0.3025444675822039, "learning_rate": 4.647160068846816e-05, "loss": 0.4621, "step": 529 }, { "epoch": 0.4923362749651649, "grad_norm": 0.2737570425581585, "learning_rate": 4.6454388984509465e-05, "loss": 0.4324, "step": 530 }, { "epoch": 0.4932652113330237, "grad_norm": 0.2582930487746826, "learning_rate": 4.643717728055078e-05, "loss": 0.4423, "step": 531 }, { "epoch": 0.4941941477008825, "grad_norm": 0.28559852630894345, "learning_rate": 4.641996557659208e-05, "loss": 0.4477, "step": 532 }, { "epoch": 0.4951230840687413, "grad_norm": 0.29483695154838563, "learning_rate": 4.6402753872633394e-05, "loss": 0.4156, "step": 533 }, { "epoch": 0.49605202043660007, "grad_norm": 0.2613379393803091, "learning_rate": 4.63855421686747e-05, "loss": 0.432, "step": 534 }, { "epoch": 0.4969809568044589, "grad_norm": 0.22622325752066913, "learning_rate": 4.636833046471601e-05, "loss": 0.4297, "step": 535 }, { "epoch": 0.4979098931723177, "grad_norm": 0.28353494710194255, "learning_rate": 4.6351118760757315e-05, "loss": 0.4533, "step": 536 }, { "epoch": 0.4988388295401765, "grad_norm": 0.31712551450474613, "learning_rate": 4.633390705679863e-05, "loss": 0.4667, "step": 537 }, { "epoch": 0.4997677659080353, "grad_norm": 0.25690414290083796, "learning_rate": 4.631669535283993e-05, "loss": 0.434, "step": 538 }, { "epoch": 0.5006967022758941, "grad_norm": 0.26005133208971615, "learning_rate": 4.6299483648881244e-05, "loss": 0.4385, "step": 539 }, { "epoch": 0.5016256386437529, "grad_norm": 0.34410172574131104, "learning_rate": 4.628227194492255e-05, "loss": 0.4508, "step": 540 }, { "epoch": 0.5025545750116117, "grad_norm": 0.25043115978998853, "learning_rate": 4.626506024096386e-05, "loss": 0.4098, "step": 541 }, { "epoch": 0.5034835113794706, "grad_norm": 0.3104609566884535, "learning_rate": 4.6247848537005166e-05, "loss": 0.4305, "step": 542 }, { "epoch": 0.5044124477473293, "grad_norm": 0.293951951581098, "learning_rate": 4.623063683304648e-05, "loss": 0.4461, "step": 543 }, { "epoch": 0.5053413841151881, "grad_norm": 0.2560108568907451, "learning_rate": 4.621342512908778e-05, "loss": 0.4355, "step": 544 }, { "epoch": 0.5062703204830469, "grad_norm": 0.358412433789302, "learning_rate": 4.6196213425129094e-05, "loss": 0.4425, "step": 545 }, { "epoch": 0.5071992568509057, "grad_norm": 0.3056935351314926, "learning_rate": 4.61790017211704e-05, "loss": 0.4428, "step": 546 }, { "epoch": 0.5081281932187646, "grad_norm": 0.24667559189967134, "learning_rate": 4.6161790017211704e-05, "loss": 0.3957, "step": 547 }, { "epoch": 0.5090571295866233, "grad_norm": 0.33367259063635524, "learning_rate": 4.614457831325301e-05, "loss": 0.4355, "step": 548 }, { "epoch": 0.5099860659544821, "grad_norm": 0.27853496333383976, "learning_rate": 4.612736660929432e-05, "loss": 0.4126, "step": 549 }, { "epoch": 0.5109150023223409, "grad_norm": 0.268125448502846, "learning_rate": 4.6110154905335626e-05, "loss": 0.4303, "step": 550 }, { "epoch": 0.5118439386901997, "grad_norm": 0.3441204665083962, "learning_rate": 4.609294320137694e-05, "loss": 0.449, "step": 551 }, { "epoch": 0.5127728750580586, "grad_norm": 0.24236240929630035, "learning_rate": 4.607573149741824e-05, "loss": 0.414, "step": 552 }, { "epoch": 0.5137018114259173, "grad_norm": 0.3015664010797303, "learning_rate": 4.6058519793459555e-05, "loss": 0.4204, "step": 553 }, { "epoch": 0.5146307477937762, "grad_norm": 0.25242151768392607, "learning_rate": 4.604130808950086e-05, "loss": 0.4273, "step": 554 }, { "epoch": 0.5155596841616349, "grad_norm": 0.30744068298559074, "learning_rate": 4.602409638554217e-05, "loss": 0.4508, "step": 555 }, { "epoch": 0.5164886205294937, "grad_norm": 0.30718731865640203, "learning_rate": 4.600688468158348e-05, "loss": 0.4474, "step": 556 }, { "epoch": 0.5174175568973526, "grad_norm": 0.29846939639927766, "learning_rate": 4.598967297762479e-05, "loss": 0.4302, "step": 557 }, { "epoch": 0.5183464932652113, "grad_norm": 0.29336388381608647, "learning_rate": 4.5972461273666093e-05, "loss": 0.4227, "step": 558 }, { "epoch": 0.5192754296330702, "grad_norm": 0.28098300576052854, "learning_rate": 4.5955249569707405e-05, "loss": 0.4225, "step": 559 }, { "epoch": 0.5202043660009289, "grad_norm": 0.27826523322374, "learning_rate": 4.593803786574871e-05, "loss": 0.4395, "step": 560 }, { "epoch": 0.5211333023687877, "grad_norm": 0.2857326733803591, "learning_rate": 4.592082616179002e-05, "loss": 0.4251, "step": 561 }, { "epoch": 0.5220622387366466, "grad_norm": 0.24460900653251463, "learning_rate": 4.590361445783133e-05, "loss": 0.3983, "step": 562 }, { "epoch": 0.5229911751045053, "grad_norm": 0.31606867032450164, "learning_rate": 4.588640275387264e-05, "loss": 0.3987, "step": 563 }, { "epoch": 0.5239201114723642, "grad_norm": 0.2720837508546668, "learning_rate": 4.5869191049913944e-05, "loss": 0.4187, "step": 564 }, { "epoch": 0.5248490478402229, "grad_norm": 0.3692339108368577, "learning_rate": 4.5851979345955256e-05, "loss": 0.4027, "step": 565 }, { "epoch": 0.5257779842080818, "grad_norm": 0.2621237616477022, "learning_rate": 4.583476764199656e-05, "loss": 0.4069, "step": 566 }, { "epoch": 0.5267069205759406, "grad_norm": 0.28576357238482947, "learning_rate": 4.581755593803787e-05, "loss": 0.4083, "step": 567 }, { "epoch": 0.5276358569437993, "grad_norm": 0.2876324843394776, "learning_rate": 4.580034423407917e-05, "loss": 0.4053, "step": 568 }, { "epoch": 0.5285647933116582, "grad_norm": 0.2952514491419477, "learning_rate": 4.578313253012048e-05, "loss": 0.4372, "step": 569 }, { "epoch": 0.5294937296795169, "grad_norm": 0.2413668924475377, "learning_rate": 4.576592082616179e-05, "loss": 0.3872, "step": 570 }, { "epoch": 0.5304226660473758, "grad_norm": 0.31322703800100615, "learning_rate": 4.57487091222031e-05, "loss": 0.415, "step": 571 }, { "epoch": 0.5313516024152346, "grad_norm": 0.29838477602444063, "learning_rate": 4.5731497418244404e-05, "loss": 0.4227, "step": 572 }, { "epoch": 0.5322805387830933, "grad_norm": 0.2743433087149783, "learning_rate": 4.5714285714285716e-05, "loss": 0.4069, "step": 573 }, { "epoch": 0.5332094751509522, "grad_norm": 0.2681736163650413, "learning_rate": 4.569707401032702e-05, "loss": 0.4124, "step": 574 }, { "epoch": 0.5341384115188109, "grad_norm": 0.27958710415501165, "learning_rate": 4.567986230636833e-05, "loss": 0.4479, "step": 575 }, { "epoch": 0.5350673478866698, "grad_norm": 0.3107133186979922, "learning_rate": 4.566265060240964e-05, "loss": 0.4309, "step": 576 }, { "epoch": 0.5359962842545286, "grad_norm": 0.3008304998300067, "learning_rate": 4.564543889845095e-05, "loss": 0.4215, "step": 577 }, { "epoch": 0.5369252206223873, "grad_norm": 0.3077480260868722, "learning_rate": 4.5628227194492255e-05, "loss": 0.4177, "step": 578 }, { "epoch": 0.5378541569902462, "grad_norm": 0.2856706754554965, "learning_rate": 4.5611015490533566e-05, "loss": 0.4298, "step": 579 }, { "epoch": 0.5387830933581049, "grad_norm": 0.3062434202958225, "learning_rate": 4.559380378657487e-05, "loss": 0.4489, "step": 580 }, { "epoch": 0.5397120297259638, "grad_norm": 0.26880576068838663, "learning_rate": 4.557659208261618e-05, "loss": 0.4378, "step": 581 }, { "epoch": 0.5406409660938226, "grad_norm": 0.2504523646966336, "learning_rate": 4.555938037865749e-05, "loss": 0.4308, "step": 582 }, { "epoch": 0.5415699024616814, "grad_norm": 0.26076967760757586, "learning_rate": 4.55421686746988e-05, "loss": 0.422, "step": 583 }, { "epoch": 0.5424988388295402, "grad_norm": 0.235898250335184, "learning_rate": 4.5524956970740105e-05, "loss": 0.4317, "step": 584 }, { "epoch": 0.5434277751973989, "grad_norm": 0.24591979617927925, "learning_rate": 4.550774526678142e-05, "loss": 0.4576, "step": 585 }, { "epoch": 0.5443567115652578, "grad_norm": 0.2336427626086619, "learning_rate": 4.549053356282272e-05, "loss": 0.4229, "step": 586 }, { "epoch": 0.5452856479331166, "grad_norm": 0.24258696081299802, "learning_rate": 4.5473321858864034e-05, "loss": 0.4314, "step": 587 }, { "epoch": 0.5462145843009754, "grad_norm": 0.2448812822768551, "learning_rate": 4.545611015490534e-05, "loss": 0.4148, "step": 588 }, { "epoch": 0.5471435206688342, "grad_norm": 0.23590842458952918, "learning_rate": 4.543889845094665e-05, "loss": 0.42, "step": 589 }, { "epoch": 0.5480724570366929, "grad_norm": 0.2522688595104704, "learning_rate": 4.542168674698795e-05, "loss": 0.4314, "step": 590 }, { "epoch": 0.5490013934045518, "grad_norm": 0.26737150464975795, "learning_rate": 4.540447504302926e-05, "loss": 0.4086, "step": 591 }, { "epoch": 0.5499303297724106, "grad_norm": 0.2559167030387441, "learning_rate": 4.5387263339070566e-05, "loss": 0.4365, "step": 592 }, { "epoch": 0.5508592661402694, "grad_norm": 0.2767220404229008, "learning_rate": 4.537005163511188e-05, "loss": 0.4148, "step": 593 }, { "epoch": 0.5517882025081282, "grad_norm": 0.27937423564051256, "learning_rate": 4.535283993115318e-05, "loss": 0.4253, "step": 594 }, { "epoch": 0.552717138875987, "grad_norm": 0.2900121711086677, "learning_rate": 4.5335628227194494e-05, "loss": 0.4325, "step": 595 }, { "epoch": 0.5536460752438458, "grad_norm": 0.27657569986368546, "learning_rate": 4.53184165232358e-05, "loss": 0.4505, "step": 596 }, { "epoch": 0.5545750116117046, "grad_norm": 0.3126165251767002, "learning_rate": 4.530120481927711e-05, "loss": 0.4358, "step": 597 }, { "epoch": 0.5555039479795634, "grad_norm": 0.27088886464887985, "learning_rate": 4.5283993115318416e-05, "loss": 0.4393, "step": 598 }, { "epoch": 0.5564328843474222, "grad_norm": 0.2504180754270926, "learning_rate": 4.526678141135973e-05, "loss": 0.4188, "step": 599 }, { "epoch": 0.557361820715281, "grad_norm": 0.30044985875880187, "learning_rate": 4.524956970740103e-05, "loss": 0.4272, "step": 600 }, { "epoch": 0.5582907570831398, "grad_norm": 0.26657267822610964, "learning_rate": 4.5232358003442345e-05, "loss": 0.4351, "step": 601 }, { "epoch": 0.5592196934509986, "grad_norm": 0.2534161771476428, "learning_rate": 4.521514629948365e-05, "loss": 0.4171, "step": 602 }, { "epoch": 0.5601486298188574, "grad_norm": 0.2856683409630207, "learning_rate": 4.519793459552496e-05, "loss": 0.4229, "step": 603 }, { "epoch": 0.5610775661867162, "grad_norm": 0.2232216928555761, "learning_rate": 4.5180722891566266e-05, "loss": 0.3916, "step": 604 }, { "epoch": 0.562006502554575, "grad_norm": 0.3018832675679175, "learning_rate": 4.516351118760758e-05, "loss": 0.4179, "step": 605 }, { "epoch": 0.5629354389224338, "grad_norm": 0.2569380913397442, "learning_rate": 4.514629948364888e-05, "loss": 0.4311, "step": 606 }, { "epoch": 0.5638643752902927, "grad_norm": 0.2722108299381111, "learning_rate": 4.5129087779690195e-05, "loss": 0.4227, "step": 607 }, { "epoch": 0.5647933116581514, "grad_norm": 0.24245246643994783, "learning_rate": 4.51118760757315e-05, "loss": 0.4058, "step": 608 }, { "epoch": 0.5657222480260102, "grad_norm": 0.2895449101358798, "learning_rate": 4.509466437177281e-05, "loss": 0.4239, "step": 609 }, { "epoch": 0.566651184393869, "grad_norm": 0.22872993936692906, "learning_rate": 4.507745266781412e-05, "loss": 0.4194, "step": 610 }, { "epoch": 0.5675801207617278, "grad_norm": 0.2951044959391205, "learning_rate": 4.506024096385542e-05, "loss": 0.4159, "step": 611 }, { "epoch": 0.5685090571295867, "grad_norm": 0.2509311292631651, "learning_rate": 4.504302925989673e-05, "loss": 0.4237, "step": 612 }, { "epoch": 0.5694379934974454, "grad_norm": 0.32512599427391997, "learning_rate": 4.502581755593804e-05, "loss": 0.416, "step": 613 }, { "epoch": 0.5703669298653042, "grad_norm": 0.2819827511805377, "learning_rate": 4.5008605851979344e-05, "loss": 0.3945, "step": 614 }, { "epoch": 0.571295866233163, "grad_norm": 0.26137765186363743, "learning_rate": 4.4991394148020655e-05, "loss": 0.4364, "step": 615 }, { "epoch": 0.5722248026010218, "grad_norm": 0.24986429511060695, "learning_rate": 4.497418244406196e-05, "loss": 0.4192, "step": 616 }, { "epoch": 0.5731537389688807, "grad_norm": 0.32510562396239795, "learning_rate": 4.495697074010327e-05, "loss": 0.4564, "step": 617 }, { "epoch": 0.5740826753367394, "grad_norm": 0.2437246846388565, "learning_rate": 4.493975903614458e-05, "loss": 0.4491, "step": 618 }, { "epoch": 0.5750116117045982, "grad_norm": 0.28270839310005524, "learning_rate": 4.492254733218589e-05, "loss": 0.4165, "step": 619 }, { "epoch": 0.575940548072457, "grad_norm": 0.2978639151525471, "learning_rate": 4.4905335628227194e-05, "loss": 0.4181, "step": 620 }, { "epoch": 0.5768694844403158, "grad_norm": 0.283277054225362, "learning_rate": 4.4888123924268506e-05, "loss": 0.4506, "step": 621 }, { "epoch": 0.5777984208081747, "grad_norm": 0.24187266585240402, "learning_rate": 4.487091222030981e-05, "loss": 0.401, "step": 622 }, { "epoch": 0.5787273571760334, "grad_norm": 0.2863505365930887, "learning_rate": 4.485370051635112e-05, "loss": 0.4117, "step": 623 }, { "epoch": 0.5796562935438923, "grad_norm": 0.2773113061348817, "learning_rate": 4.483648881239243e-05, "loss": 0.4244, "step": 624 }, { "epoch": 0.5805852299117511, "grad_norm": 0.23781370664986884, "learning_rate": 4.481927710843374e-05, "loss": 0.4172, "step": 625 }, { "epoch": 0.5815141662796098, "grad_norm": 0.2899327516074531, "learning_rate": 4.4802065404475044e-05, "loss": 0.414, "step": 626 }, { "epoch": 0.5824431026474687, "grad_norm": 0.21917991809985152, "learning_rate": 4.4784853700516356e-05, "loss": 0.3858, "step": 627 }, { "epoch": 0.5833720390153274, "grad_norm": 0.29585581964621416, "learning_rate": 4.476764199655766e-05, "loss": 0.4262, "step": 628 }, { "epoch": 0.5843009753831863, "grad_norm": 0.24103700840664655, "learning_rate": 4.475043029259897e-05, "loss": 0.4061, "step": 629 }, { "epoch": 0.5852299117510451, "grad_norm": 0.30538358771552415, "learning_rate": 4.473321858864028e-05, "loss": 0.4068, "step": 630 }, { "epoch": 0.5861588481189038, "grad_norm": 0.28929577954162256, "learning_rate": 4.471600688468159e-05, "loss": 0.4307, "step": 631 }, { "epoch": 0.5870877844867627, "grad_norm": 0.28414615427121404, "learning_rate": 4.469879518072289e-05, "loss": 0.4201, "step": 632 }, { "epoch": 0.5880167208546214, "grad_norm": 0.26330534375194914, "learning_rate": 4.46815834767642e-05, "loss": 0.4346, "step": 633 }, { "epoch": 0.5889456572224803, "grad_norm": 0.24452133530493197, "learning_rate": 4.4664371772805505e-05, "loss": 0.4283, "step": 634 }, { "epoch": 0.5898745935903391, "grad_norm": 0.275064839540402, "learning_rate": 4.4647160068846817e-05, "loss": 0.4103, "step": 635 }, { "epoch": 0.5908035299581978, "grad_norm": 0.23295221964562113, "learning_rate": 4.462994836488812e-05, "loss": 0.4265, "step": 636 }, { "epoch": 0.5917324663260567, "grad_norm": 0.29348555430997814, "learning_rate": 4.461273666092943e-05, "loss": 0.4498, "step": 637 }, { "epoch": 0.5926614026939154, "grad_norm": 0.2387454374843579, "learning_rate": 4.459552495697074e-05, "loss": 0.4267, "step": 638 }, { "epoch": 0.5935903390617743, "grad_norm": 0.24669696423956647, "learning_rate": 4.457831325301205e-05, "loss": 0.4109, "step": 639 }, { "epoch": 0.5945192754296331, "grad_norm": 0.2569187304772452, "learning_rate": 4.4561101549053355e-05, "loss": 0.4142, "step": 640 }, { "epoch": 0.5954482117974919, "grad_norm": 0.23582837954343758, "learning_rate": 4.454388984509467e-05, "loss": 0.4361, "step": 641 }, { "epoch": 0.5963771481653507, "grad_norm": 0.2577453345455404, "learning_rate": 4.452667814113597e-05, "loss": 0.4208, "step": 642 }, { "epoch": 0.5973060845332094, "grad_norm": 0.2575075232080959, "learning_rate": 4.4509466437177284e-05, "loss": 0.4175, "step": 643 }, { "epoch": 0.5982350209010683, "grad_norm": 0.2592496028490341, "learning_rate": 4.449225473321859e-05, "loss": 0.4243, "step": 644 }, { "epoch": 0.5991639572689271, "grad_norm": 0.2518897193108137, "learning_rate": 4.44750430292599e-05, "loss": 0.4238, "step": 645 }, { "epoch": 0.6000928936367859, "grad_norm": 0.27689384611411444, "learning_rate": 4.4457831325301206e-05, "loss": 0.4145, "step": 646 }, { "epoch": 0.6010218300046447, "grad_norm": 0.25485645505304355, "learning_rate": 4.444061962134252e-05, "loss": 0.4201, "step": 647 }, { "epoch": 0.6019507663725034, "grad_norm": 0.27128573378890863, "learning_rate": 4.442340791738382e-05, "loss": 0.4316, "step": 648 }, { "epoch": 0.6028797027403623, "grad_norm": 0.2578065387198813, "learning_rate": 4.4406196213425134e-05, "loss": 0.4193, "step": 649 }, { "epoch": 0.6038086391082211, "grad_norm": 0.2667247647777713, "learning_rate": 4.438898450946644e-05, "loss": 0.3986, "step": 650 }, { "epoch": 0.6047375754760799, "grad_norm": 0.26585574043938304, "learning_rate": 4.437177280550775e-05, "loss": 0.4433, "step": 651 }, { "epoch": 0.6056665118439387, "grad_norm": 0.29551030914401943, "learning_rate": 4.4354561101549056e-05, "loss": 0.4461, "step": 652 }, { "epoch": 0.6065954482117974, "grad_norm": 0.2841049021245725, "learning_rate": 4.433734939759036e-05, "loss": 0.4071, "step": 653 }, { "epoch": 0.6075243845796563, "grad_norm": 0.30886770292110666, "learning_rate": 4.4320137693631666e-05, "loss": 0.4544, "step": 654 }, { "epoch": 0.6084533209475151, "grad_norm": 0.2686093425730839, "learning_rate": 4.430292598967298e-05, "loss": 0.407, "step": 655 }, { "epoch": 0.6093822573153739, "grad_norm": 0.30412706468771006, "learning_rate": 4.428571428571428e-05, "loss": 0.3915, "step": 656 }, { "epoch": 0.6103111936832327, "grad_norm": 0.35364500741842214, "learning_rate": 4.4268502581755595e-05, "loss": 0.453, "step": 657 }, { "epoch": 0.6112401300510915, "grad_norm": 0.26460657385536474, "learning_rate": 4.42512908777969e-05, "loss": 0.3909, "step": 658 }, { "epoch": 0.6121690664189503, "grad_norm": 0.4428486110797581, "learning_rate": 4.423407917383821e-05, "loss": 0.4269, "step": 659 }, { "epoch": 0.6130980027868091, "grad_norm": 0.2866328626559702, "learning_rate": 4.4216867469879516e-05, "loss": 0.4204, "step": 660 }, { "epoch": 0.6140269391546679, "grad_norm": 0.36462567528091633, "learning_rate": 4.419965576592083e-05, "loss": 0.4353, "step": 661 }, { "epoch": 0.6149558755225267, "grad_norm": 0.3087527803770854, "learning_rate": 4.418244406196213e-05, "loss": 0.4204, "step": 662 }, { "epoch": 0.6158848118903855, "grad_norm": 0.3103535653539977, "learning_rate": 4.4165232358003445e-05, "loss": 0.4344, "step": 663 }, { "epoch": 0.6168137482582443, "grad_norm": 0.28523746975783004, "learning_rate": 4.414802065404475e-05, "loss": 0.4163, "step": 664 }, { "epoch": 0.6177426846261032, "grad_norm": 0.2842390326622571, "learning_rate": 4.413080895008606e-05, "loss": 0.408, "step": 665 }, { "epoch": 0.6186716209939619, "grad_norm": 0.23234413274784502, "learning_rate": 4.411359724612737e-05, "loss": 0.4194, "step": 666 }, { "epoch": 0.6196005573618207, "grad_norm": 0.275004251162571, "learning_rate": 4.409638554216868e-05, "loss": 0.4323, "step": 667 }, { "epoch": 0.6205294937296795, "grad_norm": 0.31847768258839976, "learning_rate": 4.4079173838209984e-05, "loss": 0.4268, "step": 668 }, { "epoch": 0.6214584300975383, "grad_norm": 0.23406432898908414, "learning_rate": 4.4061962134251295e-05, "loss": 0.4137, "step": 669 }, { "epoch": 0.6223873664653972, "grad_norm": 0.3093054020461385, "learning_rate": 4.40447504302926e-05, "loss": 0.3857, "step": 670 }, { "epoch": 0.6233163028332559, "grad_norm": 0.25338402995679804, "learning_rate": 4.402753872633391e-05, "loss": 0.4007, "step": 671 }, { "epoch": 0.6242452392011147, "grad_norm": 0.2935153845414147, "learning_rate": 4.401032702237522e-05, "loss": 0.4253, "step": 672 }, { "epoch": 0.6251741755689735, "grad_norm": 0.3257007027536486, "learning_rate": 4.399311531841653e-05, "loss": 0.4573, "step": 673 }, { "epoch": 0.6261031119368323, "grad_norm": 0.29349741747875213, "learning_rate": 4.3975903614457834e-05, "loss": 0.4016, "step": 674 }, { "epoch": 0.6270320483046912, "grad_norm": 0.3229849235540013, "learning_rate": 4.395869191049914e-05, "loss": 0.4272, "step": 675 }, { "epoch": 0.6279609846725499, "grad_norm": 0.36116533623931474, "learning_rate": 4.3941480206540444e-05, "loss": 0.4144, "step": 676 }, { "epoch": 0.6288899210404088, "grad_norm": 0.2563978464326882, "learning_rate": 4.3924268502581756e-05, "loss": 0.4194, "step": 677 }, { "epoch": 0.6298188574082675, "grad_norm": 0.3196063755723246, "learning_rate": 4.390705679862306e-05, "loss": 0.393, "step": 678 }, { "epoch": 0.6307477937761263, "grad_norm": 0.35436425630405965, "learning_rate": 4.388984509466437e-05, "loss": 0.4171, "step": 679 }, { "epoch": 0.6316767301439852, "grad_norm": 0.30135253821167374, "learning_rate": 4.387263339070568e-05, "loss": 0.4256, "step": 680 }, { "epoch": 0.6326056665118439, "grad_norm": 0.30674680549750183, "learning_rate": 4.385542168674699e-05, "loss": 0.3959, "step": 681 }, { "epoch": 0.6335346028797028, "grad_norm": 0.32187516875469857, "learning_rate": 4.3838209982788294e-05, "loss": 0.419, "step": 682 }, { "epoch": 0.6344635392475615, "grad_norm": 0.2896684222699531, "learning_rate": 4.3820998278829606e-05, "loss": 0.4324, "step": 683 }, { "epoch": 0.6353924756154203, "grad_norm": 0.2799876389868912, "learning_rate": 4.380378657487091e-05, "loss": 0.4238, "step": 684 }, { "epoch": 0.6363214119832792, "grad_norm": 0.26251094611425313, "learning_rate": 4.378657487091222e-05, "loss": 0.4537, "step": 685 }, { "epoch": 0.6372503483511379, "grad_norm": 0.24365180702477157, "learning_rate": 4.376936316695353e-05, "loss": 0.4201, "step": 686 }, { "epoch": 0.6381792847189968, "grad_norm": 0.34911025101281234, "learning_rate": 4.375215146299484e-05, "loss": 0.4591, "step": 687 }, { "epoch": 0.6391082210868555, "grad_norm": 0.25889134972689726, "learning_rate": 4.3734939759036145e-05, "loss": 0.4272, "step": 688 }, { "epoch": 0.6400371574547143, "grad_norm": 0.28475140108094354, "learning_rate": 4.371772805507746e-05, "loss": 0.4267, "step": 689 }, { "epoch": 0.6409660938225732, "grad_norm": 0.3161029094501537, "learning_rate": 4.370051635111876e-05, "loss": 0.4418, "step": 690 }, { "epoch": 0.6418950301904319, "grad_norm": 0.2396023845575197, "learning_rate": 4.3683304647160073e-05, "loss": 0.4421, "step": 691 }, { "epoch": 0.6428239665582908, "grad_norm": 0.2832054932462056, "learning_rate": 4.366609294320138e-05, "loss": 0.4091, "step": 692 }, { "epoch": 0.6437529029261495, "grad_norm": 0.28376936431369904, "learning_rate": 4.364888123924269e-05, "loss": 0.3974, "step": 693 }, { "epoch": 0.6446818392940084, "grad_norm": 0.24743406449667463, "learning_rate": 4.3631669535283995e-05, "loss": 0.411, "step": 694 }, { "epoch": 0.6456107756618672, "grad_norm": 0.2730246000748479, "learning_rate": 4.361445783132531e-05, "loss": 0.4277, "step": 695 }, { "epoch": 0.6465397120297259, "grad_norm": 0.2650695341327053, "learning_rate": 4.359724612736661e-05, "loss": 0.3944, "step": 696 }, { "epoch": 0.6474686483975848, "grad_norm": 0.321320466947324, "learning_rate": 4.358003442340792e-05, "loss": 0.4464, "step": 697 }, { "epoch": 0.6483975847654435, "grad_norm": 0.3178561485682785, "learning_rate": 4.356282271944923e-05, "loss": 0.4503, "step": 698 }, { "epoch": 0.6493265211333024, "grad_norm": 0.25967628951798444, "learning_rate": 4.3545611015490534e-05, "loss": 0.4225, "step": 699 }, { "epoch": 0.6502554575011612, "grad_norm": 0.2895967588454303, "learning_rate": 4.3528399311531846e-05, "loss": 0.4181, "step": 700 }, { "epoch": 0.6511843938690199, "grad_norm": 0.278239598528544, "learning_rate": 4.351118760757315e-05, "loss": 0.4196, "step": 701 }, { "epoch": 0.6521133302368788, "grad_norm": 0.24741676508319585, "learning_rate": 4.3493975903614456e-05, "loss": 0.4183, "step": 702 }, { "epoch": 0.6530422666047375, "grad_norm": 0.23960910227402535, "learning_rate": 4.347676419965577e-05, "loss": 0.4154, "step": 703 }, { "epoch": 0.6539712029725964, "grad_norm": 0.30614533128558064, "learning_rate": 4.345955249569707e-05, "loss": 0.4132, "step": 704 }, { "epoch": 0.6549001393404552, "grad_norm": 0.2690126725576619, "learning_rate": 4.3442340791738384e-05, "loss": 0.4252, "step": 705 }, { "epoch": 0.655829075708314, "grad_norm": 0.2849912129415421, "learning_rate": 4.342512908777969e-05, "loss": 0.4179, "step": 706 }, { "epoch": 0.6567580120761728, "grad_norm": 0.2480212482763167, "learning_rate": 4.3407917383821e-05, "loss": 0.429, "step": 707 }, { "epoch": 0.6576869484440316, "grad_norm": 0.2804131416188235, "learning_rate": 4.3390705679862306e-05, "loss": 0.4068, "step": 708 }, { "epoch": 0.6586158848118904, "grad_norm": 0.2768683400232619, "learning_rate": 4.337349397590362e-05, "loss": 0.423, "step": 709 }, { "epoch": 0.6595448211797492, "grad_norm": 0.24148977148375103, "learning_rate": 4.335628227194492e-05, "loss": 0.4322, "step": 710 }, { "epoch": 0.660473757547608, "grad_norm": 0.2405167807911801, "learning_rate": 4.3339070567986235e-05, "loss": 0.4046, "step": 711 }, { "epoch": 0.6614026939154668, "grad_norm": 0.22245312753439328, "learning_rate": 4.332185886402754e-05, "loss": 0.4126, "step": 712 }, { "epoch": 0.6623316302833256, "grad_norm": 0.2669023790429907, "learning_rate": 4.330464716006885e-05, "loss": 0.4338, "step": 713 }, { "epoch": 0.6632605666511844, "grad_norm": 0.2521812276262092, "learning_rate": 4.3287435456110156e-05, "loss": 0.4448, "step": 714 }, { "epoch": 0.6641895030190432, "grad_norm": 0.2625757212246767, "learning_rate": 4.327022375215147e-05, "loss": 0.414, "step": 715 }, { "epoch": 0.665118439386902, "grad_norm": 0.25033293810292023, "learning_rate": 4.325301204819277e-05, "loss": 0.423, "step": 716 }, { "epoch": 0.6660473757547608, "grad_norm": 0.22334251988732523, "learning_rate": 4.323580034423408e-05, "loss": 0.4237, "step": 717 }, { "epoch": 0.6669763121226197, "grad_norm": 0.2397245440218477, "learning_rate": 4.321858864027539e-05, "loss": 0.4137, "step": 718 }, { "epoch": 0.6679052484904784, "grad_norm": 0.30143292571229535, "learning_rate": 4.3201376936316695e-05, "loss": 0.3724, "step": 719 }, { "epoch": 0.6688341848583372, "grad_norm": 0.23079068591492524, "learning_rate": 4.318416523235801e-05, "loss": 0.4007, "step": 720 }, { "epoch": 0.669763121226196, "grad_norm": 0.30620216556806684, "learning_rate": 4.316695352839931e-05, "loss": 0.4384, "step": 721 }, { "epoch": 0.6706920575940548, "grad_norm": 0.29486394422042084, "learning_rate": 4.3149741824440624e-05, "loss": 0.4046, "step": 722 }, { "epoch": 0.6716209939619137, "grad_norm": 0.3305581175194426, "learning_rate": 4.313253012048193e-05, "loss": 0.4242, "step": 723 }, { "epoch": 0.6725499303297724, "grad_norm": 0.2679330466442912, "learning_rate": 4.311531841652324e-05, "loss": 0.4257, "step": 724 }, { "epoch": 0.6734788666976312, "grad_norm": 0.35811562362234445, "learning_rate": 4.3098106712564545e-05, "loss": 0.4384, "step": 725 }, { "epoch": 0.67440780306549, "grad_norm": 0.29334272663787064, "learning_rate": 4.308089500860585e-05, "loss": 0.4444, "step": 726 }, { "epoch": 0.6753367394333488, "grad_norm": 0.24401338629880093, "learning_rate": 4.306368330464716e-05, "loss": 0.4193, "step": 727 }, { "epoch": 0.6762656758012077, "grad_norm": 0.28052429245661187, "learning_rate": 4.304647160068847e-05, "loss": 0.4088, "step": 728 }, { "epoch": 0.6771946121690664, "grad_norm": 0.25925872113831155, "learning_rate": 4.302925989672978e-05, "loss": 0.4073, "step": 729 }, { "epoch": 0.6781235485369252, "grad_norm": 0.2736489940435994, "learning_rate": 4.3012048192771084e-05, "loss": 0.4237, "step": 730 }, { "epoch": 0.679052484904784, "grad_norm": 0.25905656215957673, "learning_rate": 4.2994836488812396e-05, "loss": 0.4162, "step": 731 }, { "epoch": 0.6799814212726428, "grad_norm": 0.24390493228872448, "learning_rate": 4.29776247848537e-05, "loss": 0.4307, "step": 732 }, { "epoch": 0.6809103576405017, "grad_norm": 0.24108332455669904, "learning_rate": 4.296041308089501e-05, "loss": 0.4408, "step": 733 }, { "epoch": 0.6818392940083604, "grad_norm": 0.23861930125640354, "learning_rate": 4.294320137693632e-05, "loss": 0.4097, "step": 734 }, { "epoch": 0.6827682303762193, "grad_norm": 0.2342264218537004, "learning_rate": 4.292598967297763e-05, "loss": 0.4006, "step": 735 }, { "epoch": 0.683697166744078, "grad_norm": 0.2534426133410622, "learning_rate": 4.2908777969018934e-05, "loss": 0.4127, "step": 736 }, { "epoch": 0.6846261031119368, "grad_norm": 0.2711574107583699, "learning_rate": 4.2891566265060246e-05, "loss": 0.4334, "step": 737 }, { "epoch": 0.6855550394797957, "grad_norm": 0.31772185693408067, "learning_rate": 4.287435456110155e-05, "loss": 0.4187, "step": 738 }, { "epoch": 0.6864839758476544, "grad_norm": 0.30536491271045085, "learning_rate": 4.2857142857142856e-05, "loss": 0.4215, "step": 739 }, { "epoch": 0.6874129122155133, "grad_norm": 0.33006635156656183, "learning_rate": 4.283993115318417e-05, "loss": 0.405, "step": 740 }, { "epoch": 0.688341848583372, "grad_norm": 0.2964099136976024, "learning_rate": 4.282271944922547e-05, "loss": 0.419, "step": 741 }, { "epoch": 0.6892707849512308, "grad_norm": 0.2931220669174559, "learning_rate": 4.2805507745266785e-05, "loss": 0.4422, "step": 742 }, { "epoch": 0.6901997213190897, "grad_norm": 0.2983381792229504, "learning_rate": 4.278829604130809e-05, "loss": 0.4411, "step": 743 }, { "epoch": 0.6911286576869484, "grad_norm": 0.2648675526907866, "learning_rate": 4.27710843373494e-05, "loss": 0.4429, "step": 744 }, { "epoch": 0.6920575940548073, "grad_norm": 0.2602456162921885, "learning_rate": 4.275387263339071e-05, "loss": 0.4198, "step": 745 }, { "epoch": 0.692986530422666, "grad_norm": 0.23353204937477282, "learning_rate": 4.273666092943202e-05, "loss": 0.4028, "step": 746 }, { "epoch": 0.6939154667905248, "grad_norm": 0.2991360217695013, "learning_rate": 4.2719449225473323e-05, "loss": 0.4143, "step": 747 }, { "epoch": 0.6948444031583837, "grad_norm": 0.25702159816645936, "learning_rate": 4.2702237521514635e-05, "loss": 0.4213, "step": 748 }, { "epoch": 0.6957733395262424, "grad_norm": 0.2838721908179337, "learning_rate": 4.268502581755594e-05, "loss": 0.4387, "step": 749 }, { "epoch": 0.6967022758941013, "grad_norm": 0.24149559875694418, "learning_rate": 4.2667814113597245e-05, "loss": 0.4184, "step": 750 }, { "epoch": 0.69763121226196, "grad_norm": 0.2952086958075859, "learning_rate": 4.265060240963856e-05, "loss": 0.4167, "step": 751 }, { "epoch": 0.6985601486298189, "grad_norm": 0.23972843797863336, "learning_rate": 4.263339070567986e-05, "loss": 0.4141, "step": 752 }, { "epoch": 0.6994890849976777, "grad_norm": 0.2746023756298842, "learning_rate": 4.2616179001721174e-05, "loss": 0.4364, "step": 753 }, { "epoch": 0.7004180213655364, "grad_norm": 0.29765586671383076, "learning_rate": 4.259896729776248e-05, "loss": 0.4089, "step": 754 }, { "epoch": 0.7013469577333953, "grad_norm": 0.21800684863628034, "learning_rate": 4.258175559380379e-05, "loss": 0.4143, "step": 755 }, { "epoch": 0.702275894101254, "grad_norm": 0.28610650003068566, "learning_rate": 4.2564543889845096e-05, "loss": 0.4138, "step": 756 }, { "epoch": 0.7032048304691129, "grad_norm": 0.2640665130452579, "learning_rate": 4.254733218588641e-05, "loss": 0.4549, "step": 757 }, { "epoch": 0.7041337668369717, "grad_norm": 0.25026626980377065, "learning_rate": 4.253012048192771e-05, "loss": 0.4038, "step": 758 }, { "epoch": 0.7050627032048304, "grad_norm": 0.275452109464895, "learning_rate": 4.251290877796902e-05, "loss": 0.4222, "step": 759 }, { "epoch": 0.7059916395726893, "grad_norm": 0.25082813720469543, "learning_rate": 4.249569707401033e-05, "loss": 0.414, "step": 760 }, { "epoch": 0.706920575940548, "grad_norm": 0.2607187023425719, "learning_rate": 4.2478485370051634e-05, "loss": 0.4186, "step": 761 }, { "epoch": 0.7078495123084069, "grad_norm": 0.2628577880876619, "learning_rate": 4.2461273666092946e-05, "loss": 0.4313, "step": 762 }, { "epoch": 0.7087784486762657, "grad_norm": 0.256368283425763, "learning_rate": 4.244406196213425e-05, "loss": 0.4371, "step": 763 }, { "epoch": 0.7097073850441245, "grad_norm": 0.27295950833164523, "learning_rate": 4.242685025817556e-05, "loss": 0.4224, "step": 764 }, { "epoch": 0.7106363214119833, "grad_norm": 0.2788051820721469, "learning_rate": 4.240963855421687e-05, "loss": 0.4261, "step": 765 }, { "epoch": 0.711565257779842, "grad_norm": 0.2966812172764707, "learning_rate": 4.239242685025818e-05, "loss": 0.4622, "step": 766 }, { "epoch": 0.7124941941477009, "grad_norm": 0.26841840271189216, "learning_rate": 4.2375215146299485e-05, "loss": 0.4065, "step": 767 }, { "epoch": 0.7134231305155597, "grad_norm": 0.262959590522867, "learning_rate": 4.2358003442340797e-05, "loss": 0.3935, "step": 768 }, { "epoch": 0.7143520668834185, "grad_norm": 0.24938352992988017, "learning_rate": 4.23407917383821e-05, "loss": 0.4231, "step": 769 }, { "epoch": 0.7152810032512773, "grad_norm": 0.29239937087989554, "learning_rate": 4.232358003442341e-05, "loss": 0.4064, "step": 770 }, { "epoch": 0.716209939619136, "grad_norm": 0.23536282870494213, "learning_rate": 4.230636833046472e-05, "loss": 0.402, "step": 771 }, { "epoch": 0.7171388759869949, "grad_norm": 0.2714220152937965, "learning_rate": 4.228915662650603e-05, "loss": 0.3936, "step": 772 }, { "epoch": 0.7180678123548537, "grad_norm": 0.2747257504638425, "learning_rate": 4.2271944922547335e-05, "loss": 0.4163, "step": 773 }, { "epoch": 0.7189967487227125, "grad_norm": 0.2723713156748802, "learning_rate": 4.225473321858865e-05, "loss": 0.4066, "step": 774 }, { "epoch": 0.7199256850905713, "grad_norm": 0.2557791781141218, "learning_rate": 4.223752151462995e-05, "loss": 0.4087, "step": 775 }, { "epoch": 0.72085462145843, "grad_norm": 0.23079182269430176, "learning_rate": 4.222030981067126e-05, "loss": 0.3846, "step": 776 }, { "epoch": 0.7217835578262889, "grad_norm": 0.28776961647603927, "learning_rate": 4.220309810671257e-05, "loss": 0.4035, "step": 777 }, { "epoch": 0.7227124941941477, "grad_norm": 0.25625135388254744, "learning_rate": 4.2185886402753874e-05, "loss": 0.4196, "step": 778 }, { "epoch": 0.7236414305620065, "grad_norm": 0.29595393487275606, "learning_rate": 4.2168674698795186e-05, "loss": 0.4448, "step": 779 }, { "epoch": 0.7245703669298653, "grad_norm": 0.24841926756950633, "learning_rate": 4.215146299483649e-05, "loss": 0.4153, "step": 780 }, { "epoch": 0.725499303297724, "grad_norm": 0.28459325293580745, "learning_rate": 4.2134251290877796e-05, "loss": 0.444, "step": 781 }, { "epoch": 0.7264282396655829, "grad_norm": 0.26193828352352555, "learning_rate": 4.211703958691911e-05, "loss": 0.4302, "step": 782 }, { "epoch": 0.7273571760334417, "grad_norm": 0.2479500695456553, "learning_rate": 4.209982788296041e-05, "loss": 0.4073, "step": 783 }, { "epoch": 0.7282861124013005, "grad_norm": 0.28792928032141457, "learning_rate": 4.2082616179001724e-05, "loss": 0.4006, "step": 784 }, { "epoch": 0.7292150487691593, "grad_norm": 0.22643696354988968, "learning_rate": 4.206540447504303e-05, "loss": 0.4186, "step": 785 }, { "epoch": 0.7301439851370181, "grad_norm": 0.30992055940652685, "learning_rate": 4.204819277108434e-05, "loss": 0.4013, "step": 786 }, { "epoch": 0.7310729215048769, "grad_norm": 0.22983459619899754, "learning_rate": 4.2030981067125646e-05, "loss": 0.4181, "step": 787 }, { "epoch": 0.7320018578727358, "grad_norm": 0.25635099005397793, "learning_rate": 4.201376936316696e-05, "loss": 0.4099, "step": 788 }, { "epoch": 0.7329307942405945, "grad_norm": 0.2562041424665725, "learning_rate": 4.199655765920826e-05, "loss": 0.4108, "step": 789 }, { "epoch": 0.7338597306084533, "grad_norm": 0.25150699905041096, "learning_rate": 4.1979345955249575e-05, "loss": 0.4085, "step": 790 }, { "epoch": 0.7347886669763122, "grad_norm": 0.2714063520253769, "learning_rate": 4.196213425129088e-05, "loss": 0.4214, "step": 791 }, { "epoch": 0.7357176033441709, "grad_norm": 0.2665920594786692, "learning_rate": 4.194492254733219e-05, "loss": 0.4199, "step": 792 }, { "epoch": 0.7366465397120298, "grad_norm": 0.2671300207460533, "learning_rate": 4.1927710843373496e-05, "loss": 0.4161, "step": 793 }, { "epoch": 0.7375754760798885, "grad_norm": 0.27034859657353083, "learning_rate": 4.191049913941481e-05, "loss": 0.4271, "step": 794 }, { "epoch": 0.7385044124477473, "grad_norm": 0.26690699071818325, "learning_rate": 4.189328743545611e-05, "loss": 0.4192, "step": 795 }, { "epoch": 0.7394333488156062, "grad_norm": 0.3051440328850412, "learning_rate": 4.1876075731497425e-05, "loss": 0.4132, "step": 796 }, { "epoch": 0.7403622851834649, "grad_norm": 0.21974143916170535, "learning_rate": 4.185886402753873e-05, "loss": 0.3979, "step": 797 }, { "epoch": 0.7412912215513238, "grad_norm": 0.24983461807683474, "learning_rate": 4.184165232358004e-05, "loss": 0.4079, "step": 798 }, { "epoch": 0.7422201579191825, "grad_norm": 0.24224059591231967, "learning_rate": 4.182444061962135e-05, "loss": 0.4032, "step": 799 }, { "epoch": 0.7431490942870413, "grad_norm": 0.2502956003038763, "learning_rate": 4.180722891566265e-05, "loss": 0.409, "step": 800 }, { "epoch": 0.7440780306549002, "grad_norm": 0.24589196452215084, "learning_rate": 4.1790017211703964e-05, "loss": 0.4111, "step": 801 }, { "epoch": 0.7450069670227589, "grad_norm": 0.22442063047390312, "learning_rate": 4.177280550774527e-05, "loss": 0.4212, "step": 802 }, { "epoch": 0.7459359033906178, "grad_norm": 0.2360026228130957, "learning_rate": 4.1755593803786574e-05, "loss": 0.3978, "step": 803 }, { "epoch": 0.7468648397584765, "grad_norm": 0.26711637554464907, "learning_rate": 4.1738382099827885e-05, "loss": 0.4028, "step": 804 }, { "epoch": 0.7477937761263354, "grad_norm": 0.2956104037100123, "learning_rate": 4.172117039586919e-05, "loss": 0.417, "step": 805 }, { "epoch": 0.7487227124941942, "grad_norm": 0.2764502588713264, "learning_rate": 4.17039586919105e-05, "loss": 0.4075, "step": 806 }, { "epoch": 0.7496516488620529, "grad_norm": 0.26416939396871475, "learning_rate": 4.168674698795181e-05, "loss": 0.4306, "step": 807 }, { "epoch": 0.7505805852299118, "grad_norm": 0.29657741806318, "learning_rate": 4.166953528399312e-05, "loss": 0.4213, "step": 808 }, { "epoch": 0.7515095215977705, "grad_norm": 0.262465514495409, "learning_rate": 4.1652323580034424e-05, "loss": 0.4211, "step": 809 }, { "epoch": 0.7524384579656294, "grad_norm": 0.29476951737672735, "learning_rate": 4.1635111876075736e-05, "loss": 0.4163, "step": 810 }, { "epoch": 0.7533673943334882, "grad_norm": 0.25271095485048567, "learning_rate": 4.161790017211704e-05, "loss": 0.4064, "step": 811 }, { "epoch": 0.7542963307013469, "grad_norm": 0.30865411128822723, "learning_rate": 4.160068846815835e-05, "loss": 0.4151, "step": 812 }, { "epoch": 0.7552252670692058, "grad_norm": 0.24802250876527565, "learning_rate": 4.158347676419966e-05, "loss": 0.4445, "step": 813 }, { "epoch": 0.7561542034370645, "grad_norm": 0.3701424649307489, "learning_rate": 4.156626506024097e-05, "loss": 0.4183, "step": 814 }, { "epoch": 0.7570831398049234, "grad_norm": 0.2337360342099123, "learning_rate": 4.1549053356282274e-05, "loss": 0.3858, "step": 815 }, { "epoch": 0.7580120761727822, "grad_norm": 0.2910522135380579, "learning_rate": 4.1531841652323586e-05, "loss": 0.4122, "step": 816 }, { "epoch": 0.758941012540641, "grad_norm": 0.236795985808718, "learning_rate": 4.151462994836489e-05, "loss": 0.4322, "step": 817 }, { "epoch": 0.7598699489084998, "grad_norm": 0.28675347251155037, "learning_rate": 4.14974182444062e-05, "loss": 0.411, "step": 818 }, { "epoch": 0.7607988852763585, "grad_norm": 0.24135506233109558, "learning_rate": 4.148020654044751e-05, "loss": 0.4033, "step": 819 }, { "epoch": 0.7617278216442174, "grad_norm": 0.29903357931708696, "learning_rate": 4.146299483648882e-05, "loss": 0.4028, "step": 820 }, { "epoch": 0.7626567580120762, "grad_norm": 0.2666535342900728, "learning_rate": 4.1445783132530125e-05, "loss": 0.4064, "step": 821 }, { "epoch": 0.763585694379935, "grad_norm": 0.33097109067206043, "learning_rate": 4.1428571428571437e-05, "loss": 0.4167, "step": 822 }, { "epoch": 0.7645146307477938, "grad_norm": 0.27705874830384963, "learning_rate": 4.1411359724612735e-05, "loss": 0.4317, "step": 823 }, { "epoch": 0.7654435671156525, "grad_norm": 0.27006115913006945, "learning_rate": 4.1394148020654047e-05, "loss": 0.4011, "step": 824 }, { "epoch": 0.7663725034835114, "grad_norm": 0.2191843080256796, "learning_rate": 4.137693631669535e-05, "loss": 0.3877, "step": 825 }, { "epoch": 0.7673014398513702, "grad_norm": 0.26765533335243286, "learning_rate": 4.135972461273666e-05, "loss": 0.416, "step": 826 }, { "epoch": 0.768230376219229, "grad_norm": 0.28444860627784857, "learning_rate": 4.134251290877797e-05, "loss": 0.4142, "step": 827 }, { "epoch": 0.7691593125870878, "grad_norm": 0.2590581112218012, "learning_rate": 4.132530120481928e-05, "loss": 0.455, "step": 828 }, { "epoch": 0.7700882489549465, "grad_norm": 0.28511612916261636, "learning_rate": 4.1308089500860585e-05, "loss": 0.4439, "step": 829 }, { "epoch": 0.7710171853228054, "grad_norm": 0.2724311438434457, "learning_rate": 4.12908777969019e-05, "loss": 0.4105, "step": 830 }, { "epoch": 0.7719461216906642, "grad_norm": 0.23702064984429358, "learning_rate": 4.12736660929432e-05, "loss": 0.4136, "step": 831 }, { "epoch": 0.772875058058523, "grad_norm": 0.2892255007964407, "learning_rate": 4.1256454388984514e-05, "loss": 0.4089, "step": 832 }, { "epoch": 0.7738039944263818, "grad_norm": 0.24990575367096712, "learning_rate": 4.123924268502582e-05, "loss": 0.443, "step": 833 }, { "epoch": 0.7747329307942405, "grad_norm": 0.3184046525164543, "learning_rate": 4.122203098106713e-05, "loss": 0.4318, "step": 834 }, { "epoch": 0.7756618671620994, "grad_norm": 0.2673099414060095, "learning_rate": 4.1204819277108436e-05, "loss": 0.4013, "step": 835 }, { "epoch": 0.7765908035299582, "grad_norm": 0.2925594820844854, "learning_rate": 4.118760757314975e-05, "loss": 0.4271, "step": 836 }, { "epoch": 0.777519739897817, "grad_norm": 0.22953985012166775, "learning_rate": 4.117039586919105e-05, "loss": 0.3745, "step": 837 }, { "epoch": 0.7784486762656758, "grad_norm": 0.33609789574689314, "learning_rate": 4.1153184165232364e-05, "loss": 0.4037, "step": 838 }, { "epoch": 0.7793776126335346, "grad_norm": 0.24182783755842846, "learning_rate": 4.113597246127367e-05, "loss": 0.3996, "step": 839 }, { "epoch": 0.7803065490013934, "grad_norm": 0.24661825820680278, "learning_rate": 4.111876075731498e-05, "loss": 0.4, "step": 840 }, { "epoch": 0.7812354853692522, "grad_norm": 0.2676051683765745, "learning_rate": 4.1101549053356286e-05, "loss": 0.4341, "step": 841 }, { "epoch": 0.782164421737111, "grad_norm": 0.2302592197123923, "learning_rate": 4.10843373493976e-05, "loss": 0.3956, "step": 842 }, { "epoch": 0.7830933581049698, "grad_norm": 0.26234103881814524, "learning_rate": 4.10671256454389e-05, "loss": 0.4011, "step": 843 }, { "epoch": 0.7840222944728286, "grad_norm": 0.2801794293079212, "learning_rate": 4.104991394148021e-05, "loss": 0.4329, "step": 844 }, { "epoch": 0.7849512308406874, "grad_norm": 0.23397262641748529, "learning_rate": 4.103270223752151e-05, "loss": 0.3709, "step": 845 }, { "epoch": 0.7858801672085463, "grad_norm": 0.3544703249382208, "learning_rate": 4.1015490533562825e-05, "loss": 0.4171, "step": 846 }, { "epoch": 0.786809103576405, "grad_norm": 0.2722275854852537, "learning_rate": 4.099827882960413e-05, "loss": 0.4337, "step": 847 }, { "epoch": 0.7877380399442638, "grad_norm": 0.26553648559179016, "learning_rate": 4.098106712564544e-05, "loss": 0.4173, "step": 848 }, { "epoch": 0.7886669763121226, "grad_norm": 0.2678696587952733, "learning_rate": 4.0963855421686746e-05, "loss": 0.4111, "step": 849 }, { "epoch": 0.7895959126799814, "grad_norm": 0.2657158376139536, "learning_rate": 4.094664371772806e-05, "loss": 0.4126, "step": 850 }, { "epoch": 0.7905248490478403, "grad_norm": 0.287284238795583, "learning_rate": 4.092943201376936e-05, "loss": 0.4207, "step": 851 }, { "epoch": 0.791453785415699, "grad_norm": 0.2754176953959211, "learning_rate": 4.0912220309810675e-05, "loss": 0.4259, "step": 852 }, { "epoch": 0.7923827217835578, "grad_norm": 0.262107512607669, "learning_rate": 4.089500860585198e-05, "loss": 0.4127, "step": 853 }, { "epoch": 0.7933116581514166, "grad_norm": 0.24162465041652548, "learning_rate": 4.087779690189329e-05, "loss": 0.4009, "step": 854 }, { "epoch": 0.7942405945192754, "grad_norm": 0.266448063011791, "learning_rate": 4.08605851979346e-05, "loss": 0.4271, "step": 855 }, { "epoch": 0.7951695308871343, "grad_norm": 0.24051279422477578, "learning_rate": 4.084337349397591e-05, "loss": 0.4089, "step": 856 }, { "epoch": 0.796098467254993, "grad_norm": 0.2729628116438206, "learning_rate": 4.0826161790017214e-05, "loss": 0.4167, "step": 857 }, { "epoch": 0.7970274036228518, "grad_norm": 0.23410719385943532, "learning_rate": 4.0808950086058525e-05, "loss": 0.4179, "step": 858 }, { "epoch": 0.7979563399907106, "grad_norm": 0.30219591202715035, "learning_rate": 4.079173838209983e-05, "loss": 0.4138, "step": 859 }, { "epoch": 0.7988852763585694, "grad_norm": 0.23985576065241604, "learning_rate": 4.077452667814114e-05, "loss": 0.4075, "step": 860 }, { "epoch": 0.7998142127264283, "grad_norm": 0.25029357115071, "learning_rate": 4.075731497418245e-05, "loss": 0.4118, "step": 861 }, { "epoch": 0.800743149094287, "grad_norm": 0.29345741652304597, "learning_rate": 4.074010327022376e-05, "loss": 0.397, "step": 862 }, { "epoch": 0.8016720854621459, "grad_norm": 0.24385532620239805, "learning_rate": 4.0722891566265064e-05, "loss": 0.4, "step": 863 }, { "epoch": 0.8026010218300046, "grad_norm": 0.2637648833416592, "learning_rate": 4.0705679862306376e-05, "loss": 0.4219, "step": 864 }, { "epoch": 0.8035299581978634, "grad_norm": 0.25527572185866787, "learning_rate": 4.0688468158347674e-05, "loss": 0.4007, "step": 865 }, { "epoch": 0.8044588945657223, "grad_norm": 0.2513201236957386, "learning_rate": 4.0671256454388986e-05, "loss": 0.3955, "step": 866 }, { "epoch": 0.805387830933581, "grad_norm": 0.25660774924232826, "learning_rate": 4.065404475043029e-05, "loss": 0.3787, "step": 867 }, { "epoch": 0.8063167673014399, "grad_norm": 0.26206437729581916, "learning_rate": 4.06368330464716e-05, "loss": 0.4196, "step": 868 }, { "epoch": 0.8072457036692986, "grad_norm": 0.24835895191817717, "learning_rate": 4.061962134251291e-05, "loss": 0.3994, "step": 869 }, { "epoch": 0.8081746400371574, "grad_norm": 0.26426141442192896, "learning_rate": 4.060240963855422e-05, "loss": 0.4254, "step": 870 }, { "epoch": 0.8091035764050163, "grad_norm": 0.24819792871771582, "learning_rate": 4.0585197934595524e-05, "loss": 0.4124, "step": 871 }, { "epoch": 0.810032512772875, "grad_norm": 0.2444630039170481, "learning_rate": 4.0567986230636836e-05, "loss": 0.4374, "step": 872 }, { "epoch": 0.8109614491407339, "grad_norm": 0.25882143197020263, "learning_rate": 4.055077452667814e-05, "loss": 0.4224, "step": 873 }, { "epoch": 0.8118903855085927, "grad_norm": 0.22956628354423742, "learning_rate": 4.053356282271945e-05, "loss": 0.4014, "step": 874 }, { "epoch": 0.8128193218764515, "grad_norm": 0.264956195080769, "learning_rate": 4.051635111876076e-05, "loss": 0.3938, "step": 875 }, { "epoch": 0.8137482582443103, "grad_norm": 0.24985343436278015, "learning_rate": 4.049913941480207e-05, "loss": 0.4223, "step": 876 }, { "epoch": 0.814677194612169, "grad_norm": 0.2864398731906139, "learning_rate": 4.0481927710843375e-05, "loss": 0.3934, "step": 877 }, { "epoch": 0.8156061309800279, "grad_norm": 0.24292187852748645, "learning_rate": 4.046471600688469e-05, "loss": 0.3944, "step": 878 }, { "epoch": 0.8165350673478867, "grad_norm": 0.2860359553745768, "learning_rate": 4.044750430292599e-05, "loss": 0.4127, "step": 879 }, { "epoch": 0.8174640037157455, "grad_norm": 0.29668643281017715, "learning_rate": 4.0430292598967303e-05, "loss": 0.3998, "step": 880 }, { "epoch": 0.8183929400836043, "grad_norm": 0.278439860888243, "learning_rate": 4.041308089500861e-05, "loss": 0.3744, "step": 881 }, { "epoch": 0.819321876451463, "grad_norm": 0.2763340216330821, "learning_rate": 4.039586919104992e-05, "loss": 0.4046, "step": 882 }, { "epoch": 0.8202508128193219, "grad_norm": 0.2578084168559668, "learning_rate": 4.0378657487091225e-05, "loss": 0.4144, "step": 883 }, { "epoch": 0.8211797491871807, "grad_norm": 0.2704411317528433, "learning_rate": 4.036144578313254e-05, "loss": 0.3938, "step": 884 }, { "epoch": 0.8221086855550395, "grad_norm": 0.2216930759924396, "learning_rate": 4.034423407917384e-05, "loss": 0.3976, "step": 885 }, { "epoch": 0.8230376219228983, "grad_norm": 0.23736907014407435, "learning_rate": 4.032702237521515e-05, "loss": 0.4032, "step": 886 }, { "epoch": 0.823966558290757, "grad_norm": 0.2751484069936401, "learning_rate": 4.030981067125645e-05, "loss": 0.4142, "step": 887 }, { "epoch": 0.8248954946586159, "grad_norm": 0.22573606403051907, "learning_rate": 4.0292598967297764e-05, "loss": 0.3922, "step": 888 }, { "epoch": 0.8258244310264747, "grad_norm": 0.2602795860148468, "learning_rate": 4.027538726333907e-05, "loss": 0.4139, "step": 889 }, { "epoch": 0.8267533673943335, "grad_norm": 0.24581668806400517, "learning_rate": 4.025817555938038e-05, "loss": 0.4173, "step": 890 }, { "epoch": 0.8276823037621923, "grad_norm": 0.23586598943899656, "learning_rate": 4.0240963855421686e-05, "loss": 0.397, "step": 891 }, { "epoch": 0.828611240130051, "grad_norm": 0.22698815324965912, "learning_rate": 4.0223752151463e-05, "loss": 0.397, "step": 892 }, { "epoch": 0.8295401764979099, "grad_norm": 0.251756454856816, "learning_rate": 4.02065404475043e-05, "loss": 0.43, "step": 893 }, { "epoch": 0.8304691128657687, "grad_norm": 0.23474799972973084, "learning_rate": 4.0189328743545614e-05, "loss": 0.4232, "step": 894 }, { "epoch": 0.8313980492336275, "grad_norm": 0.2284304060585405, "learning_rate": 4.017211703958692e-05, "loss": 0.4363, "step": 895 }, { "epoch": 0.8323269856014863, "grad_norm": 0.22189967592933565, "learning_rate": 4.015490533562823e-05, "loss": 0.3996, "step": 896 }, { "epoch": 0.8332559219693451, "grad_norm": 0.24727551944573103, "learning_rate": 4.0137693631669536e-05, "loss": 0.3881, "step": 897 }, { "epoch": 0.8341848583372039, "grad_norm": 0.26502357788364, "learning_rate": 4.012048192771085e-05, "loss": 0.4345, "step": 898 }, { "epoch": 0.8351137947050628, "grad_norm": 0.24506267621395414, "learning_rate": 4.010327022375215e-05, "loss": 0.408, "step": 899 }, { "epoch": 0.8360427310729215, "grad_norm": 0.2686607976921608, "learning_rate": 4.0086058519793465e-05, "loss": 0.4284, "step": 900 }, { "epoch": 0.8369716674407803, "grad_norm": 0.22658254041478237, "learning_rate": 4.006884681583477e-05, "loss": 0.4036, "step": 901 }, { "epoch": 0.8379006038086391, "grad_norm": 0.24269121548237235, "learning_rate": 4.005163511187608e-05, "loss": 0.4158, "step": 902 }, { "epoch": 0.8388295401764979, "grad_norm": 0.27624389649170866, "learning_rate": 4.0034423407917386e-05, "loss": 0.4067, "step": 903 }, { "epoch": 0.8397584765443568, "grad_norm": 0.24834115738955598, "learning_rate": 4.00172117039587e-05, "loss": 0.4068, "step": 904 }, { "epoch": 0.8406874129122155, "grad_norm": 0.25359081776709796, "learning_rate": 4e-05, "loss": 0.4013, "step": 905 }, { "epoch": 0.8416163492800743, "grad_norm": 0.2621593127454798, "learning_rate": 3.9982788296041315e-05, "loss": 0.4019, "step": 906 }, { "epoch": 0.8425452856479331, "grad_norm": 0.22773835872678272, "learning_rate": 3.996557659208262e-05, "loss": 0.3898, "step": 907 }, { "epoch": 0.8434742220157919, "grad_norm": 0.25454356735572764, "learning_rate": 3.9948364888123925e-05, "loss": 0.4041, "step": 908 }, { "epoch": 0.8444031583836508, "grad_norm": 0.24027047666859136, "learning_rate": 3.993115318416523e-05, "loss": 0.4158, "step": 909 }, { "epoch": 0.8453320947515095, "grad_norm": 0.2485921595211011, "learning_rate": 3.991394148020654e-05, "loss": 0.4101, "step": 910 }, { "epoch": 0.8462610311193683, "grad_norm": 0.22285216317499973, "learning_rate": 3.989672977624785e-05, "loss": 0.3969, "step": 911 }, { "epoch": 0.8471899674872271, "grad_norm": 0.2596638733050711, "learning_rate": 3.987951807228916e-05, "loss": 0.42, "step": 912 }, { "epoch": 0.8481189038550859, "grad_norm": 0.23550833352781333, "learning_rate": 3.9862306368330464e-05, "loss": 0.4136, "step": 913 }, { "epoch": 0.8490478402229448, "grad_norm": 0.24273175946950082, "learning_rate": 3.9845094664371775e-05, "loss": 0.4347, "step": 914 }, { "epoch": 0.8499767765908035, "grad_norm": 0.2571365495317302, "learning_rate": 3.982788296041308e-05, "loss": 0.4322, "step": 915 }, { "epoch": 0.8509057129586624, "grad_norm": 0.22272682206410346, "learning_rate": 3.981067125645439e-05, "loss": 0.382, "step": 916 }, { "epoch": 0.8518346493265211, "grad_norm": 0.22974977779029027, "learning_rate": 3.97934595524957e-05, "loss": 0.3874, "step": 917 }, { "epoch": 0.8527635856943799, "grad_norm": 0.23626085060684773, "learning_rate": 3.977624784853701e-05, "loss": 0.4205, "step": 918 }, { "epoch": 0.8536925220622388, "grad_norm": 0.2711925943505794, "learning_rate": 3.9759036144578314e-05, "loss": 0.4048, "step": 919 }, { "epoch": 0.8546214584300975, "grad_norm": 0.23373429510160257, "learning_rate": 3.9741824440619626e-05, "loss": 0.4183, "step": 920 }, { "epoch": 0.8555503947979564, "grad_norm": 0.269906941097679, "learning_rate": 3.972461273666093e-05, "loss": 0.4187, "step": 921 }, { "epoch": 0.8564793311658151, "grad_norm": 0.21569823380266548, "learning_rate": 3.970740103270224e-05, "loss": 0.3984, "step": 922 }, { "epoch": 0.8574082675336739, "grad_norm": 0.23805027630937817, "learning_rate": 3.969018932874355e-05, "loss": 0.4166, "step": 923 }, { "epoch": 0.8583372039015328, "grad_norm": 0.23069245503461422, "learning_rate": 3.967297762478486e-05, "loss": 0.4262, "step": 924 }, { "epoch": 0.8592661402693915, "grad_norm": 0.2428482973970677, "learning_rate": 3.9655765920826164e-05, "loss": 0.4168, "step": 925 }, { "epoch": 0.8601950766372504, "grad_norm": 0.2183870165831691, "learning_rate": 3.9638554216867476e-05, "loss": 0.4051, "step": 926 }, { "epoch": 0.8611240130051091, "grad_norm": 0.25786863909064817, "learning_rate": 3.962134251290878e-05, "loss": 0.4024, "step": 927 }, { "epoch": 0.862052949372968, "grad_norm": 0.21274681020345573, "learning_rate": 3.960413080895009e-05, "loss": 0.4056, "step": 928 }, { "epoch": 0.8629818857408268, "grad_norm": 0.2805951596096335, "learning_rate": 3.958691910499139e-05, "loss": 0.4251, "step": 929 }, { "epoch": 0.8639108221086855, "grad_norm": 0.22683405969287646, "learning_rate": 3.95697074010327e-05, "loss": 0.4217, "step": 930 }, { "epoch": 0.8648397584765444, "grad_norm": 0.26021009912177695, "learning_rate": 3.955249569707401e-05, "loss": 0.4219, "step": 931 }, { "epoch": 0.8657686948444031, "grad_norm": 0.24224641967846924, "learning_rate": 3.953528399311532e-05, "loss": 0.424, "step": 932 }, { "epoch": 0.866697631212262, "grad_norm": 0.22247989096036871, "learning_rate": 3.9518072289156625e-05, "loss": 0.3968, "step": 933 }, { "epoch": 0.8676265675801208, "grad_norm": 0.22730388207221008, "learning_rate": 3.950086058519794e-05, "loss": 0.4027, "step": 934 }, { "epoch": 0.8685555039479795, "grad_norm": 0.22984491491807468, "learning_rate": 3.948364888123924e-05, "loss": 0.4015, "step": 935 }, { "epoch": 0.8694844403158384, "grad_norm": 0.2706739171118266, "learning_rate": 3.9466437177280554e-05, "loss": 0.4057, "step": 936 }, { "epoch": 0.8704133766836971, "grad_norm": 0.2869836202377347, "learning_rate": 3.944922547332186e-05, "loss": 0.4169, "step": 937 }, { "epoch": 0.871342313051556, "grad_norm": 0.24144907861227421, "learning_rate": 3.943201376936317e-05, "loss": 0.3895, "step": 938 }, { "epoch": 0.8722712494194148, "grad_norm": 0.2318365608955253, "learning_rate": 3.9414802065404475e-05, "loss": 0.3887, "step": 939 }, { "epoch": 0.8732001857872735, "grad_norm": 0.24391204560861726, "learning_rate": 3.939759036144579e-05, "loss": 0.4144, "step": 940 }, { "epoch": 0.8741291221551324, "grad_norm": 0.24279276074096975, "learning_rate": 3.938037865748709e-05, "loss": 0.4105, "step": 941 }, { "epoch": 0.8750580585229911, "grad_norm": 0.2206737590128525, "learning_rate": 3.9363166953528404e-05, "loss": 0.4055, "step": 942 }, { "epoch": 0.87598699489085, "grad_norm": 0.2501829527519811, "learning_rate": 3.934595524956971e-05, "loss": 0.419, "step": 943 }, { "epoch": 0.8769159312587088, "grad_norm": 0.20113559459810643, "learning_rate": 3.932874354561102e-05, "loss": 0.3995, "step": 944 }, { "epoch": 0.8778448676265675, "grad_norm": 0.2590015788880731, "learning_rate": 3.9311531841652326e-05, "loss": 0.4106, "step": 945 }, { "epoch": 0.8787738039944264, "grad_norm": 0.21875502199045596, "learning_rate": 3.929432013769364e-05, "loss": 0.4137, "step": 946 }, { "epoch": 0.8797027403622851, "grad_norm": 0.22619403593853593, "learning_rate": 3.927710843373494e-05, "loss": 0.3897, "step": 947 }, { "epoch": 0.880631676730144, "grad_norm": 0.28274876745197, "learning_rate": 3.9259896729776254e-05, "loss": 0.4243, "step": 948 }, { "epoch": 0.8815606130980028, "grad_norm": 0.2550591340833801, "learning_rate": 3.924268502581756e-05, "loss": 0.4262, "step": 949 }, { "epoch": 0.8824895494658616, "grad_norm": 0.28300258572852405, "learning_rate": 3.9225473321858864e-05, "loss": 0.3981, "step": 950 }, { "epoch": 0.8834184858337204, "grad_norm": 0.2651505216641192, "learning_rate": 3.920826161790017e-05, "loss": 0.3987, "step": 951 }, { "epoch": 0.8843474222015791, "grad_norm": 0.2699525095302782, "learning_rate": 3.919104991394148e-05, "loss": 0.3963, "step": 952 }, { "epoch": 0.885276358569438, "grad_norm": 0.24632775459420023, "learning_rate": 3.9173838209982786e-05, "loss": 0.3793, "step": 953 }, { "epoch": 0.8862052949372968, "grad_norm": 0.2729975548782007, "learning_rate": 3.91566265060241e-05, "loss": 0.4259, "step": 954 }, { "epoch": 0.8871342313051556, "grad_norm": 0.23113626254645106, "learning_rate": 3.91394148020654e-05, "loss": 0.4118, "step": 955 }, { "epoch": 0.8880631676730144, "grad_norm": 0.2493442523902469, "learning_rate": 3.9122203098106715e-05, "loss": 0.4277, "step": 956 }, { "epoch": 0.8889921040408733, "grad_norm": 0.24785873510336145, "learning_rate": 3.910499139414802e-05, "loss": 0.4065, "step": 957 }, { "epoch": 0.889921040408732, "grad_norm": 0.24341153477797958, "learning_rate": 3.908777969018933e-05, "loss": 0.4037, "step": 958 }, { "epoch": 0.8908499767765908, "grad_norm": 0.23299336958401565, "learning_rate": 3.9070567986230637e-05, "loss": 0.4018, "step": 959 }, { "epoch": 0.8917789131444496, "grad_norm": 0.27717768327110737, "learning_rate": 3.905335628227195e-05, "loss": 0.4136, "step": 960 }, { "epoch": 0.8927078495123084, "grad_norm": 0.2634600076134501, "learning_rate": 3.903614457831325e-05, "loss": 0.4151, "step": 961 }, { "epoch": 0.8936367858801673, "grad_norm": 0.2431673757388936, "learning_rate": 3.9018932874354565e-05, "loss": 0.4046, "step": 962 }, { "epoch": 0.894565722248026, "grad_norm": 0.2873899288522361, "learning_rate": 3.900172117039587e-05, "loss": 0.4029, "step": 963 }, { "epoch": 0.8954946586158848, "grad_norm": 0.2068486702586044, "learning_rate": 3.898450946643718e-05, "loss": 0.3862, "step": 964 }, { "epoch": 0.8964235949837436, "grad_norm": 0.21678196583034384, "learning_rate": 3.896729776247849e-05, "loss": 0.3926, "step": 965 }, { "epoch": 0.8973525313516024, "grad_norm": 0.23384577673120308, "learning_rate": 3.89500860585198e-05, "loss": 0.3986, "step": 966 }, { "epoch": 0.8982814677194613, "grad_norm": 0.2168835780899871, "learning_rate": 3.8932874354561104e-05, "loss": 0.4117, "step": 967 }, { "epoch": 0.89921040408732, "grad_norm": 0.2182530040951482, "learning_rate": 3.8915662650602416e-05, "loss": 0.3993, "step": 968 }, { "epoch": 0.9001393404551788, "grad_norm": 0.2279085259514714, "learning_rate": 3.889845094664372e-05, "loss": 0.4179, "step": 969 }, { "epoch": 0.9010682768230376, "grad_norm": 0.2161021312161298, "learning_rate": 3.888123924268503e-05, "loss": 0.4196, "step": 970 }, { "epoch": 0.9019972131908964, "grad_norm": 0.23665046914396706, "learning_rate": 3.886402753872633e-05, "loss": 0.4151, "step": 971 }, { "epoch": 0.9029261495587553, "grad_norm": 0.2153174695085264, "learning_rate": 3.884681583476764e-05, "loss": 0.392, "step": 972 }, { "epoch": 0.903855085926614, "grad_norm": 0.22865056532137162, "learning_rate": 3.882960413080895e-05, "loss": 0.4285, "step": 973 }, { "epoch": 0.9047840222944729, "grad_norm": 0.24540292705433664, "learning_rate": 3.881239242685026e-05, "loss": 0.4085, "step": 974 }, { "epoch": 0.9057129586623316, "grad_norm": 0.21512013945160077, "learning_rate": 3.8795180722891564e-05, "loss": 0.4001, "step": 975 }, { "epoch": 0.9066418950301904, "grad_norm": 0.2543665399130075, "learning_rate": 3.8777969018932876e-05, "loss": 0.4233, "step": 976 }, { "epoch": 0.9075708313980493, "grad_norm": 0.2604590648729307, "learning_rate": 3.876075731497418e-05, "loss": 0.4315, "step": 977 }, { "epoch": 0.908499767765908, "grad_norm": 0.2426498446435362, "learning_rate": 3.874354561101549e-05, "loss": 0.4194, "step": 978 }, { "epoch": 0.9094287041337669, "grad_norm": 0.21692216609161416, "learning_rate": 3.87263339070568e-05, "loss": 0.392, "step": 979 }, { "epoch": 0.9103576405016256, "grad_norm": 0.24908430824930783, "learning_rate": 3.870912220309811e-05, "loss": 0.3719, "step": 980 }, { "epoch": 0.9112865768694844, "grad_norm": 0.2159682412651961, "learning_rate": 3.8691910499139415e-05, "loss": 0.3898, "step": 981 }, { "epoch": 0.9122155132373433, "grad_norm": 0.21127012379911123, "learning_rate": 3.8674698795180726e-05, "loss": 0.4248, "step": 982 }, { "epoch": 0.913144449605202, "grad_norm": 0.22813635269217872, "learning_rate": 3.865748709122203e-05, "loss": 0.4244, "step": 983 }, { "epoch": 0.9140733859730609, "grad_norm": 0.23743928654284238, "learning_rate": 3.864027538726334e-05, "loss": 0.4214, "step": 984 }, { "epoch": 0.9150023223409196, "grad_norm": 0.2186026650640482, "learning_rate": 3.862306368330465e-05, "loss": 0.405, "step": 985 }, { "epoch": 0.9159312587087785, "grad_norm": 0.257356309199639, "learning_rate": 3.860585197934596e-05, "loss": 0.3917, "step": 986 }, { "epoch": 0.9168601950766373, "grad_norm": 0.19464718995992264, "learning_rate": 3.8588640275387265e-05, "loss": 0.4049, "step": 987 }, { "epoch": 0.917789131444496, "grad_norm": 0.25657951632861403, "learning_rate": 3.857142857142858e-05, "loss": 0.4186, "step": 988 }, { "epoch": 0.9187180678123549, "grad_norm": 0.22465676748227606, "learning_rate": 3.855421686746988e-05, "loss": 0.4345, "step": 989 }, { "epoch": 0.9196470041802136, "grad_norm": 0.2506754334740293, "learning_rate": 3.8537005163511194e-05, "loss": 0.4025, "step": 990 }, { "epoch": 0.9205759405480725, "grad_norm": 0.2340373514090616, "learning_rate": 3.85197934595525e-05, "loss": 0.4256, "step": 991 }, { "epoch": 0.9215048769159313, "grad_norm": 0.23036227594793113, "learning_rate": 3.8502581755593804e-05, "loss": 0.4176, "step": 992 }, { "epoch": 0.92243381328379, "grad_norm": 0.23093971462681168, "learning_rate": 3.848537005163511e-05, "loss": 0.4007, "step": 993 }, { "epoch": 0.9233627496516489, "grad_norm": 0.2317551394646806, "learning_rate": 3.846815834767642e-05, "loss": 0.405, "step": 994 }, { "epoch": 0.9242916860195076, "grad_norm": 0.21158502861181783, "learning_rate": 3.8450946643717725e-05, "loss": 0.3942, "step": 995 }, { "epoch": 0.9252206223873665, "grad_norm": 0.2537032563068634, "learning_rate": 3.843373493975904e-05, "loss": 0.3791, "step": 996 }, { "epoch": 0.9261495587552253, "grad_norm": 0.2239536974059551, "learning_rate": 3.841652323580034e-05, "loss": 0.4064, "step": 997 }, { "epoch": 0.927078495123084, "grad_norm": 0.25068704163807093, "learning_rate": 3.8399311531841654e-05, "loss": 0.4273, "step": 998 }, { "epoch": 0.9280074314909429, "grad_norm": 0.2457256835693048, "learning_rate": 3.838209982788296e-05, "loss": 0.4096, "step": 999 }, { "epoch": 0.9289363678588016, "grad_norm": 0.23667198447703158, "learning_rate": 3.836488812392427e-05, "loss": 0.3934, "step": 1000 }, { "epoch": 0.9298653042266605, "grad_norm": 0.24449737642939945, "learning_rate": 3.8347676419965576e-05, "loss": 0.409, "step": 1001 }, { "epoch": 0.9307942405945193, "grad_norm": 0.24650595873011708, "learning_rate": 3.833046471600689e-05, "loss": 0.4053, "step": 1002 }, { "epoch": 0.931723176962378, "grad_norm": 0.26151738161916, "learning_rate": 3.831325301204819e-05, "loss": 0.4104, "step": 1003 }, { "epoch": 0.9326521133302369, "grad_norm": 0.2310178657482112, "learning_rate": 3.8296041308089504e-05, "loss": 0.3937, "step": 1004 }, { "epoch": 0.9335810496980956, "grad_norm": 0.23546622207591916, "learning_rate": 3.827882960413081e-05, "loss": 0.4041, "step": 1005 }, { "epoch": 0.9345099860659545, "grad_norm": 0.2607486009527593, "learning_rate": 3.826161790017212e-05, "loss": 0.4353, "step": 1006 }, { "epoch": 0.9354389224338133, "grad_norm": 0.2392967181255042, "learning_rate": 3.8244406196213426e-05, "loss": 0.3939, "step": 1007 }, { "epoch": 0.9363678588016721, "grad_norm": 0.21674134867792574, "learning_rate": 3.822719449225474e-05, "loss": 0.388, "step": 1008 }, { "epoch": 0.9372967951695309, "grad_norm": 0.24940419360087127, "learning_rate": 3.820998278829604e-05, "loss": 0.4239, "step": 1009 }, { "epoch": 0.9382257315373896, "grad_norm": 0.2596709039538196, "learning_rate": 3.8192771084337355e-05, "loss": 0.4005, "step": 1010 }, { "epoch": 0.9391546679052485, "grad_norm": 0.23606867754298808, "learning_rate": 3.817555938037866e-05, "loss": 0.4374, "step": 1011 }, { "epoch": 0.9400836042731073, "grad_norm": 0.27125016523755546, "learning_rate": 3.815834767641997e-05, "loss": 0.4426, "step": 1012 }, { "epoch": 0.9410125406409661, "grad_norm": 0.24501601889271793, "learning_rate": 3.8141135972461277e-05, "loss": 0.3874, "step": 1013 }, { "epoch": 0.9419414770088249, "grad_norm": 0.2442683405582566, "learning_rate": 3.812392426850258e-05, "loss": 0.4047, "step": 1014 }, { "epoch": 0.9428704133766836, "grad_norm": 0.26685613518926404, "learning_rate": 3.810671256454389e-05, "loss": 0.436, "step": 1015 }, { "epoch": 0.9437993497445425, "grad_norm": 0.23983650732436032, "learning_rate": 3.80895008605852e-05, "loss": 0.4119, "step": 1016 }, { "epoch": 0.9447282861124013, "grad_norm": 0.2570741433796242, "learning_rate": 3.8072289156626503e-05, "loss": 0.4236, "step": 1017 }, { "epoch": 0.9456572224802601, "grad_norm": 0.2544931404350317, "learning_rate": 3.8055077452667815e-05, "loss": 0.3768, "step": 1018 }, { "epoch": 0.9465861588481189, "grad_norm": 0.2345446843587865, "learning_rate": 3.803786574870912e-05, "loss": 0.4143, "step": 1019 }, { "epoch": 0.9475150952159777, "grad_norm": 0.24384024290773332, "learning_rate": 3.802065404475043e-05, "loss": 0.4044, "step": 1020 }, { "epoch": 0.9484440315838365, "grad_norm": 0.21208048637232893, "learning_rate": 3.800344234079174e-05, "loss": 0.4217, "step": 1021 }, { "epoch": 0.9493729679516953, "grad_norm": 0.2599635780830963, "learning_rate": 3.798623063683305e-05, "loss": 0.4122, "step": 1022 }, { "epoch": 0.9503019043195541, "grad_norm": 0.2391907225652313, "learning_rate": 3.7969018932874354e-05, "loss": 0.3861, "step": 1023 }, { "epoch": 0.9512308406874129, "grad_norm": 0.21784120153158443, "learning_rate": 3.7951807228915666e-05, "loss": 0.4017, "step": 1024 }, { "epoch": 0.9521597770552717, "grad_norm": 0.2868793994136997, "learning_rate": 3.793459552495697e-05, "loss": 0.4078, "step": 1025 }, { "epoch": 0.9530887134231305, "grad_norm": 0.2303322060460401, "learning_rate": 3.791738382099828e-05, "loss": 0.3959, "step": 1026 }, { "epoch": 0.9540176497909894, "grad_norm": 0.21929777651344276, "learning_rate": 3.790017211703959e-05, "loss": 0.4378, "step": 1027 }, { "epoch": 0.9549465861588481, "grad_norm": 0.29921819323800025, "learning_rate": 3.78829604130809e-05, "loss": 0.4114, "step": 1028 }, { "epoch": 0.9558755225267069, "grad_norm": 0.24792104039539412, "learning_rate": 3.7865748709122204e-05, "loss": 0.3957, "step": 1029 }, { "epoch": 0.9568044588945657, "grad_norm": 0.2547675598430285, "learning_rate": 3.7848537005163516e-05, "loss": 0.4139, "step": 1030 }, { "epoch": 0.9577333952624245, "grad_norm": 0.23635621173173713, "learning_rate": 3.783132530120482e-05, "loss": 0.4302, "step": 1031 }, { "epoch": 0.9586623316302834, "grad_norm": 0.23539058473689725, "learning_rate": 3.781411359724613e-05, "loss": 0.4106, "step": 1032 }, { "epoch": 0.9595912679981421, "grad_norm": 0.2252626765187598, "learning_rate": 3.779690189328744e-05, "loss": 0.4201, "step": 1033 }, { "epoch": 0.9605202043660009, "grad_norm": 0.23605984311517597, "learning_rate": 3.777969018932875e-05, "loss": 0.4014, "step": 1034 }, { "epoch": 0.9614491407338597, "grad_norm": 0.22411023295920124, "learning_rate": 3.776247848537005e-05, "loss": 0.3995, "step": 1035 }, { "epoch": 0.9623780771017185, "grad_norm": 0.22444324386702214, "learning_rate": 3.774526678141136e-05, "loss": 0.396, "step": 1036 }, { "epoch": 0.9633070134695774, "grad_norm": 0.2392457838225579, "learning_rate": 3.7728055077452665e-05, "loss": 0.3868, "step": 1037 }, { "epoch": 0.9642359498374361, "grad_norm": 0.24259642738031353, "learning_rate": 3.7710843373493976e-05, "loss": 0.4059, "step": 1038 }, { "epoch": 0.965164886205295, "grad_norm": 0.22122843588430882, "learning_rate": 3.769363166953528e-05, "loss": 0.3932, "step": 1039 }, { "epoch": 0.9660938225731538, "grad_norm": 0.39575069207466423, "learning_rate": 3.767641996557659e-05, "loss": 0.4106, "step": 1040 }, { "epoch": 0.9670227589410125, "grad_norm": 0.20970275754321213, "learning_rate": 3.76592082616179e-05, "loss": 0.3826, "step": 1041 }, { "epoch": 0.9679516953088714, "grad_norm": 0.2689513010745634, "learning_rate": 3.764199655765921e-05, "loss": 0.4246, "step": 1042 }, { "epoch": 0.9688806316767301, "grad_norm": 0.21716008885897425, "learning_rate": 3.7624784853700515e-05, "loss": 0.372, "step": 1043 }, { "epoch": 0.969809568044589, "grad_norm": 0.19032663763219618, "learning_rate": 3.760757314974183e-05, "loss": 0.3871, "step": 1044 }, { "epoch": 0.9707385044124478, "grad_norm": 0.24689207170253144, "learning_rate": 3.759036144578313e-05, "loss": 0.3887, "step": 1045 }, { "epoch": 0.9716674407803065, "grad_norm": 0.23664100739316893, "learning_rate": 3.7573149741824444e-05, "loss": 0.403, "step": 1046 }, { "epoch": 0.9725963771481654, "grad_norm": 0.22874968861174907, "learning_rate": 3.755593803786575e-05, "loss": 0.429, "step": 1047 }, { "epoch": 0.9735253135160241, "grad_norm": 0.213785249209633, "learning_rate": 3.753872633390706e-05, "loss": 0.3864, "step": 1048 }, { "epoch": 0.974454249883883, "grad_norm": 0.24065036290868722, "learning_rate": 3.7521514629948365e-05, "loss": 0.433, "step": 1049 }, { "epoch": 0.9753831862517418, "grad_norm": 0.22801923121906967, "learning_rate": 3.750430292598968e-05, "loss": 0.3929, "step": 1050 }, { "epoch": 0.9763121226196005, "grad_norm": 0.24919400482036166, "learning_rate": 3.748709122203098e-05, "loss": 0.4002, "step": 1051 }, { "epoch": 0.9772410589874594, "grad_norm": 0.24145964608043208, "learning_rate": 3.7469879518072294e-05, "loss": 0.4318, "step": 1052 }, { "epoch": 0.9781699953553181, "grad_norm": 0.20174232619370144, "learning_rate": 3.74526678141136e-05, "loss": 0.3912, "step": 1053 }, { "epoch": 0.979098931723177, "grad_norm": 0.21629692614591498, "learning_rate": 3.743545611015491e-05, "loss": 0.3904, "step": 1054 }, { "epoch": 0.9800278680910358, "grad_norm": 0.22440046201731037, "learning_rate": 3.7418244406196216e-05, "loss": 0.4137, "step": 1055 }, { "epoch": 0.9809568044588945, "grad_norm": 0.22672281532519148, "learning_rate": 3.740103270223752e-05, "loss": 0.3976, "step": 1056 }, { "epoch": 0.9818857408267534, "grad_norm": 0.20668391208886464, "learning_rate": 3.7383820998278826e-05, "loss": 0.3951, "step": 1057 }, { "epoch": 0.9828146771946121, "grad_norm": 0.2142818046995827, "learning_rate": 3.736660929432014e-05, "loss": 0.393, "step": 1058 }, { "epoch": 0.983743613562471, "grad_norm": 0.21718324928035726, "learning_rate": 3.734939759036144e-05, "loss": 0.398, "step": 1059 }, { "epoch": 0.9846725499303298, "grad_norm": 0.23550777849566212, "learning_rate": 3.7332185886402754e-05, "loss": 0.4065, "step": 1060 }, { "epoch": 0.9856014862981886, "grad_norm": 0.2167543874679464, "learning_rate": 3.731497418244406e-05, "loss": 0.3927, "step": 1061 }, { "epoch": 0.9865304226660474, "grad_norm": 0.2530841425618813, "learning_rate": 3.729776247848537e-05, "loss": 0.4099, "step": 1062 }, { "epoch": 0.9874593590339061, "grad_norm": 0.2376035990853862, "learning_rate": 3.7280550774526676e-05, "loss": 0.4056, "step": 1063 }, { "epoch": 0.988388295401765, "grad_norm": 0.22485376239690827, "learning_rate": 3.726333907056799e-05, "loss": 0.411, "step": 1064 }, { "epoch": 0.9893172317696238, "grad_norm": 0.2364195030509185, "learning_rate": 3.724612736660929e-05, "loss": 0.4024, "step": 1065 }, { "epoch": 0.9902461681374826, "grad_norm": 0.2741768987248631, "learning_rate": 3.7228915662650605e-05, "loss": 0.4172, "step": 1066 }, { "epoch": 0.9911751045053414, "grad_norm": 0.2224572039465709, "learning_rate": 3.721170395869191e-05, "loss": 0.4063, "step": 1067 }, { "epoch": 0.9921040408732001, "grad_norm": 0.25384861733232483, "learning_rate": 3.719449225473322e-05, "loss": 0.3874, "step": 1068 }, { "epoch": 0.993032977241059, "grad_norm": 0.21782354887827174, "learning_rate": 3.717728055077453e-05, "loss": 0.38, "step": 1069 }, { "epoch": 0.9939619136089178, "grad_norm": 0.23692026091455595, "learning_rate": 3.716006884681584e-05, "loss": 0.4025, "step": 1070 }, { "epoch": 0.9948908499767766, "grad_norm": 0.2628360369682366, "learning_rate": 3.7142857142857143e-05, "loss": 0.4011, "step": 1071 }, { "epoch": 0.9958197863446354, "grad_norm": 0.23586526214391343, "learning_rate": 3.7125645438898455e-05, "loss": 0.4004, "step": 1072 }, { "epoch": 0.9967487227124942, "grad_norm": 0.2699525810140143, "learning_rate": 3.710843373493976e-05, "loss": 0.4211, "step": 1073 }, { "epoch": 0.997677659080353, "grad_norm": 0.24858789264268025, "learning_rate": 3.709122203098107e-05, "loss": 0.4055, "step": 1074 }, { "epoch": 0.9986065954482118, "grad_norm": 0.21173668049329192, "learning_rate": 3.707401032702238e-05, "loss": 0.3769, "step": 1075 }, { "epoch": 0.9995355318160706, "grad_norm": 0.24262016662999178, "learning_rate": 3.705679862306369e-05, "loss": 0.4221, "step": 1076 }, { "epoch": 1.0, "grad_norm": 0.24262016662999178, "learning_rate": 3.703958691910499e-05, "loss": 0.4056, "step": 1077 }, { "epoch": 1.0009289363678588, "grad_norm": 0.42400479184527856, "learning_rate": 3.70223752151463e-05, "loss": 0.359, "step": 1078 }, { "epoch": 1.0018578727357177, "grad_norm": 0.25465365326859013, "learning_rate": 3.7005163511187604e-05, "loss": 0.3744, "step": 1079 }, { "epoch": 1.0027868091035763, "grad_norm": 0.2749579483536481, "learning_rate": 3.6987951807228916e-05, "loss": 0.3215, "step": 1080 }, { "epoch": 1.0037157454714352, "grad_norm": 0.25882510922444807, "learning_rate": 3.697074010327022e-05, "loss": 0.3664, "step": 1081 }, { "epoch": 1.004644681839294, "grad_norm": 0.25257387474158305, "learning_rate": 3.695352839931153e-05, "loss": 0.3439, "step": 1082 }, { "epoch": 1.0055736182071529, "grad_norm": 0.2818136562808609, "learning_rate": 3.693631669535284e-05, "loss": 0.3495, "step": 1083 }, { "epoch": 1.0065025545750117, "grad_norm": 0.2796090387992452, "learning_rate": 3.691910499139415e-05, "loss": 0.3577, "step": 1084 }, { "epoch": 1.0074314909428703, "grad_norm": 0.25921344914520555, "learning_rate": 3.6901893287435454e-05, "loss": 0.3474, "step": 1085 }, { "epoch": 1.0083604273107292, "grad_norm": 0.26684815525436895, "learning_rate": 3.6884681583476766e-05, "loss": 0.3476, "step": 1086 }, { "epoch": 1.009289363678588, "grad_norm": 0.27357439621922414, "learning_rate": 3.686746987951807e-05, "loss": 0.3437, "step": 1087 }, { "epoch": 1.0102183000464469, "grad_norm": 0.24447102717487307, "learning_rate": 3.685025817555938e-05, "loss": 0.3221, "step": 1088 }, { "epoch": 1.0111472364143057, "grad_norm": 0.258464222591644, "learning_rate": 3.683304647160069e-05, "loss": 0.356, "step": 1089 }, { "epoch": 1.0120761727821643, "grad_norm": 0.27925086544400457, "learning_rate": 3.6815834767642e-05, "loss": 0.3701, "step": 1090 }, { "epoch": 1.0130051091500232, "grad_norm": 0.2457044337838858, "learning_rate": 3.6798623063683305e-05, "loss": 0.3458, "step": 1091 }, { "epoch": 1.013934045517882, "grad_norm": 0.2973360259118452, "learning_rate": 3.6781411359724616e-05, "loss": 0.3503, "step": 1092 }, { "epoch": 1.0148629818857409, "grad_norm": 0.24684519541371244, "learning_rate": 3.676419965576592e-05, "loss": 0.3555, "step": 1093 }, { "epoch": 1.0157919182535997, "grad_norm": 0.2553488145462288, "learning_rate": 3.674698795180723e-05, "loss": 0.339, "step": 1094 }, { "epoch": 1.0167208546214583, "grad_norm": 0.27698275528497907, "learning_rate": 3.672977624784854e-05, "loss": 0.3555, "step": 1095 }, { "epoch": 1.0176497909893172, "grad_norm": 0.25512853436869026, "learning_rate": 3.671256454388985e-05, "loss": 0.3508, "step": 1096 }, { "epoch": 1.018578727357176, "grad_norm": 0.23687230499469353, "learning_rate": 3.6695352839931155e-05, "loss": 0.3541, "step": 1097 }, { "epoch": 1.0195076637250349, "grad_norm": 0.2363717396988948, "learning_rate": 3.667814113597246e-05, "loss": 0.3533, "step": 1098 }, { "epoch": 1.0204366000928937, "grad_norm": 0.2303201035801806, "learning_rate": 3.666092943201377e-05, "loss": 0.3632, "step": 1099 }, { "epoch": 1.0213655364607523, "grad_norm": 0.2459782206416615, "learning_rate": 3.664371772805508e-05, "loss": 0.3618, "step": 1100 }, { "epoch": 1.0222944728286112, "grad_norm": 0.296616176626298, "learning_rate": 3.662650602409639e-05, "loss": 0.3922, "step": 1101 }, { "epoch": 1.02322340919647, "grad_norm": 0.2147341891639701, "learning_rate": 3.6609294320137694e-05, "loss": 0.3519, "step": 1102 }, { "epoch": 1.0241523455643289, "grad_norm": 0.266276617171416, "learning_rate": 3.6592082616179e-05, "loss": 0.3377, "step": 1103 }, { "epoch": 1.0250812819321877, "grad_norm": 0.23338087166757537, "learning_rate": 3.657487091222031e-05, "loss": 0.3436, "step": 1104 }, { "epoch": 1.0260102183000464, "grad_norm": 0.26374694027225837, "learning_rate": 3.6557659208261616e-05, "loss": 0.3478, "step": 1105 }, { "epoch": 1.0269391546679052, "grad_norm": 0.2337037433490987, "learning_rate": 3.654044750430293e-05, "loss": 0.3517, "step": 1106 }, { "epoch": 1.027868091035764, "grad_norm": 0.2596231255028893, "learning_rate": 3.652323580034423e-05, "loss": 0.365, "step": 1107 }, { "epoch": 1.0287970274036229, "grad_norm": 0.240620940997802, "learning_rate": 3.6506024096385544e-05, "loss": 0.3628, "step": 1108 }, { "epoch": 1.0297259637714817, "grad_norm": 0.21426737173957083, "learning_rate": 3.648881239242685e-05, "loss": 0.3335, "step": 1109 }, { "epoch": 1.0306549001393404, "grad_norm": 0.2229960824044303, "learning_rate": 3.647160068846816e-05, "loss": 0.3549, "step": 1110 }, { "epoch": 1.0315838365071992, "grad_norm": 0.27079181149762227, "learning_rate": 3.6454388984509466e-05, "loss": 0.3767, "step": 1111 }, { "epoch": 1.032512772875058, "grad_norm": 0.22945807604481355, "learning_rate": 3.643717728055078e-05, "loss": 0.3442, "step": 1112 }, { "epoch": 1.033441709242917, "grad_norm": 0.20955895984599512, "learning_rate": 3.641996557659208e-05, "loss": 0.3248, "step": 1113 }, { "epoch": 1.0343706456107757, "grad_norm": 0.23194107498456232, "learning_rate": 3.6402753872633395e-05, "loss": 0.3502, "step": 1114 }, { "epoch": 1.0352995819786344, "grad_norm": 0.2564535374989993, "learning_rate": 3.63855421686747e-05, "loss": 0.3694, "step": 1115 }, { "epoch": 1.0362285183464932, "grad_norm": 0.2280355951773234, "learning_rate": 3.636833046471601e-05, "loss": 0.3373, "step": 1116 }, { "epoch": 1.037157454714352, "grad_norm": 0.23682133916062154, "learning_rate": 3.6351118760757316e-05, "loss": 0.3355, "step": 1117 }, { "epoch": 1.038086391082211, "grad_norm": 0.216976014046176, "learning_rate": 3.633390705679863e-05, "loss": 0.3476, "step": 1118 }, { "epoch": 1.0390153274500697, "grad_norm": 0.2067060037353485, "learning_rate": 3.631669535283993e-05, "loss": 0.3515, "step": 1119 }, { "epoch": 1.0399442638179284, "grad_norm": 0.26852478563809756, "learning_rate": 3.629948364888124e-05, "loss": 0.3687, "step": 1120 }, { "epoch": 1.0408732001857872, "grad_norm": 0.23440938063759156, "learning_rate": 3.628227194492255e-05, "loss": 0.3676, "step": 1121 }, { "epoch": 1.041802136553646, "grad_norm": 0.22839083712400235, "learning_rate": 3.6265060240963855e-05, "loss": 0.3483, "step": 1122 }, { "epoch": 1.042731072921505, "grad_norm": 0.2868994905545559, "learning_rate": 3.624784853700517e-05, "loss": 0.3495, "step": 1123 }, { "epoch": 1.0436600092893638, "grad_norm": 0.21384049452616608, "learning_rate": 3.623063683304647e-05, "loss": 0.3419, "step": 1124 }, { "epoch": 1.0445889456572224, "grad_norm": 0.22545699928123575, "learning_rate": 3.6213425129087784e-05, "loss": 0.3507, "step": 1125 }, { "epoch": 1.0455178820250812, "grad_norm": 0.2616098875142772, "learning_rate": 3.619621342512909e-05, "loss": 0.3684, "step": 1126 }, { "epoch": 1.04644681839294, "grad_norm": 0.20808430540664596, "learning_rate": 3.6179001721170394e-05, "loss": 0.331, "step": 1127 }, { "epoch": 1.047375754760799, "grad_norm": 0.2447603473682486, "learning_rate": 3.6161790017211705e-05, "loss": 0.3455, "step": 1128 }, { "epoch": 1.0483046911286578, "grad_norm": 0.2500131614712927, "learning_rate": 3.614457831325301e-05, "loss": 0.3361, "step": 1129 }, { "epoch": 1.0492336274965164, "grad_norm": 0.2325076708728905, "learning_rate": 3.612736660929432e-05, "loss": 0.3345, "step": 1130 }, { "epoch": 1.0501625638643752, "grad_norm": 0.3372982804928654, "learning_rate": 3.611015490533563e-05, "loss": 0.3442, "step": 1131 }, { "epoch": 1.051091500232234, "grad_norm": 0.26364292805811546, "learning_rate": 3.609294320137694e-05, "loss": 0.355, "step": 1132 }, { "epoch": 1.052020436600093, "grad_norm": 0.2527413633654713, "learning_rate": 3.6075731497418244e-05, "loss": 0.3629, "step": 1133 }, { "epoch": 1.0529493729679518, "grad_norm": 0.2554529746289192, "learning_rate": 3.6058519793459556e-05, "loss": 0.3383, "step": 1134 }, { "epoch": 1.0538783093358104, "grad_norm": 0.27047127903067114, "learning_rate": 3.604130808950086e-05, "loss": 0.3282, "step": 1135 }, { "epoch": 1.0548072457036692, "grad_norm": 0.19736989479380834, "learning_rate": 3.602409638554217e-05, "loss": 0.3713, "step": 1136 }, { "epoch": 1.055736182071528, "grad_norm": 0.2945278332205401, "learning_rate": 3.600688468158348e-05, "loss": 0.3537, "step": 1137 }, { "epoch": 1.056665118439387, "grad_norm": 0.2583946369570546, "learning_rate": 3.598967297762479e-05, "loss": 0.3963, "step": 1138 }, { "epoch": 1.0575940548072458, "grad_norm": 0.50245090499772, "learning_rate": 3.5972461273666094e-05, "loss": 0.3514, "step": 1139 }, { "epoch": 1.0585229911751044, "grad_norm": 0.21531395236728787, "learning_rate": 3.5955249569707406e-05, "loss": 0.3562, "step": 1140 }, { "epoch": 1.0594519275429632, "grad_norm": 0.24061017735313153, "learning_rate": 3.593803786574871e-05, "loss": 0.3694, "step": 1141 }, { "epoch": 1.060380863910822, "grad_norm": 0.2268314841653852, "learning_rate": 3.5920826161790016e-05, "loss": 0.3422, "step": 1142 }, { "epoch": 1.061309800278681, "grad_norm": 0.22078553445878826, "learning_rate": 3.590361445783133e-05, "loss": 0.3606, "step": 1143 }, { "epoch": 1.0622387366465398, "grad_norm": 0.22090392465288902, "learning_rate": 3.588640275387263e-05, "loss": 0.3448, "step": 1144 }, { "epoch": 1.0631676730143984, "grad_norm": 0.23293244274551053, "learning_rate": 3.5869191049913945e-05, "loss": 0.3415, "step": 1145 }, { "epoch": 1.0640966093822573, "grad_norm": 0.21096983770192937, "learning_rate": 3.585197934595525e-05, "loss": 0.3563, "step": 1146 }, { "epoch": 1.065025545750116, "grad_norm": 0.22212524809608433, "learning_rate": 3.583476764199656e-05, "loss": 0.3316, "step": 1147 }, { "epoch": 1.065954482117975, "grad_norm": 0.20899680745058694, "learning_rate": 3.5817555938037867e-05, "loss": 0.3649, "step": 1148 }, { "epoch": 1.0668834184858338, "grad_norm": 0.2224898644473964, "learning_rate": 3.580034423407918e-05, "loss": 0.3485, "step": 1149 }, { "epoch": 1.0678123548536926, "grad_norm": 0.2487544062552923, "learning_rate": 3.578313253012048e-05, "loss": 0.3522, "step": 1150 }, { "epoch": 1.0687412912215513, "grad_norm": 0.23055813456250082, "learning_rate": 3.576592082616179e-05, "loss": 0.3618, "step": 1151 }, { "epoch": 1.06967022758941, "grad_norm": 0.20028327166610577, "learning_rate": 3.57487091222031e-05, "loss": 0.3358, "step": 1152 }, { "epoch": 1.070599163957269, "grad_norm": 0.22311914856614992, "learning_rate": 3.5731497418244405e-05, "loss": 0.3549, "step": 1153 }, { "epoch": 1.0715281003251278, "grad_norm": 0.23560718370813885, "learning_rate": 3.571428571428572e-05, "loss": 0.374, "step": 1154 }, { "epoch": 1.0724570366929864, "grad_norm": 0.4148928972512194, "learning_rate": 3.569707401032702e-05, "loss": 0.3528, "step": 1155 }, { "epoch": 1.0733859730608453, "grad_norm": 0.25885861158094975, "learning_rate": 3.5679862306368334e-05, "loss": 0.3412, "step": 1156 }, { "epoch": 1.0743149094287041, "grad_norm": 0.21300196539669833, "learning_rate": 3.566265060240964e-05, "loss": 0.3355, "step": 1157 }, { "epoch": 1.075243845796563, "grad_norm": 0.235345413141929, "learning_rate": 3.564543889845095e-05, "loss": 0.3629, "step": 1158 }, { "epoch": 1.0761727821644218, "grad_norm": 0.23367347817298498, "learning_rate": 3.5628227194492256e-05, "loss": 0.3613, "step": 1159 }, { "epoch": 1.0771017185322806, "grad_norm": 0.2326105412483254, "learning_rate": 3.561101549053357e-05, "loss": 0.3461, "step": 1160 }, { "epoch": 1.0780306549001393, "grad_norm": 0.23402340603110036, "learning_rate": 3.559380378657487e-05, "loss": 0.3383, "step": 1161 }, { "epoch": 1.0789595912679981, "grad_norm": 0.2392078922693959, "learning_rate": 3.557659208261618e-05, "loss": 0.3481, "step": 1162 }, { "epoch": 1.079888527635857, "grad_norm": 0.21367450129437765, "learning_rate": 3.555938037865749e-05, "loss": 0.3552, "step": 1163 }, { "epoch": 1.0808174640037158, "grad_norm": 0.2715676998763513, "learning_rate": 3.5542168674698794e-05, "loss": 0.3379, "step": 1164 }, { "epoch": 1.0817464003715744, "grad_norm": 0.2386244987904004, "learning_rate": 3.5524956970740106e-05, "loss": 0.3681, "step": 1165 }, { "epoch": 1.0826753367394333, "grad_norm": 0.22559148972950677, "learning_rate": 3.550774526678141e-05, "loss": 0.3784, "step": 1166 }, { "epoch": 1.0836042731072921, "grad_norm": 0.2525819984114936, "learning_rate": 3.549053356282272e-05, "loss": 0.3547, "step": 1167 }, { "epoch": 1.084533209475151, "grad_norm": 0.21367915648446467, "learning_rate": 3.547332185886403e-05, "loss": 0.3396, "step": 1168 }, { "epoch": 1.0854621458430098, "grad_norm": 0.2521020982590234, "learning_rate": 3.545611015490534e-05, "loss": 0.3592, "step": 1169 }, { "epoch": 1.0863910822108687, "grad_norm": 0.21484751786346046, "learning_rate": 3.5438898450946645e-05, "loss": 0.3401, "step": 1170 }, { "epoch": 1.0873200185787273, "grad_norm": 0.22930787394886787, "learning_rate": 3.5421686746987956e-05, "loss": 0.3503, "step": 1171 }, { "epoch": 1.0882489549465861, "grad_norm": 0.21234758307608245, "learning_rate": 3.540447504302926e-05, "loss": 0.342, "step": 1172 }, { "epoch": 1.089177891314445, "grad_norm": 0.1870818694435957, "learning_rate": 3.538726333907057e-05, "loss": 0.3388, "step": 1173 }, { "epoch": 1.0901068276823038, "grad_norm": 0.2075435312021659, "learning_rate": 3.537005163511188e-05, "loss": 0.3534, "step": 1174 }, { "epoch": 1.0910357640501624, "grad_norm": 0.2080276884082451, "learning_rate": 3.535283993115319e-05, "loss": 0.3308, "step": 1175 }, { "epoch": 1.0919647004180213, "grad_norm": 0.21516330313139653, "learning_rate": 3.5335628227194495e-05, "loss": 0.3632, "step": 1176 }, { "epoch": 1.0928936367858801, "grad_norm": 0.24747236372423687, "learning_rate": 3.53184165232358e-05, "loss": 0.3819, "step": 1177 }, { "epoch": 1.093822573153739, "grad_norm": 0.2084513425036716, "learning_rate": 3.530120481927711e-05, "loss": 0.3284, "step": 1178 }, { "epoch": 1.0947515095215978, "grad_norm": 0.2206018613707994, "learning_rate": 3.528399311531842e-05, "loss": 0.3736, "step": 1179 }, { "epoch": 1.0956804458894567, "grad_norm": 0.26865785499633366, "learning_rate": 3.526678141135973e-05, "loss": 0.3811, "step": 1180 }, { "epoch": 1.0966093822573153, "grad_norm": 0.2208182545530636, "learning_rate": 3.5249569707401034e-05, "loss": 0.3613, "step": 1181 }, { "epoch": 1.0975383186251741, "grad_norm": 0.2654033751873286, "learning_rate": 3.5232358003442345e-05, "loss": 0.3426, "step": 1182 }, { "epoch": 1.098467254993033, "grad_norm": 0.22186733925795926, "learning_rate": 3.521514629948365e-05, "loss": 0.3542, "step": 1183 }, { "epoch": 1.0993961913608918, "grad_norm": 0.24207554296990225, "learning_rate": 3.5197934595524955e-05, "loss": 0.3839, "step": 1184 }, { "epoch": 1.1003251277287505, "grad_norm": 0.2720135985816513, "learning_rate": 3.518072289156627e-05, "loss": 0.3818, "step": 1185 }, { "epoch": 1.1012540640966093, "grad_norm": 0.23520239556784836, "learning_rate": 3.516351118760757e-05, "loss": 0.3573, "step": 1186 }, { "epoch": 1.1021830004644682, "grad_norm": 0.20611704267650519, "learning_rate": 3.5146299483648884e-05, "loss": 0.3636, "step": 1187 }, { "epoch": 1.103111936832327, "grad_norm": 0.23271916548723978, "learning_rate": 3.512908777969019e-05, "loss": 0.3563, "step": 1188 }, { "epoch": 1.1040408732001858, "grad_norm": 0.22320870615937669, "learning_rate": 3.51118760757315e-05, "loss": 0.353, "step": 1189 }, { "epoch": 1.1049698095680447, "grad_norm": 0.24142802201840471, "learning_rate": 3.5094664371772806e-05, "loss": 0.3674, "step": 1190 }, { "epoch": 1.1058987459359033, "grad_norm": 0.24032397770102173, "learning_rate": 3.507745266781412e-05, "loss": 0.3396, "step": 1191 }, { "epoch": 1.1068276823037622, "grad_norm": 0.23859301700052904, "learning_rate": 3.506024096385542e-05, "loss": 0.3709, "step": 1192 }, { "epoch": 1.107756618671621, "grad_norm": 0.20993929975570846, "learning_rate": 3.5043029259896734e-05, "loss": 0.3532, "step": 1193 }, { "epoch": 1.1086855550394799, "grad_norm": 0.21525862778600577, "learning_rate": 3.502581755593804e-05, "loss": 0.3414, "step": 1194 }, { "epoch": 1.1096144914073387, "grad_norm": 0.2153151742361438, "learning_rate": 3.500860585197935e-05, "loss": 0.3479, "step": 1195 }, { "epoch": 1.1105434277751973, "grad_norm": 0.2341045687466886, "learning_rate": 3.4991394148020656e-05, "loss": 0.3514, "step": 1196 }, { "epoch": 1.1114723641430562, "grad_norm": 0.22041252024232932, "learning_rate": 3.497418244406197e-05, "loss": 0.3517, "step": 1197 }, { "epoch": 1.112401300510915, "grad_norm": 0.2421034130602699, "learning_rate": 3.495697074010327e-05, "loss": 0.355, "step": 1198 }, { "epoch": 1.1133302368787739, "grad_norm": 0.2148087803076203, "learning_rate": 3.4939759036144585e-05, "loss": 0.3433, "step": 1199 }, { "epoch": 1.1142591732466327, "grad_norm": 0.23749094706036994, "learning_rate": 3.492254733218589e-05, "loss": 0.3566, "step": 1200 }, { "epoch": 1.1151881096144913, "grad_norm": 0.24636426776494177, "learning_rate": 3.4905335628227195e-05, "loss": 0.3415, "step": 1201 }, { "epoch": 1.1161170459823502, "grad_norm": 0.21408882618052283, "learning_rate": 3.488812392426851e-05, "loss": 0.348, "step": 1202 }, { "epoch": 1.117045982350209, "grad_norm": 0.20922550693268657, "learning_rate": 3.487091222030981e-05, "loss": 0.3385, "step": 1203 }, { "epoch": 1.1179749187180679, "grad_norm": 0.20482695943117954, "learning_rate": 3.485370051635112e-05, "loss": 0.3714, "step": 1204 }, { "epoch": 1.1189038550859267, "grad_norm": 0.21259189935478204, "learning_rate": 3.483648881239243e-05, "loss": 0.3583, "step": 1205 }, { "epoch": 1.1198327914537853, "grad_norm": 0.23993781373108236, "learning_rate": 3.4819277108433733e-05, "loss": 0.3549, "step": 1206 }, { "epoch": 1.1207617278216442, "grad_norm": 0.21359898911362096, "learning_rate": 3.4802065404475045e-05, "loss": 0.3915, "step": 1207 }, { "epoch": 1.121690664189503, "grad_norm": 0.24339391904887306, "learning_rate": 3.478485370051635e-05, "loss": 0.3481, "step": 1208 }, { "epoch": 1.1226196005573619, "grad_norm": 0.18696533553937125, "learning_rate": 3.476764199655766e-05, "loss": 0.3489, "step": 1209 }, { "epoch": 1.1235485369252207, "grad_norm": 0.22213428030739601, "learning_rate": 3.475043029259897e-05, "loss": 0.3442, "step": 1210 }, { "epoch": 1.1244774732930793, "grad_norm": 0.22004231403214247, "learning_rate": 3.473321858864028e-05, "loss": 0.3567, "step": 1211 }, { "epoch": 1.1254064096609382, "grad_norm": 0.24987319354407675, "learning_rate": 3.4716006884681584e-05, "loss": 0.3608, "step": 1212 }, { "epoch": 1.126335346028797, "grad_norm": 0.22593282852581212, "learning_rate": 3.4698795180722896e-05, "loss": 0.3604, "step": 1213 }, { "epoch": 1.1272642823966559, "grad_norm": 0.22913929079537423, "learning_rate": 3.46815834767642e-05, "loss": 0.3308, "step": 1214 }, { "epoch": 1.1281932187645147, "grad_norm": 0.22512936563970043, "learning_rate": 3.466437177280551e-05, "loss": 0.3539, "step": 1215 }, { "epoch": 1.1291221551323734, "grad_norm": 0.2051404174738686, "learning_rate": 3.464716006884682e-05, "loss": 0.363, "step": 1216 }, { "epoch": 1.1300510915002322, "grad_norm": 0.21490274319372904, "learning_rate": 3.462994836488813e-05, "loss": 0.3354, "step": 1217 }, { "epoch": 1.130980027868091, "grad_norm": 0.2174571977823224, "learning_rate": 3.4612736660929434e-05, "loss": 0.3513, "step": 1218 }, { "epoch": 1.1319089642359499, "grad_norm": 0.19767325158530422, "learning_rate": 3.4595524956970746e-05, "loss": 0.3435, "step": 1219 }, { "epoch": 1.1328379006038087, "grad_norm": 0.21719464735692653, "learning_rate": 3.457831325301205e-05, "loss": 0.3574, "step": 1220 }, { "epoch": 1.1337668369716674, "grad_norm": 0.20581929044171807, "learning_rate": 3.456110154905336e-05, "loss": 0.3341, "step": 1221 }, { "epoch": 1.1346957733395262, "grad_norm": 0.19844253892744268, "learning_rate": 3.454388984509467e-05, "loss": 0.3362, "step": 1222 }, { "epoch": 1.135624709707385, "grad_norm": 0.2120233032755566, "learning_rate": 3.452667814113598e-05, "loss": 0.3521, "step": 1223 }, { "epoch": 1.136553646075244, "grad_norm": 0.2038168421319054, "learning_rate": 3.4509466437177285e-05, "loss": 0.3831, "step": 1224 }, { "epoch": 1.1374825824431027, "grad_norm": 0.25162161041366465, "learning_rate": 3.449225473321859e-05, "loss": 0.3629, "step": 1225 }, { "epoch": 1.1384115188109614, "grad_norm": 0.23246017940495503, "learning_rate": 3.4475043029259895e-05, "loss": 0.3361, "step": 1226 }, { "epoch": 1.1393404551788202, "grad_norm": 0.20778768722697064, "learning_rate": 3.4457831325301206e-05, "loss": 0.3555, "step": 1227 }, { "epoch": 1.140269391546679, "grad_norm": 0.2153164058775418, "learning_rate": 3.444061962134251e-05, "loss": 0.3351, "step": 1228 }, { "epoch": 1.141198327914538, "grad_norm": 0.20123624716901134, "learning_rate": 3.442340791738382e-05, "loss": 0.3443, "step": 1229 }, { "epoch": 1.1421272642823967, "grad_norm": 0.21058058376175867, "learning_rate": 3.440619621342513e-05, "loss": 0.3587, "step": 1230 }, { "epoch": 1.1430562006502554, "grad_norm": 0.2263383068441551, "learning_rate": 3.438898450946644e-05, "loss": 0.3305, "step": 1231 }, { "epoch": 1.1439851370181142, "grad_norm": 0.21243006539833154, "learning_rate": 3.4371772805507745e-05, "loss": 0.3554, "step": 1232 }, { "epoch": 1.144914073385973, "grad_norm": 0.2102541546923477, "learning_rate": 3.435456110154906e-05, "loss": 0.3567, "step": 1233 }, { "epoch": 1.145843009753832, "grad_norm": 0.23046254004079486, "learning_rate": 3.433734939759036e-05, "loss": 0.3617, "step": 1234 }, { "epoch": 1.1467719461216908, "grad_norm": 0.2930500467524406, "learning_rate": 3.4320137693631674e-05, "loss": 0.3555, "step": 1235 }, { "epoch": 1.1477008824895494, "grad_norm": 0.23152478794927303, "learning_rate": 3.430292598967298e-05, "loss": 0.3734, "step": 1236 }, { "epoch": 1.1486298188574082, "grad_norm": 0.2549389816652615, "learning_rate": 3.428571428571429e-05, "loss": 0.3293, "step": 1237 }, { "epoch": 1.149558755225267, "grad_norm": 0.2167026826205843, "learning_rate": 3.4268502581755595e-05, "loss": 0.3411, "step": 1238 }, { "epoch": 1.150487691593126, "grad_norm": 0.2485030988375794, "learning_rate": 3.425129087779691e-05, "loss": 0.3559, "step": 1239 }, { "epoch": 1.1514166279609848, "grad_norm": 0.24163510390409695, "learning_rate": 3.423407917383821e-05, "loss": 0.3448, "step": 1240 }, { "epoch": 1.1523455643288434, "grad_norm": 0.23020837321077686, "learning_rate": 3.4216867469879524e-05, "loss": 0.3346, "step": 1241 }, { "epoch": 1.1532745006967022, "grad_norm": 0.21827258140007752, "learning_rate": 3.419965576592083e-05, "loss": 0.3594, "step": 1242 }, { "epoch": 1.154203437064561, "grad_norm": 0.2259959955019267, "learning_rate": 3.418244406196214e-05, "loss": 0.3263, "step": 1243 }, { "epoch": 1.15513237343242, "grad_norm": 0.2235516711875459, "learning_rate": 3.4165232358003446e-05, "loss": 0.3315, "step": 1244 }, { "epoch": 1.1560613098002788, "grad_norm": 0.20721762489645518, "learning_rate": 3.414802065404476e-05, "loss": 0.3604, "step": 1245 }, { "epoch": 1.1569902461681374, "grad_norm": 0.2231484555937936, "learning_rate": 3.413080895008606e-05, "loss": 0.3476, "step": 1246 }, { "epoch": 1.1579191825359962, "grad_norm": 0.22679340464244963, "learning_rate": 3.411359724612737e-05, "loss": 0.3788, "step": 1247 }, { "epoch": 1.158848118903855, "grad_norm": 0.2183757050580824, "learning_rate": 3.409638554216867e-05, "loss": 0.3308, "step": 1248 }, { "epoch": 1.159777055271714, "grad_norm": 0.23012791532153595, "learning_rate": 3.4079173838209984e-05, "loss": 0.3475, "step": 1249 }, { "epoch": 1.1607059916395728, "grad_norm": 0.2151078725178325, "learning_rate": 3.406196213425129e-05, "loss": 0.3542, "step": 1250 }, { "epoch": 1.1616349280074314, "grad_norm": 0.20530486932614164, "learning_rate": 3.40447504302926e-05, "loss": 0.3362, "step": 1251 }, { "epoch": 1.1625638643752902, "grad_norm": 0.21039678917229546, "learning_rate": 3.4027538726333906e-05, "loss": 0.3602, "step": 1252 }, { "epoch": 1.163492800743149, "grad_norm": 0.2292399032989059, "learning_rate": 3.401032702237522e-05, "loss": 0.3678, "step": 1253 }, { "epoch": 1.164421737111008, "grad_norm": 0.23493270267966537, "learning_rate": 3.399311531841652e-05, "loss": 0.3531, "step": 1254 }, { "epoch": 1.1653506734788668, "grad_norm": 0.20806480245707265, "learning_rate": 3.3975903614457835e-05, "loss": 0.3474, "step": 1255 }, { "epoch": 1.1662796098467254, "grad_norm": 0.23970461641766203, "learning_rate": 3.395869191049914e-05, "loss": 0.3555, "step": 1256 }, { "epoch": 1.1672085462145843, "grad_norm": 0.23969672692118035, "learning_rate": 3.394148020654045e-05, "loss": 0.3634, "step": 1257 }, { "epoch": 1.168137482582443, "grad_norm": 0.2285160457892427, "learning_rate": 3.392426850258176e-05, "loss": 0.3442, "step": 1258 }, { "epoch": 1.169066418950302, "grad_norm": 0.22369375804224814, "learning_rate": 3.390705679862307e-05, "loss": 0.3311, "step": 1259 }, { "epoch": 1.1699953553181608, "grad_norm": 0.21415743915884217, "learning_rate": 3.3889845094664373e-05, "loss": 0.3469, "step": 1260 }, { "epoch": 1.1709242916860194, "grad_norm": 0.19956161096816913, "learning_rate": 3.3872633390705685e-05, "loss": 0.3665, "step": 1261 }, { "epoch": 1.1718532280538783, "grad_norm": 0.22850970980578056, "learning_rate": 3.385542168674699e-05, "loss": 0.3311, "step": 1262 }, { "epoch": 1.172782164421737, "grad_norm": 0.22798171419609636, "learning_rate": 3.38382099827883e-05, "loss": 0.3487, "step": 1263 }, { "epoch": 1.173711100789596, "grad_norm": 0.23282279392195235, "learning_rate": 3.382099827882961e-05, "loss": 0.3698, "step": 1264 }, { "epoch": 1.1746400371574548, "grad_norm": 0.2703092707739479, "learning_rate": 3.380378657487092e-05, "loss": 0.3394, "step": 1265 }, { "epoch": 1.1755689735253134, "grad_norm": 0.22016376237672852, "learning_rate": 3.3786574870912224e-05, "loss": 0.3546, "step": 1266 }, { "epoch": 1.1764979098931723, "grad_norm": 0.20478813537848256, "learning_rate": 3.3769363166953536e-05, "loss": 0.3528, "step": 1267 }, { "epoch": 1.1774268462610311, "grad_norm": 0.2561543579105953, "learning_rate": 3.3752151462994834e-05, "loss": 0.383, "step": 1268 }, { "epoch": 1.17835578262889, "grad_norm": 0.2192358895821466, "learning_rate": 3.3734939759036146e-05, "loss": 0.3548, "step": 1269 }, { "epoch": 1.1792847189967488, "grad_norm": 0.21639896112955978, "learning_rate": 3.371772805507745e-05, "loss": 0.3438, "step": 1270 }, { "epoch": 1.1802136553646074, "grad_norm": 0.23897229606319104, "learning_rate": 3.370051635111876e-05, "loss": 0.3408, "step": 1271 }, { "epoch": 1.1811425917324663, "grad_norm": 0.21230871449016278, "learning_rate": 3.368330464716007e-05, "loss": 0.3529, "step": 1272 }, { "epoch": 1.1820715281003251, "grad_norm": 0.2225131278174084, "learning_rate": 3.366609294320138e-05, "loss": 0.3443, "step": 1273 }, { "epoch": 1.183000464468184, "grad_norm": 0.22179982347649585, "learning_rate": 3.3648881239242684e-05, "loss": 0.3667, "step": 1274 }, { "epoch": 1.1839294008360428, "grad_norm": 0.2363874317205004, "learning_rate": 3.3631669535283996e-05, "loss": 0.3786, "step": 1275 }, { "epoch": 1.1848583372039014, "grad_norm": 0.2193585078333849, "learning_rate": 3.36144578313253e-05, "loss": 0.3297, "step": 1276 }, { "epoch": 1.1857872735717603, "grad_norm": 0.2284696248734332, "learning_rate": 3.359724612736661e-05, "loss": 0.3437, "step": 1277 }, { "epoch": 1.1867162099396191, "grad_norm": 0.20793086153649565, "learning_rate": 3.358003442340792e-05, "loss": 0.3531, "step": 1278 }, { "epoch": 1.187645146307478, "grad_norm": 0.21567184317250074, "learning_rate": 3.356282271944923e-05, "loss": 0.3406, "step": 1279 }, { "epoch": 1.1885740826753368, "grad_norm": 0.24433871916587774, "learning_rate": 3.3545611015490535e-05, "loss": 0.3716, "step": 1280 }, { "epoch": 1.1895030190431957, "grad_norm": 0.22152344071069677, "learning_rate": 3.3528399311531847e-05, "loss": 0.3356, "step": 1281 }, { "epoch": 1.1904319554110543, "grad_norm": 0.21020957516671623, "learning_rate": 3.351118760757315e-05, "loss": 0.374, "step": 1282 }, { "epoch": 1.1913608917789131, "grad_norm": 0.5970058115054317, "learning_rate": 3.349397590361446e-05, "loss": 0.3366, "step": 1283 }, { "epoch": 1.192289828146772, "grad_norm": 0.22658123670184568, "learning_rate": 3.347676419965577e-05, "loss": 0.345, "step": 1284 }, { "epoch": 1.1932187645146308, "grad_norm": 0.20134140896250374, "learning_rate": 3.345955249569708e-05, "loss": 0.3369, "step": 1285 }, { "epoch": 1.1941477008824894, "grad_norm": 0.2203877272353749, "learning_rate": 3.3442340791738385e-05, "loss": 0.343, "step": 1286 }, { "epoch": 1.1950766372503483, "grad_norm": 0.20330351316025363, "learning_rate": 3.34251290877797e-05, "loss": 0.3546, "step": 1287 }, { "epoch": 1.1960055736182071, "grad_norm": 0.20833906150996628, "learning_rate": 3.3407917383821e-05, "loss": 0.3653, "step": 1288 }, { "epoch": 1.196934509986066, "grad_norm": 0.21852984142409648, "learning_rate": 3.339070567986231e-05, "loss": 0.3576, "step": 1289 }, { "epoch": 1.1978634463539248, "grad_norm": 0.23067032362394563, "learning_rate": 3.337349397590361e-05, "loss": 0.3237, "step": 1290 }, { "epoch": 1.1987923827217837, "grad_norm": 0.21203185556352686, "learning_rate": 3.3356282271944924e-05, "loss": 0.3413, "step": 1291 }, { "epoch": 1.1997213190896423, "grad_norm": 0.19814905094151894, "learning_rate": 3.333907056798623e-05, "loss": 0.3187, "step": 1292 }, { "epoch": 1.2006502554575011, "grad_norm": 0.23683735633528624, "learning_rate": 3.332185886402754e-05, "loss": 0.3692, "step": 1293 }, { "epoch": 1.20157919182536, "grad_norm": 0.23935032392727087, "learning_rate": 3.3304647160068846e-05, "loss": 0.341, "step": 1294 }, { "epoch": 1.2025081281932188, "grad_norm": 0.21718405839661278, "learning_rate": 3.328743545611016e-05, "loss": 0.339, "step": 1295 }, { "epoch": 1.2034370645610775, "grad_norm": 0.2333890322775293, "learning_rate": 3.327022375215146e-05, "loss": 0.3522, "step": 1296 }, { "epoch": 1.2043660009289363, "grad_norm": 0.23203001135682452, "learning_rate": 3.3253012048192774e-05, "loss": 0.3328, "step": 1297 }, { "epoch": 1.2052949372967952, "grad_norm": 0.20690983108544067, "learning_rate": 3.323580034423408e-05, "loss": 0.3527, "step": 1298 }, { "epoch": 1.206223873664654, "grad_norm": 0.2466034408066261, "learning_rate": 3.321858864027539e-05, "loss": 0.3629, "step": 1299 }, { "epoch": 1.2071528100325128, "grad_norm": 0.21156982100561744, "learning_rate": 3.3201376936316696e-05, "loss": 0.3669, "step": 1300 }, { "epoch": 1.2080817464003717, "grad_norm": 0.2160203305108374, "learning_rate": 3.318416523235801e-05, "loss": 0.3528, "step": 1301 }, { "epoch": 1.2090106827682303, "grad_norm": 0.24371827812631483, "learning_rate": 3.316695352839931e-05, "loss": 0.3665, "step": 1302 }, { "epoch": 1.2099396191360892, "grad_norm": 0.22457004262344515, "learning_rate": 3.3149741824440625e-05, "loss": 0.3458, "step": 1303 }, { "epoch": 1.210868555503948, "grad_norm": 0.22055226828004412, "learning_rate": 3.313253012048193e-05, "loss": 0.3738, "step": 1304 }, { "epoch": 1.2117974918718069, "grad_norm": 0.22285967677243848, "learning_rate": 3.311531841652324e-05, "loss": 0.3616, "step": 1305 }, { "epoch": 1.2127264282396655, "grad_norm": 0.2161533855354827, "learning_rate": 3.3098106712564546e-05, "loss": 0.3699, "step": 1306 }, { "epoch": 1.2136553646075243, "grad_norm": 0.22228225680627903, "learning_rate": 3.308089500860586e-05, "loss": 0.3363, "step": 1307 }, { "epoch": 1.2145843009753832, "grad_norm": 0.20607924665127353, "learning_rate": 3.306368330464716e-05, "loss": 0.3303, "step": 1308 }, { "epoch": 1.215513237343242, "grad_norm": 0.18066272382062054, "learning_rate": 3.3046471600688475e-05, "loss": 0.3569, "step": 1309 }, { "epoch": 1.2164421737111009, "grad_norm": 0.21118001457268404, "learning_rate": 3.302925989672978e-05, "loss": 0.347, "step": 1310 }, { "epoch": 1.2173711100789597, "grad_norm": 0.20561719917149285, "learning_rate": 3.3012048192771085e-05, "loss": 0.3546, "step": 1311 }, { "epoch": 1.2183000464468183, "grad_norm": 0.22731459097307283, "learning_rate": 3.299483648881239e-05, "loss": 0.3745, "step": 1312 }, { "epoch": 1.2192289828146772, "grad_norm": 0.2424041406528847, "learning_rate": 3.29776247848537e-05, "loss": 0.3486, "step": 1313 }, { "epoch": 1.220157919182536, "grad_norm": 0.20069413390660085, "learning_rate": 3.296041308089501e-05, "loss": 0.3383, "step": 1314 }, { "epoch": 1.2210868555503949, "grad_norm": 0.2281772093622241, "learning_rate": 3.294320137693632e-05, "loss": 0.3434, "step": 1315 }, { "epoch": 1.2220157919182535, "grad_norm": 0.19211768424635667, "learning_rate": 3.2925989672977624e-05, "loss": 0.3352, "step": 1316 }, { "epoch": 1.2229447282861123, "grad_norm": 0.2513611309761323, "learning_rate": 3.2908777969018935e-05, "loss": 0.345, "step": 1317 }, { "epoch": 1.2238736646539712, "grad_norm": 0.24961481880569977, "learning_rate": 3.289156626506024e-05, "loss": 0.3231, "step": 1318 }, { "epoch": 1.22480260102183, "grad_norm": 0.20092458352098808, "learning_rate": 3.287435456110155e-05, "loss": 0.3563, "step": 1319 }, { "epoch": 1.2257315373896889, "grad_norm": 0.2436154621559124, "learning_rate": 3.285714285714286e-05, "loss": 0.3831, "step": 1320 }, { "epoch": 1.2266604737575477, "grad_norm": 0.24743764188275466, "learning_rate": 3.283993115318417e-05, "loss": 0.3574, "step": 1321 }, { "epoch": 1.2275894101254063, "grad_norm": 0.19895641348642534, "learning_rate": 3.2822719449225474e-05, "loss": 0.3482, "step": 1322 }, { "epoch": 1.2285183464932652, "grad_norm": 0.22955056964532125, "learning_rate": 3.2805507745266786e-05, "loss": 0.3532, "step": 1323 }, { "epoch": 1.229447282861124, "grad_norm": 0.2100380867776916, "learning_rate": 3.278829604130809e-05, "loss": 0.3348, "step": 1324 }, { "epoch": 1.2303762192289829, "grad_norm": 0.18910582670755707, "learning_rate": 3.27710843373494e-05, "loss": 0.343, "step": 1325 }, { "epoch": 1.2313051555968415, "grad_norm": 0.23613291094866282, "learning_rate": 3.275387263339071e-05, "loss": 0.3567, "step": 1326 }, { "epoch": 1.2322340919647004, "grad_norm": 0.19368690581896822, "learning_rate": 3.273666092943202e-05, "loss": 0.3561, "step": 1327 }, { "epoch": 1.2331630283325592, "grad_norm": 0.20487266897523065, "learning_rate": 3.2719449225473324e-05, "loss": 0.3477, "step": 1328 }, { "epoch": 1.234091964700418, "grad_norm": 0.19205460972365668, "learning_rate": 3.2702237521514636e-05, "loss": 0.353, "step": 1329 }, { "epoch": 1.2350209010682769, "grad_norm": 0.22085132737761864, "learning_rate": 3.268502581755594e-05, "loss": 0.3499, "step": 1330 }, { "epoch": 1.2359498374361357, "grad_norm": 0.19737628387896256, "learning_rate": 3.266781411359725e-05, "loss": 0.3396, "step": 1331 }, { "epoch": 1.2368787738039944, "grad_norm": 0.19772707162756117, "learning_rate": 3.265060240963855e-05, "loss": 0.3364, "step": 1332 }, { "epoch": 1.2378077101718532, "grad_norm": 0.1990220518002121, "learning_rate": 3.263339070567986e-05, "loss": 0.3543, "step": 1333 }, { "epoch": 1.238736646539712, "grad_norm": 0.2088922466620167, "learning_rate": 3.261617900172117e-05, "loss": 0.3423, "step": 1334 }, { "epoch": 1.239665582907571, "grad_norm": 0.23503440075942228, "learning_rate": 3.259896729776248e-05, "loss": 0.378, "step": 1335 }, { "epoch": 1.2405945192754295, "grad_norm": 0.20957487854697568, "learning_rate": 3.2581755593803785e-05, "loss": 0.3497, "step": 1336 }, { "epoch": 1.2415234556432884, "grad_norm": 0.20536395635402577, "learning_rate": 3.2564543889845097e-05, "loss": 0.3621, "step": 1337 }, { "epoch": 1.2424523920111472, "grad_norm": 0.19566124028745266, "learning_rate": 3.25473321858864e-05, "loss": 0.3392, "step": 1338 }, { "epoch": 1.243381328379006, "grad_norm": 0.19859624787750643, "learning_rate": 3.253012048192771e-05, "loss": 0.3305, "step": 1339 }, { "epoch": 1.244310264746865, "grad_norm": 0.20220312062723214, "learning_rate": 3.251290877796902e-05, "loss": 0.3429, "step": 1340 }, { "epoch": 1.2452392011147237, "grad_norm": 0.20306062927254576, "learning_rate": 3.249569707401033e-05, "loss": 0.3515, "step": 1341 }, { "epoch": 1.2461681374825824, "grad_norm": 0.22899674919736335, "learning_rate": 3.2478485370051635e-05, "loss": 0.3512, "step": 1342 }, { "epoch": 1.2470970738504412, "grad_norm": 0.20719067431783017, "learning_rate": 3.246127366609295e-05, "loss": 0.3687, "step": 1343 }, { "epoch": 1.2480260102183, "grad_norm": 0.19102206124283935, "learning_rate": 3.244406196213425e-05, "loss": 0.3562, "step": 1344 }, { "epoch": 1.248954946586159, "grad_norm": 0.1986375316054701, "learning_rate": 3.2426850258175564e-05, "loss": 0.3549, "step": 1345 }, { "epoch": 1.2498838829540175, "grad_norm": 0.20320312735004203, "learning_rate": 3.240963855421687e-05, "loss": 0.344, "step": 1346 }, { "epoch": 1.2508128193218764, "grad_norm": 0.20833632211864914, "learning_rate": 3.239242685025818e-05, "loss": 0.3419, "step": 1347 }, { "epoch": 1.2517417556897352, "grad_norm": 0.20209089611428194, "learning_rate": 3.2375215146299486e-05, "loss": 0.378, "step": 1348 }, { "epoch": 1.252670692057594, "grad_norm": 0.23317589217939863, "learning_rate": 3.23580034423408e-05, "loss": 0.3472, "step": 1349 }, { "epoch": 1.253599628425453, "grad_norm": 0.23889292842454002, "learning_rate": 3.23407917383821e-05, "loss": 0.3546, "step": 1350 }, { "epoch": 1.2545285647933118, "grad_norm": 0.23185784512316487, "learning_rate": 3.2323580034423414e-05, "loss": 0.353, "step": 1351 }, { "epoch": 1.2554575011611704, "grad_norm": 0.25338080904879534, "learning_rate": 3.230636833046472e-05, "loss": 0.3369, "step": 1352 }, { "epoch": 1.2563864375290292, "grad_norm": 0.2099328571771037, "learning_rate": 3.2289156626506024e-05, "loss": 0.3393, "step": 1353 }, { "epoch": 1.257315373896888, "grad_norm": 0.24756023680513764, "learning_rate": 3.227194492254733e-05, "loss": 0.3448, "step": 1354 }, { "epoch": 1.258244310264747, "grad_norm": 0.23036929610075849, "learning_rate": 3.225473321858864e-05, "loss": 0.3428, "step": 1355 }, { "epoch": 1.2591732466326055, "grad_norm": 0.26373689904363057, "learning_rate": 3.2237521514629946e-05, "loss": 0.3525, "step": 1356 }, { "epoch": 1.2601021830004644, "grad_norm": 0.20482417525178498, "learning_rate": 3.222030981067126e-05, "loss": 0.3463, "step": 1357 }, { "epoch": 1.2610311193683232, "grad_norm": 0.24978474205561277, "learning_rate": 3.220309810671256e-05, "loss": 0.3616, "step": 1358 }, { "epoch": 1.261960055736182, "grad_norm": 0.2200819349873659, "learning_rate": 3.2185886402753875e-05, "loss": 0.3438, "step": 1359 }, { "epoch": 1.262888992104041, "grad_norm": 0.23973808094498264, "learning_rate": 3.216867469879518e-05, "loss": 0.3552, "step": 1360 }, { "epoch": 1.2638179284718998, "grad_norm": 0.22114441784933195, "learning_rate": 3.215146299483649e-05, "loss": 0.3752, "step": 1361 }, { "epoch": 1.2647468648397584, "grad_norm": 0.27751924776177905, "learning_rate": 3.2134251290877796e-05, "loss": 0.3515, "step": 1362 }, { "epoch": 1.2656758012076172, "grad_norm": 0.21042269563943283, "learning_rate": 3.211703958691911e-05, "loss": 0.3465, "step": 1363 }, { "epoch": 1.266604737575476, "grad_norm": 0.28456491769206277, "learning_rate": 3.209982788296041e-05, "loss": 0.3694, "step": 1364 }, { "epoch": 1.267533673943335, "grad_norm": 0.2307337278461796, "learning_rate": 3.2082616179001725e-05, "loss": 0.3686, "step": 1365 }, { "epoch": 1.2684626103111936, "grad_norm": 0.2295732638687118, "learning_rate": 3.206540447504303e-05, "loss": 0.3595, "step": 1366 }, { "epoch": 1.2693915466790524, "grad_norm": 0.25011196671438235, "learning_rate": 3.204819277108434e-05, "loss": 0.3676, "step": 1367 }, { "epoch": 1.2703204830469113, "grad_norm": 0.21793396400635984, "learning_rate": 3.203098106712565e-05, "loss": 0.3505, "step": 1368 }, { "epoch": 1.27124941941477, "grad_norm": 0.19814582335576053, "learning_rate": 3.201376936316696e-05, "loss": 0.333, "step": 1369 }, { "epoch": 1.272178355782629, "grad_norm": 0.22394942492694084, "learning_rate": 3.1996557659208264e-05, "loss": 0.358, "step": 1370 }, { "epoch": 1.2731072921504878, "grad_norm": 0.2021914187434177, "learning_rate": 3.1979345955249575e-05, "loss": 0.3385, "step": 1371 }, { "epoch": 1.2740362285183464, "grad_norm": 0.18777768433600583, "learning_rate": 3.196213425129088e-05, "loss": 0.3599, "step": 1372 }, { "epoch": 1.2749651648862053, "grad_norm": 0.20249518397349203, "learning_rate": 3.194492254733219e-05, "loss": 0.3548, "step": 1373 }, { "epoch": 1.275894101254064, "grad_norm": 0.2269731964103628, "learning_rate": 3.192771084337349e-05, "loss": 0.3537, "step": 1374 }, { "epoch": 1.276823037621923, "grad_norm": 0.19146870047473466, "learning_rate": 3.19104991394148e-05, "loss": 0.3555, "step": 1375 }, { "epoch": 1.2777519739897816, "grad_norm": 0.2073759588674991, "learning_rate": 3.189328743545611e-05, "loss": 0.351, "step": 1376 }, { "epoch": 1.2786809103576404, "grad_norm": 0.2157939027150171, "learning_rate": 3.187607573149742e-05, "loss": 0.368, "step": 1377 }, { "epoch": 1.2796098467254993, "grad_norm": 0.21802601041303335, "learning_rate": 3.1858864027538724e-05, "loss": 0.3697, "step": 1378 }, { "epoch": 1.2805387830933581, "grad_norm": 0.20338664825260214, "learning_rate": 3.1841652323580036e-05, "loss": 0.3553, "step": 1379 }, { "epoch": 1.281467719461217, "grad_norm": 0.21747341979747276, "learning_rate": 3.182444061962134e-05, "loss": 0.3434, "step": 1380 }, { "epoch": 1.2823966558290758, "grad_norm": 0.19657412183584266, "learning_rate": 3.180722891566265e-05, "loss": 0.354, "step": 1381 }, { "epoch": 1.2833255921969344, "grad_norm": 0.203905731577841, "learning_rate": 3.179001721170396e-05, "loss": 0.3639, "step": 1382 }, { "epoch": 1.2842545285647933, "grad_norm": 0.19642280259503783, "learning_rate": 3.177280550774527e-05, "loss": 0.3429, "step": 1383 }, { "epoch": 1.2851834649326521, "grad_norm": 0.20991675969465276, "learning_rate": 3.1755593803786574e-05, "loss": 0.3521, "step": 1384 }, { "epoch": 1.286112401300511, "grad_norm": 0.20253615601371597, "learning_rate": 3.1738382099827886e-05, "loss": 0.3473, "step": 1385 }, { "epoch": 1.2870413376683696, "grad_norm": 0.198688892854731, "learning_rate": 3.172117039586919e-05, "loss": 0.3436, "step": 1386 }, { "epoch": 1.2879702740362284, "grad_norm": 2.4661548054153672, "learning_rate": 3.17039586919105e-05, "loss": 0.3541, "step": 1387 }, { "epoch": 1.2888992104040873, "grad_norm": 0.20742528996459592, "learning_rate": 3.168674698795181e-05, "loss": 0.3517, "step": 1388 }, { "epoch": 1.2898281467719461, "grad_norm": 0.20329525205803542, "learning_rate": 3.166953528399312e-05, "loss": 0.3424, "step": 1389 }, { "epoch": 1.290757083139805, "grad_norm": 0.2112927190742448, "learning_rate": 3.1652323580034425e-05, "loss": 0.3556, "step": 1390 }, { "epoch": 1.2916860195076638, "grad_norm": 0.23636619370443224, "learning_rate": 3.163511187607574e-05, "loss": 0.3735, "step": 1391 }, { "epoch": 1.2926149558755227, "grad_norm": 0.22276923652505054, "learning_rate": 3.161790017211704e-05, "loss": 0.3579, "step": 1392 }, { "epoch": 1.2935438922433813, "grad_norm": 0.2005541094030619, "learning_rate": 3.1600688468158353e-05, "loss": 0.3554, "step": 1393 }, { "epoch": 1.2944728286112401, "grad_norm": 0.22155762592559106, "learning_rate": 3.158347676419966e-05, "loss": 0.3546, "step": 1394 }, { "epoch": 1.295401764979099, "grad_norm": 0.20946083857237227, "learning_rate": 3.1566265060240963e-05, "loss": 0.349, "step": 1395 }, { "epoch": 1.2963307013469576, "grad_norm": 0.20702763325289733, "learning_rate": 3.154905335628227e-05, "loss": 0.3649, "step": 1396 }, { "epoch": 1.2972596377148164, "grad_norm": 0.2229383054425248, "learning_rate": 3.153184165232358e-05, "loss": 0.349, "step": 1397 }, { "epoch": 1.2981885740826753, "grad_norm": 0.23124550374061495, "learning_rate": 3.1514629948364885e-05, "loss": 0.336, "step": 1398 }, { "epoch": 1.2991175104505341, "grad_norm": 0.19990916764328137, "learning_rate": 3.14974182444062e-05, "loss": 0.3619, "step": 1399 }, { "epoch": 1.300046446818393, "grad_norm": 0.21799120394800936, "learning_rate": 3.14802065404475e-05, "loss": 0.3492, "step": 1400 }, { "epoch": 1.3009753831862518, "grad_norm": 0.3118082548093683, "learning_rate": 3.1462994836488814e-05, "loss": 0.3486, "step": 1401 }, { "epoch": 1.3019043195541107, "grad_norm": 0.21875180339324876, "learning_rate": 3.144578313253012e-05, "loss": 0.345, "step": 1402 }, { "epoch": 1.3028332559219693, "grad_norm": 0.20622030938134586, "learning_rate": 3.142857142857143e-05, "loss": 0.3395, "step": 1403 }, { "epoch": 1.3037621922898281, "grad_norm": 0.22507581862454673, "learning_rate": 3.1411359724612736e-05, "loss": 0.3462, "step": 1404 }, { "epoch": 1.304691128657687, "grad_norm": 0.20997485299055227, "learning_rate": 3.139414802065405e-05, "loss": 0.3248, "step": 1405 }, { "epoch": 1.3056200650255456, "grad_norm": 0.20777883901322605, "learning_rate": 3.137693631669535e-05, "loss": 0.3632, "step": 1406 }, { "epoch": 1.3065490013934045, "grad_norm": 0.22237908122375719, "learning_rate": 3.1359724612736664e-05, "loss": 0.3547, "step": 1407 }, { "epoch": 1.3074779377612633, "grad_norm": 0.22989418042498605, "learning_rate": 3.134251290877797e-05, "loss": 0.3386, "step": 1408 }, { "epoch": 1.3084068741291222, "grad_norm": 0.18232306340600674, "learning_rate": 3.132530120481928e-05, "loss": 0.3387, "step": 1409 }, { "epoch": 1.309335810496981, "grad_norm": 0.22379285894618078, "learning_rate": 3.1308089500860586e-05, "loss": 0.351, "step": 1410 }, { "epoch": 1.3102647468648398, "grad_norm": 0.22957367785503446, "learning_rate": 3.12908777969019e-05, "loss": 0.3538, "step": 1411 }, { "epoch": 1.3111936832326987, "grad_norm": 0.2213926772964687, "learning_rate": 3.12736660929432e-05, "loss": 0.3501, "step": 1412 }, { "epoch": 1.3121226196005573, "grad_norm": 0.2199668333695165, "learning_rate": 3.1256454388984515e-05, "loss": 0.3357, "step": 1413 }, { "epoch": 1.3130515559684162, "grad_norm": 0.22045412078084195, "learning_rate": 3.123924268502582e-05, "loss": 0.3665, "step": 1414 }, { "epoch": 1.313980492336275, "grad_norm": 0.23937083341287288, "learning_rate": 3.122203098106713e-05, "loss": 0.3541, "step": 1415 }, { "epoch": 1.3149094287041336, "grad_norm": 0.22142560842558442, "learning_rate": 3.1204819277108436e-05, "loss": 0.3589, "step": 1416 }, { "epoch": 1.3158383650719925, "grad_norm": 0.25259554318810806, "learning_rate": 3.118760757314974e-05, "loss": 0.3558, "step": 1417 }, { "epoch": 1.3167673014398513, "grad_norm": 0.23469823002390106, "learning_rate": 3.1170395869191046e-05, "loss": 0.3604, "step": 1418 }, { "epoch": 1.3176962378077102, "grad_norm": 0.2318292616166613, "learning_rate": 3.115318416523236e-05, "loss": 0.3231, "step": 1419 }, { "epoch": 1.318625174175569, "grad_norm": 0.23736119043390674, "learning_rate": 3.113597246127366e-05, "loss": 0.3644, "step": 1420 }, { "epoch": 1.3195541105434279, "grad_norm": 0.2164485499905474, "learning_rate": 3.1118760757314975e-05, "loss": 0.3662, "step": 1421 }, { "epoch": 1.3204830469112867, "grad_norm": 0.21673871274658302, "learning_rate": 3.110154905335628e-05, "loss": 0.3451, "step": 1422 }, { "epoch": 1.3214119832791453, "grad_norm": 0.2472895979807864, "learning_rate": 3.108433734939759e-05, "loss": 0.3543, "step": 1423 }, { "epoch": 1.3223409196470042, "grad_norm": 0.2245120667209746, "learning_rate": 3.10671256454389e-05, "loss": 0.3624, "step": 1424 }, { "epoch": 1.323269856014863, "grad_norm": 0.24985123445714202, "learning_rate": 3.104991394148021e-05, "loss": 0.3387, "step": 1425 }, { "epoch": 1.3241987923827216, "grad_norm": 0.2142027736341417, "learning_rate": 3.1032702237521514e-05, "loss": 0.3576, "step": 1426 }, { "epoch": 1.3251277287505805, "grad_norm": 0.21501285847890808, "learning_rate": 3.1015490533562825e-05, "loss": 0.3259, "step": 1427 }, { "epoch": 1.3260566651184393, "grad_norm": 0.22544501880772352, "learning_rate": 3.099827882960413e-05, "loss": 0.3363, "step": 1428 }, { "epoch": 1.3269856014862982, "grad_norm": 0.23996880906387041, "learning_rate": 3.098106712564544e-05, "loss": 0.3681, "step": 1429 }, { "epoch": 1.327914537854157, "grad_norm": 0.22942067334804397, "learning_rate": 3.096385542168675e-05, "loss": 0.3505, "step": 1430 }, { "epoch": 1.3288434742220159, "grad_norm": 0.23432507600874153, "learning_rate": 3.094664371772806e-05, "loss": 0.3703, "step": 1431 }, { "epoch": 1.3297724105898747, "grad_norm": 0.19874580942818373, "learning_rate": 3.0929432013769364e-05, "loss": 0.3629, "step": 1432 }, { "epoch": 1.3307013469577333, "grad_norm": 0.2312124529538691, "learning_rate": 3.0912220309810676e-05, "loss": 0.3382, "step": 1433 }, { "epoch": 1.3316302833255922, "grad_norm": 0.20766547973250402, "learning_rate": 3.089500860585198e-05, "loss": 0.3572, "step": 1434 }, { "epoch": 1.332559219693451, "grad_norm": 0.20786382857619148, "learning_rate": 3.087779690189329e-05, "loss": 0.358, "step": 1435 }, { "epoch": 1.3334881560613099, "grad_norm": 0.20447235064175212, "learning_rate": 3.08605851979346e-05, "loss": 0.3364, "step": 1436 }, { "epoch": 1.3344170924291685, "grad_norm": 0.1996680493576843, "learning_rate": 3.084337349397591e-05, "loss": 0.3557, "step": 1437 }, { "epoch": 1.3353460287970274, "grad_norm": 0.19501800099094735, "learning_rate": 3.082616179001721e-05, "loss": 0.3515, "step": 1438 }, { "epoch": 1.3362749651648862, "grad_norm": 0.23158157472952676, "learning_rate": 3.080895008605852e-05, "loss": 0.3582, "step": 1439 }, { "epoch": 1.337203901532745, "grad_norm": 0.6667951671559779, "learning_rate": 3.0791738382099825e-05, "loss": 0.3487, "step": 1440 }, { "epoch": 1.3381328379006039, "grad_norm": 0.20918867974295047, "learning_rate": 3.0774526678141136e-05, "loss": 0.3528, "step": 1441 }, { "epoch": 1.3390617742684627, "grad_norm": 0.22293293277353693, "learning_rate": 3.075731497418244e-05, "loss": 0.3409, "step": 1442 }, { "epoch": 1.3399907106363214, "grad_norm": 0.1829660308906723, "learning_rate": 3.074010327022375e-05, "loss": 0.3455, "step": 1443 }, { "epoch": 1.3409196470041802, "grad_norm": 0.2000567256104809, "learning_rate": 3.072289156626506e-05, "loss": 0.3643, "step": 1444 }, { "epoch": 1.341848583372039, "grad_norm": 0.2115519953166791, "learning_rate": 3.070567986230637e-05, "loss": 0.3661, "step": 1445 }, { "epoch": 1.342777519739898, "grad_norm": 0.21732387886026813, "learning_rate": 3.0688468158347675e-05, "loss": 0.3316, "step": 1446 }, { "epoch": 1.3437064561077565, "grad_norm": 0.20075318769521555, "learning_rate": 3.067125645438899e-05, "loss": 0.3554, "step": 1447 }, { "epoch": 1.3446353924756154, "grad_norm": 0.20861534949300697, "learning_rate": 3.065404475043029e-05, "loss": 0.3505, "step": 1448 }, { "epoch": 1.3455643288434742, "grad_norm": 0.254657487659329, "learning_rate": 3.0636833046471604e-05, "loss": 0.3625, "step": 1449 }, { "epoch": 1.346493265211333, "grad_norm": 0.2070643343223196, "learning_rate": 3.061962134251291e-05, "loss": 0.3239, "step": 1450 }, { "epoch": 1.347422201579192, "grad_norm": 0.1928014077888377, "learning_rate": 3.060240963855422e-05, "loss": 0.3558, "step": 1451 }, { "epoch": 1.3483511379470507, "grad_norm": 0.2294446621025945, "learning_rate": 3.0585197934595525e-05, "loss": 0.3674, "step": 1452 }, { "epoch": 1.3492800743149094, "grad_norm": 0.18874531273124526, "learning_rate": 3.056798623063684e-05, "loss": 0.3561, "step": 1453 }, { "epoch": 1.3502090106827682, "grad_norm": 0.22251661860503666, "learning_rate": 3.055077452667814e-05, "loss": 0.3603, "step": 1454 }, { "epoch": 1.351137947050627, "grad_norm": 0.2235668510889753, "learning_rate": 3.0533562822719454e-05, "loss": 0.3517, "step": 1455 }, { "epoch": 1.352066883418486, "grad_norm": 0.21432492708979067, "learning_rate": 3.0516351118760762e-05, "loss": 0.3496, "step": 1456 }, { "epoch": 1.3529958197863445, "grad_norm": 0.20494031548874958, "learning_rate": 3.049913941480207e-05, "loss": 0.3342, "step": 1457 }, { "epoch": 1.3539247561542034, "grad_norm": 0.20672906781500622, "learning_rate": 3.048192771084338e-05, "loss": 0.366, "step": 1458 }, { "epoch": 1.3548536925220622, "grad_norm": 0.2042325486051598, "learning_rate": 3.046471600688468e-05, "loss": 0.3293, "step": 1459 }, { "epoch": 1.355782628889921, "grad_norm": 0.20799195423693348, "learning_rate": 3.044750430292599e-05, "loss": 0.3273, "step": 1460 }, { "epoch": 1.35671156525778, "grad_norm": 0.1915937763700306, "learning_rate": 3.0430292598967298e-05, "loss": 0.3195, "step": 1461 }, { "epoch": 1.3576405016256388, "grad_norm": 0.18941206561090246, "learning_rate": 3.0413080895008606e-05, "loss": 0.3603, "step": 1462 }, { "epoch": 1.3585694379934974, "grad_norm": 0.22011411157308233, "learning_rate": 3.0395869191049914e-05, "loss": 0.3389, "step": 1463 }, { "epoch": 1.3594983743613562, "grad_norm": 0.20454140375565522, "learning_rate": 3.0378657487091223e-05, "loss": 0.3412, "step": 1464 }, { "epoch": 1.360427310729215, "grad_norm": 0.204807974919683, "learning_rate": 3.036144578313253e-05, "loss": 0.3688, "step": 1465 }, { "epoch": 1.361356247097074, "grad_norm": 0.2500844338812536, "learning_rate": 3.034423407917384e-05, "loss": 0.3528, "step": 1466 }, { "epoch": 1.3622851834649325, "grad_norm": 0.19177649858240708, "learning_rate": 3.0327022375215148e-05, "loss": 0.3371, "step": 1467 }, { "epoch": 1.3632141198327914, "grad_norm": 0.23935175027569758, "learning_rate": 3.0309810671256456e-05, "loss": 0.3659, "step": 1468 }, { "epoch": 1.3641430562006502, "grad_norm": 0.2132668493989873, "learning_rate": 3.0292598967297765e-05, "loss": 0.3447, "step": 1469 }, { "epoch": 1.365071992568509, "grad_norm": 0.2060605613886273, "learning_rate": 3.0275387263339073e-05, "loss": 0.3522, "step": 1470 }, { "epoch": 1.366000928936368, "grad_norm": 0.19961249600782785, "learning_rate": 3.025817555938038e-05, "loss": 0.3369, "step": 1471 }, { "epoch": 1.3669298653042268, "grad_norm": 0.20880345374087075, "learning_rate": 3.024096385542169e-05, "loss": 0.3371, "step": 1472 }, { "epoch": 1.3678588016720854, "grad_norm": 0.18622587429732448, "learning_rate": 3.0223752151463e-05, "loss": 0.3546, "step": 1473 }, { "epoch": 1.3687877380399442, "grad_norm": 0.21866694803984443, "learning_rate": 3.0206540447504307e-05, "loss": 0.3582, "step": 1474 }, { "epoch": 1.369716674407803, "grad_norm": 0.1952217882301225, "learning_rate": 3.0189328743545615e-05, "loss": 0.338, "step": 1475 }, { "epoch": 1.370645610775662, "grad_norm": 0.1935932378498297, "learning_rate": 3.0172117039586924e-05, "loss": 0.3311, "step": 1476 }, { "epoch": 1.3715745471435206, "grad_norm": 0.21516220551975052, "learning_rate": 3.0154905335628232e-05, "loss": 0.3438, "step": 1477 }, { "epoch": 1.3725034835113794, "grad_norm": 0.18251550631463553, "learning_rate": 3.013769363166954e-05, "loss": 0.355, "step": 1478 }, { "epoch": 1.3734324198792383, "grad_norm": 0.21335773806259328, "learning_rate": 3.012048192771085e-05, "loss": 0.3555, "step": 1479 }, { "epoch": 1.374361356247097, "grad_norm": 0.19458561617263528, "learning_rate": 3.010327022375215e-05, "loss": 0.3334, "step": 1480 }, { "epoch": 1.375290292614956, "grad_norm": 0.2006684368240207, "learning_rate": 3.008605851979346e-05, "loss": 0.3541, "step": 1481 }, { "epoch": 1.3762192289828148, "grad_norm": 0.20772428446640925, "learning_rate": 3.0068846815834767e-05, "loss": 0.3289, "step": 1482 }, { "epoch": 1.3771481653506734, "grad_norm": 0.1940082452881049, "learning_rate": 3.0051635111876076e-05, "loss": 0.3464, "step": 1483 }, { "epoch": 1.3780771017185323, "grad_norm": 0.21437646933932283, "learning_rate": 3.0034423407917384e-05, "loss": 0.3589, "step": 1484 }, { "epoch": 1.379006038086391, "grad_norm": 0.20429035849628274, "learning_rate": 3.0017211703958692e-05, "loss": 0.3367, "step": 1485 }, { "epoch": 1.37993497445425, "grad_norm": 0.19153881560052258, "learning_rate": 3e-05, "loss": 0.3595, "step": 1486 }, { "epoch": 1.3808639108221086, "grad_norm": 0.21368509231169802, "learning_rate": 2.998278829604131e-05, "loss": 0.338, "step": 1487 }, { "epoch": 1.3817928471899674, "grad_norm": 0.19137787166733314, "learning_rate": 2.9965576592082618e-05, "loss": 0.3409, "step": 1488 }, { "epoch": 1.3827217835578263, "grad_norm": 0.20518740175365516, "learning_rate": 2.9948364888123926e-05, "loss": 0.3458, "step": 1489 }, { "epoch": 1.3836507199256851, "grad_norm": 0.21123625494807322, "learning_rate": 2.9931153184165234e-05, "loss": 0.3577, "step": 1490 }, { "epoch": 1.384579656293544, "grad_norm": 0.20634934940948052, "learning_rate": 2.9913941480206543e-05, "loss": 0.3472, "step": 1491 }, { "epoch": 1.3855085926614028, "grad_norm": 0.20463549838354728, "learning_rate": 2.989672977624785e-05, "loss": 0.3457, "step": 1492 }, { "epoch": 1.3864375290292614, "grad_norm": 0.24704845466896294, "learning_rate": 2.987951807228916e-05, "loss": 0.3598, "step": 1493 }, { "epoch": 1.3873664653971203, "grad_norm": 0.1988515336691101, "learning_rate": 2.9862306368330468e-05, "loss": 0.3372, "step": 1494 }, { "epoch": 1.3882954017649791, "grad_norm": 0.20847564536416982, "learning_rate": 2.9845094664371776e-05, "loss": 0.3428, "step": 1495 }, { "epoch": 1.389224338132838, "grad_norm": 0.21413822989945702, "learning_rate": 2.9827882960413085e-05, "loss": 0.3787, "step": 1496 }, { "epoch": 1.3901532745006966, "grad_norm": 0.22368254215474134, "learning_rate": 2.9810671256454393e-05, "loss": 0.3139, "step": 1497 }, { "epoch": 1.3910822108685554, "grad_norm": 0.19657862058957046, "learning_rate": 2.97934595524957e-05, "loss": 0.3386, "step": 1498 }, { "epoch": 1.3920111472364143, "grad_norm": 0.20285755404055592, "learning_rate": 2.977624784853701e-05, "loss": 0.3551, "step": 1499 }, { "epoch": 1.3929400836042731, "grad_norm": 0.24237808422101634, "learning_rate": 2.975903614457832e-05, "loss": 0.3325, "step": 1500 }, { "epoch": 1.393869019972132, "grad_norm": 0.1939612180735908, "learning_rate": 2.974182444061962e-05, "loss": 0.3643, "step": 1501 }, { "epoch": 1.3947979563399908, "grad_norm": 0.25538025094523903, "learning_rate": 2.972461273666093e-05, "loss": 0.3472, "step": 1502 }, { "epoch": 1.3957268927078494, "grad_norm": 0.2411165671533172, "learning_rate": 2.9707401032702237e-05, "loss": 0.3618, "step": 1503 }, { "epoch": 1.3966558290757083, "grad_norm": 0.18225449502294644, "learning_rate": 2.9690189328743545e-05, "loss": 0.3378, "step": 1504 }, { "epoch": 1.3975847654435671, "grad_norm": 0.2222434673585961, "learning_rate": 2.9672977624784854e-05, "loss": 0.3501, "step": 1505 }, { "epoch": 1.398513701811426, "grad_norm": 0.22332973721087676, "learning_rate": 2.9655765920826162e-05, "loss": 0.3361, "step": 1506 }, { "epoch": 1.3994426381792846, "grad_norm": 0.204654731810479, "learning_rate": 2.963855421686747e-05, "loss": 0.3594, "step": 1507 }, { "epoch": 1.4003715745471434, "grad_norm": 0.18130868120983315, "learning_rate": 2.962134251290878e-05, "loss": 0.3504, "step": 1508 }, { "epoch": 1.4013005109150023, "grad_norm": 0.22439741167196856, "learning_rate": 2.9604130808950087e-05, "loss": 0.3291, "step": 1509 }, { "epoch": 1.4022294472828611, "grad_norm": 0.2513401337007612, "learning_rate": 2.9586919104991396e-05, "loss": 0.3703, "step": 1510 }, { "epoch": 1.40315838365072, "grad_norm": 0.20610567460377152, "learning_rate": 2.9569707401032704e-05, "loss": 0.3347, "step": 1511 }, { "epoch": 1.4040873200185788, "grad_norm": 0.2134355620033519, "learning_rate": 2.9552495697074012e-05, "loss": 0.3328, "step": 1512 }, { "epoch": 1.4050162563864375, "grad_norm": 0.26437614395333625, "learning_rate": 2.953528399311532e-05, "loss": 0.3456, "step": 1513 }, { "epoch": 1.4059451927542963, "grad_norm": 0.20273923027295387, "learning_rate": 2.951807228915663e-05, "loss": 0.3522, "step": 1514 }, { "epoch": 1.4068741291221551, "grad_norm": 0.23385915941470425, "learning_rate": 2.9500860585197938e-05, "loss": 0.3654, "step": 1515 }, { "epoch": 1.407803065490014, "grad_norm": 0.24126554734693004, "learning_rate": 2.9483648881239246e-05, "loss": 0.3486, "step": 1516 }, { "epoch": 1.4087320018578726, "grad_norm": 0.23539838223695975, "learning_rate": 2.9466437177280554e-05, "loss": 0.3526, "step": 1517 }, { "epoch": 1.4096609382257315, "grad_norm": 0.202514576386571, "learning_rate": 2.9449225473321863e-05, "loss": 0.347, "step": 1518 }, { "epoch": 1.4105898745935903, "grad_norm": 0.21679264136440565, "learning_rate": 2.943201376936317e-05, "loss": 0.3446, "step": 1519 }, { "epoch": 1.4115188109614492, "grad_norm": 0.20985791433208453, "learning_rate": 2.941480206540448e-05, "loss": 0.3336, "step": 1520 }, { "epoch": 1.412447747329308, "grad_norm": 0.20082124256632042, "learning_rate": 2.9397590361445788e-05, "loss": 0.3627, "step": 1521 }, { "epoch": 1.4133766836971668, "grad_norm": 0.21260803696246838, "learning_rate": 2.9380378657487096e-05, "loss": 0.3279, "step": 1522 }, { "epoch": 1.4143056200650255, "grad_norm": 0.19228212989651475, "learning_rate": 2.9363166953528398e-05, "loss": 0.3536, "step": 1523 }, { "epoch": 1.4152345564328843, "grad_norm": 0.1964531621998223, "learning_rate": 2.9345955249569706e-05, "loss": 0.3244, "step": 1524 }, { "epoch": 1.4161634928007432, "grad_norm": 0.20227705650558617, "learning_rate": 2.9328743545611015e-05, "loss": 0.3637, "step": 1525 }, { "epoch": 1.417092429168602, "grad_norm": 0.19644430453193695, "learning_rate": 2.9311531841652323e-05, "loss": 0.3379, "step": 1526 }, { "epoch": 1.4180213655364606, "grad_norm": 0.19015970604275795, "learning_rate": 2.929432013769363e-05, "loss": 0.3505, "step": 1527 }, { "epoch": 1.4189503019043195, "grad_norm": 0.19835877082909076, "learning_rate": 2.927710843373494e-05, "loss": 0.3479, "step": 1528 }, { "epoch": 1.4198792382721783, "grad_norm": 0.20490877773794255, "learning_rate": 2.925989672977625e-05, "loss": 0.3656, "step": 1529 }, { "epoch": 1.4208081746400372, "grad_norm": 0.2700572256039223, "learning_rate": 2.9242685025817557e-05, "loss": 0.35, "step": 1530 }, { "epoch": 1.421737111007896, "grad_norm": 0.19596937388189786, "learning_rate": 2.9225473321858865e-05, "loss": 0.3539, "step": 1531 }, { "epoch": 1.4226660473757549, "grad_norm": 0.21738021003203584, "learning_rate": 2.9208261617900174e-05, "loss": 0.3581, "step": 1532 }, { "epoch": 1.4235949837436135, "grad_norm": 0.20702370533181025, "learning_rate": 2.9191049913941482e-05, "loss": 0.3408, "step": 1533 }, { "epoch": 1.4245239201114723, "grad_norm": 0.18957085537630466, "learning_rate": 2.917383820998279e-05, "loss": 0.3734, "step": 1534 }, { "epoch": 1.4254528564793312, "grad_norm": 0.22813119163304144, "learning_rate": 2.91566265060241e-05, "loss": 0.3694, "step": 1535 }, { "epoch": 1.42638179284719, "grad_norm": 0.20400390978454386, "learning_rate": 2.9139414802065407e-05, "loss": 0.3435, "step": 1536 }, { "epoch": 1.4273107292150486, "grad_norm": 0.18107256509762557, "learning_rate": 2.9122203098106716e-05, "loss": 0.3558, "step": 1537 }, { "epoch": 1.4282396655829075, "grad_norm": 0.23378269273098054, "learning_rate": 2.9104991394148024e-05, "loss": 0.3488, "step": 1538 }, { "epoch": 1.4291686019507663, "grad_norm": 0.1881671167013014, "learning_rate": 2.9087779690189332e-05, "loss": 0.3399, "step": 1539 }, { "epoch": 1.4300975383186252, "grad_norm": 0.19700102593332322, "learning_rate": 2.907056798623064e-05, "loss": 0.3562, "step": 1540 }, { "epoch": 1.431026474686484, "grad_norm": 0.2013437891388278, "learning_rate": 2.905335628227195e-05, "loss": 0.342, "step": 1541 }, { "epoch": 1.4319554110543429, "grad_norm": 0.19061395252656796, "learning_rate": 2.9036144578313258e-05, "loss": 0.3286, "step": 1542 }, { "epoch": 1.4328843474222015, "grad_norm": 0.2133955681118407, "learning_rate": 2.9018932874354566e-05, "loss": 0.355, "step": 1543 }, { "epoch": 1.4338132837900603, "grad_norm": 0.21653616877161203, "learning_rate": 2.9001721170395868e-05, "loss": 0.3637, "step": 1544 }, { "epoch": 1.4347422201579192, "grad_norm": 0.2241202614347583, "learning_rate": 2.8984509466437176e-05, "loss": 0.3392, "step": 1545 }, { "epoch": 1.435671156525778, "grad_norm": 0.2024615955265462, "learning_rate": 2.8967297762478484e-05, "loss": 0.3543, "step": 1546 }, { "epoch": 1.4366000928936367, "grad_norm": 0.1955822427239985, "learning_rate": 2.8950086058519793e-05, "loss": 0.3364, "step": 1547 }, { "epoch": 1.4375290292614955, "grad_norm": 0.20873100814874504, "learning_rate": 2.89328743545611e-05, "loss": 0.3517, "step": 1548 }, { "epoch": 1.4384579656293544, "grad_norm": 0.2112000677453288, "learning_rate": 2.891566265060241e-05, "loss": 0.3469, "step": 1549 }, { "epoch": 1.4393869019972132, "grad_norm": 0.20592979867590985, "learning_rate": 2.8898450946643718e-05, "loss": 0.3601, "step": 1550 }, { "epoch": 1.440315838365072, "grad_norm": 0.21192866699319832, "learning_rate": 2.8881239242685026e-05, "loss": 0.3469, "step": 1551 }, { "epoch": 1.4412447747329309, "grad_norm": 0.18398546591558737, "learning_rate": 2.8864027538726335e-05, "loss": 0.3441, "step": 1552 }, { "epoch": 1.4421737111007895, "grad_norm": 0.2017578725992054, "learning_rate": 2.8846815834767643e-05, "loss": 0.3617, "step": 1553 }, { "epoch": 1.4431026474686484, "grad_norm": 0.20148355147165561, "learning_rate": 2.882960413080895e-05, "loss": 0.3574, "step": 1554 }, { "epoch": 1.4440315838365072, "grad_norm": 0.18503169901772, "learning_rate": 2.881239242685026e-05, "loss": 0.3602, "step": 1555 }, { "epoch": 1.444960520204366, "grad_norm": 0.19444893478794364, "learning_rate": 2.879518072289157e-05, "loss": 0.3423, "step": 1556 }, { "epoch": 1.4458894565722247, "grad_norm": 0.17839868562302205, "learning_rate": 2.8777969018932877e-05, "loss": 0.3228, "step": 1557 }, { "epoch": 1.4468183929400835, "grad_norm": 0.17929509233834304, "learning_rate": 2.8760757314974185e-05, "loss": 0.3096, "step": 1558 }, { "epoch": 1.4477473293079424, "grad_norm": 0.18214616689218266, "learning_rate": 2.8743545611015494e-05, "loss": 0.3522, "step": 1559 }, { "epoch": 1.4486762656758012, "grad_norm": 0.2012672967397137, "learning_rate": 2.8726333907056802e-05, "loss": 0.358, "step": 1560 }, { "epoch": 1.44960520204366, "grad_norm": 0.19930879626694126, "learning_rate": 2.870912220309811e-05, "loss": 0.3431, "step": 1561 }, { "epoch": 1.450534138411519, "grad_norm": 0.17903764713191608, "learning_rate": 2.869191049913942e-05, "loss": 0.3396, "step": 1562 }, { "epoch": 1.4514630747793777, "grad_norm": 0.19529230647348247, "learning_rate": 2.8674698795180727e-05, "loss": 0.3592, "step": 1563 }, { "epoch": 1.4523920111472364, "grad_norm": 0.19975868842982136, "learning_rate": 2.8657487091222036e-05, "loss": 0.355, "step": 1564 }, { "epoch": 1.4533209475150952, "grad_norm": 0.20358611197286738, "learning_rate": 2.8640275387263337e-05, "loss": 0.3527, "step": 1565 }, { "epoch": 1.454249883882954, "grad_norm": 0.19856276668969844, "learning_rate": 2.8623063683304646e-05, "loss": 0.3266, "step": 1566 }, { "epoch": 1.4551788202508127, "grad_norm": 0.19124828137152422, "learning_rate": 2.8605851979345954e-05, "loss": 0.3324, "step": 1567 }, { "epoch": 1.4561077566186715, "grad_norm": 0.20725907417756995, "learning_rate": 2.8588640275387262e-05, "loss": 0.3357, "step": 1568 }, { "epoch": 1.4570366929865304, "grad_norm": 0.2176938171580828, "learning_rate": 2.857142857142857e-05, "loss": 0.3435, "step": 1569 }, { "epoch": 1.4579656293543892, "grad_norm": 0.21108437489228465, "learning_rate": 2.855421686746988e-05, "loss": 0.3466, "step": 1570 }, { "epoch": 1.458894565722248, "grad_norm": 0.22480057073204013, "learning_rate": 2.8537005163511188e-05, "loss": 0.3606, "step": 1571 }, { "epoch": 1.459823502090107, "grad_norm": 0.20864206196181312, "learning_rate": 2.8519793459552496e-05, "loss": 0.339, "step": 1572 }, { "epoch": 1.4607524384579658, "grad_norm": 0.21922099540081025, "learning_rate": 2.8502581755593804e-05, "loss": 0.3444, "step": 1573 }, { "epoch": 1.4616813748258244, "grad_norm": 0.2276494800159974, "learning_rate": 2.8485370051635113e-05, "loss": 0.3599, "step": 1574 }, { "epoch": 1.4626103111936832, "grad_norm": 0.2288239429440535, "learning_rate": 2.846815834767642e-05, "loss": 0.3491, "step": 1575 }, { "epoch": 1.463539247561542, "grad_norm": 0.17841881990882816, "learning_rate": 2.845094664371773e-05, "loss": 0.343, "step": 1576 }, { "epoch": 1.4644681839294007, "grad_norm": 0.21286928712739886, "learning_rate": 2.8433734939759038e-05, "loss": 0.3521, "step": 1577 }, { "epoch": 1.4653971202972595, "grad_norm": 0.21833463235565898, "learning_rate": 2.8416523235800346e-05, "loss": 0.3438, "step": 1578 }, { "epoch": 1.4663260566651184, "grad_norm": 0.18733866679034886, "learning_rate": 2.8399311531841655e-05, "loss": 0.3382, "step": 1579 }, { "epoch": 1.4672549930329772, "grad_norm": 0.20885255362383456, "learning_rate": 2.8382099827882963e-05, "loss": 0.3352, "step": 1580 }, { "epoch": 1.468183929400836, "grad_norm": 0.2110857036563971, "learning_rate": 2.836488812392427e-05, "loss": 0.3451, "step": 1581 }, { "epoch": 1.469112865768695, "grad_norm": 0.21499377814488893, "learning_rate": 2.834767641996558e-05, "loss": 0.344, "step": 1582 }, { "epoch": 1.4700418021365538, "grad_norm": 0.2241849573695223, "learning_rate": 2.833046471600689e-05, "loss": 0.3292, "step": 1583 }, { "epoch": 1.4709707385044124, "grad_norm": 0.21699607258286505, "learning_rate": 2.8313253012048197e-05, "loss": 0.334, "step": 1584 }, { "epoch": 1.4718996748722712, "grad_norm": 0.22542594584645712, "learning_rate": 2.8296041308089505e-05, "loss": 0.3477, "step": 1585 }, { "epoch": 1.47282861124013, "grad_norm": 0.1970265391341249, "learning_rate": 2.8278829604130807e-05, "loss": 0.3338, "step": 1586 }, { "epoch": 1.4737575476079887, "grad_norm": 0.21144181981047877, "learning_rate": 2.8261617900172115e-05, "loss": 0.3657, "step": 1587 }, { "epoch": 1.4746864839758476, "grad_norm": 0.22030587436035384, "learning_rate": 2.8244406196213424e-05, "loss": 0.3263, "step": 1588 }, { "epoch": 1.4756154203437064, "grad_norm": 0.21592987334276875, "learning_rate": 2.8227194492254732e-05, "loss": 0.3691, "step": 1589 }, { "epoch": 1.4765443567115653, "grad_norm": 0.20142452673876848, "learning_rate": 2.820998278829604e-05, "loss": 0.3485, "step": 1590 }, { "epoch": 1.477473293079424, "grad_norm": 0.21246785539377971, "learning_rate": 2.819277108433735e-05, "loss": 0.3379, "step": 1591 }, { "epoch": 1.478402229447283, "grad_norm": 0.20419932770516422, "learning_rate": 2.8175559380378657e-05, "loss": 0.3664, "step": 1592 }, { "epoch": 1.4793311658151418, "grad_norm": 0.23868328892673205, "learning_rate": 2.8158347676419966e-05, "loss": 0.3579, "step": 1593 }, { "epoch": 1.4802601021830004, "grad_norm": 0.18142148540515865, "learning_rate": 2.8141135972461274e-05, "loss": 0.3293, "step": 1594 }, { "epoch": 1.4811890385508593, "grad_norm": 0.190821964829027, "learning_rate": 2.8123924268502582e-05, "loss": 0.3336, "step": 1595 }, { "epoch": 1.482117974918718, "grad_norm": 0.1990392036313888, "learning_rate": 2.810671256454389e-05, "loss": 0.3639, "step": 1596 }, { "epoch": 1.483046911286577, "grad_norm": 0.20621738698691292, "learning_rate": 2.80895008605852e-05, "loss": 0.3524, "step": 1597 }, { "epoch": 1.4839758476544356, "grad_norm": 0.21378160126323356, "learning_rate": 2.8072289156626508e-05, "loss": 0.3507, "step": 1598 }, { "epoch": 1.4849047840222944, "grad_norm": 0.2159950509848571, "learning_rate": 2.8055077452667816e-05, "loss": 0.3549, "step": 1599 }, { "epoch": 1.4858337203901533, "grad_norm": 0.18702549069355595, "learning_rate": 2.8037865748709124e-05, "loss": 0.3366, "step": 1600 }, { "epoch": 1.4867626567580121, "grad_norm": 0.1994967299413536, "learning_rate": 2.8020654044750433e-05, "loss": 0.3256, "step": 1601 }, { "epoch": 1.487691593125871, "grad_norm": 0.21942189823000416, "learning_rate": 2.800344234079174e-05, "loss": 0.3581, "step": 1602 }, { "epoch": 1.4886205294937298, "grad_norm": 0.1766879531892212, "learning_rate": 2.798623063683305e-05, "loss": 0.3457, "step": 1603 }, { "epoch": 1.4895494658615884, "grad_norm": 0.19258517625663005, "learning_rate": 2.7969018932874358e-05, "loss": 0.3425, "step": 1604 }, { "epoch": 1.4904784022294473, "grad_norm": 0.2101457920763198, "learning_rate": 2.7951807228915666e-05, "loss": 0.3442, "step": 1605 }, { "epoch": 1.4914073385973061, "grad_norm": 0.17432791499600817, "learning_rate": 2.7934595524956975e-05, "loss": 0.3537, "step": 1606 }, { "epoch": 1.492336274965165, "grad_norm": 0.21787927461103915, "learning_rate": 2.7917383820998277e-05, "loss": 0.3375, "step": 1607 }, { "epoch": 1.4932652113330236, "grad_norm": 0.2135791752018399, "learning_rate": 2.7900172117039585e-05, "loss": 0.3642, "step": 1608 }, { "epoch": 1.4941941477008824, "grad_norm": 0.19691607764966712, "learning_rate": 2.7882960413080893e-05, "loss": 0.3492, "step": 1609 }, { "epoch": 1.4951230840687413, "grad_norm": 0.19492765300345521, "learning_rate": 2.7865748709122202e-05, "loss": 0.3657, "step": 1610 }, { "epoch": 1.4960520204366001, "grad_norm": 0.20105927888888867, "learning_rate": 2.784853700516351e-05, "loss": 0.3522, "step": 1611 }, { "epoch": 1.496980956804459, "grad_norm": 0.2022257310559514, "learning_rate": 2.783132530120482e-05, "loss": 0.3352, "step": 1612 }, { "epoch": 1.4979098931723178, "grad_norm": 0.18412786976848391, "learning_rate": 2.7814113597246127e-05, "loss": 0.3569, "step": 1613 }, { "epoch": 1.4988388295401764, "grad_norm": 0.18521152850671202, "learning_rate": 2.7796901893287435e-05, "loss": 0.3291, "step": 1614 }, { "epoch": 1.4997677659080353, "grad_norm": 0.18484257371830262, "learning_rate": 2.7779690189328744e-05, "loss": 0.3471, "step": 1615 }, { "epoch": 1.5006967022758941, "grad_norm": 0.20613823228990855, "learning_rate": 2.7762478485370052e-05, "loss": 0.3523, "step": 1616 }, { "epoch": 1.5016256386437528, "grad_norm": 0.21700406423639299, "learning_rate": 2.774526678141136e-05, "loss": 0.3402, "step": 1617 }, { "epoch": 1.5025545750116116, "grad_norm": 0.1825892710960065, "learning_rate": 2.772805507745267e-05, "loss": 0.3463, "step": 1618 }, { "epoch": 1.5034835113794704, "grad_norm": 0.21767334282347994, "learning_rate": 2.7710843373493977e-05, "loss": 0.3591, "step": 1619 }, { "epoch": 1.5044124477473293, "grad_norm": 0.2240701912543056, "learning_rate": 2.7693631669535286e-05, "loss": 0.3491, "step": 1620 }, { "epoch": 1.5053413841151881, "grad_norm": 0.20794265732288209, "learning_rate": 2.7676419965576594e-05, "loss": 0.3597, "step": 1621 }, { "epoch": 1.506270320483047, "grad_norm": 0.1942386308673407, "learning_rate": 2.7659208261617903e-05, "loss": 0.3553, "step": 1622 }, { "epoch": 1.5071992568509058, "grad_norm": 0.23953241896174912, "learning_rate": 2.764199655765921e-05, "loss": 0.3353, "step": 1623 }, { "epoch": 1.5081281932187647, "grad_norm": 0.19064283645632496, "learning_rate": 2.762478485370052e-05, "loss": 0.3309, "step": 1624 }, { "epoch": 1.5090571295866233, "grad_norm": 0.19455048912942696, "learning_rate": 2.7607573149741828e-05, "loss": 0.3512, "step": 1625 }, { "epoch": 1.5099860659544821, "grad_norm": 0.24734707148619361, "learning_rate": 2.7590361445783136e-05, "loss": 0.3369, "step": 1626 }, { "epoch": 1.5109150023223408, "grad_norm": 0.19596045941652765, "learning_rate": 2.7573149741824445e-05, "loss": 0.3447, "step": 1627 }, { "epoch": 1.5118439386901996, "grad_norm": 0.20593155094983895, "learning_rate": 2.7555938037865753e-05, "loss": 0.3281, "step": 1628 }, { "epoch": 1.5127728750580585, "grad_norm": 0.18985986735657961, "learning_rate": 2.7538726333907055e-05, "loss": 0.3444, "step": 1629 }, { "epoch": 1.5137018114259173, "grad_norm": 0.20048398993083572, "learning_rate": 2.7521514629948363e-05, "loss": 0.3352, "step": 1630 }, { "epoch": 1.5146307477937762, "grad_norm": 0.2340324824637472, "learning_rate": 2.750430292598967e-05, "loss": 0.3547, "step": 1631 }, { "epoch": 1.515559684161635, "grad_norm": 0.1979649285811828, "learning_rate": 2.748709122203098e-05, "loss": 0.3282, "step": 1632 }, { "epoch": 1.5164886205294938, "grad_norm": 0.22728561652019158, "learning_rate": 2.7469879518072288e-05, "loss": 0.3433, "step": 1633 }, { "epoch": 1.5174175568973527, "grad_norm": 0.1909605625979193, "learning_rate": 2.7452667814113597e-05, "loss": 0.3481, "step": 1634 }, { "epoch": 1.5183464932652113, "grad_norm": 0.21650749973633887, "learning_rate": 2.7435456110154905e-05, "loss": 0.3316, "step": 1635 }, { "epoch": 1.5192754296330702, "grad_norm": 0.20766956553484378, "learning_rate": 2.7418244406196213e-05, "loss": 0.3553, "step": 1636 }, { "epoch": 1.5202043660009288, "grad_norm": 0.19184440178799098, "learning_rate": 2.7401032702237522e-05, "loss": 0.3479, "step": 1637 }, { "epoch": 1.5211333023687876, "grad_norm": 0.19041440103788876, "learning_rate": 2.738382099827883e-05, "loss": 0.3229, "step": 1638 }, { "epoch": 1.5220622387366465, "grad_norm": 0.19777552122771166, "learning_rate": 2.736660929432014e-05, "loss": 0.3535, "step": 1639 }, { "epoch": 1.5229911751045053, "grad_norm": 0.2004208302269382, "learning_rate": 2.7349397590361447e-05, "loss": 0.34, "step": 1640 }, { "epoch": 1.5239201114723642, "grad_norm": 0.20336163065057344, "learning_rate": 2.7332185886402755e-05, "loss": 0.3521, "step": 1641 }, { "epoch": 1.524849047840223, "grad_norm": 0.19703431204687524, "learning_rate": 2.7314974182444064e-05, "loss": 0.3414, "step": 1642 }, { "epoch": 1.5257779842080819, "grad_norm": 0.19845951237979154, "learning_rate": 2.7297762478485372e-05, "loss": 0.3677, "step": 1643 }, { "epoch": 1.5267069205759407, "grad_norm": 0.19034132547856192, "learning_rate": 2.728055077452668e-05, "loss": 0.3406, "step": 1644 }, { "epoch": 1.5276358569437993, "grad_norm": 0.18091025641587147, "learning_rate": 2.726333907056799e-05, "loss": 0.3313, "step": 1645 }, { "epoch": 1.5285647933116582, "grad_norm": 0.18588920730588274, "learning_rate": 2.7246127366609297e-05, "loss": 0.3379, "step": 1646 }, { "epoch": 1.5294937296795168, "grad_norm": 0.183194983638107, "learning_rate": 2.7228915662650606e-05, "loss": 0.3514, "step": 1647 }, { "epoch": 1.5304226660473756, "grad_norm": 0.20313603959857246, "learning_rate": 2.7211703958691914e-05, "loss": 0.3148, "step": 1648 }, { "epoch": 1.5313516024152345, "grad_norm": 0.18413859078792266, "learning_rate": 2.7194492254733223e-05, "loss": 0.3406, "step": 1649 }, { "epoch": 1.5322805387830933, "grad_norm": 0.2218660778677224, "learning_rate": 2.7177280550774524e-05, "loss": 0.3473, "step": 1650 }, { "epoch": 1.5332094751509522, "grad_norm": 0.20256666218542452, "learning_rate": 2.7160068846815833e-05, "loss": 0.327, "step": 1651 }, { "epoch": 1.534138411518811, "grad_norm": 0.20973362475741678, "learning_rate": 2.714285714285714e-05, "loss": 0.3382, "step": 1652 }, { "epoch": 1.5350673478866699, "grad_norm": 0.21420987345417866, "learning_rate": 2.712564543889845e-05, "loss": 0.3549, "step": 1653 }, { "epoch": 1.5359962842545287, "grad_norm": 0.20547490967457238, "learning_rate": 2.7108433734939758e-05, "loss": 0.3692, "step": 1654 }, { "epoch": 1.5369252206223873, "grad_norm": 0.20579644597140412, "learning_rate": 2.7091222030981066e-05, "loss": 0.3603, "step": 1655 }, { "epoch": 1.5378541569902462, "grad_norm": 0.17767519418921987, "learning_rate": 2.7074010327022375e-05, "loss": 0.3207, "step": 1656 }, { "epoch": 1.5387830933581048, "grad_norm": 0.21067954281906232, "learning_rate": 2.7056798623063683e-05, "loss": 0.3668, "step": 1657 }, { "epoch": 1.5397120297259637, "grad_norm": 0.1962228244666741, "learning_rate": 2.703958691910499e-05, "loss": 0.3492, "step": 1658 }, { "epoch": 1.5406409660938225, "grad_norm": 0.19557765329976629, "learning_rate": 2.70223752151463e-05, "loss": 0.3474, "step": 1659 }, { "epoch": 1.5415699024616814, "grad_norm": 0.18500343001861966, "learning_rate": 2.7005163511187608e-05, "loss": 0.3416, "step": 1660 }, { "epoch": 1.5424988388295402, "grad_norm": 0.2023975455255552, "learning_rate": 2.6987951807228917e-05, "loss": 0.3406, "step": 1661 }, { "epoch": 1.543427775197399, "grad_norm": 0.19197373382978503, "learning_rate": 2.6970740103270225e-05, "loss": 0.3494, "step": 1662 }, { "epoch": 1.544356711565258, "grad_norm": 0.200189895799683, "learning_rate": 2.6953528399311533e-05, "loss": 0.3437, "step": 1663 }, { "epoch": 1.5452856479331167, "grad_norm": 0.2224630313261313, "learning_rate": 2.6936316695352842e-05, "loss": 0.362, "step": 1664 }, { "epoch": 1.5462145843009754, "grad_norm": 0.22940474152039902, "learning_rate": 2.691910499139415e-05, "loss": 0.3469, "step": 1665 }, { "epoch": 1.5471435206688342, "grad_norm": 0.2206713442498346, "learning_rate": 2.690189328743546e-05, "loss": 0.3565, "step": 1666 }, { "epoch": 1.5480724570366928, "grad_norm": 0.19182785647681846, "learning_rate": 2.6884681583476767e-05, "loss": 0.3279, "step": 1667 }, { "epoch": 1.5490013934045517, "grad_norm": 0.23133211363227132, "learning_rate": 2.6867469879518075e-05, "loss": 0.3571, "step": 1668 }, { "epoch": 1.5499303297724105, "grad_norm": 0.19494744045107942, "learning_rate": 2.6850258175559384e-05, "loss": 0.325, "step": 1669 }, { "epoch": 1.5508592661402694, "grad_norm": 0.19903204750297201, "learning_rate": 2.6833046471600692e-05, "loss": 0.3385, "step": 1670 }, { "epoch": 1.5517882025081282, "grad_norm": 0.1941975771860589, "learning_rate": 2.6815834767641994e-05, "loss": 0.3472, "step": 1671 }, { "epoch": 1.552717138875987, "grad_norm": 0.20247277746107317, "learning_rate": 2.6798623063683302e-05, "loss": 0.3473, "step": 1672 }, { "epoch": 1.553646075243846, "grad_norm": 0.21609956845102268, "learning_rate": 2.678141135972461e-05, "loss": 0.3259, "step": 1673 }, { "epoch": 1.5545750116117047, "grad_norm": 0.19593104013837537, "learning_rate": 2.676419965576592e-05, "loss": 0.354, "step": 1674 }, { "epoch": 1.5555039479795634, "grad_norm": 0.18796258655354342, "learning_rate": 2.6746987951807227e-05, "loss": 0.3506, "step": 1675 }, { "epoch": 1.5564328843474222, "grad_norm": 0.2059985043490576, "learning_rate": 2.6729776247848536e-05, "loss": 0.3533, "step": 1676 }, { "epoch": 1.5573618207152808, "grad_norm": 0.2268856053908675, "learning_rate": 2.6712564543889844e-05, "loss": 0.3617, "step": 1677 }, { "epoch": 1.5582907570831397, "grad_norm": 0.19081957209925454, "learning_rate": 2.6695352839931153e-05, "loss": 0.3425, "step": 1678 }, { "epoch": 1.5592196934509985, "grad_norm": 0.21370626444738328, "learning_rate": 2.667814113597246e-05, "loss": 0.3435, "step": 1679 }, { "epoch": 1.5601486298188574, "grad_norm": 0.20241828803392062, "learning_rate": 2.666092943201377e-05, "loss": 0.3471, "step": 1680 }, { "epoch": 1.5610775661867162, "grad_norm": 0.2126563580465179, "learning_rate": 2.6643717728055078e-05, "loss": 0.3345, "step": 1681 }, { "epoch": 1.562006502554575, "grad_norm": 0.19313249003359623, "learning_rate": 2.6626506024096386e-05, "loss": 0.3367, "step": 1682 }, { "epoch": 1.562935438922434, "grad_norm": 0.20080971884495982, "learning_rate": 2.6609294320137695e-05, "loss": 0.3512, "step": 1683 }, { "epoch": 1.5638643752902928, "grad_norm": 0.2326488096574912, "learning_rate": 2.6592082616179003e-05, "loss": 0.3507, "step": 1684 }, { "epoch": 1.5647933116581514, "grad_norm": 0.20822523814151891, "learning_rate": 2.657487091222031e-05, "loss": 0.3483, "step": 1685 }, { "epoch": 1.5657222480260102, "grad_norm": 0.21602169294812015, "learning_rate": 2.655765920826162e-05, "loss": 0.3451, "step": 1686 }, { "epoch": 1.5666511843938689, "grad_norm": 0.2026495811587196, "learning_rate": 2.6540447504302928e-05, "loss": 0.3401, "step": 1687 }, { "epoch": 1.5675801207617277, "grad_norm": 0.20373496856303852, "learning_rate": 2.6523235800344237e-05, "loss": 0.3472, "step": 1688 }, { "epoch": 1.5685090571295865, "grad_norm": 0.202815214563734, "learning_rate": 2.6506024096385545e-05, "loss": 0.3443, "step": 1689 }, { "epoch": 1.5694379934974454, "grad_norm": 0.20807192948042533, "learning_rate": 2.6488812392426853e-05, "loss": 0.3244, "step": 1690 }, { "epoch": 1.5703669298653042, "grad_norm": 0.19319517687258816, "learning_rate": 2.6471600688468162e-05, "loss": 0.3382, "step": 1691 }, { "epoch": 1.571295866233163, "grad_norm": 0.22053885061161396, "learning_rate": 2.6454388984509463e-05, "loss": 0.3666, "step": 1692 }, { "epoch": 1.572224802601022, "grad_norm": 0.207890820090718, "learning_rate": 2.6437177280550772e-05, "loss": 0.3436, "step": 1693 }, { "epoch": 1.5731537389688808, "grad_norm": 0.19245242691550665, "learning_rate": 2.641996557659208e-05, "loss": 0.3539, "step": 1694 }, { "epoch": 1.5740826753367394, "grad_norm": 0.21843726413189252, "learning_rate": 2.640275387263339e-05, "loss": 0.3421, "step": 1695 }, { "epoch": 1.5750116117045982, "grad_norm": 0.220646507516627, "learning_rate": 2.6385542168674697e-05, "loss": 0.3541, "step": 1696 }, { "epoch": 1.5759405480724569, "grad_norm": 0.22537203693214305, "learning_rate": 2.6368330464716005e-05, "loss": 0.3277, "step": 1697 }, { "epoch": 1.5768694844403157, "grad_norm": 0.20461650445631827, "learning_rate": 2.6351118760757314e-05, "loss": 0.3387, "step": 1698 }, { "epoch": 1.5777984208081746, "grad_norm": 0.2136483563667655, "learning_rate": 2.6333907056798622e-05, "loss": 0.3426, "step": 1699 }, { "epoch": 1.5787273571760334, "grad_norm": 0.19981232379812758, "learning_rate": 2.631669535283993e-05, "loss": 0.3454, "step": 1700 }, { "epoch": 1.5796562935438923, "grad_norm": 0.23165196856655018, "learning_rate": 2.629948364888124e-05, "loss": 0.3775, "step": 1701 }, { "epoch": 1.580585229911751, "grad_norm": 0.2055708819872716, "learning_rate": 2.6282271944922547e-05, "loss": 0.3398, "step": 1702 }, { "epoch": 1.58151416627961, "grad_norm": 0.20549199744598382, "learning_rate": 2.6265060240963856e-05, "loss": 0.3303, "step": 1703 }, { "epoch": 1.5824431026474688, "grad_norm": 0.19735611925338659, "learning_rate": 2.6247848537005164e-05, "loss": 0.328, "step": 1704 }, { "epoch": 1.5833720390153274, "grad_norm": 0.18310842492165016, "learning_rate": 2.6230636833046473e-05, "loss": 0.343, "step": 1705 }, { "epoch": 1.5843009753831863, "grad_norm": 0.1873807736634079, "learning_rate": 2.621342512908778e-05, "loss": 0.3481, "step": 1706 }, { "epoch": 1.585229911751045, "grad_norm": 0.20000318958063903, "learning_rate": 2.619621342512909e-05, "loss": 0.3356, "step": 1707 }, { "epoch": 1.5861588481189037, "grad_norm": 0.2015136507544033, "learning_rate": 2.6179001721170398e-05, "loss": 0.3324, "step": 1708 }, { "epoch": 1.5870877844867626, "grad_norm": 0.18262820920953574, "learning_rate": 2.6161790017211706e-05, "loss": 0.3532, "step": 1709 }, { "epoch": 1.5880167208546214, "grad_norm": 0.23344245143312825, "learning_rate": 2.6144578313253015e-05, "loss": 0.3775, "step": 1710 }, { "epoch": 1.5889456572224803, "grad_norm": 0.2199511607804858, "learning_rate": 2.6127366609294323e-05, "loss": 0.3168, "step": 1711 }, { "epoch": 1.5898745935903391, "grad_norm": 0.212983265606066, "learning_rate": 2.611015490533563e-05, "loss": 0.3437, "step": 1712 }, { "epoch": 1.590803529958198, "grad_norm": 0.213128136916506, "learning_rate": 2.6092943201376936e-05, "loss": 0.3754, "step": 1713 }, { "epoch": 1.5917324663260568, "grad_norm": 0.2448986605848652, "learning_rate": 2.6075731497418245e-05, "loss": 0.352, "step": 1714 }, { "epoch": 1.5926614026939154, "grad_norm": 0.20674274942405582, "learning_rate": 2.605851979345955e-05, "loss": 0.3496, "step": 1715 }, { "epoch": 1.5935903390617743, "grad_norm": 0.18776586917192614, "learning_rate": 2.6041308089500858e-05, "loss": 0.3322, "step": 1716 }, { "epoch": 1.5945192754296331, "grad_norm": 0.20953790690241902, "learning_rate": 2.6024096385542167e-05, "loss": 0.331, "step": 1717 }, { "epoch": 1.5954482117974917, "grad_norm": 0.20314003333327893, "learning_rate": 2.6006884681583475e-05, "loss": 0.3357, "step": 1718 }, { "epoch": 1.5963771481653506, "grad_norm": 0.20922503716379712, "learning_rate": 2.5989672977624783e-05, "loss": 0.3572, "step": 1719 }, { "epoch": 1.5973060845332094, "grad_norm": 0.20988177401915958, "learning_rate": 2.5972461273666092e-05, "loss": 0.352, "step": 1720 }, { "epoch": 1.5982350209010683, "grad_norm": 0.21315542074548124, "learning_rate": 2.59552495697074e-05, "loss": 0.3626, "step": 1721 }, { "epoch": 1.5991639572689271, "grad_norm": 0.20813871660567065, "learning_rate": 2.593803786574871e-05, "loss": 0.3474, "step": 1722 }, { "epoch": 1.600092893636786, "grad_norm": 0.1872599355028666, "learning_rate": 2.5920826161790017e-05, "loss": 0.3445, "step": 1723 }, { "epoch": 1.6010218300046448, "grad_norm": 0.1911940128780578, "learning_rate": 2.5903614457831325e-05, "loss": 0.3391, "step": 1724 }, { "epoch": 1.6019507663725034, "grad_norm": 0.2092461298477511, "learning_rate": 2.5886402753872634e-05, "loss": 0.3539, "step": 1725 }, { "epoch": 1.6028797027403623, "grad_norm": 0.20596276428098503, "learning_rate": 2.5869191049913942e-05, "loss": 0.3539, "step": 1726 }, { "epoch": 1.6038086391082211, "grad_norm": 0.1932818475108648, "learning_rate": 2.585197934595525e-05, "loss": 0.3526, "step": 1727 }, { "epoch": 1.6047375754760798, "grad_norm": 0.21283715898106823, "learning_rate": 2.583476764199656e-05, "loss": 0.3468, "step": 1728 }, { "epoch": 1.6056665118439386, "grad_norm": 0.20794295159851667, "learning_rate": 2.5817555938037867e-05, "loss": 0.3496, "step": 1729 }, { "epoch": 1.6065954482117974, "grad_norm": 0.20156095064197782, "learning_rate": 2.5800344234079176e-05, "loss": 0.3396, "step": 1730 }, { "epoch": 1.6075243845796563, "grad_norm": 0.18701079703686946, "learning_rate": 2.5783132530120484e-05, "loss": 0.3491, "step": 1731 }, { "epoch": 1.6084533209475151, "grad_norm": 0.19536223716914838, "learning_rate": 2.5765920826161793e-05, "loss": 0.3377, "step": 1732 }, { "epoch": 1.609382257315374, "grad_norm": 0.17791846865411473, "learning_rate": 2.57487091222031e-05, "loss": 0.3472, "step": 1733 }, { "epoch": 1.6103111936832328, "grad_norm": 0.2129004966083023, "learning_rate": 2.573149741824441e-05, "loss": 0.329, "step": 1734 }, { "epoch": 1.6112401300510915, "grad_norm": 0.18814499058025813, "learning_rate": 2.5714285714285714e-05, "loss": 0.3429, "step": 1735 }, { "epoch": 1.6121690664189503, "grad_norm": 0.1809854014670112, "learning_rate": 2.5697074010327023e-05, "loss": 0.3237, "step": 1736 }, { "epoch": 1.6130980027868091, "grad_norm": 0.18538402717591057, "learning_rate": 2.567986230636833e-05, "loss": 0.3497, "step": 1737 }, { "epoch": 1.6140269391546678, "grad_norm": 0.18559136167726656, "learning_rate": 2.566265060240964e-05, "loss": 0.354, "step": 1738 }, { "epoch": 1.6149558755225266, "grad_norm": 0.20263301316144824, "learning_rate": 2.5645438898450948e-05, "loss": 0.3672, "step": 1739 }, { "epoch": 1.6158848118903855, "grad_norm": 0.196423315030862, "learning_rate": 2.5628227194492253e-05, "loss": 0.3408, "step": 1740 }, { "epoch": 1.6168137482582443, "grad_norm": 0.19368812454813836, "learning_rate": 2.561101549053356e-05, "loss": 0.348, "step": 1741 }, { "epoch": 1.6177426846261032, "grad_norm": 0.19885581508283579, "learning_rate": 2.559380378657487e-05, "loss": 0.3371, "step": 1742 }, { "epoch": 1.618671620993962, "grad_norm": 0.21769073543185363, "learning_rate": 2.5576592082616178e-05, "loss": 0.3652, "step": 1743 }, { "epoch": 1.6196005573618208, "grad_norm": 0.215179619394168, "learning_rate": 2.5559380378657487e-05, "loss": 0.3412, "step": 1744 }, { "epoch": 1.6205294937296795, "grad_norm": 0.1898709834286627, "learning_rate": 2.5542168674698795e-05, "loss": 0.3352, "step": 1745 }, { "epoch": 1.6214584300975383, "grad_norm": 0.24343732302612947, "learning_rate": 2.5524956970740103e-05, "loss": 0.3271, "step": 1746 }, { "epoch": 1.6223873664653972, "grad_norm": 0.19406338350847918, "learning_rate": 2.5507745266781412e-05, "loss": 0.3207, "step": 1747 }, { "epoch": 1.6233163028332558, "grad_norm": 0.18606967526005735, "learning_rate": 2.549053356282272e-05, "loss": 0.3402, "step": 1748 }, { "epoch": 1.6242452392011146, "grad_norm": 0.24944407718719433, "learning_rate": 2.547332185886403e-05, "loss": 0.3327, "step": 1749 }, { "epoch": 1.6251741755689735, "grad_norm": 0.20574480952491447, "learning_rate": 2.5456110154905337e-05, "loss": 0.3353, "step": 1750 }, { "epoch": 1.6261031119368323, "grad_norm": 0.1899167111957658, "learning_rate": 2.5438898450946645e-05, "loss": 0.3533, "step": 1751 }, { "epoch": 1.6270320483046912, "grad_norm": 0.2355371100922719, "learning_rate": 2.5421686746987954e-05, "loss": 0.3442, "step": 1752 }, { "epoch": 1.62796098467255, "grad_norm": 0.22707474270252473, "learning_rate": 2.5404475043029262e-05, "loss": 0.3485, "step": 1753 }, { "epoch": 1.6288899210404089, "grad_norm": 0.23297359604796236, "learning_rate": 2.538726333907057e-05, "loss": 0.3492, "step": 1754 }, { "epoch": 1.6298188574082675, "grad_norm": 0.18949532826143803, "learning_rate": 2.537005163511188e-05, "loss": 0.3479, "step": 1755 }, { "epoch": 1.6307477937761263, "grad_norm": 0.18362249976581027, "learning_rate": 2.5352839931153184e-05, "loss": 0.3581, "step": 1756 }, { "epoch": 1.6316767301439852, "grad_norm": 0.2554058616714679, "learning_rate": 2.5335628227194492e-05, "loss": 0.3546, "step": 1757 }, { "epoch": 1.6326056665118438, "grad_norm": 0.22540032891893913, "learning_rate": 2.53184165232358e-05, "loss": 0.3619, "step": 1758 }, { "epoch": 1.6335346028797026, "grad_norm": 0.20418915234656554, "learning_rate": 2.530120481927711e-05, "loss": 0.361, "step": 1759 }, { "epoch": 1.6344635392475615, "grad_norm": 0.19977889622190362, "learning_rate": 2.5283993115318418e-05, "loss": 0.3377, "step": 1760 }, { "epoch": 1.6353924756154203, "grad_norm": 0.22689586847036763, "learning_rate": 2.5266781411359726e-05, "loss": 0.3282, "step": 1761 }, { "epoch": 1.6363214119832792, "grad_norm": 0.20137363650236248, "learning_rate": 2.5249569707401034e-05, "loss": 0.3444, "step": 1762 }, { "epoch": 1.637250348351138, "grad_norm": 0.20785537438471768, "learning_rate": 2.5232358003442343e-05, "loss": 0.3476, "step": 1763 }, { "epoch": 1.6381792847189969, "grad_norm": 0.21247019504395587, "learning_rate": 2.521514629948365e-05, "loss": 0.3579, "step": 1764 }, { "epoch": 1.6391082210868555, "grad_norm": 0.20923349346429376, "learning_rate": 2.5197934595524956e-05, "loss": 0.3547, "step": 1765 }, { "epoch": 1.6400371574547143, "grad_norm": 0.19648308507644077, "learning_rate": 2.5180722891566265e-05, "loss": 0.3606, "step": 1766 }, { "epoch": 1.6409660938225732, "grad_norm": 0.2302898022257594, "learning_rate": 2.5163511187607573e-05, "loss": 0.3521, "step": 1767 }, { "epoch": 1.6418950301904318, "grad_norm": 0.2324588872518842, "learning_rate": 2.514629948364888e-05, "loss": 0.3551, "step": 1768 }, { "epoch": 1.6428239665582907, "grad_norm": 0.22587569032785934, "learning_rate": 2.512908777969019e-05, "loss": 0.3874, "step": 1769 }, { "epoch": 1.6437529029261495, "grad_norm": 0.2281758443165584, "learning_rate": 2.5111876075731498e-05, "loss": 0.3501, "step": 1770 }, { "epoch": 1.6446818392940084, "grad_norm": 0.20674500113194846, "learning_rate": 2.5094664371772807e-05, "loss": 0.3321, "step": 1771 }, { "epoch": 1.6456107756618672, "grad_norm": 0.18216341746652498, "learning_rate": 2.5077452667814115e-05, "loss": 0.3451, "step": 1772 }, { "epoch": 1.646539712029726, "grad_norm": 0.24083907570929008, "learning_rate": 2.5060240963855423e-05, "loss": 0.3571, "step": 1773 }, { "epoch": 1.647468648397585, "grad_norm": 0.21053850263699328, "learning_rate": 2.5043029259896732e-05, "loss": 0.3393, "step": 1774 }, { "epoch": 1.6483975847654435, "grad_norm": 0.21190369434405645, "learning_rate": 2.502581755593804e-05, "loss": 0.3562, "step": 1775 }, { "epoch": 1.6493265211333024, "grad_norm": 0.22135502539564378, "learning_rate": 2.500860585197935e-05, "loss": 0.3525, "step": 1776 }, { "epoch": 1.6502554575011612, "grad_norm": 0.22367773820582154, "learning_rate": 2.4991394148020654e-05, "loss": 0.3466, "step": 1777 }, { "epoch": 1.6511843938690198, "grad_norm": 0.21451556990315485, "learning_rate": 2.4974182444061962e-05, "loss": 0.3338, "step": 1778 }, { "epoch": 1.6521133302368787, "grad_norm": 0.2465298762993094, "learning_rate": 2.495697074010327e-05, "loss": 0.333, "step": 1779 }, { "epoch": 1.6530422666047375, "grad_norm": 0.2547044770751165, "learning_rate": 2.493975903614458e-05, "loss": 0.4001, "step": 1780 }, { "epoch": 1.6539712029725964, "grad_norm": 0.22774781080068754, "learning_rate": 2.4922547332185887e-05, "loss": 0.3461, "step": 1781 }, { "epoch": 1.6549001393404552, "grad_norm": 0.21204449913798434, "learning_rate": 2.4905335628227196e-05, "loss": 0.3274, "step": 1782 }, { "epoch": 1.655829075708314, "grad_norm": 0.21580956854064373, "learning_rate": 2.4888123924268504e-05, "loss": 0.3359, "step": 1783 }, { "epoch": 1.656758012076173, "grad_norm": 0.24076908609466494, "learning_rate": 2.4870912220309813e-05, "loss": 0.3588, "step": 1784 }, { "epoch": 1.6576869484440317, "grad_norm": 0.2125733983681432, "learning_rate": 2.485370051635112e-05, "loss": 0.3295, "step": 1785 }, { "epoch": 1.6586158848118904, "grad_norm": 0.1890708531709907, "learning_rate": 2.483648881239243e-05, "loss": 0.3418, "step": 1786 }, { "epoch": 1.6595448211797492, "grad_norm": 0.19675942962384732, "learning_rate": 2.4819277108433738e-05, "loss": 0.3472, "step": 1787 }, { "epoch": 1.6604737575476078, "grad_norm": 0.2170796522409692, "learning_rate": 2.4802065404475046e-05, "loss": 0.3457, "step": 1788 }, { "epoch": 1.6614026939154667, "grad_norm": 0.21149308045148213, "learning_rate": 2.478485370051635e-05, "loss": 0.3759, "step": 1789 }, { "epoch": 1.6623316302833255, "grad_norm": 0.20813548096362552, "learning_rate": 2.476764199655766e-05, "loss": 0.3582, "step": 1790 }, { "epoch": 1.6632605666511844, "grad_norm": 0.19034774277541827, "learning_rate": 2.4750430292598968e-05, "loss": 0.3489, "step": 1791 }, { "epoch": 1.6641895030190432, "grad_norm": 0.19380501073483952, "learning_rate": 2.4733218588640276e-05, "loss": 0.3339, "step": 1792 }, { "epoch": 1.665118439386902, "grad_norm": 0.1834320423794678, "learning_rate": 2.4716006884681585e-05, "loss": 0.3617, "step": 1793 }, { "epoch": 1.666047375754761, "grad_norm": 0.24296412127242056, "learning_rate": 2.4698795180722893e-05, "loss": 0.3596, "step": 1794 }, { "epoch": 1.6669763121226198, "grad_norm": 0.2280578339681279, "learning_rate": 2.46815834767642e-05, "loss": 0.3584, "step": 1795 }, { "epoch": 1.6679052484904784, "grad_norm": 0.19120766695593186, "learning_rate": 2.466437177280551e-05, "loss": 0.3479, "step": 1796 }, { "epoch": 1.6688341848583372, "grad_norm": 0.2310771090381013, "learning_rate": 2.464716006884682e-05, "loss": 0.3555, "step": 1797 }, { "epoch": 1.6697631212261959, "grad_norm": 0.18985210461541444, "learning_rate": 2.4629948364888127e-05, "loss": 0.3356, "step": 1798 }, { "epoch": 1.6706920575940547, "grad_norm": 0.19786178896000547, "learning_rate": 2.4612736660929435e-05, "loss": 0.3374, "step": 1799 }, { "epoch": 1.6716209939619135, "grad_norm": 0.23092029223763727, "learning_rate": 2.4595524956970744e-05, "loss": 0.3447, "step": 1800 }, { "epoch": 1.6725499303297724, "grad_norm": 0.1922509738346118, "learning_rate": 2.4578313253012052e-05, "loss": 0.3292, "step": 1801 }, { "epoch": 1.6734788666976312, "grad_norm": 0.21074032243268462, "learning_rate": 2.4561101549053357e-05, "loss": 0.3591, "step": 1802 }, { "epoch": 1.67440780306549, "grad_norm": 0.2210145887813637, "learning_rate": 2.4543889845094665e-05, "loss": 0.3408, "step": 1803 }, { "epoch": 1.675336739433349, "grad_norm": 0.1955049882758258, "learning_rate": 2.4526678141135974e-05, "loss": 0.3252, "step": 1804 }, { "epoch": 1.6762656758012078, "grad_norm": 0.20630880882991626, "learning_rate": 2.4509466437177282e-05, "loss": 0.3343, "step": 1805 }, { "epoch": 1.6771946121690664, "grad_norm": 0.2068467693733224, "learning_rate": 2.449225473321859e-05, "loss": 0.3718, "step": 1806 }, { "epoch": 1.6781235485369252, "grad_norm": 0.21746922099285282, "learning_rate": 2.44750430292599e-05, "loss": 0.3473, "step": 1807 }, { "epoch": 1.6790524849047839, "grad_norm": 0.22531439643757564, "learning_rate": 2.4457831325301207e-05, "loss": 0.3378, "step": 1808 }, { "epoch": 1.6799814212726427, "grad_norm": 0.16961930850382043, "learning_rate": 2.4440619621342516e-05, "loss": 0.3307, "step": 1809 }, { "epoch": 1.6809103576405016, "grad_norm": 0.2184425585470951, "learning_rate": 2.4423407917383824e-05, "loss": 0.3444, "step": 1810 }, { "epoch": 1.6818392940083604, "grad_norm": 0.20341565894023106, "learning_rate": 2.4406196213425133e-05, "loss": 0.341, "step": 1811 }, { "epoch": 1.6827682303762193, "grad_norm": 0.20083870829005598, "learning_rate": 2.438898450946644e-05, "loss": 0.3561, "step": 1812 }, { "epoch": 1.683697166744078, "grad_norm": 0.22620645826766164, "learning_rate": 2.437177280550775e-05, "loss": 0.3585, "step": 1813 }, { "epoch": 1.684626103111937, "grad_norm": 0.1941948141031554, "learning_rate": 2.4354561101549054e-05, "loss": 0.3443, "step": 1814 }, { "epoch": 1.6855550394797958, "grad_norm": 0.2277987840149738, "learning_rate": 2.4337349397590363e-05, "loss": 0.3595, "step": 1815 }, { "epoch": 1.6864839758476544, "grad_norm": 0.19485898183092634, "learning_rate": 2.432013769363167e-05, "loss": 0.322, "step": 1816 }, { "epoch": 1.6874129122155133, "grad_norm": 0.18603940637738384, "learning_rate": 2.430292598967298e-05, "loss": 0.3582, "step": 1817 }, { "epoch": 1.6883418485833719, "grad_norm": 0.23071978533124235, "learning_rate": 2.4285714285714288e-05, "loss": 0.3644, "step": 1818 }, { "epoch": 1.6892707849512307, "grad_norm": 0.20747311082414033, "learning_rate": 2.4268502581755596e-05, "loss": 0.3373, "step": 1819 }, { "epoch": 1.6901997213190896, "grad_norm": 0.19539815282798248, "learning_rate": 2.4251290877796905e-05, "loss": 0.3348, "step": 1820 }, { "epoch": 1.6911286576869484, "grad_norm": 0.21365984840479363, "learning_rate": 2.4234079173838213e-05, "loss": 0.344, "step": 1821 }, { "epoch": 1.6920575940548073, "grad_norm": 0.21504209447331304, "learning_rate": 2.421686746987952e-05, "loss": 0.3439, "step": 1822 }, { "epoch": 1.6929865304226661, "grad_norm": 0.1982570782906239, "learning_rate": 2.419965576592083e-05, "loss": 0.3507, "step": 1823 }, { "epoch": 1.693915466790525, "grad_norm": 0.19337549611458893, "learning_rate": 2.418244406196214e-05, "loss": 0.3591, "step": 1824 }, { "epoch": 1.6948444031583838, "grad_norm": 0.20348014807541867, "learning_rate": 2.4165232358003443e-05, "loss": 0.325, "step": 1825 }, { "epoch": 1.6957733395262424, "grad_norm": 0.20540644281014037, "learning_rate": 2.4148020654044752e-05, "loss": 0.3411, "step": 1826 }, { "epoch": 1.6967022758941013, "grad_norm": 0.20236522736446938, "learning_rate": 2.413080895008606e-05, "loss": 0.3551, "step": 1827 }, { "epoch": 1.69763121226196, "grad_norm": 0.20198047155898852, "learning_rate": 2.411359724612737e-05, "loss": 0.3477, "step": 1828 }, { "epoch": 1.6985601486298187, "grad_norm": 0.21306903457439758, "learning_rate": 2.4096385542168677e-05, "loss": 0.3423, "step": 1829 }, { "epoch": 1.6994890849976776, "grad_norm": 0.19666890014745034, "learning_rate": 2.4079173838209985e-05, "loss": 0.3444, "step": 1830 }, { "epoch": 1.7004180213655364, "grad_norm": 0.19584912929425907, "learning_rate": 2.4061962134251294e-05, "loss": 0.3541, "step": 1831 }, { "epoch": 1.7013469577333953, "grad_norm": 0.20466324373230668, "learning_rate": 2.4044750430292602e-05, "loss": 0.3537, "step": 1832 }, { "epoch": 1.7022758941012541, "grad_norm": 0.20414327533805723, "learning_rate": 2.402753872633391e-05, "loss": 0.3491, "step": 1833 }, { "epoch": 1.703204830469113, "grad_norm": 0.17877641915198147, "learning_rate": 2.401032702237522e-05, "loss": 0.3306, "step": 1834 }, { "epoch": 1.7041337668369718, "grad_norm": 0.19756753438210706, "learning_rate": 2.3993115318416524e-05, "loss": 0.3522, "step": 1835 }, { "epoch": 1.7050627032048304, "grad_norm": 0.20624963074350838, "learning_rate": 2.3975903614457832e-05, "loss": 0.3271, "step": 1836 }, { "epoch": 1.7059916395726893, "grad_norm": 0.18278177540349813, "learning_rate": 2.395869191049914e-05, "loss": 0.3285, "step": 1837 }, { "epoch": 1.706920575940548, "grad_norm": 0.18016020662628543, "learning_rate": 2.394148020654045e-05, "loss": 0.3512, "step": 1838 }, { "epoch": 1.7078495123084068, "grad_norm": 0.19957513588627965, "learning_rate": 2.3924268502581758e-05, "loss": 0.342, "step": 1839 }, { "epoch": 1.7087784486762656, "grad_norm": 0.19966706741908602, "learning_rate": 2.3907056798623066e-05, "loss": 0.346, "step": 1840 }, { "epoch": 1.7097073850441245, "grad_norm": 0.18844002311194907, "learning_rate": 2.3889845094664374e-05, "loss": 0.3448, "step": 1841 }, { "epoch": 1.7106363214119833, "grad_norm": 0.19040969107600308, "learning_rate": 2.3872633390705683e-05, "loss": 0.3606, "step": 1842 }, { "epoch": 1.7115652577798421, "grad_norm": 0.19757417243583347, "learning_rate": 2.385542168674699e-05, "loss": 0.3299, "step": 1843 }, { "epoch": 1.712494194147701, "grad_norm": 0.19669847871271992, "learning_rate": 2.38382099827883e-05, "loss": 0.3628, "step": 1844 }, { "epoch": 1.7134231305155598, "grad_norm": 0.1817638016455849, "learning_rate": 2.3820998278829608e-05, "loss": 0.3723, "step": 1845 }, { "epoch": 1.7143520668834185, "grad_norm": 0.19044959623520683, "learning_rate": 2.3803786574870913e-05, "loss": 0.336, "step": 1846 }, { "epoch": 1.7152810032512773, "grad_norm": 0.20454387824331371, "learning_rate": 2.378657487091222e-05, "loss": 0.3561, "step": 1847 }, { "epoch": 1.716209939619136, "grad_norm": 0.2101073375770777, "learning_rate": 2.376936316695353e-05, "loss": 0.3373, "step": 1848 }, { "epoch": 1.7171388759869948, "grad_norm": 0.200320939170345, "learning_rate": 2.3752151462994838e-05, "loss": 0.3404, "step": 1849 }, { "epoch": 1.7180678123548536, "grad_norm": 0.1952985352033432, "learning_rate": 2.3734939759036147e-05, "loss": 0.3518, "step": 1850 }, { "epoch": 1.7189967487227125, "grad_norm": 0.19447811004540178, "learning_rate": 2.3717728055077455e-05, "loss": 0.3428, "step": 1851 }, { "epoch": 1.7199256850905713, "grad_norm": 0.23780369028264597, "learning_rate": 2.3700516351118763e-05, "loss": 0.3454, "step": 1852 }, { "epoch": 1.7208546214584302, "grad_norm": 0.21663178518511156, "learning_rate": 2.3683304647160072e-05, "loss": 0.3585, "step": 1853 }, { "epoch": 1.721783557826289, "grad_norm": 0.1920341932268259, "learning_rate": 2.366609294320138e-05, "loss": 0.3373, "step": 1854 }, { "epoch": 1.7227124941941478, "grad_norm": 0.20088003096202445, "learning_rate": 2.364888123924269e-05, "loss": 0.3408, "step": 1855 }, { "epoch": 1.7236414305620065, "grad_norm": 0.19365614048712398, "learning_rate": 2.3631669535283997e-05, "loss": 0.3297, "step": 1856 }, { "epoch": 1.7245703669298653, "grad_norm": 0.1906200226199188, "learning_rate": 2.3614457831325302e-05, "loss": 0.352, "step": 1857 }, { "epoch": 1.725499303297724, "grad_norm": 0.20155735822067727, "learning_rate": 2.359724612736661e-05, "loss": 0.34, "step": 1858 }, { "epoch": 1.7264282396655828, "grad_norm": 0.19158534386586176, "learning_rate": 2.358003442340792e-05, "loss": 0.3243, "step": 1859 }, { "epoch": 1.7273571760334416, "grad_norm": 0.20500769628661117, "learning_rate": 2.3562822719449227e-05, "loss": 0.3318, "step": 1860 }, { "epoch": 1.7282861124013005, "grad_norm": 0.211428858561031, "learning_rate": 2.3545611015490536e-05, "loss": 0.3332, "step": 1861 }, { "epoch": 1.7292150487691593, "grad_norm": 0.18958780180075827, "learning_rate": 2.3528399311531844e-05, "loss": 0.3391, "step": 1862 }, { "epoch": 1.7301439851370182, "grad_norm": 0.19932551881534102, "learning_rate": 2.3511187607573152e-05, "loss": 0.3354, "step": 1863 }, { "epoch": 1.731072921504877, "grad_norm": 0.19686069428098435, "learning_rate": 2.349397590361446e-05, "loss": 0.3343, "step": 1864 }, { "epoch": 1.7320018578727359, "grad_norm": 0.1874048143051562, "learning_rate": 2.347676419965577e-05, "loss": 0.3331, "step": 1865 }, { "epoch": 1.7329307942405945, "grad_norm": 0.22051015667350352, "learning_rate": 2.3459552495697078e-05, "loss": 0.3406, "step": 1866 }, { "epoch": 1.7338597306084533, "grad_norm": 0.19652476043071881, "learning_rate": 2.3442340791738383e-05, "loss": 0.3413, "step": 1867 }, { "epoch": 1.7347886669763122, "grad_norm": 0.18429305065219817, "learning_rate": 2.342512908777969e-05, "loss": 0.3392, "step": 1868 }, { "epoch": 1.7357176033441708, "grad_norm": 0.21153170060670934, "learning_rate": 2.3407917383821e-05, "loss": 0.3537, "step": 1869 }, { "epoch": 1.7366465397120296, "grad_norm": 0.19456879357822598, "learning_rate": 2.3390705679862308e-05, "loss": 0.3405, "step": 1870 }, { "epoch": 1.7375754760798885, "grad_norm": 0.19705924651045284, "learning_rate": 2.3373493975903616e-05, "loss": 0.3518, "step": 1871 }, { "epoch": 1.7385044124477473, "grad_norm": 0.23371156033466298, "learning_rate": 2.3356282271944925e-05, "loss": 0.348, "step": 1872 }, { "epoch": 1.7394333488156062, "grad_norm": 0.17949864900216111, "learning_rate": 2.3339070567986233e-05, "loss": 0.3411, "step": 1873 }, { "epoch": 1.740362285183465, "grad_norm": 0.19886700512246908, "learning_rate": 2.332185886402754e-05, "loss": 0.3447, "step": 1874 }, { "epoch": 1.7412912215513239, "grad_norm": 0.2084202011104834, "learning_rate": 2.330464716006885e-05, "loss": 0.3571, "step": 1875 }, { "epoch": 1.7422201579191825, "grad_norm": 0.18145128888963527, "learning_rate": 2.3287435456110158e-05, "loss": 0.339, "step": 1876 }, { "epoch": 1.7431490942870413, "grad_norm": 0.19420189258092047, "learning_rate": 2.3270223752151467e-05, "loss": 0.3496, "step": 1877 }, { "epoch": 1.7440780306549002, "grad_norm": 0.19829094715505624, "learning_rate": 2.325301204819277e-05, "loss": 0.3137, "step": 1878 }, { "epoch": 1.7450069670227588, "grad_norm": 0.20101000694540003, "learning_rate": 2.323580034423408e-05, "loss": 0.3544, "step": 1879 }, { "epoch": 1.7459359033906177, "grad_norm": 0.1905563892060915, "learning_rate": 2.321858864027539e-05, "loss": 0.3345, "step": 1880 }, { "epoch": 1.7468648397584765, "grad_norm": 0.19097935275198497, "learning_rate": 2.3201376936316697e-05, "loss": 0.3483, "step": 1881 }, { "epoch": 1.7477937761263354, "grad_norm": 0.1991523378048746, "learning_rate": 2.3184165232358005e-05, "loss": 0.3345, "step": 1882 }, { "epoch": 1.7487227124941942, "grad_norm": 0.1960091282122507, "learning_rate": 2.3166953528399314e-05, "loss": 0.3309, "step": 1883 }, { "epoch": 1.749651648862053, "grad_norm": 0.18402497230694953, "learning_rate": 2.3149741824440622e-05, "loss": 0.3414, "step": 1884 }, { "epoch": 1.750580585229912, "grad_norm": 0.18159596100294748, "learning_rate": 2.313253012048193e-05, "loss": 0.3571, "step": 1885 }, { "epoch": 1.7515095215977705, "grad_norm": 0.20590477472666366, "learning_rate": 2.311531841652324e-05, "loss": 0.3403, "step": 1886 }, { "epoch": 1.7524384579656294, "grad_norm": 0.18739986818056353, "learning_rate": 2.3098106712564547e-05, "loss": 0.368, "step": 1887 }, { "epoch": 1.7533673943334882, "grad_norm": 0.1922357485644034, "learning_rate": 2.3080895008605852e-05, "loss": 0.3622, "step": 1888 }, { "epoch": 1.7542963307013468, "grad_norm": 0.1907104113911057, "learning_rate": 2.306368330464716e-05, "loss": 0.337, "step": 1889 }, { "epoch": 1.7552252670692057, "grad_norm": 0.18926167592617857, "learning_rate": 2.304647160068847e-05, "loss": 0.3637, "step": 1890 }, { "epoch": 1.7561542034370645, "grad_norm": 0.19061055691996645, "learning_rate": 2.3029259896729777e-05, "loss": 0.3605, "step": 1891 }, { "epoch": 1.7570831398049234, "grad_norm": 0.301161005754076, "learning_rate": 2.3012048192771086e-05, "loss": 0.3274, "step": 1892 }, { "epoch": 1.7580120761727822, "grad_norm": 0.17079160860066211, "learning_rate": 2.2994836488812394e-05, "loss": 0.3499, "step": 1893 }, { "epoch": 1.758941012540641, "grad_norm": 0.20510016168695175, "learning_rate": 2.2977624784853703e-05, "loss": 0.3718, "step": 1894 }, { "epoch": 1.7598699489085, "grad_norm": 0.1860961861877371, "learning_rate": 2.296041308089501e-05, "loss": 0.3446, "step": 1895 }, { "epoch": 1.7607988852763585, "grad_norm": 0.1823482266004319, "learning_rate": 2.294320137693632e-05, "loss": 0.3444, "step": 1896 }, { "epoch": 1.7617278216442174, "grad_norm": 0.181834177614641, "learning_rate": 2.2925989672977628e-05, "loss": 0.352, "step": 1897 }, { "epoch": 1.7626567580120762, "grad_norm": 0.189848159943081, "learning_rate": 2.2908777969018936e-05, "loss": 0.3502, "step": 1898 }, { "epoch": 1.7635856943799348, "grad_norm": 0.19580044122071402, "learning_rate": 2.289156626506024e-05, "loss": 0.343, "step": 1899 }, { "epoch": 1.7645146307477937, "grad_norm": 0.1958254403858763, "learning_rate": 2.287435456110155e-05, "loss": 0.3301, "step": 1900 }, { "epoch": 1.7654435671156525, "grad_norm": 0.1714082079009884, "learning_rate": 2.2857142857142858e-05, "loss": 0.3136, "step": 1901 }, { "epoch": 1.7663725034835114, "grad_norm": 0.188207144023739, "learning_rate": 2.2839931153184166e-05, "loss": 0.3496, "step": 1902 }, { "epoch": 1.7673014398513702, "grad_norm": 0.19149325753512772, "learning_rate": 2.2822719449225475e-05, "loss": 0.3471, "step": 1903 }, { "epoch": 1.768230376219229, "grad_norm": 0.20844243247605607, "learning_rate": 2.2805507745266783e-05, "loss": 0.3635, "step": 1904 }, { "epoch": 1.769159312587088, "grad_norm": 0.21302605597258154, "learning_rate": 2.278829604130809e-05, "loss": 0.3586, "step": 1905 }, { "epoch": 1.7700882489549465, "grad_norm": 0.18803392186519952, "learning_rate": 2.27710843373494e-05, "loss": 0.3236, "step": 1906 }, { "epoch": 1.7710171853228054, "grad_norm": 0.19461739487306387, "learning_rate": 2.275387263339071e-05, "loss": 0.3336, "step": 1907 }, { "epoch": 1.7719461216906642, "grad_norm": 0.19855123181901438, "learning_rate": 2.2736660929432017e-05, "loss": 0.328, "step": 1908 }, { "epoch": 1.7728750580585229, "grad_norm": 0.193296807754737, "learning_rate": 2.2719449225473325e-05, "loss": 0.3429, "step": 1909 }, { "epoch": 1.7738039944263817, "grad_norm": 0.18250714695301445, "learning_rate": 2.270223752151463e-05, "loss": 0.3402, "step": 1910 }, { "epoch": 1.7747329307942405, "grad_norm": 0.2207728048782478, "learning_rate": 2.268502581755594e-05, "loss": 0.3648, "step": 1911 }, { "epoch": 1.7756618671620994, "grad_norm": 0.20763889726318494, "learning_rate": 2.2667814113597247e-05, "loss": 0.335, "step": 1912 }, { "epoch": 1.7765908035299582, "grad_norm": 0.1802847803327561, "learning_rate": 2.2650602409638555e-05, "loss": 0.3659, "step": 1913 }, { "epoch": 1.777519739897817, "grad_norm": 0.2017058059062457, "learning_rate": 2.2633390705679864e-05, "loss": 0.3229, "step": 1914 }, { "epoch": 1.778448676265676, "grad_norm": 0.18380005279157755, "learning_rate": 2.2616179001721172e-05, "loss": 0.3297, "step": 1915 }, { "epoch": 1.7793776126335346, "grad_norm": 0.20980299060848648, "learning_rate": 2.259896729776248e-05, "loss": 0.3603, "step": 1916 }, { "epoch": 1.7803065490013934, "grad_norm": 0.21845300686706248, "learning_rate": 2.258175559380379e-05, "loss": 0.3526, "step": 1917 }, { "epoch": 1.7812354853692522, "grad_norm": 0.18306805762943487, "learning_rate": 2.2564543889845097e-05, "loss": 0.3427, "step": 1918 }, { "epoch": 1.7821644217371109, "grad_norm": 0.22267478907178537, "learning_rate": 2.2547332185886406e-05, "loss": 0.3502, "step": 1919 }, { "epoch": 1.7830933581049697, "grad_norm": 0.2114169092592862, "learning_rate": 2.253012048192771e-05, "loss": 0.337, "step": 1920 }, { "epoch": 1.7840222944728286, "grad_norm": 0.2114698629778862, "learning_rate": 2.251290877796902e-05, "loss": 0.3331, "step": 1921 }, { "epoch": 1.7849512308406874, "grad_norm": 0.1737401942597242, "learning_rate": 2.2495697074010328e-05, "loss": 0.3402, "step": 1922 }, { "epoch": 1.7858801672085463, "grad_norm": 0.21860276556477812, "learning_rate": 2.2478485370051636e-05, "loss": 0.3785, "step": 1923 }, { "epoch": 1.786809103576405, "grad_norm": 0.21990594737894525, "learning_rate": 2.2461273666092944e-05, "loss": 0.333, "step": 1924 }, { "epoch": 1.787738039944264, "grad_norm": 0.17965595351731087, "learning_rate": 2.2444061962134253e-05, "loss": 0.3161, "step": 1925 }, { "epoch": 1.7886669763121226, "grad_norm": 0.18037444567864072, "learning_rate": 2.242685025817556e-05, "loss": 0.3218, "step": 1926 }, { "epoch": 1.7895959126799814, "grad_norm": 0.19458012919376982, "learning_rate": 2.240963855421687e-05, "loss": 0.3445, "step": 1927 }, { "epoch": 1.7905248490478403, "grad_norm": 0.1870071334265635, "learning_rate": 2.2392426850258178e-05, "loss": 0.3467, "step": 1928 }, { "epoch": 1.7914537854156989, "grad_norm": 0.20143024738216037, "learning_rate": 2.2375215146299486e-05, "loss": 0.3667, "step": 1929 }, { "epoch": 1.7923827217835577, "grad_norm": 0.18517571958598458, "learning_rate": 2.2358003442340795e-05, "loss": 0.347, "step": 1930 }, { "epoch": 1.7933116581514166, "grad_norm": 0.18972740321401171, "learning_rate": 2.23407917383821e-05, "loss": 0.3561, "step": 1931 }, { "epoch": 1.7942405945192754, "grad_norm": 0.18681340130076599, "learning_rate": 2.2323580034423408e-05, "loss": 0.3526, "step": 1932 }, { "epoch": 1.7951695308871343, "grad_norm": 0.17839062435491831, "learning_rate": 2.2306368330464717e-05, "loss": 0.3492, "step": 1933 }, { "epoch": 1.7960984672549931, "grad_norm": 0.19307814521598352, "learning_rate": 2.2289156626506025e-05, "loss": 0.3543, "step": 1934 }, { "epoch": 1.797027403622852, "grad_norm": 0.18708013233718124, "learning_rate": 2.2271944922547333e-05, "loss": 0.3459, "step": 1935 }, { "epoch": 1.7979563399907106, "grad_norm": 0.17553651305256152, "learning_rate": 2.2254733218588642e-05, "loss": 0.3598, "step": 1936 }, { "epoch": 1.7988852763585694, "grad_norm": 0.18618042918495897, "learning_rate": 2.223752151462995e-05, "loss": 0.3248, "step": 1937 }, { "epoch": 1.7998142127264283, "grad_norm": 0.17742356291112643, "learning_rate": 2.222030981067126e-05, "loss": 0.3367, "step": 1938 }, { "epoch": 1.800743149094287, "grad_norm": 0.1783779824099535, "learning_rate": 2.2203098106712567e-05, "loss": 0.3459, "step": 1939 }, { "epoch": 1.8016720854621457, "grad_norm": 0.1873864409342349, "learning_rate": 2.2185886402753875e-05, "loss": 0.3411, "step": 1940 }, { "epoch": 1.8026010218300046, "grad_norm": 0.19820459741704952, "learning_rate": 2.216867469879518e-05, "loss": 0.3622, "step": 1941 }, { "epoch": 1.8035299581978634, "grad_norm": 0.19579829256800121, "learning_rate": 2.215146299483649e-05, "loss": 0.3488, "step": 1942 }, { "epoch": 1.8044588945657223, "grad_norm": 0.18994419202784169, "learning_rate": 2.2134251290877797e-05, "loss": 0.3274, "step": 1943 }, { "epoch": 1.8053878309335811, "grad_norm": 0.17560731190213005, "learning_rate": 2.2117039586919106e-05, "loss": 0.3249, "step": 1944 }, { "epoch": 1.80631676730144, "grad_norm": 0.18332672388432894, "learning_rate": 2.2099827882960414e-05, "loss": 0.3287, "step": 1945 }, { "epoch": 1.8072457036692986, "grad_norm": 0.17542344527664755, "learning_rate": 2.2082616179001722e-05, "loss": 0.3438, "step": 1946 }, { "epoch": 1.8081746400371574, "grad_norm": 0.19087927519310427, "learning_rate": 2.206540447504303e-05, "loss": 0.3422, "step": 1947 }, { "epoch": 1.8091035764050163, "grad_norm": 0.17849449852961385, "learning_rate": 2.204819277108434e-05, "loss": 0.3461, "step": 1948 }, { "epoch": 1.810032512772875, "grad_norm": 0.1994999151338678, "learning_rate": 2.2030981067125648e-05, "loss": 0.3394, "step": 1949 }, { "epoch": 1.8109614491407338, "grad_norm": 0.1907734719321139, "learning_rate": 2.2013769363166956e-05, "loss": 0.3503, "step": 1950 }, { "epoch": 1.8118903855085926, "grad_norm": 0.19648653560232324, "learning_rate": 2.1996557659208264e-05, "loss": 0.3316, "step": 1951 }, { "epoch": 1.8128193218764515, "grad_norm": 0.1896427151890389, "learning_rate": 2.197934595524957e-05, "loss": 0.3392, "step": 1952 }, { "epoch": 1.8137482582443103, "grad_norm": 0.19027376064820928, "learning_rate": 2.1962134251290878e-05, "loss": 0.3537, "step": 1953 }, { "epoch": 1.8146771946121691, "grad_norm": 0.1855808710590735, "learning_rate": 2.1944922547332186e-05, "loss": 0.3187, "step": 1954 }, { "epoch": 1.815606130980028, "grad_norm": 0.1984498093903945, "learning_rate": 2.1927710843373495e-05, "loss": 0.3613, "step": 1955 }, { "epoch": 1.8165350673478868, "grad_norm": 0.2242505101068666, "learning_rate": 2.1910499139414803e-05, "loss": 0.3266, "step": 1956 }, { "epoch": 1.8174640037157455, "grad_norm": 0.18137823080536702, "learning_rate": 2.189328743545611e-05, "loss": 0.3399, "step": 1957 }, { "epoch": 1.8183929400836043, "grad_norm": 0.22546513335231178, "learning_rate": 2.187607573149742e-05, "loss": 0.3301, "step": 1958 }, { "epoch": 1.819321876451463, "grad_norm": 0.20216140374779656, "learning_rate": 2.185886402753873e-05, "loss": 0.3401, "step": 1959 }, { "epoch": 1.8202508128193218, "grad_norm": 0.19247489072112667, "learning_rate": 2.1841652323580037e-05, "loss": 0.3451, "step": 1960 }, { "epoch": 1.8211797491871806, "grad_norm": 0.217041904392123, "learning_rate": 2.1824440619621345e-05, "loss": 0.3609, "step": 1961 }, { "epoch": 1.8221086855550395, "grad_norm": 0.20709511532226713, "learning_rate": 2.1807228915662654e-05, "loss": 0.3389, "step": 1962 }, { "epoch": 1.8230376219228983, "grad_norm": 0.20249280813186266, "learning_rate": 2.179001721170396e-05, "loss": 0.3367, "step": 1963 }, { "epoch": 1.8239665582907572, "grad_norm": 0.21759029061763033, "learning_rate": 2.1772805507745267e-05, "loss": 0.3532, "step": 1964 }, { "epoch": 1.824895494658616, "grad_norm": 0.19698131495510754, "learning_rate": 2.1755593803786575e-05, "loss": 0.3453, "step": 1965 }, { "epoch": 1.8258244310264748, "grad_norm": 0.222032209900618, "learning_rate": 2.1738382099827884e-05, "loss": 0.3358, "step": 1966 }, { "epoch": 1.8267533673943335, "grad_norm": 0.2098001700888135, "learning_rate": 2.1721170395869192e-05, "loss": 0.342, "step": 1967 }, { "epoch": 1.8276823037621923, "grad_norm": 0.20422523735242673, "learning_rate": 2.17039586919105e-05, "loss": 0.321, "step": 1968 }, { "epoch": 1.828611240130051, "grad_norm": 0.2012686527729587, "learning_rate": 2.168674698795181e-05, "loss": 0.3442, "step": 1969 }, { "epoch": 1.8295401764979098, "grad_norm": 0.21283216710918032, "learning_rate": 2.1669535283993117e-05, "loss": 0.3588, "step": 1970 }, { "epoch": 1.8304691128657686, "grad_norm": 0.19417470172069212, "learning_rate": 2.1652323580034426e-05, "loss": 0.3283, "step": 1971 }, { "epoch": 1.8313980492336275, "grad_norm": 0.20490789352440664, "learning_rate": 2.1635111876075734e-05, "loss": 0.3637, "step": 1972 }, { "epoch": 1.8323269856014863, "grad_norm": 0.2069872797582916, "learning_rate": 2.161790017211704e-05, "loss": 0.3346, "step": 1973 }, { "epoch": 1.8332559219693452, "grad_norm": 0.18465378730661033, "learning_rate": 2.1600688468158348e-05, "loss": 0.3514, "step": 1974 }, { "epoch": 1.834184858337204, "grad_norm": 0.1772459699016406, "learning_rate": 2.1583476764199656e-05, "loss": 0.3446, "step": 1975 }, { "epoch": 1.8351137947050629, "grad_norm": 0.2246086295913219, "learning_rate": 2.1566265060240964e-05, "loss": 0.3426, "step": 1976 }, { "epoch": 1.8360427310729215, "grad_norm": 0.19999809607634841, "learning_rate": 2.1549053356282273e-05, "loss": 0.3266, "step": 1977 }, { "epoch": 1.8369716674407803, "grad_norm": 0.18937275360959321, "learning_rate": 2.153184165232358e-05, "loss": 0.3362, "step": 1978 }, { "epoch": 1.837900603808639, "grad_norm": 0.1894381875679942, "learning_rate": 2.151462994836489e-05, "loss": 0.3555, "step": 1979 }, { "epoch": 1.8388295401764978, "grad_norm": 0.2163255652366772, "learning_rate": 2.1497418244406198e-05, "loss": 0.3509, "step": 1980 }, { "epoch": 1.8397584765443566, "grad_norm": 0.21158701402876343, "learning_rate": 2.1480206540447506e-05, "loss": 0.3419, "step": 1981 }, { "epoch": 1.8406874129122155, "grad_norm": 0.1832944886654045, "learning_rate": 2.1462994836488815e-05, "loss": 0.3519, "step": 1982 }, { "epoch": 1.8416163492800743, "grad_norm": 0.20724791335919457, "learning_rate": 2.1445783132530123e-05, "loss": 0.3332, "step": 1983 }, { "epoch": 1.8425452856479332, "grad_norm": 0.2026632632941415, "learning_rate": 2.1428571428571428e-05, "loss": 0.3535, "step": 1984 }, { "epoch": 1.843474222015792, "grad_norm": 0.18716762184495547, "learning_rate": 2.1411359724612737e-05, "loss": 0.3388, "step": 1985 }, { "epoch": 1.8444031583836509, "grad_norm": 0.19811432373555643, "learning_rate": 2.1394148020654045e-05, "loss": 0.338, "step": 1986 }, { "epoch": 1.8453320947515095, "grad_norm": 0.2078115035058908, "learning_rate": 2.1376936316695353e-05, "loss": 0.3175, "step": 1987 }, { "epoch": 1.8462610311193683, "grad_norm": 0.19475185295752664, "learning_rate": 2.1359724612736662e-05, "loss": 0.3284, "step": 1988 }, { "epoch": 1.847189967487227, "grad_norm": 0.19643704859931127, "learning_rate": 2.134251290877797e-05, "loss": 0.3579, "step": 1989 }, { "epoch": 1.8481189038550858, "grad_norm": 0.19642321023564513, "learning_rate": 2.132530120481928e-05, "loss": 0.3215, "step": 1990 }, { "epoch": 1.8490478402229447, "grad_norm": 0.18968037438997556, "learning_rate": 2.1308089500860587e-05, "loss": 0.3206, "step": 1991 }, { "epoch": 1.8499767765908035, "grad_norm": 0.19787881644954247, "learning_rate": 2.1290877796901895e-05, "loss": 0.3362, "step": 1992 }, { "epoch": 1.8509057129586624, "grad_norm": 0.19024488827058927, "learning_rate": 2.1273666092943204e-05, "loss": 0.3398, "step": 1993 }, { "epoch": 1.8518346493265212, "grad_norm": 0.19182808398964474, "learning_rate": 2.125645438898451e-05, "loss": 0.3428, "step": 1994 }, { "epoch": 1.85276358569438, "grad_norm": 0.21009107426495674, "learning_rate": 2.1239242685025817e-05, "loss": 0.3434, "step": 1995 }, { "epoch": 1.853692522062239, "grad_norm": 0.1914880152985732, "learning_rate": 2.1222030981067126e-05, "loss": 0.327, "step": 1996 }, { "epoch": 1.8546214584300975, "grad_norm": 0.1913970478509932, "learning_rate": 2.1204819277108434e-05, "loss": 0.34, "step": 1997 }, { "epoch": 1.8555503947979564, "grad_norm": 0.2155756840317901, "learning_rate": 2.1187607573149742e-05, "loss": 0.3245, "step": 1998 }, { "epoch": 1.856479331165815, "grad_norm": 0.18445074398921807, "learning_rate": 2.117039586919105e-05, "loss": 0.3429, "step": 1999 }, { "epoch": 1.8574082675336738, "grad_norm": 0.18560311928741094, "learning_rate": 2.115318416523236e-05, "loss": 0.328, "step": 2000 }, { "epoch": 1.8583372039015327, "grad_norm": 0.17532760828844168, "learning_rate": 2.1135972461273668e-05, "loss": 0.3499, "step": 2001 }, { "epoch": 1.8592661402693915, "grad_norm": 0.18320848398237594, "learning_rate": 2.1118760757314976e-05, "loss": 0.3371, "step": 2002 }, { "epoch": 1.8601950766372504, "grad_norm": 0.18645021050922914, "learning_rate": 2.1101549053356284e-05, "loss": 0.3712, "step": 2003 }, { "epoch": 1.8611240130051092, "grad_norm": 0.1930981725638318, "learning_rate": 2.1084337349397593e-05, "loss": 0.3631, "step": 2004 }, { "epoch": 1.862052949372968, "grad_norm": 0.23294279233206924, "learning_rate": 2.1067125645438898e-05, "loss": 0.3627, "step": 2005 }, { "epoch": 1.862981885740827, "grad_norm": 0.22252132694320956, "learning_rate": 2.1049913941480206e-05, "loss": 0.3251, "step": 2006 }, { "epoch": 1.8639108221086855, "grad_norm": 0.18325558308090584, "learning_rate": 2.1032702237521515e-05, "loss": 0.3516, "step": 2007 }, { "epoch": 1.8648397584765444, "grad_norm": 0.18803938364991366, "learning_rate": 2.1015490533562823e-05, "loss": 0.335, "step": 2008 }, { "epoch": 1.865768694844403, "grad_norm": 0.21560136159819376, "learning_rate": 2.099827882960413e-05, "loss": 0.3502, "step": 2009 }, { "epoch": 1.8666976312122618, "grad_norm": 0.2075112616048167, "learning_rate": 2.098106712564544e-05, "loss": 0.3448, "step": 2010 }, { "epoch": 1.8676265675801207, "grad_norm": 0.1794063031438016, "learning_rate": 2.0963855421686748e-05, "loss": 0.3589, "step": 2011 }, { "epoch": 1.8685555039479795, "grad_norm": 0.2088734523300138, "learning_rate": 2.0946643717728057e-05, "loss": 0.3401, "step": 2012 }, { "epoch": 1.8694844403158384, "grad_norm": 0.20514199980300488, "learning_rate": 2.0929432013769365e-05, "loss": 0.3562, "step": 2013 }, { "epoch": 1.8704133766836972, "grad_norm": 0.1888327050014192, "learning_rate": 2.0912220309810673e-05, "loss": 0.3104, "step": 2014 }, { "epoch": 1.871342313051556, "grad_norm": 0.18729822949477454, "learning_rate": 2.0895008605851982e-05, "loss": 0.3547, "step": 2015 }, { "epoch": 1.872271249419415, "grad_norm": 0.22612417730319762, "learning_rate": 2.0877796901893287e-05, "loss": 0.3499, "step": 2016 }, { "epoch": 1.8732001857872735, "grad_norm": 0.19668595579111908, "learning_rate": 2.0860585197934595e-05, "loss": 0.3296, "step": 2017 }, { "epoch": 1.8741291221551324, "grad_norm": 0.18571506463595566, "learning_rate": 2.0843373493975904e-05, "loss": 0.3312, "step": 2018 }, { "epoch": 1.875058058522991, "grad_norm": 0.18777769470240502, "learning_rate": 2.0826161790017212e-05, "loss": 0.3234, "step": 2019 }, { "epoch": 1.8759869948908499, "grad_norm": 0.18583762914371052, "learning_rate": 2.080895008605852e-05, "loss": 0.3271, "step": 2020 }, { "epoch": 1.8769159312587087, "grad_norm": 0.1960771159495725, "learning_rate": 2.079173838209983e-05, "loss": 0.3581, "step": 2021 }, { "epoch": 1.8778448676265675, "grad_norm": 0.19599306930659696, "learning_rate": 2.0774526678141137e-05, "loss": 0.357, "step": 2022 }, { "epoch": 1.8787738039944264, "grad_norm": 0.1951564097933878, "learning_rate": 2.0757314974182446e-05, "loss": 0.3591, "step": 2023 }, { "epoch": 1.8797027403622852, "grad_norm": 0.19235765747681716, "learning_rate": 2.0740103270223754e-05, "loss": 0.3609, "step": 2024 }, { "epoch": 1.880631676730144, "grad_norm": 0.19639988332184477, "learning_rate": 2.0722891566265062e-05, "loss": 0.3397, "step": 2025 }, { "epoch": 1.881560613098003, "grad_norm": 0.19618499182075172, "learning_rate": 2.0705679862306367e-05, "loss": 0.3319, "step": 2026 }, { "epoch": 1.8824895494658616, "grad_norm": 0.18391264325568676, "learning_rate": 2.0688468158347676e-05, "loss": 0.3668, "step": 2027 }, { "epoch": 1.8834184858337204, "grad_norm": 0.1865315427514266, "learning_rate": 2.0671256454388984e-05, "loss": 0.357, "step": 2028 }, { "epoch": 1.884347422201579, "grad_norm": 0.1965123947388705, "learning_rate": 2.0654044750430293e-05, "loss": 0.3317, "step": 2029 }, { "epoch": 1.8852763585694379, "grad_norm": 0.19090795755847184, "learning_rate": 2.06368330464716e-05, "loss": 0.3409, "step": 2030 }, { "epoch": 1.8862052949372967, "grad_norm": 0.2011381053988122, "learning_rate": 2.061962134251291e-05, "loss": 0.3579, "step": 2031 }, { "epoch": 1.8871342313051556, "grad_norm": 0.17714274475948777, "learning_rate": 2.0602409638554218e-05, "loss": 0.3486, "step": 2032 }, { "epoch": 1.8880631676730144, "grad_norm": 0.20684528947484848, "learning_rate": 2.0585197934595526e-05, "loss": 0.3315, "step": 2033 }, { "epoch": 1.8889921040408733, "grad_norm": 0.20594746537045297, "learning_rate": 2.0567986230636835e-05, "loss": 0.3518, "step": 2034 }, { "epoch": 1.889921040408732, "grad_norm": 0.20843330387434464, "learning_rate": 2.0550774526678143e-05, "loss": 0.3377, "step": 2035 }, { "epoch": 1.890849976776591, "grad_norm": 0.19119948120789587, "learning_rate": 2.053356282271945e-05, "loss": 0.3508, "step": 2036 }, { "epoch": 1.8917789131444496, "grad_norm": 0.20499362964321996, "learning_rate": 2.0516351118760756e-05, "loss": 0.3386, "step": 2037 }, { "epoch": 1.8927078495123084, "grad_norm": 0.20951246709031993, "learning_rate": 2.0499139414802065e-05, "loss": 0.3454, "step": 2038 }, { "epoch": 1.8936367858801673, "grad_norm": 0.1933469877666157, "learning_rate": 2.0481927710843373e-05, "loss": 0.3498, "step": 2039 }, { "epoch": 1.8945657222480259, "grad_norm": 0.2033030466421948, "learning_rate": 2.046471600688468e-05, "loss": 0.3496, "step": 2040 }, { "epoch": 1.8954946586158847, "grad_norm": 0.21435793489097013, "learning_rate": 2.044750430292599e-05, "loss": 0.35, "step": 2041 }, { "epoch": 1.8964235949837436, "grad_norm": 0.21062698065040608, "learning_rate": 2.04302925989673e-05, "loss": 0.3598, "step": 2042 }, { "epoch": 1.8973525313516024, "grad_norm": 0.19925766321798463, "learning_rate": 2.0413080895008607e-05, "loss": 0.3301, "step": 2043 }, { "epoch": 1.8982814677194613, "grad_norm": 0.19489181490424323, "learning_rate": 2.0395869191049915e-05, "loss": 0.325, "step": 2044 }, { "epoch": 1.8992104040873201, "grad_norm": 0.20174771996042817, "learning_rate": 2.0378657487091224e-05, "loss": 0.3308, "step": 2045 }, { "epoch": 1.900139340455179, "grad_norm": 0.20408788253809196, "learning_rate": 2.0361445783132532e-05, "loss": 0.3211, "step": 2046 }, { "epoch": 1.9010682768230376, "grad_norm": 0.1878516530540329, "learning_rate": 2.0344234079173837e-05, "loss": 0.3414, "step": 2047 }, { "epoch": 1.9019972131908964, "grad_norm": 0.21257975400270182, "learning_rate": 2.0327022375215145e-05, "loss": 0.3781, "step": 2048 }, { "epoch": 1.9029261495587553, "grad_norm": 0.20595091660025142, "learning_rate": 2.0309810671256454e-05, "loss": 0.3373, "step": 2049 }, { "epoch": 1.903855085926614, "grad_norm": 0.19877200066200787, "learning_rate": 2.0292598967297762e-05, "loss": 0.3417, "step": 2050 }, { "epoch": 1.9047840222944727, "grad_norm": 0.20104969915961177, "learning_rate": 2.027538726333907e-05, "loss": 0.3467, "step": 2051 }, { "epoch": 1.9057129586623316, "grad_norm": 0.2327660603720691, "learning_rate": 2.025817555938038e-05, "loss": 0.3568, "step": 2052 }, { "epoch": 1.9066418950301904, "grad_norm": 0.18750059187569587, "learning_rate": 2.0240963855421687e-05, "loss": 0.3427, "step": 2053 }, { "epoch": 1.9075708313980493, "grad_norm": 0.18529718656829258, "learning_rate": 2.0223752151462996e-05, "loss": 0.3657, "step": 2054 }, { "epoch": 1.9084997677659081, "grad_norm": 0.21378367120455413, "learning_rate": 2.0206540447504304e-05, "loss": 0.3448, "step": 2055 }, { "epoch": 1.909428704133767, "grad_norm": 0.19816996642108167, "learning_rate": 2.0189328743545613e-05, "loss": 0.3379, "step": 2056 }, { "epoch": 1.9103576405016256, "grad_norm": 0.17952569303828486, "learning_rate": 2.017211703958692e-05, "loss": 0.3495, "step": 2057 }, { "epoch": 1.9112865768694844, "grad_norm": 0.1887115752212257, "learning_rate": 2.0154905335628226e-05, "loss": 0.3528, "step": 2058 }, { "epoch": 1.9122155132373433, "grad_norm": 0.19795487285379446, "learning_rate": 2.0137693631669534e-05, "loss": 0.312, "step": 2059 }, { "epoch": 1.913144449605202, "grad_norm": 0.1913042487899833, "learning_rate": 2.0120481927710843e-05, "loss": 0.3474, "step": 2060 }, { "epoch": 1.9140733859730608, "grad_norm": 0.18702333471919602, "learning_rate": 2.010327022375215e-05, "loss": 0.3313, "step": 2061 }, { "epoch": 1.9150023223409196, "grad_norm": 0.18216724495400727, "learning_rate": 2.008605851979346e-05, "loss": 0.3273, "step": 2062 }, { "epoch": 1.9159312587087785, "grad_norm": 0.1870733559834162, "learning_rate": 2.0068846815834768e-05, "loss": 0.328, "step": 2063 }, { "epoch": 1.9168601950766373, "grad_norm": 0.20069109494783774, "learning_rate": 2.0051635111876076e-05, "loss": 0.3468, "step": 2064 }, { "epoch": 1.9177891314444961, "grad_norm": 0.1788738115806139, "learning_rate": 2.0034423407917385e-05, "loss": 0.3186, "step": 2065 }, { "epoch": 1.918718067812355, "grad_norm": 0.18264576767730586, "learning_rate": 2.0017211703958693e-05, "loss": 0.3399, "step": 2066 }, { "epoch": 1.9196470041802136, "grad_norm": 0.1798033926304608, "learning_rate": 2e-05, "loss": 0.3475, "step": 2067 }, { "epoch": 1.9205759405480725, "grad_norm": 0.20622934405855825, "learning_rate": 1.998278829604131e-05, "loss": 0.3448, "step": 2068 }, { "epoch": 1.9215048769159313, "grad_norm": 0.20482612442847245, "learning_rate": 1.9965576592082615e-05, "loss": 0.3479, "step": 2069 }, { "epoch": 1.92243381328379, "grad_norm": 0.19506902393707024, "learning_rate": 1.9948364888123923e-05, "loss": 0.3428, "step": 2070 }, { "epoch": 1.9233627496516488, "grad_norm": 0.20966049670336326, "learning_rate": 1.9931153184165232e-05, "loss": 0.344, "step": 2071 }, { "epoch": 1.9242916860195076, "grad_norm": 0.1865501634784815, "learning_rate": 1.991394148020654e-05, "loss": 0.3616, "step": 2072 }, { "epoch": 1.9252206223873665, "grad_norm": 0.196751064921281, "learning_rate": 1.989672977624785e-05, "loss": 0.3318, "step": 2073 }, { "epoch": 1.9261495587552253, "grad_norm": 0.1804349938052275, "learning_rate": 1.9879518072289157e-05, "loss": 0.3416, "step": 2074 }, { "epoch": 1.9270784951230842, "grad_norm": 0.18106072316653846, "learning_rate": 1.9862306368330465e-05, "loss": 0.3336, "step": 2075 }, { "epoch": 1.928007431490943, "grad_norm": 0.18984032415869853, "learning_rate": 1.9845094664371774e-05, "loss": 0.35, "step": 2076 }, { "epoch": 1.9289363678588016, "grad_norm": 0.21925027722818818, "learning_rate": 1.9827882960413082e-05, "loss": 0.3428, "step": 2077 }, { "epoch": 1.9298653042266605, "grad_norm": 0.17538934019640326, "learning_rate": 1.981067125645439e-05, "loss": 0.3489, "step": 2078 }, { "epoch": 1.9307942405945193, "grad_norm": 0.21247914753683844, "learning_rate": 1.9793459552495696e-05, "loss": 0.3436, "step": 2079 }, { "epoch": 1.931723176962378, "grad_norm": 0.20170277926352845, "learning_rate": 1.9776247848537004e-05, "loss": 0.3665, "step": 2080 }, { "epoch": 1.9326521133302368, "grad_norm": 0.21489372804535314, "learning_rate": 1.9759036144578312e-05, "loss": 0.3271, "step": 2081 }, { "epoch": 1.9335810496980956, "grad_norm": 0.19404730364157935, "learning_rate": 1.974182444061962e-05, "loss": 0.3395, "step": 2082 }, { "epoch": 1.9345099860659545, "grad_norm": 0.1768311883365791, "learning_rate": 1.972461273666093e-05, "loss": 0.3304, "step": 2083 }, { "epoch": 1.9354389224338133, "grad_norm": 0.19513776903048077, "learning_rate": 1.9707401032702238e-05, "loss": 0.3355, "step": 2084 }, { "epoch": 1.9363678588016722, "grad_norm": 0.20552603770095754, "learning_rate": 1.9690189328743546e-05, "loss": 0.3586, "step": 2085 }, { "epoch": 1.937296795169531, "grad_norm": 0.20606350494140052, "learning_rate": 1.9672977624784854e-05, "loss": 0.341, "step": 2086 }, { "epoch": 1.9382257315373896, "grad_norm": 0.21117517653181553, "learning_rate": 1.9655765920826163e-05, "loss": 0.3556, "step": 2087 }, { "epoch": 1.9391546679052485, "grad_norm": 0.19889606660583597, "learning_rate": 1.963855421686747e-05, "loss": 0.3754, "step": 2088 }, { "epoch": 1.9400836042731073, "grad_norm": 0.2315272588969166, "learning_rate": 1.962134251290878e-05, "loss": 0.3625, "step": 2089 }, { "epoch": 1.941012540640966, "grad_norm": 0.1801392600297905, "learning_rate": 1.9604130808950085e-05, "loss": 0.3678, "step": 2090 }, { "epoch": 1.9419414770088248, "grad_norm": 0.20819332360704426, "learning_rate": 1.9586919104991393e-05, "loss": 0.3413, "step": 2091 }, { "epoch": 1.9428704133766836, "grad_norm": 0.20855084760144948, "learning_rate": 1.95697074010327e-05, "loss": 0.3479, "step": 2092 }, { "epoch": 1.9437993497445425, "grad_norm": 0.1934591269154208, "learning_rate": 1.955249569707401e-05, "loss": 0.3292, "step": 2093 }, { "epoch": 1.9447282861124013, "grad_norm": 0.19888631185262237, "learning_rate": 1.9535283993115318e-05, "loss": 0.3517, "step": 2094 }, { "epoch": 1.9456572224802602, "grad_norm": 0.23696777928124235, "learning_rate": 1.9518072289156627e-05, "loss": 0.3603, "step": 2095 }, { "epoch": 1.946586158848119, "grad_norm": 0.1930501835027152, "learning_rate": 1.9500860585197935e-05, "loss": 0.3511, "step": 2096 }, { "epoch": 1.9475150952159777, "grad_norm": 0.1847048438548015, "learning_rate": 1.9483648881239243e-05, "loss": 0.3256, "step": 2097 }, { "epoch": 1.9484440315838365, "grad_norm": 0.17904191429627417, "learning_rate": 1.9466437177280552e-05, "loss": 0.3526, "step": 2098 }, { "epoch": 1.9493729679516953, "grad_norm": 0.21564625082297997, "learning_rate": 1.944922547332186e-05, "loss": 0.3336, "step": 2099 }, { "epoch": 1.950301904319554, "grad_norm": 0.18093745627216662, "learning_rate": 1.9432013769363165e-05, "loss": 0.337, "step": 2100 }, { "epoch": 1.9512308406874128, "grad_norm": 0.20861506817234635, "learning_rate": 1.9414802065404474e-05, "loss": 0.3545, "step": 2101 }, { "epoch": 1.9521597770552717, "grad_norm": 0.18651110392988504, "learning_rate": 1.9397590361445782e-05, "loss": 0.3382, "step": 2102 }, { "epoch": 1.9530887134231305, "grad_norm": 0.19498741795933872, "learning_rate": 1.938037865748709e-05, "loss": 0.3196, "step": 2103 }, { "epoch": 1.9540176497909894, "grad_norm": 0.19938426190865807, "learning_rate": 1.93631669535284e-05, "loss": 0.3439, "step": 2104 }, { "epoch": 1.9549465861588482, "grad_norm": 0.20167566769499962, "learning_rate": 1.9345955249569707e-05, "loss": 0.3422, "step": 2105 }, { "epoch": 1.955875522526707, "grad_norm": 0.1858484130472836, "learning_rate": 1.9328743545611016e-05, "loss": 0.3334, "step": 2106 }, { "epoch": 1.9568044588945657, "grad_norm": 0.19150012147084414, "learning_rate": 1.9311531841652324e-05, "loss": 0.3199, "step": 2107 }, { "epoch": 1.9577333952624245, "grad_norm": 0.20387383733567602, "learning_rate": 1.9294320137693632e-05, "loss": 0.3531, "step": 2108 }, { "epoch": 1.9586623316302834, "grad_norm": 0.18764227954582366, "learning_rate": 1.927710843373494e-05, "loss": 0.3485, "step": 2109 }, { "epoch": 1.959591267998142, "grad_norm": 0.19184067561980478, "learning_rate": 1.925989672977625e-05, "loss": 0.3375, "step": 2110 }, { "epoch": 1.9605202043660008, "grad_norm": 0.18361672644385202, "learning_rate": 1.9242685025817554e-05, "loss": 0.3411, "step": 2111 }, { "epoch": 1.9614491407338597, "grad_norm": 0.19465647573588937, "learning_rate": 1.9225473321858863e-05, "loss": 0.3273, "step": 2112 }, { "epoch": 1.9623780771017185, "grad_norm": 0.18345387692416829, "learning_rate": 1.920826161790017e-05, "loss": 0.3534, "step": 2113 }, { "epoch": 1.9633070134695774, "grad_norm": 0.18440352667913198, "learning_rate": 1.919104991394148e-05, "loss": 0.3338, "step": 2114 }, { "epoch": 1.9642359498374362, "grad_norm": 0.1905629308788521, "learning_rate": 1.9173838209982788e-05, "loss": 0.348, "step": 2115 }, { "epoch": 1.965164886205295, "grad_norm": 0.17738331253996667, "learning_rate": 1.9156626506024096e-05, "loss": 0.338, "step": 2116 }, { "epoch": 1.966093822573154, "grad_norm": 0.19317990178767253, "learning_rate": 1.9139414802065405e-05, "loss": 0.3463, "step": 2117 }, { "epoch": 1.9670227589410125, "grad_norm": 0.19118447547256318, "learning_rate": 1.9122203098106713e-05, "loss": 0.3205, "step": 2118 }, { "epoch": 1.9679516953088714, "grad_norm": 0.1941495864820665, "learning_rate": 1.910499139414802e-05, "loss": 0.3539, "step": 2119 }, { "epoch": 1.96888063167673, "grad_norm": 0.19760267671496076, "learning_rate": 1.908777969018933e-05, "loss": 0.3464, "step": 2120 }, { "epoch": 1.9698095680445888, "grad_norm": 0.20254146860119207, "learning_rate": 1.9070567986230638e-05, "loss": 0.3499, "step": 2121 }, { "epoch": 1.9707385044124477, "grad_norm": 0.1843345665414209, "learning_rate": 1.9053356282271943e-05, "loss": 0.3208, "step": 2122 }, { "epoch": 1.9716674407803065, "grad_norm": 0.18931583414329914, "learning_rate": 1.9036144578313252e-05, "loss": 0.3314, "step": 2123 }, { "epoch": 1.9725963771481654, "grad_norm": 0.17544270577499563, "learning_rate": 1.901893287435456e-05, "loss": 0.3287, "step": 2124 }, { "epoch": 1.9735253135160242, "grad_norm": 0.19688051277717727, "learning_rate": 1.900172117039587e-05, "loss": 0.3522, "step": 2125 }, { "epoch": 1.974454249883883, "grad_norm": 0.19873447267182606, "learning_rate": 1.8984509466437177e-05, "loss": 0.3312, "step": 2126 }, { "epoch": 1.975383186251742, "grad_norm": 0.19087379498613669, "learning_rate": 1.8967297762478485e-05, "loss": 0.3676, "step": 2127 }, { "epoch": 1.9763121226196005, "grad_norm": 0.19614887292658442, "learning_rate": 1.8950086058519794e-05, "loss": 0.3592, "step": 2128 }, { "epoch": 1.9772410589874594, "grad_norm": 0.20900078157895577, "learning_rate": 1.8932874354561102e-05, "loss": 0.3431, "step": 2129 }, { "epoch": 1.978169995355318, "grad_norm": 0.19572251469376092, "learning_rate": 1.891566265060241e-05, "loss": 0.3266, "step": 2130 }, { "epoch": 1.9790989317231769, "grad_norm": 0.17443311056477603, "learning_rate": 1.889845094664372e-05, "loss": 0.3338, "step": 2131 }, { "epoch": 1.9800278680910357, "grad_norm": 0.20884670381056047, "learning_rate": 1.8881239242685024e-05, "loss": 0.3434, "step": 2132 }, { "epoch": 1.9809568044588945, "grad_norm": 0.19322917648542368, "learning_rate": 1.8864027538726332e-05, "loss": 0.3201, "step": 2133 }, { "epoch": 1.9818857408267534, "grad_norm": 0.18161054306051905, "learning_rate": 1.884681583476764e-05, "loss": 0.3369, "step": 2134 }, { "epoch": 1.9828146771946122, "grad_norm": 0.19995407241341367, "learning_rate": 1.882960413080895e-05, "loss": 0.3286, "step": 2135 }, { "epoch": 1.983743613562471, "grad_norm": 0.1989383783308988, "learning_rate": 1.8812392426850258e-05, "loss": 0.3627, "step": 2136 }, { "epoch": 1.98467254993033, "grad_norm": 0.20119453420571734, "learning_rate": 1.8795180722891566e-05, "loss": 0.3256, "step": 2137 }, { "epoch": 1.9856014862981886, "grad_norm": 0.2182021666836405, "learning_rate": 1.8777969018932874e-05, "loss": 0.3461, "step": 2138 }, { "epoch": 1.9865304226660474, "grad_norm": 0.2102127576334096, "learning_rate": 1.8760757314974183e-05, "loss": 0.3416, "step": 2139 }, { "epoch": 1.987459359033906, "grad_norm": 0.18281161210727, "learning_rate": 1.874354561101549e-05, "loss": 0.3288, "step": 2140 }, { "epoch": 1.9883882954017649, "grad_norm": 0.21447362464818037, "learning_rate": 1.87263339070568e-05, "loss": 0.3259, "step": 2141 }, { "epoch": 1.9893172317696237, "grad_norm": 0.2115018838882039, "learning_rate": 1.8709122203098108e-05, "loss": 0.3453, "step": 2142 }, { "epoch": 1.9902461681374826, "grad_norm": 0.1960898048792348, "learning_rate": 1.8691910499139413e-05, "loss": 0.3445, "step": 2143 }, { "epoch": 1.9911751045053414, "grad_norm": 0.1930005268145748, "learning_rate": 1.867469879518072e-05, "loss": 0.3257, "step": 2144 }, { "epoch": 1.9921040408732003, "grad_norm": 0.22568106040346528, "learning_rate": 1.865748709122203e-05, "loss": 0.3308, "step": 2145 }, { "epoch": 1.993032977241059, "grad_norm": 0.1984125411749985, "learning_rate": 1.8640275387263338e-05, "loss": 0.3465, "step": 2146 }, { "epoch": 1.993961913608918, "grad_norm": 0.19270028218557986, "learning_rate": 1.8623063683304647e-05, "loss": 0.3766, "step": 2147 }, { "epoch": 1.9948908499767766, "grad_norm": 0.21961399982931906, "learning_rate": 1.8605851979345955e-05, "loss": 0.364, "step": 2148 }, { "epoch": 1.9958197863446354, "grad_norm": 0.19683946052220597, "learning_rate": 1.8588640275387263e-05, "loss": 0.3494, "step": 2149 }, { "epoch": 1.996748722712494, "grad_norm": 0.2216547899837589, "learning_rate": 1.8571428571428572e-05, "loss": 0.3343, "step": 2150 }, { "epoch": 1.9976776590803529, "grad_norm": 0.18513919900783316, "learning_rate": 1.855421686746988e-05, "loss": 0.3333, "step": 2151 }, { "epoch": 1.9986065954482117, "grad_norm": 0.19337964070214114, "learning_rate": 1.853700516351119e-05, "loss": 0.349, "step": 2152 }, { "epoch": 1.9995355318160706, "grad_norm": 0.22054539434718806, "learning_rate": 1.8519793459552494e-05, "loss": 0.3425, "step": 2153 }, { "epoch": 2.0, "grad_norm": 0.3469287314164476, "learning_rate": 1.8502581755593802e-05, "loss": 0.3178, "step": 2154 }, { "epoch": 2.000928936367859, "grad_norm": 0.2157058376159686, "learning_rate": 1.848537005163511e-05, "loss": 0.2774, "step": 2155 }, { "epoch": 2.0018578727357177, "grad_norm": 0.23148826903740533, "learning_rate": 1.846815834767642e-05, "loss": 0.2857, "step": 2156 }, { "epoch": 2.0027868091035765, "grad_norm": 0.2528270886829438, "learning_rate": 1.8450946643717727e-05, "loss": 0.2855, "step": 2157 }, { "epoch": 2.0037157454714354, "grad_norm": 0.24255204959272156, "learning_rate": 1.8433734939759036e-05, "loss": 0.2678, "step": 2158 }, { "epoch": 2.0046446818392942, "grad_norm": 0.21988723244876518, "learning_rate": 1.8416523235800344e-05, "loss": 0.2666, "step": 2159 }, { "epoch": 2.0055736182071526, "grad_norm": 0.230258825927274, "learning_rate": 1.8399311531841652e-05, "loss": 0.2855, "step": 2160 }, { "epoch": 2.0065025545750115, "grad_norm": 0.24750969996912872, "learning_rate": 1.838209982788296e-05, "loss": 0.274, "step": 2161 }, { "epoch": 2.0074314909428703, "grad_norm": 0.21415269672959325, "learning_rate": 1.836488812392427e-05, "loss": 0.2559, "step": 2162 }, { "epoch": 2.008360427310729, "grad_norm": 0.24277353543089242, "learning_rate": 1.8347676419965578e-05, "loss": 0.2881, "step": 2163 }, { "epoch": 2.009289363678588, "grad_norm": 0.22462647600480273, "learning_rate": 1.8330464716006886e-05, "loss": 0.273, "step": 2164 }, { "epoch": 2.010218300046447, "grad_norm": 0.23734616553826293, "learning_rate": 1.8313253012048194e-05, "loss": 0.2876, "step": 2165 }, { "epoch": 2.0111472364143057, "grad_norm": 0.24635100347524322, "learning_rate": 1.82960413080895e-05, "loss": 0.2943, "step": 2166 }, { "epoch": 2.0120761727821646, "grad_norm": 0.23031332092273404, "learning_rate": 1.8278829604130808e-05, "loss": 0.2724, "step": 2167 }, { "epoch": 2.0130051091500234, "grad_norm": 0.20356233261897416, "learning_rate": 1.8261617900172116e-05, "loss": 0.2768, "step": 2168 }, { "epoch": 2.0139340455178822, "grad_norm": 0.24232305321170966, "learning_rate": 1.8244406196213425e-05, "loss": 0.278, "step": 2169 }, { "epoch": 2.0148629818857406, "grad_norm": 0.20514711352879814, "learning_rate": 1.8227194492254733e-05, "loss": 0.2889, "step": 2170 }, { "epoch": 2.0157919182535995, "grad_norm": 0.21172830872100393, "learning_rate": 1.820998278829604e-05, "loss": 0.2795, "step": 2171 }, { "epoch": 2.0167208546214583, "grad_norm": 0.19343812296165566, "learning_rate": 1.819277108433735e-05, "loss": 0.27, "step": 2172 }, { "epoch": 2.017649790989317, "grad_norm": 0.22135939544753655, "learning_rate": 1.8175559380378658e-05, "loss": 0.2653, "step": 2173 }, { "epoch": 2.018578727357176, "grad_norm": 0.21690056177642822, "learning_rate": 1.8158347676419967e-05, "loss": 0.2739, "step": 2174 }, { "epoch": 2.019507663725035, "grad_norm": 0.21806288455919925, "learning_rate": 1.8141135972461275e-05, "loss": 0.2973, "step": 2175 }, { "epoch": 2.0204366000928937, "grad_norm": 0.22552713294809715, "learning_rate": 1.8123924268502583e-05, "loss": 0.2854, "step": 2176 }, { "epoch": 2.0213655364607526, "grad_norm": 0.2055813988009532, "learning_rate": 1.8106712564543892e-05, "loss": 0.2806, "step": 2177 }, { "epoch": 2.0222944728286114, "grad_norm": 0.20183039247919923, "learning_rate": 1.8089500860585197e-05, "loss": 0.2812, "step": 2178 }, { "epoch": 2.0232234091964703, "grad_norm": 0.19722763146647482, "learning_rate": 1.8072289156626505e-05, "loss": 0.2677, "step": 2179 }, { "epoch": 2.0241523455643287, "grad_norm": 0.21974854118511616, "learning_rate": 1.8055077452667814e-05, "loss": 0.2869, "step": 2180 }, { "epoch": 2.0250812819321875, "grad_norm": 0.20162885066764683, "learning_rate": 1.8037865748709122e-05, "loss": 0.2793, "step": 2181 }, { "epoch": 2.0260102183000464, "grad_norm": 0.18375708438721025, "learning_rate": 1.802065404475043e-05, "loss": 0.2693, "step": 2182 }, { "epoch": 2.026939154667905, "grad_norm": 0.2180582338151327, "learning_rate": 1.800344234079174e-05, "loss": 0.2778, "step": 2183 }, { "epoch": 2.027868091035764, "grad_norm": 0.1904035088310135, "learning_rate": 1.7986230636833047e-05, "loss": 0.267, "step": 2184 }, { "epoch": 2.028797027403623, "grad_norm": 0.21014267773827153, "learning_rate": 1.7969018932874356e-05, "loss": 0.2888, "step": 2185 }, { "epoch": 2.0297259637714817, "grad_norm": 0.19891687553756965, "learning_rate": 1.7951807228915664e-05, "loss": 0.2874, "step": 2186 }, { "epoch": 2.0306549001393406, "grad_norm": 0.20789519384380667, "learning_rate": 1.7934595524956972e-05, "loss": 0.2623, "step": 2187 }, { "epoch": 2.0315838365071994, "grad_norm": 0.20429364046527473, "learning_rate": 1.791738382099828e-05, "loss": 0.2784, "step": 2188 }, { "epoch": 2.0325127728750583, "grad_norm": 0.19034811630615903, "learning_rate": 1.790017211703959e-05, "loss": 0.292, "step": 2189 }, { "epoch": 2.0334417092429167, "grad_norm": 0.20841543986466363, "learning_rate": 1.7882960413080894e-05, "loss": 0.2819, "step": 2190 }, { "epoch": 2.0343706456107755, "grad_norm": 0.1955048400721127, "learning_rate": 1.7865748709122203e-05, "loss": 0.2641, "step": 2191 }, { "epoch": 2.0352995819786344, "grad_norm": 0.1830392111751671, "learning_rate": 1.784853700516351e-05, "loss": 0.2671, "step": 2192 }, { "epoch": 2.036228518346493, "grad_norm": 0.1902652316296126, "learning_rate": 1.783132530120482e-05, "loss": 0.2767, "step": 2193 }, { "epoch": 2.037157454714352, "grad_norm": 0.19416968904801624, "learning_rate": 1.7814113597246128e-05, "loss": 0.2768, "step": 2194 }, { "epoch": 2.038086391082211, "grad_norm": 0.22500831983539799, "learning_rate": 1.7796901893287436e-05, "loss": 0.2738, "step": 2195 }, { "epoch": 2.0390153274500697, "grad_norm": 0.1928120950604119, "learning_rate": 1.7779690189328745e-05, "loss": 0.2811, "step": 2196 }, { "epoch": 2.0399442638179286, "grad_norm": 0.17208435269872213, "learning_rate": 1.7762478485370053e-05, "loss": 0.2755, "step": 2197 }, { "epoch": 2.0408732001857874, "grad_norm": 0.18090731260063056, "learning_rate": 1.774526678141136e-05, "loss": 0.2544, "step": 2198 }, { "epoch": 2.0418021365536463, "grad_norm": 0.183776818234785, "learning_rate": 1.772805507745267e-05, "loss": 0.2866, "step": 2199 }, { "epoch": 2.0427310729215047, "grad_norm": 0.18371201328625772, "learning_rate": 1.7710843373493978e-05, "loss": 0.2704, "step": 2200 }, { "epoch": 2.0436600092893635, "grad_norm": 0.18123129067098215, "learning_rate": 1.7693631669535287e-05, "loss": 0.3001, "step": 2201 }, { "epoch": 2.0445889456572224, "grad_norm": 0.1811746843377487, "learning_rate": 1.7676419965576595e-05, "loss": 0.2676, "step": 2202 }, { "epoch": 2.0455178820250812, "grad_norm": 0.18554403639433137, "learning_rate": 1.76592082616179e-05, "loss": 0.2738, "step": 2203 }, { "epoch": 2.04644681839294, "grad_norm": 0.19747699767180618, "learning_rate": 1.764199655765921e-05, "loss": 0.2763, "step": 2204 }, { "epoch": 2.047375754760799, "grad_norm": 0.1935650970013363, "learning_rate": 1.7624784853700517e-05, "loss": 0.2715, "step": 2205 }, { "epoch": 2.0483046911286578, "grad_norm": 0.1854401984756031, "learning_rate": 1.7607573149741825e-05, "loss": 0.2789, "step": 2206 }, { "epoch": 2.0492336274965166, "grad_norm": 0.1878981380085049, "learning_rate": 1.7590361445783134e-05, "loss": 0.2747, "step": 2207 }, { "epoch": 2.0501625638643755, "grad_norm": 0.18824729243879953, "learning_rate": 1.7573149741824442e-05, "loss": 0.2766, "step": 2208 }, { "epoch": 2.0510915002322343, "grad_norm": 0.18511176681612157, "learning_rate": 1.755593803786575e-05, "loss": 0.2808, "step": 2209 }, { "epoch": 2.0520204366000927, "grad_norm": 0.17356200032688698, "learning_rate": 1.753872633390706e-05, "loss": 0.2739, "step": 2210 }, { "epoch": 2.0529493729679515, "grad_norm": 0.1751736860021714, "learning_rate": 1.7521514629948367e-05, "loss": 0.2685, "step": 2211 }, { "epoch": 2.0538783093358104, "grad_norm": 0.18256208438125998, "learning_rate": 1.7504302925989676e-05, "loss": 0.2604, "step": 2212 }, { "epoch": 2.0548072457036692, "grad_norm": 0.1767328307937035, "learning_rate": 1.7487091222030984e-05, "loss": 0.2673, "step": 2213 }, { "epoch": 2.055736182071528, "grad_norm": 0.18036991264224542, "learning_rate": 1.7469879518072292e-05, "loss": 0.2735, "step": 2214 }, { "epoch": 2.056665118439387, "grad_norm": 0.17618684914959876, "learning_rate": 1.7452667814113597e-05, "loss": 0.2671, "step": 2215 }, { "epoch": 2.0575940548072458, "grad_norm": 0.22278489819230726, "learning_rate": 1.7435456110154906e-05, "loss": 0.2953, "step": 2216 }, { "epoch": 2.0585229911751046, "grad_norm": 0.1957544879322214, "learning_rate": 1.7418244406196214e-05, "loss": 0.29, "step": 2217 }, { "epoch": 2.0594519275429635, "grad_norm": 0.20251022082815323, "learning_rate": 1.7401032702237523e-05, "loss": 0.2655, "step": 2218 }, { "epoch": 2.0603808639108223, "grad_norm": 0.18210373540439084, "learning_rate": 1.738382099827883e-05, "loss": 0.2719, "step": 2219 }, { "epoch": 2.0613098002786807, "grad_norm": 0.19081176507689804, "learning_rate": 1.736660929432014e-05, "loss": 0.289, "step": 2220 }, { "epoch": 2.0622387366465396, "grad_norm": 0.17312113263605755, "learning_rate": 1.7349397590361448e-05, "loss": 0.2707, "step": 2221 }, { "epoch": 2.0631676730143984, "grad_norm": 0.2042563838051068, "learning_rate": 1.7332185886402756e-05, "loss": 0.2734, "step": 2222 }, { "epoch": 2.0640966093822573, "grad_norm": 0.19562693428663167, "learning_rate": 1.7314974182444065e-05, "loss": 0.2735, "step": 2223 }, { "epoch": 2.065025545750116, "grad_norm": 0.207464339658486, "learning_rate": 1.7297762478485373e-05, "loss": 0.2771, "step": 2224 }, { "epoch": 2.065954482117975, "grad_norm": 0.20297657867438057, "learning_rate": 1.728055077452668e-05, "loss": 0.2945, "step": 2225 }, { "epoch": 2.066883418485834, "grad_norm": 0.19308466101824381, "learning_rate": 1.726333907056799e-05, "loss": 0.2761, "step": 2226 }, { "epoch": 2.0678123548536926, "grad_norm": 0.1940231498846366, "learning_rate": 1.7246127366609295e-05, "loss": 0.2781, "step": 2227 }, { "epoch": 2.0687412912215515, "grad_norm": 0.19959094783677078, "learning_rate": 1.7228915662650603e-05, "loss": 0.283, "step": 2228 }, { "epoch": 2.0696702275894103, "grad_norm": 0.17718713745827044, "learning_rate": 1.721170395869191e-05, "loss": 0.2731, "step": 2229 }, { "epoch": 2.0705991639572687, "grad_norm": 0.20114808299866702, "learning_rate": 1.719449225473322e-05, "loss": 0.2682, "step": 2230 }, { "epoch": 2.0715281003251276, "grad_norm": 0.18891743595084226, "learning_rate": 1.717728055077453e-05, "loss": 0.268, "step": 2231 }, { "epoch": 2.0724570366929864, "grad_norm": 0.19305694440730528, "learning_rate": 1.7160068846815837e-05, "loss": 0.2864, "step": 2232 }, { "epoch": 2.0733859730608453, "grad_norm": 0.18612754148970131, "learning_rate": 1.7142857142857145e-05, "loss": 0.2804, "step": 2233 }, { "epoch": 2.074314909428704, "grad_norm": 0.21870942336091884, "learning_rate": 1.7125645438898454e-05, "loss": 0.2803, "step": 2234 }, { "epoch": 2.075243845796563, "grad_norm": 0.1905749129757181, "learning_rate": 1.7108433734939762e-05, "loss": 0.2769, "step": 2235 }, { "epoch": 2.076172782164422, "grad_norm": 0.17958345546132004, "learning_rate": 1.709122203098107e-05, "loss": 0.2676, "step": 2236 }, { "epoch": 2.0771017185322806, "grad_norm": 0.17978922180228651, "learning_rate": 1.707401032702238e-05, "loss": 0.2566, "step": 2237 }, { "epoch": 2.0780306549001395, "grad_norm": 0.19619208069729835, "learning_rate": 1.7056798623063684e-05, "loss": 0.2847, "step": 2238 }, { "epoch": 2.0789595912679983, "grad_norm": 0.2028901050066484, "learning_rate": 1.7039586919104992e-05, "loss": 0.2839, "step": 2239 }, { "epoch": 2.0798885276358567, "grad_norm": 0.20553977912065274, "learning_rate": 1.70223752151463e-05, "loss": 0.27, "step": 2240 }, { "epoch": 2.0808174640037156, "grad_norm": 0.2175363885394602, "learning_rate": 1.700516351118761e-05, "loss": 0.2827, "step": 2241 }, { "epoch": 2.0817464003715744, "grad_norm": 0.17374801708666573, "learning_rate": 1.6987951807228917e-05, "loss": 0.2514, "step": 2242 }, { "epoch": 2.0826753367394333, "grad_norm": 0.2009093330228981, "learning_rate": 1.6970740103270226e-05, "loss": 0.2695, "step": 2243 }, { "epoch": 2.083604273107292, "grad_norm": 0.18927752447294688, "learning_rate": 1.6953528399311534e-05, "loss": 0.2576, "step": 2244 }, { "epoch": 2.084533209475151, "grad_norm": 0.23624065141040287, "learning_rate": 1.6936316695352843e-05, "loss": 0.2998, "step": 2245 }, { "epoch": 2.08546214584301, "grad_norm": 0.18922983252395634, "learning_rate": 1.691910499139415e-05, "loss": 0.2805, "step": 2246 }, { "epoch": 2.0863910822108687, "grad_norm": 0.19589618442669648, "learning_rate": 1.690189328743546e-05, "loss": 0.2683, "step": 2247 }, { "epoch": 2.0873200185787275, "grad_norm": 0.19640479008012807, "learning_rate": 1.6884681583476768e-05, "loss": 0.2744, "step": 2248 }, { "epoch": 2.0882489549465864, "grad_norm": 0.19901030382264118, "learning_rate": 1.6867469879518073e-05, "loss": 0.264, "step": 2249 }, { "epoch": 2.0891778913144448, "grad_norm": 0.1952682814125198, "learning_rate": 1.685025817555938e-05, "loss": 0.2958, "step": 2250 }, { "epoch": 2.0901068276823036, "grad_norm": 0.19766258576899515, "learning_rate": 1.683304647160069e-05, "loss": 0.2786, "step": 2251 }, { "epoch": 2.0910357640501624, "grad_norm": 0.19455969769783835, "learning_rate": 1.6815834767641998e-05, "loss": 0.2798, "step": 2252 }, { "epoch": 2.0919647004180213, "grad_norm": 0.18818819171389964, "learning_rate": 1.6798623063683306e-05, "loss": 0.2781, "step": 2253 }, { "epoch": 2.09289363678588, "grad_norm": 0.1915501131624007, "learning_rate": 1.6781411359724615e-05, "loss": 0.2744, "step": 2254 }, { "epoch": 2.093822573153739, "grad_norm": 0.18831604411563369, "learning_rate": 1.6764199655765923e-05, "loss": 0.2619, "step": 2255 }, { "epoch": 2.094751509521598, "grad_norm": 0.19348282994201058, "learning_rate": 1.674698795180723e-05, "loss": 0.2751, "step": 2256 }, { "epoch": 2.0956804458894567, "grad_norm": 0.19615082676185192, "learning_rate": 1.672977624784854e-05, "loss": 0.2888, "step": 2257 }, { "epoch": 2.0966093822573155, "grad_norm": 0.1847380252200025, "learning_rate": 1.671256454388985e-05, "loss": 0.2854, "step": 2258 }, { "epoch": 2.0975383186251744, "grad_norm": 0.20348389567918942, "learning_rate": 1.6695352839931153e-05, "loss": 0.2873, "step": 2259 }, { "epoch": 2.0984672549930328, "grad_norm": 0.19400279600413042, "learning_rate": 1.6678141135972462e-05, "loss": 0.2813, "step": 2260 }, { "epoch": 2.0993961913608916, "grad_norm": 0.18111282515696545, "learning_rate": 1.666092943201377e-05, "loss": 0.2983, "step": 2261 }, { "epoch": 2.1003251277287505, "grad_norm": 0.18813347341361206, "learning_rate": 1.664371772805508e-05, "loss": 0.269, "step": 2262 }, { "epoch": 2.1012540640966093, "grad_norm": 0.17424998589403623, "learning_rate": 1.6626506024096387e-05, "loss": 0.2607, "step": 2263 }, { "epoch": 2.102183000464468, "grad_norm": 0.17051368168715583, "learning_rate": 1.6609294320137695e-05, "loss": 0.2632, "step": 2264 }, { "epoch": 2.103111936832327, "grad_norm": 0.19333768098843307, "learning_rate": 1.6592082616179004e-05, "loss": 0.2739, "step": 2265 }, { "epoch": 2.104040873200186, "grad_norm": 0.17969196315731728, "learning_rate": 1.6574870912220312e-05, "loss": 0.2642, "step": 2266 }, { "epoch": 2.1049698095680447, "grad_norm": 0.17290109863089814, "learning_rate": 1.655765920826162e-05, "loss": 0.274, "step": 2267 }, { "epoch": 2.1058987459359035, "grad_norm": 0.17856097137064214, "learning_rate": 1.654044750430293e-05, "loss": 0.2735, "step": 2268 }, { "epoch": 2.1068276823037624, "grad_norm": 0.18795624301701083, "learning_rate": 1.6523235800344237e-05, "loss": 0.2811, "step": 2269 }, { "epoch": 2.107756618671621, "grad_norm": 0.17878323498135645, "learning_rate": 1.6506024096385542e-05, "loss": 0.2757, "step": 2270 }, { "epoch": 2.1086855550394796, "grad_norm": 0.1754157334553124, "learning_rate": 1.648881239242685e-05, "loss": 0.2719, "step": 2271 }, { "epoch": 2.1096144914073385, "grad_norm": 0.18969010978993706, "learning_rate": 1.647160068846816e-05, "loss": 0.2805, "step": 2272 }, { "epoch": 2.1105434277751973, "grad_norm": 0.2042293911687927, "learning_rate": 1.6454388984509468e-05, "loss": 0.2869, "step": 2273 }, { "epoch": 2.111472364143056, "grad_norm": 0.1733527665317661, "learning_rate": 1.6437177280550776e-05, "loss": 0.2684, "step": 2274 }, { "epoch": 2.112401300510915, "grad_norm": 0.18133085027013407, "learning_rate": 1.6419965576592084e-05, "loss": 0.2783, "step": 2275 }, { "epoch": 2.113330236878774, "grad_norm": 0.19680333317378604, "learning_rate": 1.6402753872633393e-05, "loss": 0.2804, "step": 2276 }, { "epoch": 2.1142591732466327, "grad_norm": 0.18081135463697318, "learning_rate": 1.63855421686747e-05, "loss": 0.2605, "step": 2277 }, { "epoch": 2.1151881096144916, "grad_norm": 0.19740057774259165, "learning_rate": 1.636833046471601e-05, "loss": 0.2789, "step": 2278 }, { "epoch": 2.1161170459823504, "grad_norm": 0.18568459190301742, "learning_rate": 1.6351118760757318e-05, "loss": 0.2828, "step": 2279 }, { "epoch": 2.117045982350209, "grad_norm": 0.19365683468329514, "learning_rate": 1.6333907056798626e-05, "loss": 0.2688, "step": 2280 }, { "epoch": 2.1179749187180676, "grad_norm": 0.20252419703650162, "learning_rate": 1.631669535283993e-05, "loss": 0.2713, "step": 2281 }, { "epoch": 2.1189038550859265, "grad_norm": 0.1792890569752184, "learning_rate": 1.629948364888124e-05, "loss": 0.262, "step": 2282 }, { "epoch": 2.1198327914537853, "grad_norm": 0.20524389028390555, "learning_rate": 1.6282271944922548e-05, "loss": 0.2843, "step": 2283 }, { "epoch": 2.120761727821644, "grad_norm": 0.28341598447437294, "learning_rate": 1.6265060240963857e-05, "loss": 0.3053, "step": 2284 }, { "epoch": 2.121690664189503, "grad_norm": 0.20203034410437481, "learning_rate": 1.6247848537005165e-05, "loss": 0.2854, "step": 2285 }, { "epoch": 2.122619600557362, "grad_norm": 0.1953939465453228, "learning_rate": 1.6230636833046473e-05, "loss": 0.2899, "step": 2286 }, { "epoch": 2.1235485369252207, "grad_norm": 0.1793459966677846, "learning_rate": 1.6213425129087782e-05, "loss": 0.2796, "step": 2287 }, { "epoch": 2.1244774732930796, "grad_norm": 0.17783901924314915, "learning_rate": 1.619621342512909e-05, "loss": 0.2626, "step": 2288 }, { "epoch": 2.1254064096609384, "grad_norm": 0.22343298968404166, "learning_rate": 1.61790017211704e-05, "loss": 0.2811, "step": 2289 }, { "epoch": 2.126335346028797, "grad_norm": 0.1814389246560736, "learning_rate": 1.6161790017211707e-05, "loss": 0.2598, "step": 2290 }, { "epoch": 2.1272642823966557, "grad_norm": 0.18055709417523266, "learning_rate": 1.6144578313253012e-05, "loss": 0.2798, "step": 2291 }, { "epoch": 2.1281932187645145, "grad_norm": 0.17888962889126264, "learning_rate": 1.612736660929432e-05, "loss": 0.2682, "step": 2292 }, { "epoch": 2.1291221551323734, "grad_norm": 0.19721301430366406, "learning_rate": 1.611015490533563e-05, "loss": 0.2719, "step": 2293 }, { "epoch": 2.130051091500232, "grad_norm": 0.19444402714079692, "learning_rate": 1.6092943201376937e-05, "loss": 0.2783, "step": 2294 }, { "epoch": 2.130980027868091, "grad_norm": 0.1838632683011167, "learning_rate": 1.6075731497418246e-05, "loss": 0.2768, "step": 2295 }, { "epoch": 2.13190896423595, "grad_norm": 0.1884248262044888, "learning_rate": 1.6058519793459554e-05, "loss": 0.2707, "step": 2296 }, { "epoch": 2.1328379006038087, "grad_norm": 0.19204342794555168, "learning_rate": 1.6041308089500863e-05, "loss": 0.2775, "step": 2297 }, { "epoch": 2.1337668369716676, "grad_norm": 0.18608945592808687, "learning_rate": 1.602409638554217e-05, "loss": 0.2685, "step": 2298 }, { "epoch": 2.1346957733395264, "grad_norm": 0.18587485562812903, "learning_rate": 1.600688468158348e-05, "loss": 0.2786, "step": 2299 }, { "epoch": 2.1356247097073853, "grad_norm": 0.19587986603415075, "learning_rate": 1.5989672977624788e-05, "loss": 0.2856, "step": 2300 }, { "epoch": 2.1365536460752437, "grad_norm": 0.1942871240310058, "learning_rate": 1.5972461273666096e-05, "loss": 0.2803, "step": 2301 }, { "epoch": 2.1374825824431025, "grad_norm": 0.18749941680014653, "learning_rate": 1.59552495697074e-05, "loss": 0.2917, "step": 2302 }, { "epoch": 2.1384115188109614, "grad_norm": 0.1887106230362805, "learning_rate": 1.593803786574871e-05, "loss": 0.2872, "step": 2303 }, { "epoch": 2.13934045517882, "grad_norm": 0.18424120413217845, "learning_rate": 1.5920826161790018e-05, "loss": 0.2744, "step": 2304 }, { "epoch": 2.140269391546679, "grad_norm": 0.2091266962240203, "learning_rate": 1.5903614457831326e-05, "loss": 0.2872, "step": 2305 }, { "epoch": 2.141198327914538, "grad_norm": 0.167622736533787, "learning_rate": 1.5886402753872635e-05, "loss": 0.2579, "step": 2306 }, { "epoch": 2.1421272642823967, "grad_norm": 0.18300097585692288, "learning_rate": 1.5869191049913943e-05, "loss": 0.2979, "step": 2307 }, { "epoch": 2.1430562006502556, "grad_norm": 0.18348416995061526, "learning_rate": 1.585197934595525e-05, "loss": 0.261, "step": 2308 }, { "epoch": 2.1439851370181144, "grad_norm": 0.1890149322297578, "learning_rate": 1.583476764199656e-05, "loss": 0.2918, "step": 2309 }, { "epoch": 2.144914073385973, "grad_norm": 0.1925329558848419, "learning_rate": 1.581755593803787e-05, "loss": 0.2823, "step": 2310 }, { "epoch": 2.1458430097538317, "grad_norm": 0.1805093088723906, "learning_rate": 1.5800344234079177e-05, "loss": 0.279, "step": 2311 }, { "epoch": 2.1467719461216905, "grad_norm": 0.1896794503755551, "learning_rate": 1.5783132530120482e-05, "loss": 0.2791, "step": 2312 }, { "epoch": 2.1477008824895494, "grad_norm": 0.19321251388893304, "learning_rate": 1.576592082616179e-05, "loss": 0.2947, "step": 2313 }, { "epoch": 2.1486298188574082, "grad_norm": 0.17234766420937184, "learning_rate": 1.57487091222031e-05, "loss": 0.2516, "step": 2314 }, { "epoch": 2.149558755225267, "grad_norm": 0.1822533993953394, "learning_rate": 1.5731497418244407e-05, "loss": 0.2823, "step": 2315 }, { "epoch": 2.150487691593126, "grad_norm": 0.17706007655705194, "learning_rate": 1.5714285714285715e-05, "loss": 0.2874, "step": 2316 }, { "epoch": 2.1514166279609848, "grad_norm": 0.19007988270275075, "learning_rate": 1.5697074010327024e-05, "loss": 0.2964, "step": 2317 }, { "epoch": 2.1523455643288436, "grad_norm": 0.18529312530513586, "learning_rate": 1.5679862306368332e-05, "loss": 0.2622, "step": 2318 }, { "epoch": 2.1532745006967025, "grad_norm": 0.17979646607166183, "learning_rate": 1.566265060240964e-05, "loss": 0.2873, "step": 2319 }, { "epoch": 2.1542034370645613, "grad_norm": 0.17790060931246865, "learning_rate": 1.564543889845095e-05, "loss": 0.2684, "step": 2320 }, { "epoch": 2.1551323734324197, "grad_norm": 0.18215704599038443, "learning_rate": 1.5628227194492257e-05, "loss": 0.2541, "step": 2321 }, { "epoch": 2.1560613098002785, "grad_norm": 0.1892490048553774, "learning_rate": 1.5611015490533566e-05, "loss": 0.2669, "step": 2322 }, { "epoch": 2.1569902461681374, "grad_norm": 0.19249753393355534, "learning_rate": 1.559380378657487e-05, "loss": 0.2761, "step": 2323 }, { "epoch": 2.1579191825359962, "grad_norm": 0.1839816309966501, "learning_rate": 1.557659208261618e-05, "loss": 0.2677, "step": 2324 }, { "epoch": 2.158848118903855, "grad_norm": 0.1810375798407296, "learning_rate": 1.5559380378657488e-05, "loss": 0.2773, "step": 2325 }, { "epoch": 2.159777055271714, "grad_norm": 0.18831165733890587, "learning_rate": 1.5542168674698796e-05, "loss": 0.2848, "step": 2326 }, { "epoch": 2.1607059916395728, "grad_norm": 0.19377240510871852, "learning_rate": 1.5524956970740104e-05, "loss": 0.2941, "step": 2327 }, { "epoch": 2.1616349280074316, "grad_norm": 0.1761716521658807, "learning_rate": 1.5507745266781413e-05, "loss": 0.2829, "step": 2328 }, { "epoch": 2.1625638643752905, "grad_norm": 0.19274957074290722, "learning_rate": 1.549053356282272e-05, "loss": 0.2732, "step": 2329 }, { "epoch": 2.163492800743149, "grad_norm": 0.18470851789842668, "learning_rate": 1.547332185886403e-05, "loss": 0.2829, "step": 2330 }, { "epoch": 2.1644217371110077, "grad_norm": 0.17644911709437527, "learning_rate": 1.5456110154905338e-05, "loss": 0.2639, "step": 2331 }, { "epoch": 2.1653506734788666, "grad_norm": 0.1860663550583413, "learning_rate": 1.5438898450946646e-05, "loss": 0.265, "step": 2332 }, { "epoch": 2.1662796098467254, "grad_norm": 0.1732712738965505, "learning_rate": 1.5421686746987955e-05, "loss": 0.275, "step": 2333 }, { "epoch": 2.1672085462145843, "grad_norm": 0.20738176414089443, "learning_rate": 1.540447504302926e-05, "loss": 0.2922, "step": 2334 }, { "epoch": 2.168137482582443, "grad_norm": 0.18615130026642834, "learning_rate": 1.5387263339070568e-05, "loss": 0.2749, "step": 2335 }, { "epoch": 2.169066418950302, "grad_norm": 0.18330688472974185, "learning_rate": 1.5370051635111877e-05, "loss": 0.2829, "step": 2336 }, { "epoch": 2.169995355318161, "grad_norm": 0.19173214451709955, "learning_rate": 1.5352839931153185e-05, "loss": 0.2915, "step": 2337 }, { "epoch": 2.1709242916860196, "grad_norm": 0.186766371199974, "learning_rate": 1.5335628227194493e-05, "loss": 0.2658, "step": 2338 }, { "epoch": 2.1718532280538785, "grad_norm": 0.18396142682701574, "learning_rate": 1.5318416523235802e-05, "loss": 0.2899, "step": 2339 }, { "epoch": 2.1727821644217373, "grad_norm": 0.1780638489393759, "learning_rate": 1.530120481927711e-05, "loss": 0.2787, "step": 2340 }, { "epoch": 2.1737111007895957, "grad_norm": 0.1752883172604511, "learning_rate": 1.528399311531842e-05, "loss": 0.2748, "step": 2341 }, { "epoch": 2.1746400371574546, "grad_norm": 0.177303228639095, "learning_rate": 1.5266781411359727e-05, "loss": 0.2783, "step": 2342 }, { "epoch": 2.1755689735253134, "grad_norm": 0.18826552255300416, "learning_rate": 1.5249569707401035e-05, "loss": 0.2684, "step": 2343 }, { "epoch": 2.1764979098931723, "grad_norm": 0.17442496731786208, "learning_rate": 1.523235800344234e-05, "loss": 0.2749, "step": 2344 }, { "epoch": 2.177426846261031, "grad_norm": 0.1737699610108496, "learning_rate": 1.5215146299483649e-05, "loss": 0.2703, "step": 2345 }, { "epoch": 2.17835578262889, "grad_norm": 0.17176047456146679, "learning_rate": 1.5197934595524957e-05, "loss": 0.2683, "step": 2346 }, { "epoch": 2.179284718996749, "grad_norm": 0.17869107629479228, "learning_rate": 1.5180722891566266e-05, "loss": 0.2689, "step": 2347 }, { "epoch": 2.1802136553646077, "grad_norm": 0.188008817213917, "learning_rate": 1.5163511187607574e-05, "loss": 0.2735, "step": 2348 }, { "epoch": 2.1811425917324665, "grad_norm": 0.17523593662500742, "learning_rate": 1.5146299483648882e-05, "loss": 0.2763, "step": 2349 }, { "epoch": 2.182071528100325, "grad_norm": 0.18284465023530216, "learning_rate": 1.512908777969019e-05, "loss": 0.2792, "step": 2350 }, { "epoch": 2.1830004644681837, "grad_norm": 0.1868107881910502, "learning_rate": 1.51118760757315e-05, "loss": 0.2841, "step": 2351 }, { "epoch": 2.1839294008360426, "grad_norm": 0.1814772554278054, "learning_rate": 1.5094664371772808e-05, "loss": 0.2818, "step": 2352 }, { "epoch": 2.1848583372039014, "grad_norm": 0.1982916282455446, "learning_rate": 1.5077452667814116e-05, "loss": 0.2909, "step": 2353 }, { "epoch": 2.1857872735717603, "grad_norm": 0.18547018214356015, "learning_rate": 1.5060240963855424e-05, "loss": 0.2633, "step": 2354 }, { "epoch": 2.186716209939619, "grad_norm": 0.18629229106963163, "learning_rate": 1.504302925989673e-05, "loss": 0.2714, "step": 2355 }, { "epoch": 2.187645146307478, "grad_norm": 0.17792953090605365, "learning_rate": 1.5025817555938038e-05, "loss": 0.2804, "step": 2356 }, { "epoch": 2.188574082675337, "grad_norm": 0.19411643483321742, "learning_rate": 1.5008605851979346e-05, "loss": 0.2834, "step": 2357 }, { "epoch": 2.1895030190431957, "grad_norm": 0.17838088091084456, "learning_rate": 1.4991394148020655e-05, "loss": 0.2894, "step": 2358 }, { "epoch": 2.1904319554110545, "grad_norm": 0.18798746896736152, "learning_rate": 1.4974182444061963e-05, "loss": 0.2825, "step": 2359 }, { "epoch": 2.1913608917789134, "grad_norm": 0.18713834475410582, "learning_rate": 1.4956970740103271e-05, "loss": 0.2753, "step": 2360 }, { "epoch": 2.1922898281467718, "grad_norm": 0.21091756592860836, "learning_rate": 1.493975903614458e-05, "loss": 0.2841, "step": 2361 }, { "epoch": 2.1932187645146306, "grad_norm": 0.18950367949638064, "learning_rate": 1.4922547332185888e-05, "loss": 0.2786, "step": 2362 }, { "epoch": 2.1941477008824894, "grad_norm": 0.18906056192602488, "learning_rate": 1.4905335628227197e-05, "loss": 0.2792, "step": 2363 }, { "epoch": 2.1950766372503483, "grad_norm": 0.18613054799021314, "learning_rate": 1.4888123924268505e-05, "loss": 0.2681, "step": 2364 }, { "epoch": 2.196005573618207, "grad_norm": 0.18070972999393983, "learning_rate": 1.487091222030981e-05, "loss": 0.2689, "step": 2365 }, { "epoch": 2.196934509986066, "grad_norm": 0.17512758804049097, "learning_rate": 1.4853700516351118e-05, "loss": 0.2702, "step": 2366 }, { "epoch": 2.197863446353925, "grad_norm": 0.1977826508419132, "learning_rate": 1.4836488812392427e-05, "loss": 0.2714, "step": 2367 }, { "epoch": 2.1987923827217837, "grad_norm": 0.19045916194894494, "learning_rate": 1.4819277108433735e-05, "loss": 0.2898, "step": 2368 }, { "epoch": 2.1997213190896425, "grad_norm": 0.1894952240845188, "learning_rate": 1.4802065404475044e-05, "loss": 0.2793, "step": 2369 }, { "epoch": 2.200650255457501, "grad_norm": 0.18712603681731005, "learning_rate": 1.4784853700516352e-05, "loss": 0.2875, "step": 2370 }, { "epoch": 2.2015791918253598, "grad_norm": 0.18017707880699574, "learning_rate": 1.476764199655766e-05, "loss": 0.2676, "step": 2371 }, { "epoch": 2.2025081281932186, "grad_norm": 0.1872086868065948, "learning_rate": 1.4750430292598969e-05, "loss": 0.2812, "step": 2372 }, { "epoch": 2.2034370645610775, "grad_norm": 0.18279582654005014, "learning_rate": 1.4733218588640277e-05, "loss": 0.2775, "step": 2373 }, { "epoch": 2.2043660009289363, "grad_norm": 0.1742546303606481, "learning_rate": 1.4716006884681586e-05, "loss": 0.2672, "step": 2374 }, { "epoch": 2.205294937296795, "grad_norm": 0.19829823980108882, "learning_rate": 1.4698795180722894e-05, "loss": 0.2932, "step": 2375 }, { "epoch": 2.206223873664654, "grad_norm": 0.1907823494693345, "learning_rate": 1.4681583476764199e-05, "loss": 0.2759, "step": 2376 }, { "epoch": 2.207152810032513, "grad_norm": 0.1760244498190223, "learning_rate": 1.4664371772805507e-05, "loss": 0.2675, "step": 2377 }, { "epoch": 2.2080817464003717, "grad_norm": 0.17827743105039992, "learning_rate": 1.4647160068846816e-05, "loss": 0.2737, "step": 2378 }, { "epoch": 2.2090106827682305, "grad_norm": 0.19304042505289626, "learning_rate": 1.4629948364888124e-05, "loss": 0.2798, "step": 2379 }, { "epoch": 2.2099396191360894, "grad_norm": 0.16940952191421077, "learning_rate": 1.4612736660929433e-05, "loss": 0.2736, "step": 2380 }, { "epoch": 2.210868555503948, "grad_norm": 2.077281485156273, "learning_rate": 1.4595524956970741e-05, "loss": 0.3002, "step": 2381 }, { "epoch": 2.2117974918718066, "grad_norm": 0.19341272255759445, "learning_rate": 1.457831325301205e-05, "loss": 0.2777, "step": 2382 }, { "epoch": 2.2127264282396655, "grad_norm": 0.20112600841398107, "learning_rate": 1.4561101549053358e-05, "loss": 0.2887, "step": 2383 }, { "epoch": 2.2136553646075243, "grad_norm": 0.19413557575902854, "learning_rate": 1.4543889845094666e-05, "loss": 0.2947, "step": 2384 }, { "epoch": 2.214584300975383, "grad_norm": 0.19780119460533946, "learning_rate": 1.4526678141135975e-05, "loss": 0.2839, "step": 2385 }, { "epoch": 2.215513237343242, "grad_norm": 0.17913396155866435, "learning_rate": 1.4509466437177283e-05, "loss": 0.2733, "step": 2386 }, { "epoch": 2.216442173711101, "grad_norm": 0.18886643483257676, "learning_rate": 1.4492254733218588e-05, "loss": 0.2864, "step": 2387 }, { "epoch": 2.2173711100789597, "grad_norm": 0.1995227456484706, "learning_rate": 1.4475043029259896e-05, "loss": 0.2838, "step": 2388 }, { "epoch": 2.2183000464468186, "grad_norm": 0.18895265517020549, "learning_rate": 1.4457831325301205e-05, "loss": 0.2706, "step": 2389 }, { "epoch": 2.2192289828146774, "grad_norm": 0.19334993843744272, "learning_rate": 1.4440619621342513e-05, "loss": 0.2787, "step": 2390 }, { "epoch": 2.220157919182536, "grad_norm": 0.20236064176192772, "learning_rate": 1.4423407917383822e-05, "loss": 0.2788, "step": 2391 }, { "epoch": 2.2210868555503946, "grad_norm": 0.3655805333065711, "learning_rate": 1.440619621342513e-05, "loss": 0.2708, "step": 2392 }, { "epoch": 2.2220157919182535, "grad_norm": 0.18758244632384213, "learning_rate": 1.4388984509466438e-05, "loss": 0.2825, "step": 2393 }, { "epoch": 2.2229447282861123, "grad_norm": 0.17757060628418905, "learning_rate": 1.4371772805507747e-05, "loss": 0.2779, "step": 2394 }, { "epoch": 2.223873664653971, "grad_norm": 0.19104858972751676, "learning_rate": 1.4354561101549055e-05, "loss": 0.272, "step": 2395 }, { "epoch": 2.22480260102183, "grad_norm": 0.19793684191220914, "learning_rate": 1.4337349397590364e-05, "loss": 0.2728, "step": 2396 }, { "epoch": 2.225731537389689, "grad_norm": 0.18934791385839767, "learning_rate": 1.4320137693631669e-05, "loss": 0.2796, "step": 2397 }, { "epoch": 2.2266604737575477, "grad_norm": 0.17614906901226796, "learning_rate": 1.4302925989672977e-05, "loss": 0.2812, "step": 2398 }, { "epoch": 2.2275894101254066, "grad_norm": 0.17586759106445218, "learning_rate": 1.4285714285714285e-05, "loss": 0.2677, "step": 2399 }, { "epoch": 2.2285183464932654, "grad_norm": 0.1872402948258275, "learning_rate": 1.4268502581755594e-05, "loss": 0.2555, "step": 2400 }, { "epoch": 2.229447282861124, "grad_norm": 0.19771821410158671, "learning_rate": 1.4251290877796902e-05, "loss": 0.2677, "step": 2401 }, { "epoch": 2.2303762192289827, "grad_norm": 0.17818595832024386, "learning_rate": 1.423407917383821e-05, "loss": 0.2607, "step": 2402 }, { "epoch": 2.2313051555968415, "grad_norm": 0.1776547560819404, "learning_rate": 1.4216867469879519e-05, "loss": 0.2749, "step": 2403 }, { "epoch": 2.2322340919647004, "grad_norm": 0.170957875139368, "learning_rate": 1.4199655765920827e-05, "loss": 0.2591, "step": 2404 }, { "epoch": 2.233163028332559, "grad_norm": 0.1742647844913809, "learning_rate": 1.4182444061962136e-05, "loss": 0.2738, "step": 2405 }, { "epoch": 2.234091964700418, "grad_norm": 0.1925283348131071, "learning_rate": 1.4165232358003444e-05, "loss": 0.2812, "step": 2406 }, { "epoch": 2.235020901068277, "grad_norm": 0.19555384576301638, "learning_rate": 1.4148020654044753e-05, "loss": 0.2945, "step": 2407 }, { "epoch": 2.2359498374361357, "grad_norm": 0.17682561155125856, "learning_rate": 1.4130808950086058e-05, "loss": 0.2614, "step": 2408 }, { "epoch": 2.2368787738039946, "grad_norm": 0.21751007119184168, "learning_rate": 1.4113597246127366e-05, "loss": 0.2985, "step": 2409 }, { "epoch": 2.2378077101718534, "grad_norm": 0.18932523578727964, "learning_rate": 1.4096385542168674e-05, "loss": 0.2706, "step": 2410 }, { "epoch": 2.238736646539712, "grad_norm": 0.1741531414035046, "learning_rate": 1.4079173838209983e-05, "loss": 0.2775, "step": 2411 }, { "epoch": 2.2396655829075707, "grad_norm": 0.1790079893951182, "learning_rate": 1.4061962134251291e-05, "loss": 0.2716, "step": 2412 }, { "epoch": 2.2405945192754295, "grad_norm": 0.1897657068346121, "learning_rate": 1.40447504302926e-05, "loss": 0.2887, "step": 2413 }, { "epoch": 2.2415234556432884, "grad_norm": 0.17563878014722972, "learning_rate": 1.4027538726333908e-05, "loss": 0.2678, "step": 2414 }, { "epoch": 2.242452392011147, "grad_norm": 0.1734405490085664, "learning_rate": 1.4010327022375216e-05, "loss": 0.2863, "step": 2415 }, { "epoch": 2.243381328379006, "grad_norm": 0.18332196872544093, "learning_rate": 1.3993115318416525e-05, "loss": 0.2734, "step": 2416 }, { "epoch": 2.244310264746865, "grad_norm": 0.17947567690760682, "learning_rate": 1.3975903614457833e-05, "loss": 0.27, "step": 2417 }, { "epoch": 2.2452392011147237, "grad_norm": 0.18735527285985934, "learning_rate": 1.3958691910499138e-05, "loss": 0.2851, "step": 2418 }, { "epoch": 2.2461681374825826, "grad_norm": 0.19049022365781365, "learning_rate": 1.3941480206540447e-05, "loss": 0.2795, "step": 2419 }, { "epoch": 2.2470970738504414, "grad_norm": 0.17554046191220224, "learning_rate": 1.3924268502581755e-05, "loss": 0.2703, "step": 2420 }, { "epoch": 2.2480260102183, "grad_norm": 0.18762826003233593, "learning_rate": 1.3907056798623063e-05, "loss": 0.2928, "step": 2421 }, { "epoch": 2.2489549465861587, "grad_norm": 0.18234373655440445, "learning_rate": 1.3889845094664372e-05, "loss": 0.2817, "step": 2422 }, { "epoch": 2.2498838829540175, "grad_norm": 0.18049105683824865, "learning_rate": 1.387263339070568e-05, "loss": 0.2744, "step": 2423 }, { "epoch": 2.2508128193218764, "grad_norm": 0.1770550838421612, "learning_rate": 1.3855421686746989e-05, "loss": 0.2694, "step": 2424 }, { "epoch": 2.2517417556897352, "grad_norm": 0.1756612487036032, "learning_rate": 1.3838209982788297e-05, "loss": 0.2647, "step": 2425 }, { "epoch": 2.252670692057594, "grad_norm": 0.1811227177376827, "learning_rate": 1.3820998278829605e-05, "loss": 0.2732, "step": 2426 }, { "epoch": 2.253599628425453, "grad_norm": 0.1895660967425297, "learning_rate": 1.3803786574870914e-05, "loss": 0.2883, "step": 2427 }, { "epoch": 2.2545285647933118, "grad_norm": 0.1759719292934546, "learning_rate": 1.3786574870912222e-05, "loss": 0.2704, "step": 2428 }, { "epoch": 2.2554575011611706, "grad_norm": 0.1852248917014425, "learning_rate": 1.3769363166953527e-05, "loss": 0.2714, "step": 2429 }, { "epoch": 2.2563864375290295, "grad_norm": 0.1872463960739118, "learning_rate": 1.3752151462994836e-05, "loss": 0.2884, "step": 2430 }, { "epoch": 2.257315373896888, "grad_norm": 0.19160635539373605, "learning_rate": 1.3734939759036144e-05, "loss": 0.2653, "step": 2431 }, { "epoch": 2.2582443102647467, "grad_norm": 0.17691605824841325, "learning_rate": 1.3717728055077452e-05, "loss": 0.2587, "step": 2432 }, { "epoch": 2.2591732466326055, "grad_norm": 0.1840601443984862, "learning_rate": 1.3700516351118761e-05, "loss": 0.2863, "step": 2433 }, { "epoch": 2.2601021830004644, "grad_norm": 0.17288469873242204, "learning_rate": 1.368330464716007e-05, "loss": 0.2547, "step": 2434 }, { "epoch": 2.2610311193683232, "grad_norm": 0.20393520871661627, "learning_rate": 1.3666092943201378e-05, "loss": 0.2918, "step": 2435 }, { "epoch": 2.261960055736182, "grad_norm": 0.185024404089295, "learning_rate": 1.3648881239242686e-05, "loss": 0.2779, "step": 2436 }, { "epoch": 2.262888992104041, "grad_norm": 0.1901769547372566, "learning_rate": 1.3631669535283994e-05, "loss": 0.2931, "step": 2437 }, { "epoch": 2.2638179284718998, "grad_norm": 0.2214837074293375, "learning_rate": 1.3614457831325303e-05, "loss": 0.3066, "step": 2438 }, { "epoch": 2.2647468648397586, "grad_norm": 0.18890674277222674, "learning_rate": 1.3597246127366611e-05, "loss": 0.273, "step": 2439 }, { "epoch": 2.2656758012076175, "grad_norm": 0.17542645415927108, "learning_rate": 1.3580034423407916e-05, "loss": 0.2746, "step": 2440 }, { "epoch": 2.2666047375754763, "grad_norm": 0.19723610546547288, "learning_rate": 1.3562822719449225e-05, "loss": 0.2862, "step": 2441 }, { "epoch": 2.2675336739433347, "grad_norm": 0.18409020452903432, "learning_rate": 1.3545611015490533e-05, "loss": 0.2756, "step": 2442 }, { "epoch": 2.2684626103111936, "grad_norm": 0.17662475275167705, "learning_rate": 1.3528399311531841e-05, "loss": 0.2678, "step": 2443 }, { "epoch": 2.2693915466790524, "grad_norm": 0.1791475766551058, "learning_rate": 1.351118760757315e-05, "loss": 0.275, "step": 2444 }, { "epoch": 2.2703204830469113, "grad_norm": 0.1854099470575039, "learning_rate": 1.3493975903614458e-05, "loss": 0.2728, "step": 2445 }, { "epoch": 2.27124941941477, "grad_norm": 0.1852589646237944, "learning_rate": 1.3476764199655767e-05, "loss": 0.277, "step": 2446 }, { "epoch": 2.272178355782629, "grad_norm": 0.18383715705433082, "learning_rate": 1.3459552495697075e-05, "loss": 0.2889, "step": 2447 }, { "epoch": 2.273107292150488, "grad_norm": 0.1951503760520118, "learning_rate": 1.3442340791738383e-05, "loss": 0.2805, "step": 2448 }, { "epoch": 2.2740362285183466, "grad_norm": 0.20073372805491246, "learning_rate": 1.3425129087779692e-05, "loss": 0.2731, "step": 2449 }, { "epoch": 2.2749651648862055, "grad_norm": 0.16310800213885338, "learning_rate": 1.3407917383820997e-05, "loss": 0.2639, "step": 2450 }, { "epoch": 2.275894101254064, "grad_norm": 0.17606453823569151, "learning_rate": 1.3390705679862305e-05, "loss": 0.2694, "step": 2451 }, { "epoch": 2.2768230376219227, "grad_norm": 0.21268495266034193, "learning_rate": 1.3373493975903614e-05, "loss": 0.273, "step": 2452 }, { "epoch": 2.2777519739897816, "grad_norm": 0.17975510622715113, "learning_rate": 1.3356282271944922e-05, "loss": 0.2721, "step": 2453 }, { "epoch": 2.2786809103576404, "grad_norm": 0.17495866372601926, "learning_rate": 1.333907056798623e-05, "loss": 0.2608, "step": 2454 }, { "epoch": 2.2796098467254993, "grad_norm": 0.1799744906204507, "learning_rate": 1.3321858864027539e-05, "loss": 0.2773, "step": 2455 }, { "epoch": 2.280538783093358, "grad_norm": 0.20386977048097904, "learning_rate": 1.3304647160068847e-05, "loss": 0.298, "step": 2456 }, { "epoch": 2.281467719461217, "grad_norm": 0.17543050916264705, "learning_rate": 1.3287435456110156e-05, "loss": 0.2682, "step": 2457 }, { "epoch": 2.282396655829076, "grad_norm": 0.18115557846011818, "learning_rate": 1.3270223752151464e-05, "loss": 0.2963, "step": 2458 }, { "epoch": 2.2833255921969347, "grad_norm": 0.1825233749262127, "learning_rate": 1.3253012048192772e-05, "loss": 0.2759, "step": 2459 }, { "epoch": 2.2842545285647935, "grad_norm": 0.18810580131761556, "learning_rate": 1.3235800344234081e-05, "loss": 0.2736, "step": 2460 }, { "epoch": 2.2851834649326523, "grad_norm": 0.20406387820144806, "learning_rate": 1.3218588640275386e-05, "loss": 0.2885, "step": 2461 }, { "epoch": 2.2861124013005107, "grad_norm": 0.17866786419081768, "learning_rate": 1.3201376936316694e-05, "loss": 0.27, "step": 2462 }, { "epoch": 2.2870413376683696, "grad_norm": 0.19558916886417338, "learning_rate": 1.3184165232358003e-05, "loss": 0.2745, "step": 2463 }, { "epoch": 2.2879702740362284, "grad_norm": 0.17178727146349201, "learning_rate": 1.3166953528399311e-05, "loss": 0.2695, "step": 2464 }, { "epoch": 2.2888992104040873, "grad_norm": 0.18186780650265671, "learning_rate": 1.314974182444062e-05, "loss": 0.2768, "step": 2465 }, { "epoch": 2.289828146771946, "grad_norm": 0.1851530367223113, "learning_rate": 1.3132530120481928e-05, "loss": 0.2867, "step": 2466 }, { "epoch": 2.290757083139805, "grad_norm": 0.16670097051368127, "learning_rate": 1.3115318416523236e-05, "loss": 0.2793, "step": 2467 }, { "epoch": 2.291686019507664, "grad_norm": 0.1856617082893609, "learning_rate": 1.3098106712564545e-05, "loss": 0.2852, "step": 2468 }, { "epoch": 2.2926149558755227, "grad_norm": 0.16657158904737376, "learning_rate": 1.3080895008605853e-05, "loss": 0.2648, "step": 2469 }, { "epoch": 2.2935438922433815, "grad_norm": 0.1817338062264846, "learning_rate": 1.3063683304647162e-05, "loss": 0.2888, "step": 2470 }, { "epoch": 2.29447282861124, "grad_norm": 0.16841896976761195, "learning_rate": 1.3046471600688468e-05, "loss": 0.2765, "step": 2471 }, { "epoch": 2.2954017649790988, "grad_norm": 0.16883158219004724, "learning_rate": 1.3029259896729775e-05, "loss": 0.2684, "step": 2472 }, { "epoch": 2.2963307013469576, "grad_norm": 0.17717975998990523, "learning_rate": 1.3012048192771083e-05, "loss": 0.268, "step": 2473 }, { "epoch": 2.2972596377148164, "grad_norm": 0.18560225793278562, "learning_rate": 1.2994836488812392e-05, "loss": 0.3009, "step": 2474 }, { "epoch": 2.2981885740826753, "grad_norm": 0.17167104424094876, "learning_rate": 1.29776247848537e-05, "loss": 0.2759, "step": 2475 }, { "epoch": 2.299117510450534, "grad_norm": 0.18419703054460113, "learning_rate": 1.2960413080895009e-05, "loss": 0.2842, "step": 2476 }, { "epoch": 2.300046446818393, "grad_norm": 0.18156266735689566, "learning_rate": 1.2943201376936317e-05, "loss": 0.2676, "step": 2477 }, { "epoch": 2.300975383186252, "grad_norm": 0.18273226393990982, "learning_rate": 1.2925989672977625e-05, "loss": 0.2852, "step": 2478 }, { "epoch": 2.3019043195541107, "grad_norm": 0.18012479146698604, "learning_rate": 1.2908777969018934e-05, "loss": 0.2817, "step": 2479 }, { "epoch": 2.3028332559219695, "grad_norm": 0.18611898476583089, "learning_rate": 1.2891566265060242e-05, "loss": 0.2734, "step": 2480 }, { "epoch": 2.3037621922898284, "grad_norm": 0.2045432229041082, "learning_rate": 1.287435456110155e-05, "loss": 0.2868, "step": 2481 }, { "epoch": 2.3046911286576868, "grad_norm": 0.18166021704998359, "learning_rate": 1.2857142857142857e-05, "loss": 0.2742, "step": 2482 }, { "epoch": 2.3056200650255456, "grad_norm": 0.18583776084377682, "learning_rate": 1.2839931153184166e-05, "loss": 0.2878, "step": 2483 }, { "epoch": 2.3065490013934045, "grad_norm": 0.18056875244820153, "learning_rate": 1.2822719449225474e-05, "loss": 0.2722, "step": 2484 }, { "epoch": 2.3074779377612633, "grad_norm": 0.1775475098336737, "learning_rate": 1.280550774526678e-05, "loss": 0.2621, "step": 2485 }, { "epoch": 2.308406874129122, "grad_norm": 0.19416763735743434, "learning_rate": 1.2788296041308089e-05, "loss": 0.2777, "step": 2486 }, { "epoch": 2.309335810496981, "grad_norm": 0.17930364275371594, "learning_rate": 1.2771084337349398e-05, "loss": 0.2686, "step": 2487 }, { "epoch": 2.31026474686484, "grad_norm": 0.19195008866900118, "learning_rate": 1.2753872633390706e-05, "loss": 0.2767, "step": 2488 }, { "epoch": 2.3111936832326987, "grad_norm": 0.2102981226089247, "learning_rate": 1.2736660929432014e-05, "loss": 0.2975, "step": 2489 }, { "epoch": 2.3121226196005575, "grad_norm": 0.1838111249569394, "learning_rate": 1.2719449225473323e-05, "loss": 0.2701, "step": 2490 }, { "epoch": 2.313051555968416, "grad_norm": 0.1834891735582498, "learning_rate": 1.2702237521514631e-05, "loss": 0.2668, "step": 2491 }, { "epoch": 2.313980492336275, "grad_norm": 0.1748837067912934, "learning_rate": 1.268502581755594e-05, "loss": 0.2536, "step": 2492 }, { "epoch": 2.3149094287041336, "grad_norm": 0.18170866744646788, "learning_rate": 1.2667814113597246e-05, "loss": 0.2791, "step": 2493 }, { "epoch": 2.3158383650719925, "grad_norm": 0.17517417710104063, "learning_rate": 1.2650602409638555e-05, "loss": 0.2697, "step": 2494 }, { "epoch": 2.3167673014398513, "grad_norm": 0.1696007724114442, "learning_rate": 1.2633390705679863e-05, "loss": 0.2701, "step": 2495 }, { "epoch": 2.31769623780771, "grad_norm": 0.19983708306954123, "learning_rate": 1.2616179001721171e-05, "loss": 0.2759, "step": 2496 }, { "epoch": 2.318625174175569, "grad_norm": 0.18007852898360852, "learning_rate": 1.2598967297762478e-05, "loss": 0.2738, "step": 2497 }, { "epoch": 2.319554110543428, "grad_norm": 0.17640021787489446, "learning_rate": 1.2581755593803787e-05, "loss": 0.2568, "step": 2498 }, { "epoch": 2.3204830469112867, "grad_norm": 0.17426432406739611, "learning_rate": 1.2564543889845095e-05, "loss": 0.2813, "step": 2499 }, { "epoch": 2.3214119832791456, "grad_norm": 0.1819597764792421, "learning_rate": 1.2547332185886403e-05, "loss": 0.2843, "step": 2500 }, { "epoch": 2.3223409196470044, "grad_norm": 0.6807628441338488, "learning_rate": 1.2530120481927712e-05, "loss": 0.2916, "step": 2501 }, { "epoch": 2.323269856014863, "grad_norm": 0.18961215732510447, "learning_rate": 1.251290877796902e-05, "loss": 0.2885, "step": 2502 }, { "epoch": 2.3241987923827216, "grad_norm": 0.185281322046881, "learning_rate": 1.2495697074010327e-05, "loss": 0.261, "step": 2503 }, { "epoch": 2.3251277287505805, "grad_norm": 0.18212697020003069, "learning_rate": 1.2478485370051635e-05, "loss": 0.2824, "step": 2504 }, { "epoch": 2.3260566651184393, "grad_norm": 0.19341133551329, "learning_rate": 1.2461273666092944e-05, "loss": 0.2861, "step": 2505 }, { "epoch": 2.326985601486298, "grad_norm": 0.17022345986335427, "learning_rate": 1.2444061962134252e-05, "loss": 0.2693, "step": 2506 }, { "epoch": 2.327914537854157, "grad_norm": 0.18536850713483183, "learning_rate": 1.242685025817556e-05, "loss": 0.2746, "step": 2507 }, { "epoch": 2.328843474222016, "grad_norm": 0.16506173986908923, "learning_rate": 1.2409638554216869e-05, "loss": 0.265, "step": 2508 }, { "epoch": 2.3297724105898747, "grad_norm": 0.17182803727776838, "learning_rate": 1.2392426850258176e-05, "loss": 0.2736, "step": 2509 }, { "epoch": 2.3307013469577336, "grad_norm": 0.1742820002887796, "learning_rate": 1.2375215146299484e-05, "loss": 0.2618, "step": 2510 }, { "epoch": 2.331630283325592, "grad_norm": 0.19642680627052156, "learning_rate": 1.2358003442340792e-05, "loss": 0.274, "step": 2511 }, { "epoch": 2.332559219693451, "grad_norm": 0.18642669549275, "learning_rate": 1.23407917383821e-05, "loss": 0.284, "step": 2512 }, { "epoch": 2.3334881560613097, "grad_norm": 0.17054316401763836, "learning_rate": 1.232358003442341e-05, "loss": 0.2869, "step": 2513 }, { "epoch": 2.3344170924291685, "grad_norm": 0.18369081567830525, "learning_rate": 1.2306368330464718e-05, "loss": 0.2767, "step": 2514 }, { "epoch": 2.3353460287970274, "grad_norm": 0.20114946960190172, "learning_rate": 1.2289156626506026e-05, "loss": 0.2814, "step": 2515 }, { "epoch": 2.336274965164886, "grad_norm": 0.18437735158767538, "learning_rate": 1.2271944922547333e-05, "loss": 0.2919, "step": 2516 }, { "epoch": 2.337203901532745, "grad_norm": 0.18161588561862224, "learning_rate": 1.2254733218588641e-05, "loss": 0.287, "step": 2517 }, { "epoch": 2.338132837900604, "grad_norm": 0.18870854980187435, "learning_rate": 1.223752151462995e-05, "loss": 0.2805, "step": 2518 }, { "epoch": 2.3390617742684627, "grad_norm": 0.1768654024578103, "learning_rate": 1.2220309810671258e-05, "loss": 0.269, "step": 2519 }, { "epoch": 2.3399907106363216, "grad_norm": 0.17490366879789768, "learning_rate": 1.2203098106712566e-05, "loss": 0.2783, "step": 2520 }, { "epoch": 2.3409196470041804, "grad_norm": 0.18896883370592954, "learning_rate": 1.2185886402753875e-05, "loss": 0.268, "step": 2521 }, { "epoch": 2.341848583372039, "grad_norm": 0.1801247883571675, "learning_rate": 1.2168674698795181e-05, "loss": 0.2719, "step": 2522 }, { "epoch": 2.3427775197398977, "grad_norm": 0.18694993552175548, "learning_rate": 1.215146299483649e-05, "loss": 0.2757, "step": 2523 }, { "epoch": 2.3437064561077565, "grad_norm": 0.18275519642880572, "learning_rate": 1.2134251290877798e-05, "loss": 0.263, "step": 2524 }, { "epoch": 2.3446353924756154, "grad_norm": 0.17351016085246834, "learning_rate": 1.2117039586919107e-05, "loss": 0.2565, "step": 2525 }, { "epoch": 2.345564328843474, "grad_norm": 0.18379515482313682, "learning_rate": 1.2099827882960415e-05, "loss": 0.2696, "step": 2526 }, { "epoch": 2.346493265211333, "grad_norm": 0.1720315048361915, "learning_rate": 1.2082616179001722e-05, "loss": 0.2641, "step": 2527 }, { "epoch": 2.347422201579192, "grad_norm": 0.18590275177538543, "learning_rate": 1.206540447504303e-05, "loss": 0.2733, "step": 2528 }, { "epoch": 2.3483511379470507, "grad_norm": 0.169610311172933, "learning_rate": 1.2048192771084338e-05, "loss": 0.2702, "step": 2529 }, { "epoch": 2.3492800743149096, "grad_norm": 0.1963508763656982, "learning_rate": 1.2030981067125647e-05, "loss": 0.2652, "step": 2530 }, { "epoch": 2.350209010682768, "grad_norm": 0.18587076519396542, "learning_rate": 1.2013769363166955e-05, "loss": 0.2789, "step": 2531 }, { "epoch": 2.351137947050627, "grad_norm": 0.1854453938930749, "learning_rate": 1.1996557659208262e-05, "loss": 0.2732, "step": 2532 }, { "epoch": 2.3520668834184857, "grad_norm": 0.1942839700749339, "learning_rate": 1.197934595524957e-05, "loss": 0.2803, "step": 2533 }, { "epoch": 2.3529958197863445, "grad_norm": 0.19659561231939945, "learning_rate": 1.1962134251290879e-05, "loss": 0.2913, "step": 2534 }, { "epoch": 2.3539247561542034, "grad_norm": 0.17480801654404732, "learning_rate": 1.1944922547332187e-05, "loss": 0.271, "step": 2535 }, { "epoch": 2.3548536925220622, "grad_norm": 0.1944663657157406, "learning_rate": 1.1927710843373496e-05, "loss": 0.2641, "step": 2536 }, { "epoch": 2.355782628889921, "grad_norm": 0.18816619868751366, "learning_rate": 1.1910499139414804e-05, "loss": 0.2833, "step": 2537 }, { "epoch": 2.35671156525778, "grad_norm": 0.20018181591126397, "learning_rate": 1.189328743545611e-05, "loss": 0.2852, "step": 2538 }, { "epoch": 2.3576405016256388, "grad_norm": 0.1756427462956202, "learning_rate": 1.1876075731497419e-05, "loss": 0.2778, "step": 2539 }, { "epoch": 2.3585694379934976, "grad_norm": 0.18983605510608587, "learning_rate": 1.1858864027538727e-05, "loss": 0.2769, "step": 2540 }, { "epoch": 2.3594983743613565, "grad_norm": 0.20928473702526423, "learning_rate": 1.1841652323580036e-05, "loss": 0.2856, "step": 2541 }, { "epoch": 2.360427310729215, "grad_norm": 0.18099417160933323, "learning_rate": 1.1824440619621344e-05, "loss": 0.2758, "step": 2542 }, { "epoch": 2.3613562470970737, "grad_norm": 0.1788276756913518, "learning_rate": 1.1807228915662651e-05, "loss": 0.2809, "step": 2543 }, { "epoch": 2.3622851834649325, "grad_norm": 0.18196101771025752, "learning_rate": 1.179001721170396e-05, "loss": 0.2714, "step": 2544 }, { "epoch": 2.3632141198327914, "grad_norm": 0.16952857307776525, "learning_rate": 1.1772805507745268e-05, "loss": 0.2562, "step": 2545 }, { "epoch": 2.3641430562006502, "grad_norm": 0.18206933762294697, "learning_rate": 1.1755593803786576e-05, "loss": 0.2761, "step": 2546 }, { "epoch": 2.365071992568509, "grad_norm": 0.19380265765568364, "learning_rate": 1.1738382099827885e-05, "loss": 0.2869, "step": 2547 }, { "epoch": 2.366000928936368, "grad_norm": 0.16983950092360575, "learning_rate": 1.1721170395869191e-05, "loss": 0.2745, "step": 2548 }, { "epoch": 2.3669298653042268, "grad_norm": 0.19191163491346017, "learning_rate": 1.17039586919105e-05, "loss": 0.2946, "step": 2549 }, { "epoch": 2.3678588016720856, "grad_norm": 0.17125497389397198, "learning_rate": 1.1686746987951808e-05, "loss": 0.2735, "step": 2550 }, { "epoch": 2.368787738039944, "grad_norm": 0.19949567009374117, "learning_rate": 1.1669535283993117e-05, "loss": 0.2946, "step": 2551 }, { "epoch": 2.369716674407803, "grad_norm": 0.18799349450204403, "learning_rate": 1.1652323580034425e-05, "loss": 0.2838, "step": 2552 }, { "epoch": 2.3706456107756617, "grad_norm": 0.17431131561492344, "learning_rate": 1.1635111876075733e-05, "loss": 0.2662, "step": 2553 }, { "epoch": 2.3715745471435206, "grad_norm": 0.17729505501913617, "learning_rate": 1.161790017211704e-05, "loss": 0.2845, "step": 2554 }, { "epoch": 2.3725034835113794, "grad_norm": 0.17567637231257308, "learning_rate": 1.1600688468158348e-05, "loss": 0.2828, "step": 2555 }, { "epoch": 2.3734324198792383, "grad_norm": 0.18325562355641456, "learning_rate": 1.1583476764199657e-05, "loss": 0.2881, "step": 2556 }, { "epoch": 2.374361356247097, "grad_norm": 0.20307829278142425, "learning_rate": 1.1566265060240965e-05, "loss": 0.2797, "step": 2557 }, { "epoch": 2.375290292614956, "grad_norm": 0.1760215816447019, "learning_rate": 1.1549053356282274e-05, "loss": 0.2866, "step": 2558 }, { "epoch": 2.376219228982815, "grad_norm": 0.16869374727160663, "learning_rate": 1.153184165232358e-05, "loss": 0.2677, "step": 2559 }, { "epoch": 2.3771481653506736, "grad_norm": 0.19634176148081195, "learning_rate": 1.1514629948364889e-05, "loss": 0.2826, "step": 2560 }, { "epoch": 2.3780771017185325, "grad_norm": 0.1901365638963679, "learning_rate": 1.1497418244406197e-05, "loss": 0.2717, "step": 2561 }, { "epoch": 2.3790060380863913, "grad_norm": 0.17289531409457187, "learning_rate": 1.1480206540447506e-05, "loss": 0.2745, "step": 2562 }, { "epoch": 2.3799349744542497, "grad_norm": 0.1863744077538756, "learning_rate": 1.1462994836488814e-05, "loss": 0.2861, "step": 2563 }, { "epoch": 2.3808639108221086, "grad_norm": 0.19647354430195266, "learning_rate": 1.144578313253012e-05, "loss": 0.2953, "step": 2564 }, { "epoch": 2.3817928471899674, "grad_norm": 0.16997392062698125, "learning_rate": 1.1428571428571429e-05, "loss": 0.2662, "step": 2565 }, { "epoch": 2.3827217835578263, "grad_norm": 0.17741241720060805, "learning_rate": 1.1411359724612737e-05, "loss": 0.2801, "step": 2566 }, { "epoch": 2.383650719925685, "grad_norm": 0.1734889437651768, "learning_rate": 1.1394148020654046e-05, "loss": 0.2715, "step": 2567 }, { "epoch": 2.384579656293544, "grad_norm": 0.18052716242882721, "learning_rate": 1.1376936316695354e-05, "loss": 0.2861, "step": 2568 }, { "epoch": 2.385508592661403, "grad_norm": 0.17565238572476038, "learning_rate": 1.1359724612736663e-05, "loss": 0.2714, "step": 2569 }, { "epoch": 2.3864375290292617, "grad_norm": 0.1706278981440593, "learning_rate": 1.134251290877797e-05, "loss": 0.2762, "step": 2570 }, { "epoch": 2.38736646539712, "grad_norm": 0.17926154917329037, "learning_rate": 1.1325301204819278e-05, "loss": 0.2854, "step": 2571 }, { "epoch": 2.388295401764979, "grad_norm": 0.1802656856974113, "learning_rate": 1.1308089500860586e-05, "loss": 0.2807, "step": 2572 }, { "epoch": 2.3892243381328377, "grad_norm": 0.17506633593361373, "learning_rate": 1.1290877796901895e-05, "loss": 0.2779, "step": 2573 }, { "epoch": 2.3901532745006966, "grad_norm": 0.1728471092867977, "learning_rate": 1.1273666092943203e-05, "loss": 0.2765, "step": 2574 }, { "epoch": 2.3910822108685554, "grad_norm": 0.19565702280776281, "learning_rate": 1.125645438898451e-05, "loss": 0.2635, "step": 2575 }, { "epoch": 2.3920111472364143, "grad_norm": 0.18314447255310315, "learning_rate": 1.1239242685025818e-05, "loss": 0.2737, "step": 2576 }, { "epoch": 2.392940083604273, "grad_norm": 0.1750548586458759, "learning_rate": 1.1222030981067126e-05, "loss": 0.2642, "step": 2577 }, { "epoch": 2.393869019972132, "grad_norm": 0.1832451160171968, "learning_rate": 1.1204819277108435e-05, "loss": 0.2745, "step": 2578 }, { "epoch": 2.394797956339991, "grad_norm": 0.18031801543289583, "learning_rate": 1.1187607573149743e-05, "loss": 0.2684, "step": 2579 }, { "epoch": 2.3957268927078497, "grad_norm": 0.1799011721479443, "learning_rate": 1.117039586919105e-05, "loss": 0.2807, "step": 2580 }, { "epoch": 2.3966558290757085, "grad_norm": 0.17467422843455963, "learning_rate": 1.1153184165232358e-05, "loss": 0.2661, "step": 2581 }, { "epoch": 2.3975847654435674, "grad_norm": 0.182510619647452, "learning_rate": 1.1135972461273667e-05, "loss": 0.2661, "step": 2582 }, { "epoch": 2.3985137018114258, "grad_norm": 0.19000712669188646, "learning_rate": 1.1118760757314975e-05, "loss": 0.2785, "step": 2583 }, { "epoch": 2.3994426381792846, "grad_norm": 0.16903359729376602, "learning_rate": 1.1101549053356284e-05, "loss": 0.2704, "step": 2584 }, { "epoch": 2.4003715745471434, "grad_norm": 0.18217095684501244, "learning_rate": 1.108433734939759e-05, "loss": 0.2634, "step": 2585 }, { "epoch": 2.4013005109150023, "grad_norm": 0.17717495586427512, "learning_rate": 1.1067125645438899e-05, "loss": 0.2551, "step": 2586 }, { "epoch": 2.402229447282861, "grad_norm": 0.1838135913410062, "learning_rate": 1.1049913941480207e-05, "loss": 0.2876, "step": 2587 }, { "epoch": 2.40315838365072, "grad_norm": 0.17031644253992032, "learning_rate": 1.1032702237521515e-05, "loss": 0.2624, "step": 2588 }, { "epoch": 2.404087320018579, "grad_norm": 0.17556071203202087, "learning_rate": 1.1015490533562824e-05, "loss": 0.2753, "step": 2589 }, { "epoch": 2.4050162563864377, "grad_norm": 0.1834407267830562, "learning_rate": 1.0998278829604132e-05, "loss": 0.2746, "step": 2590 }, { "epoch": 2.4059451927542965, "grad_norm": 0.19926877874180607, "learning_rate": 1.0981067125645439e-05, "loss": 0.29, "step": 2591 }, { "epoch": 2.406874129122155, "grad_norm": 0.17906776663032192, "learning_rate": 1.0963855421686747e-05, "loss": 0.2844, "step": 2592 }, { "epoch": 2.4078030654900138, "grad_norm": 0.16699051055445974, "learning_rate": 1.0946643717728056e-05, "loss": 0.2588, "step": 2593 }, { "epoch": 2.4087320018578726, "grad_norm": 0.1751714906923023, "learning_rate": 1.0929432013769364e-05, "loss": 0.2717, "step": 2594 }, { "epoch": 2.4096609382257315, "grad_norm": 0.17331285320106854, "learning_rate": 1.0912220309810673e-05, "loss": 0.251, "step": 2595 }, { "epoch": 2.4105898745935903, "grad_norm": 0.1739226946282106, "learning_rate": 1.089500860585198e-05, "loss": 0.2883, "step": 2596 }, { "epoch": 2.411518810961449, "grad_norm": 0.19012644739523568, "learning_rate": 1.0877796901893288e-05, "loss": 0.2806, "step": 2597 }, { "epoch": 2.412447747329308, "grad_norm": 0.18129586130288267, "learning_rate": 1.0860585197934596e-05, "loss": 0.2769, "step": 2598 }, { "epoch": 2.413376683697167, "grad_norm": 0.17128766688292568, "learning_rate": 1.0843373493975904e-05, "loss": 0.2725, "step": 2599 }, { "epoch": 2.4143056200650257, "grad_norm": 0.1723024173039281, "learning_rate": 1.0826161790017213e-05, "loss": 0.2712, "step": 2600 }, { "epoch": 2.4152345564328845, "grad_norm": 0.18914686577326056, "learning_rate": 1.080895008605852e-05, "loss": 0.2823, "step": 2601 }, { "epoch": 2.4161634928007434, "grad_norm": 0.17962076856198506, "learning_rate": 1.0791738382099828e-05, "loss": 0.2738, "step": 2602 }, { "epoch": 2.417092429168602, "grad_norm": 0.18223450453434567, "learning_rate": 1.0774526678141136e-05, "loss": 0.2817, "step": 2603 }, { "epoch": 2.4180213655364606, "grad_norm": 0.17204461955432634, "learning_rate": 1.0757314974182445e-05, "loss": 0.2784, "step": 2604 }, { "epoch": 2.4189503019043195, "grad_norm": 0.18734269004984067, "learning_rate": 1.0740103270223753e-05, "loss": 0.2798, "step": 2605 }, { "epoch": 2.4198792382721783, "grad_norm": 0.18627841159310524, "learning_rate": 1.0722891566265062e-05, "loss": 0.2736, "step": 2606 }, { "epoch": 2.420808174640037, "grad_norm": 0.18288440859506122, "learning_rate": 1.0705679862306368e-05, "loss": 0.2762, "step": 2607 }, { "epoch": 2.421737111007896, "grad_norm": 0.16671344490112985, "learning_rate": 1.0688468158347677e-05, "loss": 0.2583, "step": 2608 }, { "epoch": 2.422666047375755, "grad_norm": 0.16774244865276433, "learning_rate": 1.0671256454388985e-05, "loss": 0.2667, "step": 2609 }, { "epoch": 2.4235949837436137, "grad_norm": 0.1906502737894114, "learning_rate": 1.0654044750430293e-05, "loss": 0.2729, "step": 2610 }, { "epoch": 2.4245239201114726, "grad_norm": 0.19139371036181707, "learning_rate": 1.0636833046471602e-05, "loss": 0.2917, "step": 2611 }, { "epoch": 2.425452856479331, "grad_norm": 0.1830525571294439, "learning_rate": 1.0619621342512909e-05, "loss": 0.273, "step": 2612 }, { "epoch": 2.42638179284719, "grad_norm": 0.17846541802522353, "learning_rate": 1.0602409638554217e-05, "loss": 0.2697, "step": 2613 }, { "epoch": 2.4273107292150486, "grad_norm": 0.1912546863200559, "learning_rate": 1.0585197934595525e-05, "loss": 0.2614, "step": 2614 }, { "epoch": 2.4282396655829075, "grad_norm": 0.18087505849060975, "learning_rate": 1.0567986230636834e-05, "loss": 0.2615, "step": 2615 }, { "epoch": 2.4291686019507663, "grad_norm": 0.18672321455235916, "learning_rate": 1.0550774526678142e-05, "loss": 0.2829, "step": 2616 }, { "epoch": 2.430097538318625, "grad_norm": 0.17819321647845873, "learning_rate": 1.0533562822719449e-05, "loss": 0.2781, "step": 2617 }, { "epoch": 2.431026474686484, "grad_norm": 0.1802526099580208, "learning_rate": 1.0516351118760757e-05, "loss": 0.2799, "step": 2618 }, { "epoch": 2.431955411054343, "grad_norm": 0.1732325664289292, "learning_rate": 1.0499139414802066e-05, "loss": 0.267, "step": 2619 }, { "epoch": 2.4328843474222017, "grad_norm": 0.1862818608754717, "learning_rate": 1.0481927710843374e-05, "loss": 0.257, "step": 2620 }, { "epoch": 2.4338132837900606, "grad_norm": 0.19978393037967157, "learning_rate": 1.0464716006884682e-05, "loss": 0.2982, "step": 2621 }, { "epoch": 2.4347422201579194, "grad_norm": 0.16756039896886504, "learning_rate": 1.0447504302925991e-05, "loss": 0.2713, "step": 2622 }, { "epoch": 2.435671156525778, "grad_norm": 0.1812223551466504, "learning_rate": 1.0430292598967298e-05, "loss": 0.271, "step": 2623 }, { "epoch": 2.4366000928936367, "grad_norm": 0.18945960502714276, "learning_rate": 1.0413080895008606e-05, "loss": 0.2799, "step": 2624 }, { "epoch": 2.4375290292614955, "grad_norm": 0.17519311685029668, "learning_rate": 1.0395869191049914e-05, "loss": 0.2765, "step": 2625 }, { "epoch": 2.4384579656293544, "grad_norm": 0.18324447068348415, "learning_rate": 1.0378657487091223e-05, "loss": 0.2762, "step": 2626 }, { "epoch": 2.439386901997213, "grad_norm": 0.17760314979609604, "learning_rate": 1.0361445783132531e-05, "loss": 0.2686, "step": 2627 }, { "epoch": 2.440315838365072, "grad_norm": 0.17054771582356815, "learning_rate": 1.0344234079173838e-05, "loss": 0.2615, "step": 2628 }, { "epoch": 2.441244774732931, "grad_norm": 0.1826470589868783, "learning_rate": 1.0327022375215146e-05, "loss": 0.2942, "step": 2629 }, { "epoch": 2.4421737111007897, "grad_norm": 0.1949199675365578, "learning_rate": 1.0309810671256455e-05, "loss": 0.2861, "step": 2630 }, { "epoch": 2.4431026474686486, "grad_norm": 0.1960583056723856, "learning_rate": 1.0292598967297763e-05, "loss": 0.272, "step": 2631 }, { "epoch": 2.444031583836507, "grad_norm": 0.17233454553942215, "learning_rate": 1.0275387263339071e-05, "loss": 0.2778, "step": 2632 }, { "epoch": 2.444960520204366, "grad_norm": 0.1954500467677132, "learning_rate": 1.0258175559380378e-05, "loss": 0.2769, "step": 2633 }, { "epoch": 2.4458894565722247, "grad_norm": 0.1833072097746944, "learning_rate": 1.0240963855421687e-05, "loss": 0.2931, "step": 2634 }, { "epoch": 2.4468183929400835, "grad_norm": 0.18965157114105274, "learning_rate": 1.0223752151462995e-05, "loss": 0.2877, "step": 2635 }, { "epoch": 2.4477473293079424, "grad_norm": 0.20978733790727866, "learning_rate": 1.0206540447504303e-05, "loss": 0.2854, "step": 2636 }, { "epoch": 2.448676265675801, "grad_norm": 0.17423882357451637, "learning_rate": 1.0189328743545612e-05, "loss": 0.2801, "step": 2637 }, { "epoch": 2.44960520204366, "grad_norm": 0.17370390492230156, "learning_rate": 1.0172117039586919e-05, "loss": 0.2756, "step": 2638 }, { "epoch": 2.450534138411519, "grad_norm": 0.16986015376541683, "learning_rate": 1.0154905335628227e-05, "loss": 0.2684, "step": 2639 }, { "epoch": 2.4514630747793777, "grad_norm": 0.18055842985320075, "learning_rate": 1.0137693631669535e-05, "loss": 0.2691, "step": 2640 }, { "epoch": 2.4523920111472366, "grad_norm": 0.1879144310139025, "learning_rate": 1.0120481927710844e-05, "loss": 0.2829, "step": 2641 }, { "epoch": 2.4533209475150954, "grad_norm": 0.16956291193537515, "learning_rate": 1.0103270223752152e-05, "loss": 0.2836, "step": 2642 }, { "epoch": 2.454249883882954, "grad_norm": 0.17981281771166743, "learning_rate": 1.008605851979346e-05, "loss": 0.2885, "step": 2643 }, { "epoch": 2.4551788202508127, "grad_norm": 0.17947003101922124, "learning_rate": 1.0068846815834767e-05, "loss": 0.2693, "step": 2644 }, { "epoch": 2.4561077566186715, "grad_norm": 0.1940400400563871, "learning_rate": 1.0051635111876076e-05, "loss": 0.2846, "step": 2645 }, { "epoch": 2.4570366929865304, "grad_norm": 0.18487882046134174, "learning_rate": 1.0034423407917384e-05, "loss": 0.2577, "step": 2646 }, { "epoch": 2.4579656293543892, "grad_norm": 0.17290264600688812, "learning_rate": 1.0017211703958692e-05, "loss": 0.2787, "step": 2647 }, { "epoch": 2.458894565722248, "grad_norm": 0.17023515664022007, "learning_rate": 1e-05, "loss": 0.2673, "step": 2648 }, { "epoch": 2.459823502090107, "grad_norm": 0.1776155129313746, "learning_rate": 9.982788296041308e-06, "loss": 0.2759, "step": 2649 }, { "epoch": 2.4607524384579658, "grad_norm": 0.1922956087310196, "learning_rate": 9.965576592082616e-06, "loss": 0.2784, "step": 2650 }, { "epoch": 2.4616813748258246, "grad_norm": 0.19175819536969194, "learning_rate": 9.948364888123924e-06, "loss": 0.271, "step": 2651 }, { "epoch": 2.462610311193683, "grad_norm": 0.17557011388898683, "learning_rate": 9.931153184165233e-06, "loss": 0.2803, "step": 2652 }, { "epoch": 2.463539247561542, "grad_norm": 0.17863208746544765, "learning_rate": 9.913941480206541e-06, "loss": 0.2717, "step": 2653 }, { "epoch": 2.4644681839294007, "grad_norm": 0.18402079757460876, "learning_rate": 9.896729776247848e-06, "loss": 0.2705, "step": 2654 }, { "epoch": 2.4653971202972595, "grad_norm": 0.18480632945427394, "learning_rate": 9.879518072289156e-06, "loss": 0.2939, "step": 2655 }, { "epoch": 2.4663260566651184, "grad_norm": 0.1699533541431954, "learning_rate": 9.862306368330465e-06, "loss": 0.2773, "step": 2656 }, { "epoch": 2.4672549930329772, "grad_norm": 0.17129023030549725, "learning_rate": 9.845094664371773e-06, "loss": 0.2748, "step": 2657 }, { "epoch": 2.468183929400836, "grad_norm": 0.17801465299435518, "learning_rate": 9.827882960413081e-06, "loss": 0.2862, "step": 2658 }, { "epoch": 2.469112865768695, "grad_norm": 0.16935169755624885, "learning_rate": 9.81067125645439e-06, "loss": 0.2682, "step": 2659 }, { "epoch": 2.4700418021365538, "grad_norm": 0.2000237770209295, "learning_rate": 9.793459552495697e-06, "loss": 0.2813, "step": 2660 }, { "epoch": 2.4709707385044126, "grad_norm": 0.18784791548400018, "learning_rate": 9.776247848537005e-06, "loss": 0.2829, "step": 2661 }, { "epoch": 2.4718996748722715, "grad_norm": 0.17733443093063525, "learning_rate": 9.759036144578313e-06, "loss": 0.278, "step": 2662 }, { "epoch": 2.47282861124013, "grad_norm": 0.17325580348112374, "learning_rate": 9.741824440619622e-06, "loss": 0.2773, "step": 2663 }, { "epoch": 2.4737575476079887, "grad_norm": 0.17758599669012587, "learning_rate": 9.72461273666093e-06, "loss": 0.2691, "step": 2664 }, { "epoch": 2.4746864839758476, "grad_norm": 0.2084107357635697, "learning_rate": 9.707401032702237e-06, "loss": 0.2772, "step": 2665 }, { "epoch": 2.4756154203437064, "grad_norm": 0.18325821857011707, "learning_rate": 9.690189328743545e-06, "loss": 0.2731, "step": 2666 }, { "epoch": 2.4765443567115653, "grad_norm": 0.18181226539042997, "learning_rate": 9.672977624784854e-06, "loss": 0.2745, "step": 2667 }, { "epoch": 2.477473293079424, "grad_norm": 0.18724076787159039, "learning_rate": 9.655765920826162e-06, "loss": 0.2882, "step": 2668 }, { "epoch": 2.478402229447283, "grad_norm": 0.19008040074552002, "learning_rate": 9.63855421686747e-06, "loss": 0.296, "step": 2669 }, { "epoch": 2.479331165815142, "grad_norm": 0.1683701319992699, "learning_rate": 9.621342512908777e-06, "loss": 0.2805, "step": 2670 }, { "epoch": 2.4802601021830006, "grad_norm": 0.17949764929549564, "learning_rate": 9.604130808950086e-06, "loss": 0.2715, "step": 2671 }, { "epoch": 2.481189038550859, "grad_norm": 0.18184976004626205, "learning_rate": 9.586919104991394e-06, "loss": 0.2773, "step": 2672 }, { "epoch": 2.482117974918718, "grad_norm": 0.17775807892598655, "learning_rate": 9.569707401032702e-06, "loss": 0.283, "step": 2673 }, { "epoch": 2.4830469112865767, "grad_norm": 0.18923652551120143, "learning_rate": 9.55249569707401e-06, "loss": 0.2845, "step": 2674 }, { "epoch": 2.4839758476544356, "grad_norm": 0.17906213434493118, "learning_rate": 9.535283993115319e-06, "loss": 0.2729, "step": 2675 }, { "epoch": 2.4849047840222944, "grad_norm": 0.18853045947261501, "learning_rate": 9.518072289156626e-06, "loss": 0.2749, "step": 2676 }, { "epoch": 2.4858337203901533, "grad_norm": 0.1725220515656567, "learning_rate": 9.500860585197934e-06, "loss": 0.2556, "step": 2677 }, { "epoch": 2.486762656758012, "grad_norm": 0.18222882747239028, "learning_rate": 9.483648881239243e-06, "loss": 0.2761, "step": 2678 }, { "epoch": 2.487691593125871, "grad_norm": 0.18770105080124452, "learning_rate": 9.466437177280551e-06, "loss": 0.2759, "step": 2679 }, { "epoch": 2.48862052949373, "grad_norm": 0.1797766621607397, "learning_rate": 9.44922547332186e-06, "loss": 0.2931, "step": 2680 }, { "epoch": 2.4895494658615887, "grad_norm": 0.1842354032637381, "learning_rate": 9.432013769363166e-06, "loss": 0.2826, "step": 2681 }, { "epoch": 2.4904784022294475, "grad_norm": 0.164114716412252, "learning_rate": 9.414802065404475e-06, "loss": 0.2659, "step": 2682 }, { "epoch": 2.491407338597306, "grad_norm": 0.16405362051068478, "learning_rate": 9.397590361445783e-06, "loss": 0.2658, "step": 2683 }, { "epoch": 2.4923362749651647, "grad_norm": 0.17956863474404955, "learning_rate": 9.380378657487091e-06, "loss": 0.2781, "step": 2684 }, { "epoch": 2.4932652113330236, "grad_norm": 0.17737289818146046, "learning_rate": 9.3631669535284e-06, "loss": 0.296, "step": 2685 }, { "epoch": 2.4941941477008824, "grad_norm": 0.18585541574048775, "learning_rate": 9.345955249569706e-06, "loss": 0.3013, "step": 2686 }, { "epoch": 2.4951230840687413, "grad_norm": 0.1643243554438021, "learning_rate": 9.328743545611015e-06, "loss": 0.2623, "step": 2687 }, { "epoch": 2.4960520204366, "grad_norm": 0.17270213602505902, "learning_rate": 9.311531841652323e-06, "loss": 0.2766, "step": 2688 }, { "epoch": 2.496980956804459, "grad_norm": 0.16782827955554608, "learning_rate": 9.294320137693632e-06, "loss": 0.2665, "step": 2689 }, { "epoch": 2.497909893172318, "grad_norm": 0.17122748158137155, "learning_rate": 9.27710843373494e-06, "loss": 0.2705, "step": 2690 }, { "epoch": 2.4988388295401767, "grad_norm": 0.17869194306118438, "learning_rate": 9.259896729776247e-06, "loss": 0.281, "step": 2691 }, { "epoch": 2.499767765908035, "grad_norm": 0.1695879134783124, "learning_rate": 9.242685025817555e-06, "loss": 0.2537, "step": 2692 }, { "epoch": 2.500696702275894, "grad_norm": 0.1662584266248051, "learning_rate": 9.225473321858864e-06, "loss": 0.2632, "step": 2693 }, { "epoch": 2.5016256386437528, "grad_norm": 0.17093570405435482, "learning_rate": 9.208261617900172e-06, "loss": 0.2792, "step": 2694 }, { "epoch": 2.5025545750116116, "grad_norm": 0.17307510808065985, "learning_rate": 9.19104991394148e-06, "loss": 0.2769, "step": 2695 }, { "epoch": 2.5034835113794704, "grad_norm": 0.17692551001812182, "learning_rate": 9.173838209982789e-06, "loss": 0.2648, "step": 2696 }, { "epoch": 2.5044124477473293, "grad_norm": 0.17879082159343984, "learning_rate": 9.156626506024097e-06, "loss": 0.2763, "step": 2697 }, { "epoch": 2.505341384115188, "grad_norm": 0.19597822806679713, "learning_rate": 9.139414802065404e-06, "loss": 0.2869, "step": 2698 }, { "epoch": 2.506270320483047, "grad_norm": 0.17477995496206758, "learning_rate": 9.122203098106712e-06, "loss": 0.2787, "step": 2699 }, { "epoch": 2.507199256850906, "grad_norm": 0.19752698933537774, "learning_rate": 9.10499139414802e-06, "loss": 0.2796, "step": 2700 }, { "epoch": 2.5081281932187647, "grad_norm": 0.19121547602167308, "learning_rate": 9.087779690189329e-06, "loss": 0.2763, "step": 2701 }, { "epoch": 2.5090571295866235, "grad_norm": 0.19101906802675103, "learning_rate": 9.070567986230637e-06, "loss": 0.3006, "step": 2702 }, { "epoch": 2.5099860659544824, "grad_norm": 0.17615835392415316, "learning_rate": 9.053356282271946e-06, "loss": 0.2846, "step": 2703 }, { "epoch": 2.5109150023223408, "grad_norm": 0.1762519374159477, "learning_rate": 9.036144578313253e-06, "loss": 0.2693, "step": 2704 }, { "epoch": 2.5118439386901996, "grad_norm": 0.20903449277468536, "learning_rate": 9.018932874354561e-06, "loss": 0.2829, "step": 2705 }, { "epoch": 2.5127728750580585, "grad_norm": 0.17834560785229894, "learning_rate": 9.00172117039587e-06, "loss": 0.2674, "step": 2706 }, { "epoch": 2.5137018114259173, "grad_norm": 0.1793333142413588, "learning_rate": 8.984509466437178e-06, "loss": 0.2592, "step": 2707 }, { "epoch": 2.514630747793776, "grad_norm": 0.18220038218018145, "learning_rate": 8.967297762478486e-06, "loss": 0.2738, "step": 2708 }, { "epoch": 2.515559684161635, "grad_norm": 0.1767274685035216, "learning_rate": 8.950086058519795e-06, "loss": 0.2777, "step": 2709 }, { "epoch": 2.516488620529494, "grad_norm": 0.1838314403491137, "learning_rate": 8.932874354561101e-06, "loss": 0.2811, "step": 2710 }, { "epoch": 2.5174175568973527, "grad_norm": 0.2000434374752791, "learning_rate": 8.91566265060241e-06, "loss": 0.2744, "step": 2711 }, { "epoch": 2.518346493265211, "grad_norm": 0.1821868758548907, "learning_rate": 8.898450946643718e-06, "loss": 0.2768, "step": 2712 }, { "epoch": 2.51927542963307, "grad_norm": 0.29230691057904484, "learning_rate": 8.881239242685026e-06, "loss": 0.2734, "step": 2713 }, { "epoch": 2.520204366000929, "grad_norm": 0.167813313251146, "learning_rate": 8.864027538726335e-06, "loss": 0.2601, "step": 2714 }, { "epoch": 2.5211333023687876, "grad_norm": 0.1769085034080416, "learning_rate": 8.846815834767643e-06, "loss": 0.2726, "step": 2715 }, { "epoch": 2.5220622387366465, "grad_norm": 0.20220842083808388, "learning_rate": 8.82960413080895e-06, "loss": 0.2817, "step": 2716 }, { "epoch": 2.5229911751045053, "grad_norm": 0.18444616770204614, "learning_rate": 8.812392426850258e-06, "loss": 0.2773, "step": 2717 }, { "epoch": 2.523920111472364, "grad_norm": 0.18296071897423907, "learning_rate": 8.795180722891567e-06, "loss": 0.2797, "step": 2718 }, { "epoch": 2.524849047840223, "grad_norm": 0.1817067050377542, "learning_rate": 8.777969018932875e-06, "loss": 0.2641, "step": 2719 }, { "epoch": 2.525777984208082, "grad_norm": 0.1794572761384637, "learning_rate": 8.760757314974184e-06, "loss": 0.2813, "step": 2720 }, { "epoch": 2.5267069205759407, "grad_norm": 0.1676740839883334, "learning_rate": 8.743545611015492e-06, "loss": 0.2808, "step": 2721 }, { "epoch": 2.5276358569437996, "grad_norm": 0.17116798658772298, "learning_rate": 8.726333907056799e-06, "loss": 0.2778, "step": 2722 }, { "epoch": 2.5285647933116584, "grad_norm": 0.17943680035895654, "learning_rate": 8.709122203098107e-06, "loss": 0.2713, "step": 2723 }, { "epoch": 2.529493729679517, "grad_norm": 0.18500653140503157, "learning_rate": 8.691910499139416e-06, "loss": 0.2748, "step": 2724 }, { "epoch": 2.5304226660473756, "grad_norm": 0.18656247536015494, "learning_rate": 8.674698795180724e-06, "loss": 0.2931, "step": 2725 }, { "epoch": 2.5313516024152345, "grad_norm": 0.17519545206291068, "learning_rate": 8.657487091222032e-06, "loss": 0.2636, "step": 2726 }, { "epoch": 2.5322805387830933, "grad_norm": 0.18579678715189885, "learning_rate": 8.64027538726334e-06, "loss": 0.2609, "step": 2727 }, { "epoch": 2.533209475150952, "grad_norm": 0.17685544278047252, "learning_rate": 8.623063683304647e-06, "loss": 0.269, "step": 2728 }, { "epoch": 2.534138411518811, "grad_norm": 0.1716684760379682, "learning_rate": 8.605851979345956e-06, "loss": 0.273, "step": 2729 }, { "epoch": 2.53506734788667, "grad_norm": 0.18997722560178743, "learning_rate": 8.588640275387264e-06, "loss": 0.2735, "step": 2730 }, { "epoch": 2.5359962842545287, "grad_norm": 0.1725056612650081, "learning_rate": 8.571428571428573e-06, "loss": 0.2782, "step": 2731 }, { "epoch": 2.536925220622387, "grad_norm": 0.1846867810643451, "learning_rate": 8.554216867469881e-06, "loss": 0.281, "step": 2732 }, { "epoch": 2.537854156990246, "grad_norm": 0.1746259889225677, "learning_rate": 8.53700516351119e-06, "loss": 0.2802, "step": 2733 }, { "epoch": 2.538783093358105, "grad_norm": 0.18034001099175637, "learning_rate": 8.519793459552496e-06, "loss": 0.2983, "step": 2734 }, { "epoch": 2.5397120297259637, "grad_norm": 0.17285283661416923, "learning_rate": 8.502581755593805e-06, "loss": 0.267, "step": 2735 }, { "epoch": 2.5406409660938225, "grad_norm": 0.16623526260482924, "learning_rate": 8.485370051635113e-06, "loss": 0.2714, "step": 2736 }, { "epoch": 2.5415699024616814, "grad_norm": 0.17140669097240416, "learning_rate": 8.468158347676421e-06, "loss": 0.2764, "step": 2737 }, { "epoch": 2.54249883882954, "grad_norm": 0.17006832454927703, "learning_rate": 8.45094664371773e-06, "loss": 0.2911, "step": 2738 }, { "epoch": 2.543427775197399, "grad_norm": 0.16385080260835502, "learning_rate": 8.433734939759036e-06, "loss": 0.2677, "step": 2739 }, { "epoch": 2.544356711565258, "grad_norm": 0.1798411378111758, "learning_rate": 8.416523235800345e-06, "loss": 0.2876, "step": 2740 }, { "epoch": 2.5452856479331167, "grad_norm": 0.17749376274284356, "learning_rate": 8.399311531841653e-06, "loss": 0.2809, "step": 2741 }, { "epoch": 2.5462145843009756, "grad_norm": 0.17811015681571632, "learning_rate": 8.382099827882962e-06, "loss": 0.2758, "step": 2742 }, { "epoch": 2.5471435206688344, "grad_norm": 0.1791864260525822, "learning_rate": 8.36488812392427e-06, "loss": 0.2868, "step": 2743 }, { "epoch": 2.548072457036693, "grad_norm": 0.18193816987523823, "learning_rate": 8.347676419965577e-06, "loss": 0.2759, "step": 2744 }, { "epoch": 2.5490013934045517, "grad_norm": 0.18066533942488144, "learning_rate": 8.330464716006885e-06, "loss": 0.2671, "step": 2745 }, { "epoch": 2.5499303297724105, "grad_norm": 0.17448722669503738, "learning_rate": 8.313253012048194e-06, "loss": 0.2747, "step": 2746 }, { "epoch": 2.5508592661402694, "grad_norm": 0.1818779797983094, "learning_rate": 8.296041308089502e-06, "loss": 0.2836, "step": 2747 }, { "epoch": 2.551788202508128, "grad_norm": 0.1802872411024581, "learning_rate": 8.27882960413081e-06, "loss": 0.2762, "step": 2748 }, { "epoch": 2.552717138875987, "grad_norm": 0.17919426431481447, "learning_rate": 8.261617900172119e-06, "loss": 0.2844, "step": 2749 }, { "epoch": 2.553646075243846, "grad_norm": 0.17435336893217318, "learning_rate": 8.244406196213425e-06, "loss": 0.2803, "step": 2750 }, { "epoch": 2.5545750116117047, "grad_norm": 0.19650206355264208, "learning_rate": 8.227194492254734e-06, "loss": 0.2995, "step": 2751 }, { "epoch": 2.555503947979563, "grad_norm": 0.1751970358659085, "learning_rate": 8.209982788296042e-06, "loss": 0.272, "step": 2752 }, { "epoch": 2.556432884347422, "grad_norm": 0.1690503222258136, "learning_rate": 8.19277108433735e-06, "loss": 0.2739, "step": 2753 }, { "epoch": 2.557361820715281, "grad_norm": 0.17620039647154925, "learning_rate": 8.175559380378659e-06, "loss": 0.2805, "step": 2754 }, { "epoch": 2.5582907570831397, "grad_norm": 0.17388899053880985, "learning_rate": 8.158347676419966e-06, "loss": 0.2704, "step": 2755 }, { "epoch": 2.5592196934509985, "grad_norm": 0.17685736263136762, "learning_rate": 8.141135972461274e-06, "loss": 0.2649, "step": 2756 }, { "epoch": 2.5601486298188574, "grad_norm": 0.17315801191584784, "learning_rate": 8.123924268502583e-06, "loss": 0.2783, "step": 2757 }, { "epoch": 2.5610775661867162, "grad_norm": 0.17710989290259285, "learning_rate": 8.106712564543891e-06, "loss": 0.2833, "step": 2758 }, { "epoch": 2.562006502554575, "grad_norm": 0.17693674550109328, "learning_rate": 8.0895008605852e-06, "loss": 0.28, "step": 2759 }, { "epoch": 2.562935438922434, "grad_norm": 0.18006819440338806, "learning_rate": 8.072289156626506e-06, "loss": 0.2793, "step": 2760 }, { "epoch": 2.5638643752902928, "grad_norm": 0.17402028938476224, "learning_rate": 8.055077452667814e-06, "loss": 0.2806, "step": 2761 }, { "epoch": 2.5647933116581516, "grad_norm": 0.1756389111979832, "learning_rate": 8.037865748709123e-06, "loss": 0.2797, "step": 2762 }, { "epoch": 2.5657222480260105, "grad_norm": 0.1798104235898651, "learning_rate": 8.020654044750431e-06, "loss": 0.2801, "step": 2763 }, { "epoch": 2.566651184393869, "grad_norm": 0.17595147246541074, "learning_rate": 8.00344234079174e-06, "loss": 0.2796, "step": 2764 }, { "epoch": 2.5675801207617277, "grad_norm": 0.1866139051974165, "learning_rate": 7.986230636833048e-06, "loss": 0.2792, "step": 2765 }, { "epoch": 2.5685090571295865, "grad_norm": 0.17242021849415873, "learning_rate": 7.969018932874355e-06, "loss": 0.2799, "step": 2766 }, { "epoch": 2.5694379934974454, "grad_norm": 0.1722563276561945, "learning_rate": 7.951807228915663e-06, "loss": 0.2658, "step": 2767 }, { "epoch": 2.5703669298653042, "grad_norm": 0.18138158515978361, "learning_rate": 7.934595524956972e-06, "loss": 0.2937, "step": 2768 }, { "epoch": 2.571295866233163, "grad_norm": 0.17876412039688724, "learning_rate": 7.91738382099828e-06, "loss": 0.2749, "step": 2769 }, { "epoch": 2.572224802601022, "grad_norm": 0.17551798053405512, "learning_rate": 7.900172117039588e-06, "loss": 0.2755, "step": 2770 }, { "epoch": 2.5731537389688808, "grad_norm": 0.17206297732377063, "learning_rate": 7.882960413080895e-06, "loss": 0.2729, "step": 2771 }, { "epoch": 2.574082675336739, "grad_norm": 0.17868585578978832, "learning_rate": 7.865748709122203e-06, "loss": 0.2958, "step": 2772 }, { "epoch": 2.575011611704598, "grad_norm": 0.17935135793930987, "learning_rate": 7.848537005163512e-06, "loss": 0.2679, "step": 2773 }, { "epoch": 2.575940548072457, "grad_norm": 0.16765676035780458, "learning_rate": 7.83132530120482e-06, "loss": 0.2648, "step": 2774 }, { "epoch": 2.5768694844403157, "grad_norm": 0.18548752107627353, "learning_rate": 7.814113597246129e-06, "loss": 0.2953, "step": 2775 }, { "epoch": 2.5777984208081746, "grad_norm": 0.18625401386712584, "learning_rate": 7.796901893287435e-06, "loss": 0.2903, "step": 2776 }, { "epoch": 2.5787273571760334, "grad_norm": 0.17797790691377569, "learning_rate": 7.779690189328744e-06, "loss": 0.2587, "step": 2777 }, { "epoch": 2.5796562935438923, "grad_norm": 0.17169551543009043, "learning_rate": 7.762478485370052e-06, "loss": 0.2591, "step": 2778 }, { "epoch": 2.580585229911751, "grad_norm": 0.17885288708688196, "learning_rate": 7.74526678141136e-06, "loss": 0.2644, "step": 2779 }, { "epoch": 2.58151416627961, "grad_norm": 0.17569046883577338, "learning_rate": 7.728055077452669e-06, "loss": 0.2802, "step": 2780 }, { "epoch": 2.582443102647469, "grad_norm": 0.18426157444373517, "learning_rate": 7.710843373493977e-06, "loss": 0.2734, "step": 2781 }, { "epoch": 2.5833720390153276, "grad_norm": 0.18868415686038092, "learning_rate": 7.693631669535284e-06, "loss": 0.2862, "step": 2782 }, { "epoch": 2.5843009753831865, "grad_norm": 0.1826628963868718, "learning_rate": 7.676419965576592e-06, "loss": 0.275, "step": 2783 }, { "epoch": 2.5852299117510453, "grad_norm": 0.18305810292468144, "learning_rate": 7.659208261617901e-06, "loss": 0.2783, "step": 2784 }, { "epoch": 2.5861588481189037, "grad_norm": 0.19042265177468026, "learning_rate": 7.64199655765921e-06, "loss": 0.2836, "step": 2785 }, { "epoch": 2.5870877844867626, "grad_norm": 0.17182388584361694, "learning_rate": 7.624784853700518e-06, "loss": 0.2758, "step": 2786 }, { "epoch": 2.5880167208546214, "grad_norm": 0.16171186896342973, "learning_rate": 7.607573149741824e-06, "loss": 0.27, "step": 2787 }, { "epoch": 2.5889456572224803, "grad_norm": 0.16522516692804265, "learning_rate": 7.590361445783133e-06, "loss": 0.2518, "step": 2788 }, { "epoch": 2.589874593590339, "grad_norm": 0.18331019909753513, "learning_rate": 7.573149741824441e-06, "loss": 0.2866, "step": 2789 }, { "epoch": 2.590803529958198, "grad_norm": 0.18035348067205553, "learning_rate": 7.55593803786575e-06, "loss": 0.2613, "step": 2790 }, { "epoch": 2.591732466326057, "grad_norm": 0.18295229566848326, "learning_rate": 7.538726333907058e-06, "loss": 0.272, "step": 2791 }, { "epoch": 2.592661402693915, "grad_norm": 0.16902236952248076, "learning_rate": 7.521514629948365e-06, "loss": 0.2676, "step": 2792 }, { "epoch": 2.593590339061774, "grad_norm": 0.18044289901142607, "learning_rate": 7.504302925989673e-06, "loss": 0.2907, "step": 2793 }, { "epoch": 2.594519275429633, "grad_norm": 0.18658630020919675, "learning_rate": 7.4870912220309815e-06, "loss": 0.2898, "step": 2794 }, { "epoch": 2.5954482117974917, "grad_norm": 0.1659203986129596, "learning_rate": 7.46987951807229e-06, "loss": 0.2767, "step": 2795 }, { "epoch": 2.5963771481653506, "grad_norm": 0.18224464598969245, "learning_rate": 7.452667814113598e-06, "loss": 0.2744, "step": 2796 }, { "epoch": 2.5973060845332094, "grad_norm": 0.18798345715683007, "learning_rate": 7.435456110154905e-06, "loss": 0.2803, "step": 2797 }, { "epoch": 2.5982350209010683, "grad_norm": 0.17498384353384416, "learning_rate": 7.418244406196213e-06, "loss": 0.2752, "step": 2798 }, { "epoch": 2.599163957268927, "grad_norm": 0.17390827410315354, "learning_rate": 7.401032702237522e-06, "loss": 0.2713, "step": 2799 }, { "epoch": 2.600092893636786, "grad_norm": 0.17905924555277583, "learning_rate": 7.38382099827883e-06, "loss": 0.272, "step": 2800 }, { "epoch": 2.601021830004645, "grad_norm": 0.19237045567028638, "learning_rate": 7.366609294320139e-06, "loss": 0.2754, "step": 2801 }, { "epoch": 2.6019507663725037, "grad_norm": 0.19081730075922945, "learning_rate": 7.349397590361447e-06, "loss": 0.2784, "step": 2802 }, { "epoch": 2.6028797027403625, "grad_norm": 0.1773815781690385, "learning_rate": 7.332185886402754e-06, "loss": 0.2812, "step": 2803 }, { "epoch": 2.6038086391082214, "grad_norm": 0.18905163088336832, "learning_rate": 7.314974182444062e-06, "loss": 0.2798, "step": 2804 }, { "epoch": 2.6047375754760798, "grad_norm": 0.17873348755643859, "learning_rate": 7.2977624784853705e-06, "loss": 0.2716, "step": 2805 }, { "epoch": 2.6056665118439386, "grad_norm": 0.17030522579544236, "learning_rate": 7.280550774526679e-06, "loss": 0.2685, "step": 2806 }, { "epoch": 2.6065954482117974, "grad_norm": 0.1862035284607427, "learning_rate": 7.263339070567987e-06, "loss": 0.2788, "step": 2807 }, { "epoch": 2.6075243845796563, "grad_norm": 0.17337568922147115, "learning_rate": 7.246127366609294e-06, "loss": 0.2642, "step": 2808 }, { "epoch": 2.608453320947515, "grad_norm": 0.19450967596793517, "learning_rate": 7.228915662650602e-06, "loss": 0.2727, "step": 2809 }, { "epoch": 2.609382257315374, "grad_norm": 0.1893548613084817, "learning_rate": 7.211703958691911e-06, "loss": 0.2791, "step": 2810 }, { "epoch": 2.610311193683233, "grad_norm": 0.18755670041525413, "learning_rate": 7.194492254733219e-06, "loss": 0.2893, "step": 2811 }, { "epoch": 2.6112401300510912, "grad_norm": 0.1737293113823649, "learning_rate": 7.177280550774528e-06, "loss": 0.2654, "step": 2812 }, { "epoch": 2.61216906641895, "grad_norm": 0.17980094681636927, "learning_rate": 7.160068846815834e-06, "loss": 0.2801, "step": 2813 }, { "epoch": 2.613098002786809, "grad_norm": 0.17904359523084443, "learning_rate": 7.142857142857143e-06, "loss": 0.2767, "step": 2814 }, { "epoch": 2.6140269391546678, "grad_norm": 0.17364837747203463, "learning_rate": 7.125645438898451e-06, "loss": 0.2702, "step": 2815 }, { "epoch": 2.6149558755225266, "grad_norm": 0.17617804961029349, "learning_rate": 7.1084337349397595e-06, "loss": 0.2642, "step": 2816 }, { "epoch": 2.6158848118903855, "grad_norm": 0.1730303401260583, "learning_rate": 7.091222030981068e-06, "loss": 0.2778, "step": 2817 }, { "epoch": 2.6168137482582443, "grad_norm": 0.1831938804451245, "learning_rate": 7.074010327022376e-06, "loss": 0.2811, "step": 2818 }, { "epoch": 2.617742684626103, "grad_norm": 0.1753374582449086, "learning_rate": 7.056798623063683e-06, "loss": 0.2717, "step": 2819 }, { "epoch": 2.618671620993962, "grad_norm": 0.18343820977770026, "learning_rate": 7.039586919104991e-06, "loss": 0.3012, "step": 2820 }, { "epoch": 2.619600557361821, "grad_norm": 0.18184849598170855, "learning_rate": 7.0223752151463e-06, "loss": 0.2697, "step": 2821 }, { "epoch": 2.6205294937296797, "grad_norm": 0.167860177827926, "learning_rate": 7.005163511187608e-06, "loss": 0.2784, "step": 2822 }, { "epoch": 2.6214584300975385, "grad_norm": 0.17080098742603222, "learning_rate": 6.987951807228917e-06, "loss": 0.2577, "step": 2823 }, { "epoch": 2.6223873664653974, "grad_norm": 0.1883622438215358, "learning_rate": 6.970740103270223e-06, "loss": 0.2879, "step": 2824 }, { "epoch": 2.623316302833256, "grad_norm": 0.1744115106332342, "learning_rate": 6.953528399311532e-06, "loss": 0.2739, "step": 2825 }, { "epoch": 2.6242452392011146, "grad_norm": 0.16819532461761638, "learning_rate": 6.93631669535284e-06, "loss": 0.262, "step": 2826 }, { "epoch": 2.6251741755689735, "grad_norm": 0.1760702927558675, "learning_rate": 6.9191049913941485e-06, "loss": 0.2698, "step": 2827 }, { "epoch": 2.6261031119368323, "grad_norm": 0.17497265930471442, "learning_rate": 6.901893287435457e-06, "loss": 0.268, "step": 2828 }, { "epoch": 2.627032048304691, "grad_norm": 0.23004773700222614, "learning_rate": 6.884681583476764e-06, "loss": 0.2662, "step": 2829 }, { "epoch": 2.62796098467255, "grad_norm": 0.1757075749648394, "learning_rate": 6.867469879518072e-06, "loss": 0.2788, "step": 2830 }, { "epoch": 2.628889921040409, "grad_norm": 0.1817582870520524, "learning_rate": 6.8502581755593804e-06, "loss": 0.2864, "step": 2831 }, { "epoch": 2.6298188574082673, "grad_norm": 0.17943345206651734, "learning_rate": 6.833046471600689e-06, "loss": 0.2709, "step": 2832 }, { "epoch": 2.630747793776126, "grad_norm": 0.18193170849342033, "learning_rate": 6.815834767641997e-06, "loss": 0.2776, "step": 2833 }, { "epoch": 2.631676730143985, "grad_norm": 0.17203280602668908, "learning_rate": 6.798623063683306e-06, "loss": 0.2619, "step": 2834 }, { "epoch": 2.632605666511844, "grad_norm": 0.17778486134836455, "learning_rate": 6.781411359724612e-06, "loss": 0.2825, "step": 2835 }, { "epoch": 2.6335346028797026, "grad_norm": 0.17844868569560363, "learning_rate": 6.764199655765921e-06, "loss": 0.2858, "step": 2836 }, { "epoch": 2.6344635392475615, "grad_norm": 0.17726568802995427, "learning_rate": 6.746987951807229e-06, "loss": 0.2811, "step": 2837 }, { "epoch": 2.6353924756154203, "grad_norm": 0.16952025898385292, "learning_rate": 6.7297762478485375e-06, "loss": 0.271, "step": 2838 }, { "epoch": 2.636321411983279, "grad_norm": 0.16547292535807162, "learning_rate": 6.712564543889846e-06, "loss": 0.2685, "step": 2839 }, { "epoch": 2.637250348351138, "grad_norm": 0.17302245425655283, "learning_rate": 6.695352839931153e-06, "loss": 0.271, "step": 2840 }, { "epoch": 2.638179284718997, "grad_norm": 0.16762567096195974, "learning_rate": 6.678141135972461e-06, "loss": 0.266, "step": 2841 }, { "epoch": 2.6391082210868557, "grad_norm": 0.1699246368225437, "learning_rate": 6.6609294320137694e-06, "loss": 0.2696, "step": 2842 }, { "epoch": 2.6400371574547146, "grad_norm": 0.16340294143468365, "learning_rate": 6.643717728055078e-06, "loss": 0.2732, "step": 2843 }, { "epoch": 2.6409660938225734, "grad_norm": 0.17885057916542138, "learning_rate": 6.626506024096386e-06, "loss": 0.2896, "step": 2844 }, { "epoch": 2.641895030190432, "grad_norm": 0.17028443868648646, "learning_rate": 6.609294320137693e-06, "loss": 0.2771, "step": 2845 }, { "epoch": 2.6428239665582907, "grad_norm": 0.17146793678084205, "learning_rate": 6.592082616179001e-06, "loss": 0.277, "step": 2846 }, { "epoch": 2.6437529029261495, "grad_norm": 0.18046897103380846, "learning_rate": 6.57487091222031e-06, "loss": 0.2712, "step": 2847 }, { "epoch": 2.6446818392940084, "grad_norm": 0.17765626048887875, "learning_rate": 6.557659208261618e-06, "loss": 0.2755, "step": 2848 }, { "epoch": 2.645610775661867, "grad_norm": 0.16752452222262293, "learning_rate": 6.5404475043029266e-06, "loss": 0.2676, "step": 2849 }, { "epoch": 2.646539712029726, "grad_norm": 0.17019441380452832, "learning_rate": 6.523235800344234e-06, "loss": 0.27, "step": 2850 }, { "epoch": 2.647468648397585, "grad_norm": 0.20022193692643978, "learning_rate": 6.506024096385542e-06, "loss": 0.2719, "step": 2851 }, { "epoch": 2.6483975847654433, "grad_norm": 0.18034574748663765, "learning_rate": 6.48881239242685e-06, "loss": 0.2815, "step": 2852 }, { "epoch": 2.649326521133302, "grad_norm": 0.17256922959908166, "learning_rate": 6.4716006884681585e-06, "loss": 0.2822, "step": 2853 }, { "epoch": 2.650255457501161, "grad_norm": 0.17557669549784571, "learning_rate": 6.454388984509467e-06, "loss": 0.2819, "step": 2854 }, { "epoch": 2.65118439386902, "grad_norm": 0.18357507050027355, "learning_rate": 6.437177280550775e-06, "loss": 0.2856, "step": 2855 }, { "epoch": 2.6521133302368787, "grad_norm": 0.17883677795624237, "learning_rate": 6.419965576592083e-06, "loss": 0.278, "step": 2856 }, { "epoch": 2.6530422666047375, "grad_norm": 0.1626862574701004, "learning_rate": 6.40275387263339e-06, "loss": 0.2574, "step": 2857 }, { "epoch": 2.6539712029725964, "grad_norm": 0.16383951586471682, "learning_rate": 6.385542168674699e-06, "loss": 0.2752, "step": 2858 }, { "epoch": 2.654900139340455, "grad_norm": 0.16774157905432333, "learning_rate": 6.368330464716007e-06, "loss": 0.2788, "step": 2859 }, { "epoch": 2.655829075708314, "grad_norm": 0.17064891524490505, "learning_rate": 6.3511187607573156e-06, "loss": 0.2744, "step": 2860 }, { "epoch": 2.656758012076173, "grad_norm": 0.18019181299443282, "learning_rate": 6.333907056798623e-06, "loss": 0.2749, "step": 2861 }, { "epoch": 2.6576869484440317, "grad_norm": 0.18065522881499407, "learning_rate": 6.3166953528399315e-06, "loss": 0.2781, "step": 2862 }, { "epoch": 2.6586158848118906, "grad_norm": 0.16699071545783015, "learning_rate": 6.299483648881239e-06, "loss": 0.2699, "step": 2863 }, { "epoch": 2.6595448211797494, "grad_norm": 0.17960486746690257, "learning_rate": 6.2822719449225475e-06, "loss": 0.2701, "step": 2864 }, { "epoch": 2.660473757547608, "grad_norm": 0.16751383073112, "learning_rate": 6.265060240963856e-06, "loss": 0.2706, "step": 2865 }, { "epoch": 2.6614026939154667, "grad_norm": 0.17012354333185065, "learning_rate": 6.2478485370051634e-06, "loss": 0.2603, "step": 2866 }, { "epoch": 2.6623316302833255, "grad_norm": 0.16323188266871058, "learning_rate": 6.230636833046472e-06, "loss": 0.2587, "step": 2867 }, { "epoch": 2.6632605666511844, "grad_norm": 0.16800338995254763, "learning_rate": 6.21342512908778e-06, "loss": 0.2844, "step": 2868 }, { "epoch": 2.6641895030190432, "grad_norm": 0.17523825857623324, "learning_rate": 6.196213425129088e-06, "loss": 0.2664, "step": 2869 }, { "epoch": 2.665118439386902, "grad_norm": 0.17259113929735792, "learning_rate": 6.179001721170396e-06, "loss": 0.2682, "step": 2870 }, { "epoch": 2.666047375754761, "grad_norm": 0.15843773315912996, "learning_rate": 6.161790017211705e-06, "loss": 0.2526, "step": 2871 }, { "epoch": 2.6669763121226198, "grad_norm": 0.1667368341714137, "learning_rate": 6.144578313253013e-06, "loss": 0.2836, "step": 2872 }, { "epoch": 2.667905248490478, "grad_norm": 0.18356977684527812, "learning_rate": 6.1273666092943205e-06, "loss": 0.2879, "step": 2873 }, { "epoch": 2.668834184858337, "grad_norm": 0.1718844280644391, "learning_rate": 6.110154905335629e-06, "loss": 0.2691, "step": 2874 }, { "epoch": 2.669763121226196, "grad_norm": 0.17542563761145286, "learning_rate": 6.092943201376937e-06, "loss": 0.2835, "step": 2875 }, { "epoch": 2.6706920575940547, "grad_norm": 0.18112601734776523, "learning_rate": 6.075731497418245e-06, "loss": 0.2813, "step": 2876 }, { "epoch": 2.6716209939619135, "grad_norm": 0.17517423987429506, "learning_rate": 6.058519793459553e-06, "loss": 0.2769, "step": 2877 }, { "epoch": 2.6725499303297724, "grad_norm": 0.17260026662133704, "learning_rate": 6.041308089500861e-06, "loss": 0.2764, "step": 2878 }, { "epoch": 2.6734788666976312, "grad_norm": 0.17274079216676472, "learning_rate": 6.024096385542169e-06, "loss": 0.2633, "step": 2879 }, { "epoch": 2.67440780306549, "grad_norm": 0.1749655577705151, "learning_rate": 6.006884681583478e-06, "loss": 0.2801, "step": 2880 }, { "epoch": 2.675336739433349, "grad_norm": 0.18151574461910705, "learning_rate": 5.989672977624785e-06, "loss": 0.2866, "step": 2881 }, { "epoch": 2.6762656758012078, "grad_norm": 0.1603999496686991, "learning_rate": 5.972461273666094e-06, "loss": 0.2582, "step": 2882 }, { "epoch": 2.6771946121690666, "grad_norm": 0.1757361166235524, "learning_rate": 5.955249569707402e-06, "loss": 0.2844, "step": 2883 }, { "epoch": 2.6781235485369255, "grad_norm": 0.17402452532430263, "learning_rate": 5.9380378657487095e-06, "loss": 0.2765, "step": 2884 }, { "epoch": 2.679052484904784, "grad_norm": 0.1709408589895161, "learning_rate": 5.920826161790018e-06, "loss": 0.2838, "step": 2885 }, { "epoch": 2.6799814212726427, "grad_norm": 0.17361142927299925, "learning_rate": 5.9036144578313255e-06, "loss": 0.2868, "step": 2886 }, { "epoch": 2.6809103576405016, "grad_norm": 0.1597414246640021, "learning_rate": 5.886402753872634e-06, "loss": 0.2611, "step": 2887 }, { "epoch": 2.6818392940083604, "grad_norm": 0.16013784375486953, "learning_rate": 5.869191049913942e-06, "loss": 0.2557, "step": 2888 }, { "epoch": 2.6827682303762193, "grad_norm": 0.15920257619823422, "learning_rate": 5.85197934595525e-06, "loss": 0.2537, "step": 2889 }, { "epoch": 2.683697166744078, "grad_norm": 0.19602228120008208, "learning_rate": 5.834767641996558e-06, "loss": 0.2767, "step": 2890 }, { "epoch": 2.684626103111937, "grad_norm": 0.16026967401774392, "learning_rate": 5.817555938037867e-06, "loss": 0.2652, "step": 2891 }, { "epoch": 2.685555039479796, "grad_norm": 0.16763626760535624, "learning_rate": 5.800344234079174e-06, "loss": 0.2797, "step": 2892 }, { "epoch": 2.686483975847654, "grad_norm": 0.1706032665023045, "learning_rate": 5.783132530120483e-06, "loss": 0.2744, "step": 2893 }, { "epoch": 2.687412912215513, "grad_norm": 0.18732977118836014, "learning_rate": 5.76592082616179e-06, "loss": 0.2924, "step": 2894 }, { "epoch": 2.688341848583372, "grad_norm": 0.17185758153715494, "learning_rate": 5.7487091222030986e-06, "loss": 0.2655, "step": 2895 }, { "epoch": 2.6892707849512307, "grad_norm": 0.17530485016367162, "learning_rate": 5.731497418244407e-06, "loss": 0.2847, "step": 2896 }, { "epoch": 2.6901997213190896, "grad_norm": 0.15863803214886732, "learning_rate": 5.7142857142857145e-06, "loss": 0.2704, "step": 2897 }, { "epoch": 2.6911286576869484, "grad_norm": 0.1691861266842899, "learning_rate": 5.697074010327023e-06, "loss": 0.29, "step": 2898 }, { "epoch": 2.6920575940548073, "grad_norm": 0.16970518454179528, "learning_rate": 5.679862306368331e-06, "loss": 0.2614, "step": 2899 }, { "epoch": 2.692986530422666, "grad_norm": 0.1741414021917801, "learning_rate": 5.662650602409639e-06, "loss": 0.2822, "step": 2900 }, { "epoch": 2.693915466790525, "grad_norm": 0.1681316865512381, "learning_rate": 5.645438898450947e-06, "loss": 0.2577, "step": 2901 }, { "epoch": 2.694844403158384, "grad_norm": 0.16184757987994441, "learning_rate": 5.628227194492255e-06, "loss": 0.2583, "step": 2902 }, { "epoch": 2.6957733395262427, "grad_norm": 0.16281128048095134, "learning_rate": 5.611015490533563e-06, "loss": 0.2641, "step": 2903 }, { "epoch": 2.6967022758941015, "grad_norm": 0.16822832216234318, "learning_rate": 5.593803786574872e-06, "loss": 0.2674, "step": 2904 }, { "epoch": 2.69763121226196, "grad_norm": 0.1660704396611372, "learning_rate": 5.576592082616179e-06, "loss": 0.2693, "step": 2905 }, { "epoch": 2.6985601486298187, "grad_norm": 0.17603534664952974, "learning_rate": 5.5593803786574876e-06, "loss": 0.2913, "step": 2906 }, { "epoch": 2.6994890849976776, "grad_norm": 0.16173314477849363, "learning_rate": 5.542168674698795e-06, "loss": 0.2565, "step": 2907 }, { "epoch": 2.7004180213655364, "grad_norm": 0.1663575560714465, "learning_rate": 5.5249569707401035e-06, "loss": 0.2748, "step": 2908 }, { "epoch": 2.7013469577333953, "grad_norm": 0.1721550238413437, "learning_rate": 5.507745266781412e-06, "loss": 0.2912, "step": 2909 }, { "epoch": 2.702275894101254, "grad_norm": 0.16768308926561104, "learning_rate": 5.4905335628227195e-06, "loss": 0.2666, "step": 2910 }, { "epoch": 2.703204830469113, "grad_norm": 0.18026118938082336, "learning_rate": 5.473321858864028e-06, "loss": 0.2742, "step": 2911 }, { "epoch": 2.704133766836972, "grad_norm": 0.17114811293445578, "learning_rate": 5.456110154905336e-06, "loss": 0.2869, "step": 2912 }, { "epoch": 2.70506270320483, "grad_norm": 0.1897353218307368, "learning_rate": 5.438898450946644e-06, "loss": 0.2891, "step": 2913 }, { "epoch": 2.705991639572689, "grad_norm": 0.17270248612365974, "learning_rate": 5.421686746987952e-06, "loss": 0.262, "step": 2914 }, { "epoch": 2.706920575940548, "grad_norm": 0.16787775314760653, "learning_rate": 5.40447504302926e-06, "loss": 0.2698, "step": 2915 }, { "epoch": 2.7078495123084068, "grad_norm": 0.1969016770528086, "learning_rate": 5.387263339070568e-06, "loss": 0.2822, "step": 2916 }, { "epoch": 2.7087784486762656, "grad_norm": 0.16996615370604629, "learning_rate": 5.370051635111877e-06, "loss": 0.2695, "step": 2917 }, { "epoch": 2.7097073850441245, "grad_norm": 0.16671749967465915, "learning_rate": 5.352839931153184e-06, "loss": 0.2673, "step": 2918 }, { "epoch": 2.7106363214119833, "grad_norm": 0.17148870032680638, "learning_rate": 5.3356282271944925e-06, "loss": 0.2736, "step": 2919 }, { "epoch": 2.711565257779842, "grad_norm": 0.16182282721435845, "learning_rate": 5.318416523235801e-06, "loss": 0.255, "step": 2920 }, { "epoch": 2.712494194147701, "grad_norm": 0.18004793360917773, "learning_rate": 5.3012048192771085e-06, "loss": 0.3005, "step": 2921 }, { "epoch": 2.71342313051556, "grad_norm": 0.16893534756516349, "learning_rate": 5.283993115318417e-06, "loss": 0.2614, "step": 2922 }, { "epoch": 2.7143520668834187, "grad_norm": 0.17993627923927366, "learning_rate": 5.2667814113597244e-06, "loss": 0.2829, "step": 2923 }, { "epoch": 2.7152810032512775, "grad_norm": 0.16985404866566564, "learning_rate": 5.249569707401033e-06, "loss": 0.2714, "step": 2924 }, { "epoch": 2.716209939619136, "grad_norm": 0.16997003285554874, "learning_rate": 5.232358003442341e-06, "loss": 0.2723, "step": 2925 }, { "epoch": 2.7171388759869948, "grad_norm": 0.17280395561719686, "learning_rate": 5.215146299483649e-06, "loss": 0.2806, "step": 2926 }, { "epoch": 2.7180678123548536, "grad_norm": 0.18591179771276334, "learning_rate": 5.197934595524957e-06, "loss": 0.2839, "step": 2927 }, { "epoch": 2.7189967487227125, "grad_norm": 0.16848714989466346, "learning_rate": 5.180722891566266e-06, "loss": 0.27, "step": 2928 }, { "epoch": 2.7199256850905713, "grad_norm": 0.17396159174476428, "learning_rate": 5.163511187607573e-06, "loss": 0.2887, "step": 2929 }, { "epoch": 2.72085462145843, "grad_norm": 0.16362910998344926, "learning_rate": 5.1462994836488815e-06, "loss": 0.2677, "step": 2930 }, { "epoch": 2.721783557826289, "grad_norm": 0.17616701253337705, "learning_rate": 5.129087779690189e-06, "loss": 0.2805, "step": 2931 }, { "epoch": 2.722712494194148, "grad_norm": 0.16554867783146326, "learning_rate": 5.1118760757314975e-06, "loss": 0.2604, "step": 2932 }, { "epoch": 2.7236414305620062, "grad_norm": 0.1830057396941038, "learning_rate": 5.094664371772806e-06, "loss": 0.2661, "step": 2933 }, { "epoch": 2.724570366929865, "grad_norm": 0.16804873136561682, "learning_rate": 5.0774526678141135e-06, "loss": 0.2761, "step": 2934 }, { "epoch": 2.725499303297724, "grad_norm": 0.17881514628477865, "learning_rate": 5.060240963855422e-06, "loss": 0.2833, "step": 2935 }, { "epoch": 2.726428239665583, "grad_norm": 0.18028085375147532, "learning_rate": 5.04302925989673e-06, "loss": 0.28, "step": 2936 }, { "epoch": 2.7273571760334416, "grad_norm": 0.17128865792041942, "learning_rate": 5.025817555938038e-06, "loss": 0.2827, "step": 2937 }, { "epoch": 2.7282861124013005, "grad_norm": 0.17508069891904077, "learning_rate": 5.008605851979346e-06, "loss": 0.2846, "step": 2938 }, { "epoch": 2.7292150487691593, "grad_norm": 0.1624442446983855, "learning_rate": 4.991394148020654e-06, "loss": 0.2605, "step": 2939 }, { "epoch": 2.730143985137018, "grad_norm": 0.16717663468832003, "learning_rate": 4.974182444061962e-06, "loss": 0.2562, "step": 2940 }, { "epoch": 2.731072921504877, "grad_norm": 0.17435792481065424, "learning_rate": 4.9569707401032706e-06, "loss": 0.2641, "step": 2941 }, { "epoch": 2.732001857872736, "grad_norm": 0.18206523972349564, "learning_rate": 4.939759036144578e-06, "loss": 0.2906, "step": 2942 }, { "epoch": 2.7329307942405947, "grad_norm": 0.1660518040509173, "learning_rate": 4.9225473321858865e-06, "loss": 0.2616, "step": 2943 }, { "epoch": 2.7338597306084536, "grad_norm": 0.17641521165468432, "learning_rate": 4.905335628227195e-06, "loss": 0.279, "step": 2944 }, { "epoch": 2.7347886669763124, "grad_norm": 0.17021194005960247, "learning_rate": 4.8881239242685025e-06, "loss": 0.2662, "step": 2945 }, { "epoch": 2.735717603344171, "grad_norm": 0.18680530424480912, "learning_rate": 4.870912220309811e-06, "loss": 0.279, "step": 2946 }, { "epoch": 2.7366465397120296, "grad_norm": 0.16624112372533945, "learning_rate": 4.853700516351118e-06, "loss": 0.2603, "step": 2947 }, { "epoch": 2.7375754760798885, "grad_norm": 0.1658532363601099, "learning_rate": 4.836488812392427e-06, "loss": 0.2573, "step": 2948 }, { "epoch": 2.7385044124477473, "grad_norm": 0.17508939946629776, "learning_rate": 4.819277108433735e-06, "loss": 0.2748, "step": 2949 }, { "epoch": 2.739433348815606, "grad_norm": 0.1697082547434485, "learning_rate": 4.802065404475043e-06, "loss": 0.277, "step": 2950 }, { "epoch": 2.740362285183465, "grad_norm": 0.17265474187461444, "learning_rate": 4.784853700516351e-06, "loss": 0.2766, "step": 2951 }, { "epoch": 2.741291221551324, "grad_norm": 0.1801581954287595, "learning_rate": 4.7676419965576596e-06, "loss": 0.2865, "step": 2952 }, { "epoch": 2.7422201579191823, "grad_norm": 0.1671113061115391, "learning_rate": 4.750430292598967e-06, "loss": 0.2789, "step": 2953 }, { "epoch": 2.743149094287041, "grad_norm": 0.1810191012298334, "learning_rate": 4.7332185886402755e-06, "loss": 0.2829, "step": 2954 }, { "epoch": 2.7440780306549, "grad_norm": 0.1873382578762812, "learning_rate": 4.716006884681583e-06, "loss": 0.3014, "step": 2955 }, { "epoch": 2.745006967022759, "grad_norm": 0.16836258371111407, "learning_rate": 4.6987951807228915e-06, "loss": 0.2777, "step": 2956 }, { "epoch": 2.7459359033906177, "grad_norm": 0.17397396442719273, "learning_rate": 4.6815834767642e-06, "loss": 0.2794, "step": 2957 }, { "epoch": 2.7468648397584765, "grad_norm": 0.17012803273497898, "learning_rate": 4.6643717728055074e-06, "loss": 0.2735, "step": 2958 }, { "epoch": 2.7477937761263354, "grad_norm": 0.1726453541385025, "learning_rate": 4.647160068846816e-06, "loss": 0.2728, "step": 2959 }, { "epoch": 2.748722712494194, "grad_norm": 0.1607579980714856, "learning_rate": 4.629948364888123e-06, "loss": 0.2652, "step": 2960 }, { "epoch": 2.749651648862053, "grad_norm": 0.19173078813688318, "learning_rate": 4.612736660929432e-06, "loss": 0.2889, "step": 2961 }, { "epoch": 2.750580585229912, "grad_norm": 0.16695194838007188, "learning_rate": 4.59552495697074e-06, "loss": 0.2711, "step": 2962 }, { "epoch": 2.7515095215977707, "grad_norm": 0.1644552191700174, "learning_rate": 4.578313253012049e-06, "loss": 0.2612, "step": 2963 }, { "epoch": 2.7524384579656296, "grad_norm": 0.16861408449465942, "learning_rate": 4.561101549053356e-06, "loss": 0.2672, "step": 2964 }, { "epoch": 2.7533673943334884, "grad_norm": 0.16556411207676844, "learning_rate": 4.5438898450946645e-06, "loss": 0.2843, "step": 2965 }, { "epoch": 2.754296330701347, "grad_norm": 0.1653424279361049, "learning_rate": 4.526678141135973e-06, "loss": 0.2698, "step": 2966 }, { "epoch": 2.7552252670692057, "grad_norm": 0.17268687936725427, "learning_rate": 4.5094664371772805e-06, "loss": 0.2753, "step": 2967 }, { "epoch": 2.7561542034370645, "grad_norm": 0.16679010719063728, "learning_rate": 4.492254733218589e-06, "loss": 0.2658, "step": 2968 }, { "epoch": 2.7570831398049234, "grad_norm": 0.1731384049435442, "learning_rate": 4.475043029259897e-06, "loss": 0.2776, "step": 2969 }, { "epoch": 2.758012076172782, "grad_norm": 0.17817981617485937, "learning_rate": 4.457831325301205e-06, "loss": 0.2723, "step": 2970 }, { "epoch": 2.758941012540641, "grad_norm": 0.17631736307329407, "learning_rate": 4.440619621342513e-06, "loss": 0.29, "step": 2971 }, { "epoch": 2.7598699489085, "grad_norm": 0.16557688303929388, "learning_rate": 4.423407917383822e-06, "loss": 0.2715, "step": 2972 }, { "epoch": 2.7607988852763583, "grad_norm": 0.16830413475646644, "learning_rate": 4.406196213425129e-06, "loss": 0.2755, "step": 2973 }, { "epoch": 2.761727821644217, "grad_norm": 0.16634439441042223, "learning_rate": 4.388984509466438e-06, "loss": 0.2821, "step": 2974 }, { "epoch": 2.762656758012076, "grad_norm": 0.17593720968564058, "learning_rate": 4.371772805507746e-06, "loss": 0.2661, "step": 2975 }, { "epoch": 2.763585694379935, "grad_norm": 0.16971061649067173, "learning_rate": 4.3545611015490536e-06, "loss": 0.2611, "step": 2976 }, { "epoch": 2.7645146307477937, "grad_norm": 0.18290133340663603, "learning_rate": 4.337349397590362e-06, "loss": 0.296, "step": 2977 }, { "epoch": 2.7654435671156525, "grad_norm": 0.16094186761731016, "learning_rate": 4.32013769363167e-06, "loss": 0.2599, "step": 2978 }, { "epoch": 2.7663725034835114, "grad_norm": 0.1700225427982041, "learning_rate": 4.302925989672978e-06, "loss": 0.2851, "step": 2979 }, { "epoch": 2.7673014398513702, "grad_norm": 0.169245573986097, "learning_rate": 4.285714285714286e-06, "loss": 0.2543, "step": 2980 }, { "epoch": 2.768230376219229, "grad_norm": 0.16461588181504275, "learning_rate": 4.268502581755595e-06, "loss": 0.2671, "step": 2981 }, { "epoch": 2.769159312587088, "grad_norm": 0.16325022819287804, "learning_rate": 4.251290877796902e-06, "loss": 0.2917, "step": 2982 }, { "epoch": 2.7700882489549468, "grad_norm": 0.17634053929655194, "learning_rate": 4.234079173838211e-06, "loss": 0.289, "step": 2983 }, { "epoch": 2.7710171853228056, "grad_norm": 0.17119702526297131, "learning_rate": 4.216867469879518e-06, "loss": 0.283, "step": 2984 }, { "epoch": 2.7719461216906645, "grad_norm": 0.16474730190343942, "learning_rate": 4.199655765920827e-06, "loss": 0.2639, "step": 2985 }, { "epoch": 2.772875058058523, "grad_norm": 0.33285250375326086, "learning_rate": 4.182444061962135e-06, "loss": 0.267, "step": 2986 }, { "epoch": 2.7738039944263817, "grad_norm": 0.16364044631808572, "learning_rate": 4.1652323580034426e-06, "loss": 0.2717, "step": 2987 }, { "epoch": 2.7747329307942405, "grad_norm": 0.16681061485901677, "learning_rate": 4.148020654044751e-06, "loss": 0.2683, "step": 2988 }, { "epoch": 2.7756618671620994, "grad_norm": 0.1651724380142913, "learning_rate": 4.130808950086059e-06, "loss": 0.2704, "step": 2989 }, { "epoch": 2.7765908035299582, "grad_norm": 0.1709084288247249, "learning_rate": 4.113597246127367e-06, "loss": 0.2819, "step": 2990 }, { "epoch": 2.777519739897817, "grad_norm": 0.16357948699205382, "learning_rate": 4.096385542168675e-06, "loss": 0.2875, "step": 2991 }, { "epoch": 2.778448676265676, "grad_norm": 0.17167993006882395, "learning_rate": 4.079173838209983e-06, "loss": 0.2745, "step": 2992 }, { "epoch": 2.7793776126335343, "grad_norm": 0.1675771078403747, "learning_rate": 4.061962134251291e-06, "loss": 0.2823, "step": 2993 }, { "epoch": 2.780306549001393, "grad_norm": 0.16584264394434609, "learning_rate": 4.0447504302926e-06, "loss": 0.2744, "step": 2994 }, { "epoch": 2.781235485369252, "grad_norm": 0.16662698036780293, "learning_rate": 4.027538726333907e-06, "loss": 0.2688, "step": 2995 }, { "epoch": 2.782164421737111, "grad_norm": 0.16794738348660668, "learning_rate": 4.010327022375216e-06, "loss": 0.2888, "step": 2996 }, { "epoch": 2.7830933581049697, "grad_norm": 0.17789218429995182, "learning_rate": 3.993115318416524e-06, "loss": 0.2859, "step": 2997 }, { "epoch": 2.7840222944728286, "grad_norm": 0.16091287332048385, "learning_rate": 3.975903614457832e-06, "loss": 0.2746, "step": 2998 }, { "epoch": 2.7849512308406874, "grad_norm": 0.18944455859801637, "learning_rate": 3.95869191049914e-06, "loss": 0.2716, "step": 2999 }, { "epoch": 2.7858801672085463, "grad_norm": 0.2954117399236272, "learning_rate": 3.9414802065404475e-06, "loss": 0.2902, "step": 3000 }, { "epoch": 2.786809103576405, "grad_norm": 0.16573385628794401, "learning_rate": 3.924268502581756e-06, "loss": 0.2726, "step": 3001 }, { "epoch": 2.787738039944264, "grad_norm": 0.17004238681555403, "learning_rate": 3.907056798623064e-06, "loss": 0.2831, "step": 3002 }, { "epoch": 2.788666976312123, "grad_norm": 0.17663477423381163, "learning_rate": 3.889845094664372e-06, "loss": 0.2651, "step": 3003 }, { "epoch": 2.7895959126799816, "grad_norm": 0.17378992478360958, "learning_rate": 3.87263339070568e-06, "loss": 0.2743, "step": 3004 }, { "epoch": 2.7905248490478405, "grad_norm": 0.16494900421379727, "learning_rate": 3.855421686746989e-06, "loss": 0.2689, "step": 3005 }, { "epoch": 2.791453785415699, "grad_norm": 0.1603907236344023, "learning_rate": 3.838209982788296e-06, "loss": 0.2657, "step": 3006 }, { "epoch": 2.7923827217835577, "grad_norm": 0.1567572624521738, "learning_rate": 3.820998278829605e-06, "loss": 0.2637, "step": 3007 }, { "epoch": 2.7933116581514166, "grad_norm": 0.18404937236372684, "learning_rate": 3.803786574870912e-06, "loss": 0.2755, "step": 3008 }, { "epoch": 2.7942405945192754, "grad_norm": 0.16868307932572335, "learning_rate": 3.7865748709122206e-06, "loss": 0.2778, "step": 3009 }, { "epoch": 2.7951695308871343, "grad_norm": 0.17073649004440103, "learning_rate": 3.769363166953529e-06, "loss": 0.264, "step": 3010 }, { "epoch": 2.796098467254993, "grad_norm": 0.16505500534365, "learning_rate": 3.7521514629948365e-06, "loss": 0.2672, "step": 3011 }, { "epoch": 2.797027403622852, "grad_norm": 0.16381885442693822, "learning_rate": 3.734939759036145e-06, "loss": 0.272, "step": 3012 }, { "epoch": 2.7979563399907104, "grad_norm": 0.16292544827439895, "learning_rate": 3.7177280550774525e-06, "loss": 0.2737, "step": 3013 }, { "epoch": 2.798885276358569, "grad_norm": 0.17228455569690837, "learning_rate": 3.700516351118761e-06, "loss": 0.2811, "step": 3014 }, { "epoch": 2.799814212726428, "grad_norm": 0.19524306180865297, "learning_rate": 3.6833046471600693e-06, "loss": 0.2713, "step": 3015 }, { "epoch": 2.800743149094287, "grad_norm": 0.16070401680293828, "learning_rate": 3.666092943201377e-06, "loss": 0.2692, "step": 3016 }, { "epoch": 2.8016720854621457, "grad_norm": 0.16705220705216192, "learning_rate": 3.6488812392426853e-06, "loss": 0.2744, "step": 3017 }, { "epoch": 2.8026010218300046, "grad_norm": 0.16539813777059287, "learning_rate": 3.6316695352839937e-06, "loss": 0.2747, "step": 3018 }, { "epoch": 2.8035299581978634, "grad_norm": 0.16555633870974384, "learning_rate": 3.614457831325301e-06, "loss": 0.2899, "step": 3019 }, { "epoch": 2.8044588945657223, "grad_norm": 0.16101648042383737, "learning_rate": 3.5972461273666096e-06, "loss": 0.268, "step": 3020 }, { "epoch": 2.805387830933581, "grad_norm": 0.16506138181326488, "learning_rate": 3.580034423407917e-06, "loss": 0.2653, "step": 3021 }, { "epoch": 2.80631676730144, "grad_norm": 0.1741809446855172, "learning_rate": 3.5628227194492256e-06, "loss": 0.2626, "step": 3022 }, { "epoch": 2.807245703669299, "grad_norm": 0.16483290521015864, "learning_rate": 3.545611015490534e-06, "loss": 0.286, "step": 3023 }, { "epoch": 2.8081746400371577, "grad_norm": 0.1571851560823419, "learning_rate": 3.5283993115318415e-06, "loss": 0.2495, "step": 3024 }, { "epoch": 2.8091035764050165, "grad_norm": 0.16235471506982116, "learning_rate": 3.51118760757315e-06, "loss": 0.2778, "step": 3025 }, { "epoch": 2.810032512772875, "grad_norm": 0.15756406208488002, "learning_rate": 3.4939759036144583e-06, "loss": 0.2643, "step": 3026 }, { "epoch": 2.8109614491407338, "grad_norm": 0.17055705625316472, "learning_rate": 3.476764199655766e-06, "loss": 0.2996, "step": 3027 }, { "epoch": 2.8118903855085926, "grad_norm": 0.1796767292475761, "learning_rate": 3.4595524956970743e-06, "loss": 0.276, "step": 3028 }, { "epoch": 2.8128193218764515, "grad_norm": 0.1717885163183135, "learning_rate": 3.442340791738382e-06, "loss": 0.2933, "step": 3029 }, { "epoch": 2.8137482582443103, "grad_norm": 0.17849394515997952, "learning_rate": 3.4251290877796902e-06, "loss": 0.2835, "step": 3030 }, { "epoch": 2.814677194612169, "grad_norm": 0.1724053731347566, "learning_rate": 3.4079173838209986e-06, "loss": 0.2829, "step": 3031 }, { "epoch": 2.815606130980028, "grad_norm": 0.17700506216444933, "learning_rate": 3.390705679862306e-06, "loss": 0.2836, "step": 3032 }, { "epoch": 2.816535067347887, "grad_norm": 0.1711437698846605, "learning_rate": 3.3734939759036146e-06, "loss": 0.2848, "step": 3033 }, { "epoch": 2.8174640037157452, "grad_norm": 0.1681253963815512, "learning_rate": 3.356282271944923e-06, "loss": 0.2792, "step": 3034 }, { "epoch": 2.818392940083604, "grad_norm": 0.15981392784911405, "learning_rate": 3.3390705679862305e-06, "loss": 0.2699, "step": 3035 }, { "epoch": 2.819321876451463, "grad_norm": 0.1740927276763424, "learning_rate": 3.321858864027539e-06, "loss": 0.2841, "step": 3036 }, { "epoch": 2.8202508128193218, "grad_norm": 0.17717015170291478, "learning_rate": 3.3046471600688465e-06, "loss": 0.2819, "step": 3037 }, { "epoch": 2.8211797491871806, "grad_norm": 0.1677501092435031, "learning_rate": 3.287435456110155e-06, "loss": 0.273, "step": 3038 }, { "epoch": 2.8221086855550395, "grad_norm": 0.16673210395588484, "learning_rate": 3.2702237521514633e-06, "loss": 0.2693, "step": 3039 }, { "epoch": 2.8230376219228983, "grad_norm": 0.16028997391776514, "learning_rate": 3.253012048192771e-06, "loss": 0.2624, "step": 3040 }, { "epoch": 2.823966558290757, "grad_norm": 0.15950671745325568, "learning_rate": 3.2358003442340792e-06, "loss": 0.266, "step": 3041 }, { "epoch": 2.824895494658616, "grad_norm": 0.18954997486322914, "learning_rate": 3.2185886402753876e-06, "loss": 0.2705, "step": 3042 }, { "epoch": 2.825824431026475, "grad_norm": 0.18313245510676568, "learning_rate": 3.201376936316695e-06, "loss": 0.2903, "step": 3043 }, { "epoch": 2.8267533673943337, "grad_norm": 0.17647582446439805, "learning_rate": 3.1841652323580036e-06, "loss": 0.299, "step": 3044 }, { "epoch": 2.8276823037621925, "grad_norm": 0.18666596433252766, "learning_rate": 3.1669535283993116e-06, "loss": 0.2691, "step": 3045 }, { "epoch": 2.828611240130051, "grad_norm": 0.1621297164024695, "learning_rate": 3.1497418244406195e-06, "loss": 0.2581, "step": 3046 }, { "epoch": 2.82954017649791, "grad_norm": 0.16497540494920462, "learning_rate": 3.132530120481928e-06, "loss": 0.2674, "step": 3047 }, { "epoch": 2.8304691128657686, "grad_norm": 0.18187531575636953, "learning_rate": 3.115318416523236e-06, "loss": 0.3102, "step": 3048 }, { "epoch": 2.8313980492336275, "grad_norm": 0.19013570305586183, "learning_rate": 3.098106712564544e-06, "loss": 0.2816, "step": 3049 }, { "epoch": 2.8323269856014863, "grad_norm": 0.16717134338818446, "learning_rate": 3.0808950086058523e-06, "loss": 0.2771, "step": 3050 }, { "epoch": 2.833255921969345, "grad_norm": 0.16409077988979776, "learning_rate": 3.0636833046471603e-06, "loss": 0.2907, "step": 3051 }, { "epoch": 2.834184858337204, "grad_norm": 0.16156022525948335, "learning_rate": 3.0464716006884687e-06, "loss": 0.2707, "step": 3052 }, { "epoch": 2.835113794705063, "grad_norm": 0.1628946794644337, "learning_rate": 3.0292598967297766e-06, "loss": 0.2525, "step": 3053 }, { "epoch": 2.8360427310729213, "grad_norm": 0.17817137797577384, "learning_rate": 3.0120481927710846e-06, "loss": 0.287, "step": 3054 }, { "epoch": 2.83697166744078, "grad_norm": 0.1574612277457272, "learning_rate": 2.9948364888123926e-06, "loss": 0.2691, "step": 3055 }, { "epoch": 2.837900603808639, "grad_norm": 0.1596146649010269, "learning_rate": 2.977624784853701e-06, "loss": 0.2618, "step": 3056 }, { "epoch": 2.838829540176498, "grad_norm": 0.1666069390299578, "learning_rate": 2.960413080895009e-06, "loss": 0.2788, "step": 3057 }, { "epoch": 2.8397584765443566, "grad_norm": 0.16942939086250466, "learning_rate": 2.943201376936317e-06, "loss": 0.278, "step": 3058 }, { "epoch": 2.8406874129122155, "grad_norm": 0.16780692378544415, "learning_rate": 2.925989672977625e-06, "loss": 0.2864, "step": 3059 }, { "epoch": 2.8416163492800743, "grad_norm": 0.16519686217403176, "learning_rate": 2.9087779690189333e-06, "loss": 0.2743, "step": 3060 }, { "epoch": 2.842545285647933, "grad_norm": 0.18887259058754785, "learning_rate": 2.8915662650602413e-06, "loss": 0.28, "step": 3061 }, { "epoch": 2.843474222015792, "grad_norm": 0.16104392704343692, "learning_rate": 2.8743545611015493e-06, "loss": 0.2651, "step": 3062 }, { "epoch": 2.844403158383651, "grad_norm": 0.16267194525224002, "learning_rate": 2.8571428571428573e-06, "loss": 0.2743, "step": 3063 }, { "epoch": 2.8453320947515097, "grad_norm": 0.16408592399590138, "learning_rate": 2.8399311531841657e-06, "loss": 0.2703, "step": 3064 }, { "epoch": 2.8462610311193686, "grad_norm": 0.17434242637401323, "learning_rate": 2.8227194492254736e-06, "loss": 0.2797, "step": 3065 }, { "epoch": 2.847189967487227, "grad_norm": 0.16647472284974027, "learning_rate": 2.8055077452667816e-06, "loss": 0.2623, "step": 3066 }, { "epoch": 2.848118903855086, "grad_norm": 0.16968808669769664, "learning_rate": 2.7882960413080896e-06, "loss": 0.278, "step": 3067 }, { "epoch": 2.8490478402229447, "grad_norm": 0.17519157938858387, "learning_rate": 2.7710843373493976e-06, "loss": 0.2864, "step": 3068 }, { "epoch": 2.8499767765908035, "grad_norm": 0.15968806347904377, "learning_rate": 2.753872633390706e-06, "loss": 0.2781, "step": 3069 }, { "epoch": 2.8509057129586624, "grad_norm": 0.1690915914984628, "learning_rate": 2.736660929432014e-06, "loss": 0.2761, "step": 3070 }, { "epoch": 2.851834649326521, "grad_norm": 0.16230516310874854, "learning_rate": 2.719449225473322e-06, "loss": 0.268, "step": 3071 }, { "epoch": 2.85276358569438, "grad_norm": 0.17552261229791508, "learning_rate": 2.70223752151463e-06, "loss": 0.2862, "step": 3072 }, { "epoch": 2.853692522062239, "grad_norm": 0.16542085819719532, "learning_rate": 2.6850258175559383e-06, "loss": 0.2707, "step": 3073 }, { "epoch": 2.8546214584300973, "grad_norm": 0.17209167653272706, "learning_rate": 2.6678141135972463e-06, "loss": 0.2745, "step": 3074 }, { "epoch": 2.855550394797956, "grad_norm": 0.1801208337324367, "learning_rate": 2.6506024096385542e-06, "loss": 0.275, "step": 3075 }, { "epoch": 2.856479331165815, "grad_norm": 0.1658175112453202, "learning_rate": 2.6333907056798622e-06, "loss": 0.28, "step": 3076 }, { "epoch": 2.857408267533674, "grad_norm": 0.1682675080594553, "learning_rate": 2.6161790017211706e-06, "loss": 0.272, "step": 3077 }, { "epoch": 2.8583372039015327, "grad_norm": 0.16424665159704155, "learning_rate": 2.5989672977624786e-06, "loss": 0.2722, "step": 3078 }, { "epoch": 2.8592661402693915, "grad_norm": 0.17646757447622446, "learning_rate": 2.5817555938037866e-06, "loss": 0.2743, "step": 3079 }, { "epoch": 2.8601950766372504, "grad_norm": 0.16214982779968065, "learning_rate": 2.5645438898450946e-06, "loss": 0.2636, "step": 3080 }, { "epoch": 2.861124013005109, "grad_norm": 0.16427321944645545, "learning_rate": 2.547332185886403e-06, "loss": 0.2687, "step": 3081 }, { "epoch": 2.862052949372968, "grad_norm": 0.1651934825129748, "learning_rate": 2.530120481927711e-06, "loss": 0.2716, "step": 3082 }, { "epoch": 2.862981885740827, "grad_norm": 0.1801065723272109, "learning_rate": 2.512908777969019e-06, "loss": 0.2907, "step": 3083 }, { "epoch": 2.8639108221086857, "grad_norm": 0.1625252701117974, "learning_rate": 2.495697074010327e-06, "loss": 0.2694, "step": 3084 }, { "epoch": 2.8648397584765446, "grad_norm": 0.1693264127024217, "learning_rate": 2.4784853700516353e-06, "loss": 0.2719, "step": 3085 }, { "epoch": 2.865768694844403, "grad_norm": 0.18597914567062376, "learning_rate": 2.4612736660929433e-06, "loss": 0.2759, "step": 3086 }, { "epoch": 2.866697631212262, "grad_norm": 0.1680321028316844, "learning_rate": 2.4440619621342512e-06, "loss": 0.2582, "step": 3087 }, { "epoch": 2.8676265675801207, "grad_norm": 0.17018851398423457, "learning_rate": 2.426850258175559e-06, "loss": 0.2741, "step": 3088 }, { "epoch": 2.8685555039479795, "grad_norm": 0.17493183425794315, "learning_rate": 2.4096385542168676e-06, "loss": 0.2824, "step": 3089 }, { "epoch": 2.8694844403158384, "grad_norm": 0.17392176050130073, "learning_rate": 2.3924268502581756e-06, "loss": 0.2861, "step": 3090 }, { "epoch": 2.8704133766836972, "grad_norm": 0.16101372989259485, "learning_rate": 2.3752151462994836e-06, "loss": 0.2729, "step": 3091 }, { "epoch": 2.871342313051556, "grad_norm": 0.1811947560081589, "learning_rate": 2.3580034423407915e-06, "loss": 0.29, "step": 3092 }, { "epoch": 2.872271249419415, "grad_norm": 0.16656066225329413, "learning_rate": 2.3407917383821e-06, "loss": 0.2582, "step": 3093 }, { "epoch": 2.8732001857872733, "grad_norm": 0.15996052741989927, "learning_rate": 2.323580034423408e-06, "loss": 0.2597, "step": 3094 }, { "epoch": 2.874129122155132, "grad_norm": 0.16380652320388664, "learning_rate": 2.306368330464716e-06, "loss": 0.2705, "step": 3095 }, { "epoch": 2.875058058522991, "grad_norm": 0.1628946736235343, "learning_rate": 2.2891566265060243e-06, "loss": 0.2723, "step": 3096 }, { "epoch": 2.87598699489085, "grad_norm": 0.17560963055242468, "learning_rate": 2.2719449225473323e-06, "loss": 0.2798, "step": 3097 }, { "epoch": 2.8769159312587087, "grad_norm": 0.16780655023233185, "learning_rate": 2.2547332185886402e-06, "loss": 0.2657, "step": 3098 }, { "epoch": 2.8778448676265675, "grad_norm": 0.17403931947708165, "learning_rate": 2.2375215146299486e-06, "loss": 0.2785, "step": 3099 }, { "epoch": 2.8787738039944264, "grad_norm": 0.17010520777366847, "learning_rate": 2.2203098106712566e-06, "loss": 0.2614, "step": 3100 }, { "epoch": 2.8797027403622852, "grad_norm": 0.16812351916262241, "learning_rate": 2.2030981067125646e-06, "loss": 0.281, "step": 3101 }, { "epoch": 2.880631676730144, "grad_norm": 0.16691194577486648, "learning_rate": 2.185886402753873e-06, "loss": 0.2662, "step": 3102 }, { "epoch": 2.881560613098003, "grad_norm": 0.1759493066311429, "learning_rate": 2.168674698795181e-06, "loss": 0.2747, "step": 3103 }, { "epoch": 2.8824895494658618, "grad_norm": 0.1707537478590751, "learning_rate": 2.151462994836489e-06, "loss": 0.2707, "step": 3104 }, { "epoch": 2.8834184858337206, "grad_norm": 0.18135021613947166, "learning_rate": 2.1342512908777974e-06, "loss": 0.2905, "step": 3105 }, { "epoch": 2.884347422201579, "grad_norm": 0.16500668342734362, "learning_rate": 2.1170395869191053e-06, "loss": 0.287, "step": 3106 }, { "epoch": 2.885276358569438, "grad_norm": 0.1719432107365479, "learning_rate": 2.0998278829604133e-06, "loss": 0.2813, "step": 3107 }, { "epoch": 2.8862052949372967, "grad_norm": 0.15985021847497927, "learning_rate": 2.0826161790017213e-06, "loss": 0.263, "step": 3108 }, { "epoch": 2.8871342313051556, "grad_norm": 0.16354165474456403, "learning_rate": 2.0654044750430297e-06, "loss": 0.2836, "step": 3109 }, { "epoch": 2.8880631676730144, "grad_norm": 0.16878198938643335, "learning_rate": 2.0481927710843377e-06, "loss": 0.2785, "step": 3110 }, { "epoch": 2.8889921040408733, "grad_norm": 0.16589172961440266, "learning_rate": 2.0309810671256456e-06, "loss": 0.2784, "step": 3111 }, { "epoch": 2.889921040408732, "grad_norm": 0.17438950890163252, "learning_rate": 2.0137693631669536e-06, "loss": 0.2714, "step": 3112 }, { "epoch": 2.890849976776591, "grad_norm": 0.16242581403999212, "learning_rate": 1.996557659208262e-06, "loss": 0.2629, "step": 3113 }, { "epoch": 2.8917789131444493, "grad_norm": 0.1744527543075875, "learning_rate": 1.97934595524957e-06, "loss": 0.2707, "step": 3114 }, { "epoch": 2.892707849512308, "grad_norm": 0.16426414428304387, "learning_rate": 1.962134251290878e-06, "loss": 0.2816, "step": 3115 }, { "epoch": 2.893636785880167, "grad_norm": 0.1621871796090516, "learning_rate": 1.944922547332186e-06, "loss": 0.2751, "step": 3116 }, { "epoch": 2.894565722248026, "grad_norm": 0.1775919652596439, "learning_rate": 1.9277108433734943e-06, "loss": 0.2895, "step": 3117 }, { "epoch": 2.8954946586158847, "grad_norm": 0.17006770205203484, "learning_rate": 1.9104991394148023e-06, "loss": 0.2688, "step": 3118 }, { "epoch": 2.8964235949837436, "grad_norm": 0.16894126178473537, "learning_rate": 1.8932874354561103e-06, "loss": 0.2609, "step": 3119 }, { "epoch": 2.8973525313516024, "grad_norm": 0.16582301619354106, "learning_rate": 1.8760757314974183e-06, "loss": 0.2732, "step": 3120 }, { "epoch": 2.8982814677194613, "grad_norm": 0.17231531336949352, "learning_rate": 1.8588640275387262e-06, "loss": 0.2734, "step": 3121 }, { "epoch": 2.89921040408732, "grad_norm": 0.16228298439571703, "learning_rate": 1.8416523235800346e-06, "loss": 0.2662, "step": 3122 }, { "epoch": 2.900139340455179, "grad_norm": 0.1655198446070943, "learning_rate": 1.8244406196213426e-06, "loss": 0.2678, "step": 3123 }, { "epoch": 2.901068276823038, "grad_norm": 0.17406281157074208, "learning_rate": 1.8072289156626506e-06, "loss": 0.2802, "step": 3124 }, { "epoch": 2.9019972131908967, "grad_norm": 0.16674227317723664, "learning_rate": 1.7900172117039586e-06, "loss": 0.266, "step": 3125 }, { "epoch": 2.9029261495587555, "grad_norm": 0.1706960267209148, "learning_rate": 1.772805507745267e-06, "loss": 0.2974, "step": 3126 }, { "epoch": 2.903855085926614, "grad_norm": 0.1658872040790518, "learning_rate": 1.755593803786575e-06, "loss": 0.2694, "step": 3127 }, { "epoch": 2.9047840222944727, "grad_norm": 0.16658354730048636, "learning_rate": 1.738382099827883e-06, "loss": 0.2587, "step": 3128 }, { "epoch": 2.9057129586623316, "grad_norm": 0.16444858436297213, "learning_rate": 1.721170395869191e-06, "loss": 0.2813, "step": 3129 }, { "epoch": 2.9066418950301904, "grad_norm": 0.16336194076854949, "learning_rate": 1.7039586919104993e-06, "loss": 0.2732, "step": 3130 }, { "epoch": 2.9075708313980493, "grad_norm": 0.171145553329007, "learning_rate": 1.6867469879518073e-06, "loss": 0.2723, "step": 3131 }, { "epoch": 2.908499767765908, "grad_norm": 0.16347568718682626, "learning_rate": 1.6695352839931153e-06, "loss": 0.2762, "step": 3132 }, { "epoch": 2.909428704133767, "grad_norm": 0.16509176449938368, "learning_rate": 1.6523235800344232e-06, "loss": 0.2684, "step": 3133 }, { "epoch": 2.9103576405016254, "grad_norm": 0.16259817795344064, "learning_rate": 1.6351118760757316e-06, "loss": 0.2672, "step": 3134 }, { "epoch": 2.911286576869484, "grad_norm": 0.16536212949320228, "learning_rate": 1.6179001721170396e-06, "loss": 0.2712, "step": 3135 }, { "epoch": 2.912215513237343, "grad_norm": 0.17142664756129727, "learning_rate": 1.6006884681583476e-06, "loss": 0.2781, "step": 3136 }, { "epoch": 2.913144449605202, "grad_norm": 0.16836163730587553, "learning_rate": 1.5834767641996558e-06, "loss": 0.2808, "step": 3137 }, { "epoch": 2.9140733859730608, "grad_norm": 0.16607118988603203, "learning_rate": 1.566265060240964e-06, "loss": 0.2766, "step": 3138 }, { "epoch": 2.9150023223409196, "grad_norm": 0.1672718943537726, "learning_rate": 1.549053356282272e-06, "loss": 0.2704, "step": 3139 }, { "epoch": 2.9159312587087785, "grad_norm": 0.16835726638255913, "learning_rate": 1.5318416523235801e-06, "loss": 0.2769, "step": 3140 }, { "epoch": 2.9168601950766373, "grad_norm": 0.161048165231964, "learning_rate": 1.5146299483648883e-06, "loss": 0.2837, "step": 3141 }, { "epoch": 2.917789131444496, "grad_norm": 0.16282977150245906, "learning_rate": 1.4974182444061963e-06, "loss": 0.2638, "step": 3142 }, { "epoch": 2.918718067812355, "grad_norm": 0.17323001324895482, "learning_rate": 1.4802065404475045e-06, "loss": 0.2815, "step": 3143 }, { "epoch": 2.919647004180214, "grad_norm": 0.16388686409084324, "learning_rate": 1.4629948364888125e-06, "loss": 0.2827, "step": 3144 }, { "epoch": 2.9205759405480727, "grad_norm": 0.16384234361312092, "learning_rate": 1.4457831325301207e-06, "loss": 0.282, "step": 3145 }, { "epoch": 2.9215048769159315, "grad_norm": 0.16568266746026178, "learning_rate": 1.4285714285714286e-06, "loss": 0.2704, "step": 3146 }, { "epoch": 2.92243381328379, "grad_norm": 0.16915537105295747, "learning_rate": 1.4113597246127368e-06, "loss": 0.2796, "step": 3147 }, { "epoch": 2.9233627496516488, "grad_norm": 0.15803350350704365, "learning_rate": 1.3941480206540448e-06, "loss": 0.2632, "step": 3148 }, { "epoch": 2.9242916860195076, "grad_norm": 0.1724406036070289, "learning_rate": 1.376936316695353e-06, "loss": 0.2811, "step": 3149 }, { "epoch": 2.9252206223873665, "grad_norm": 0.16469469642126497, "learning_rate": 1.359724612736661e-06, "loss": 0.255, "step": 3150 }, { "epoch": 2.9261495587552253, "grad_norm": 0.18760969517301657, "learning_rate": 1.3425129087779691e-06, "loss": 0.2682, "step": 3151 }, { "epoch": 2.927078495123084, "grad_norm": 0.16665139220788266, "learning_rate": 1.3253012048192771e-06, "loss": 0.268, "step": 3152 }, { "epoch": 2.928007431490943, "grad_norm": 0.17530978095431596, "learning_rate": 1.3080895008605853e-06, "loss": 0.2624, "step": 3153 }, { "epoch": 2.9289363678588014, "grad_norm": 0.1631271421954177, "learning_rate": 1.2908777969018933e-06, "loss": 0.2727, "step": 3154 }, { "epoch": 2.9298653042266602, "grad_norm": 0.15966257157616312, "learning_rate": 1.2736660929432015e-06, "loss": 0.2688, "step": 3155 }, { "epoch": 2.930794240594519, "grad_norm": 0.15325566348484762, "learning_rate": 1.2564543889845095e-06, "loss": 0.2559, "step": 3156 }, { "epoch": 2.931723176962378, "grad_norm": 0.16790370946361619, "learning_rate": 1.2392426850258176e-06, "loss": 0.286, "step": 3157 }, { "epoch": 2.932652113330237, "grad_norm": 0.16679108887381452, "learning_rate": 1.2220309810671256e-06, "loss": 0.2863, "step": 3158 }, { "epoch": 2.9335810496980956, "grad_norm": 0.17093563722555055, "learning_rate": 1.2048192771084338e-06, "loss": 0.283, "step": 3159 }, { "epoch": 2.9345099860659545, "grad_norm": 0.1671496062211597, "learning_rate": 1.1876075731497418e-06, "loss": 0.2792, "step": 3160 }, { "epoch": 2.9354389224338133, "grad_norm": 0.1599449296648143, "learning_rate": 1.17039586919105e-06, "loss": 0.2448, "step": 3161 }, { "epoch": 2.936367858801672, "grad_norm": 0.15847396108522563, "learning_rate": 1.153184165232358e-06, "loss": 0.2704, "step": 3162 }, { "epoch": 2.937296795169531, "grad_norm": 0.16199214200992662, "learning_rate": 1.1359724612736661e-06, "loss": 0.2516, "step": 3163 }, { "epoch": 2.93822573153739, "grad_norm": 0.1626770624143151, "learning_rate": 1.1187607573149743e-06, "loss": 0.2682, "step": 3164 }, { "epoch": 2.9391546679052487, "grad_norm": 0.16395321212716582, "learning_rate": 1.1015490533562823e-06, "loss": 0.2722, "step": 3165 }, { "epoch": 2.9400836042731076, "grad_norm": 0.15821372199935677, "learning_rate": 1.0843373493975905e-06, "loss": 0.2687, "step": 3166 }, { "epoch": 2.941012540640966, "grad_norm": 0.15843662175473597, "learning_rate": 1.0671256454388987e-06, "loss": 0.2681, "step": 3167 }, { "epoch": 2.941941477008825, "grad_norm": 0.16333393084594955, "learning_rate": 1.0499139414802067e-06, "loss": 0.2856, "step": 3168 }, { "epoch": 2.9428704133766836, "grad_norm": 0.17542548208560677, "learning_rate": 1.0327022375215148e-06, "loss": 0.2798, "step": 3169 }, { "epoch": 2.9437993497445425, "grad_norm": 0.17024423281462964, "learning_rate": 1.0154905335628228e-06, "loss": 0.2742, "step": 3170 }, { "epoch": 2.9447282861124013, "grad_norm": 0.1576061324066094, "learning_rate": 9.98278829604131e-07, "loss": 0.2781, "step": 3171 }, { "epoch": 2.94565722248026, "grad_norm": 0.1601221693884007, "learning_rate": 9.81067125645439e-07, "loss": 0.2693, "step": 3172 }, { "epoch": 2.946586158848119, "grad_norm": 0.16786872565481428, "learning_rate": 9.638554216867472e-07, "loss": 0.2753, "step": 3173 }, { "epoch": 2.9475150952159774, "grad_norm": 0.16660337771357872, "learning_rate": 9.466437177280551e-07, "loss": 0.2758, "step": 3174 }, { "epoch": 2.9484440315838363, "grad_norm": 0.1691333863317382, "learning_rate": 9.294320137693631e-07, "loss": 0.2795, "step": 3175 }, { "epoch": 2.949372967951695, "grad_norm": 0.1722484989104376, "learning_rate": 9.122203098106713e-07, "loss": 0.2802, "step": 3176 }, { "epoch": 2.950301904319554, "grad_norm": 0.16958831531505558, "learning_rate": 8.950086058519793e-07, "loss": 0.2901, "step": 3177 }, { "epoch": 2.951230840687413, "grad_norm": 0.17023309066648035, "learning_rate": 8.777969018932875e-07, "loss": 0.2717, "step": 3178 }, { "epoch": 2.9521597770552717, "grad_norm": 0.15809523592675392, "learning_rate": 8.605851979345955e-07, "loss": 0.265, "step": 3179 }, { "epoch": 2.9530887134231305, "grad_norm": 0.1675472901902543, "learning_rate": 8.433734939759036e-07, "loss": 0.2849, "step": 3180 }, { "epoch": 2.9540176497909894, "grad_norm": 0.1654082507246669, "learning_rate": 8.261617900172116e-07, "loss": 0.2791, "step": 3181 }, { "epoch": 2.954946586158848, "grad_norm": 0.162707952994883, "learning_rate": 8.089500860585198e-07, "loss": 0.2813, "step": 3182 }, { "epoch": 2.955875522526707, "grad_norm": 0.16446726694121308, "learning_rate": 7.917383820998279e-07, "loss": 0.2872, "step": 3183 }, { "epoch": 2.956804458894566, "grad_norm": 0.1637824540494551, "learning_rate": 7.74526678141136e-07, "loss": 0.2795, "step": 3184 }, { "epoch": 2.9577333952624247, "grad_norm": 0.18638420051088697, "learning_rate": 7.573149741824442e-07, "loss": 0.2709, "step": 3185 }, { "epoch": 2.9586623316302836, "grad_norm": 0.15707600467510602, "learning_rate": 7.401032702237522e-07, "loss": 0.2628, "step": 3186 }, { "epoch": 2.959591267998142, "grad_norm": 0.16140242751307413, "learning_rate": 7.228915662650603e-07, "loss": 0.2735, "step": 3187 }, { "epoch": 2.960520204366001, "grad_norm": 0.16654689577483472, "learning_rate": 7.056798623063684e-07, "loss": 0.2831, "step": 3188 }, { "epoch": 2.9614491407338597, "grad_norm": 0.16057309604016964, "learning_rate": 6.884681583476765e-07, "loss": 0.2635, "step": 3189 }, { "epoch": 2.9623780771017185, "grad_norm": 0.16490464189295742, "learning_rate": 6.712564543889846e-07, "loss": 0.2752, "step": 3190 }, { "epoch": 2.9633070134695774, "grad_norm": 0.1671405024647208, "learning_rate": 6.540447504302927e-07, "loss": 0.269, "step": 3191 }, { "epoch": 2.964235949837436, "grad_norm": 0.1710576565622639, "learning_rate": 6.368330464716007e-07, "loss": 0.2843, "step": 3192 }, { "epoch": 2.965164886205295, "grad_norm": 0.16276635918619017, "learning_rate": 6.196213425129088e-07, "loss": 0.2813, "step": 3193 }, { "epoch": 2.966093822573154, "grad_norm": 0.15281133331696603, "learning_rate": 6.024096385542169e-07, "loss": 0.2639, "step": 3194 }, { "epoch": 2.9670227589410123, "grad_norm": 0.16323794995058483, "learning_rate": 5.85197934595525e-07, "loss": 0.2663, "step": 3195 }, { "epoch": 2.967951695308871, "grad_norm": 0.1624282394962894, "learning_rate": 5.679862306368331e-07, "loss": 0.2752, "step": 3196 }, { "epoch": 2.96888063167673, "grad_norm": 0.16134159864744876, "learning_rate": 5.507745266781412e-07, "loss": 0.2702, "step": 3197 }, { "epoch": 2.969809568044589, "grad_norm": 0.16589417651717428, "learning_rate": 5.335628227194493e-07, "loss": 0.2828, "step": 3198 }, { "epoch": 2.9707385044124477, "grad_norm": 0.16183155315903366, "learning_rate": 5.163511187607574e-07, "loss": 0.2852, "step": 3199 }, { "epoch": 2.9716674407803065, "grad_norm": 0.16678141742547176, "learning_rate": 4.991394148020655e-07, "loss": 0.2813, "step": 3200 }, { "epoch": 2.9725963771481654, "grad_norm": 0.1619658189257669, "learning_rate": 4.819277108433736e-07, "loss": 0.2649, "step": 3201 }, { "epoch": 2.9735253135160242, "grad_norm": 0.15993238852155908, "learning_rate": 4.6471600688468156e-07, "loss": 0.2764, "step": 3202 }, { "epoch": 2.974454249883883, "grad_norm": 0.16228626601674104, "learning_rate": 4.4750430292598964e-07, "loss": 0.2738, "step": 3203 }, { "epoch": 2.975383186251742, "grad_norm": 0.16945807739892618, "learning_rate": 4.3029259896729773e-07, "loss": 0.2875, "step": 3204 }, { "epoch": 2.9763121226196008, "grad_norm": 0.16673802928277734, "learning_rate": 4.130808950086058e-07, "loss": 0.2751, "step": 3205 }, { "epoch": 2.9772410589874596, "grad_norm": 0.16066420702808745, "learning_rate": 3.9586919104991394e-07, "loss": 0.2691, "step": 3206 }, { "epoch": 2.978169995355318, "grad_norm": 0.167644174523662, "learning_rate": 3.786574870912221e-07, "loss": 0.276, "step": 3207 }, { "epoch": 2.979098931723177, "grad_norm": 0.16405491071346914, "learning_rate": 3.6144578313253016e-07, "loss": 0.27, "step": 3208 }, { "epoch": 2.9800278680910357, "grad_norm": 0.15776111203718893, "learning_rate": 3.4423407917383825e-07, "loss": 0.2754, "step": 3209 }, { "epoch": 2.9809568044588945, "grad_norm": 0.16291385160189412, "learning_rate": 3.2702237521514633e-07, "loss": 0.2801, "step": 3210 }, { "epoch": 2.9818857408267534, "grad_norm": 0.15798932483422015, "learning_rate": 3.098106712564544e-07, "loss": 0.2655, "step": 3211 }, { "epoch": 2.9828146771946122, "grad_norm": 0.16945630512529875, "learning_rate": 2.925989672977625e-07, "loss": 0.274, "step": 3212 }, { "epoch": 2.983743613562471, "grad_norm": 0.1708554921170849, "learning_rate": 2.753872633390706e-07, "loss": 0.272, "step": 3213 }, { "epoch": 2.98467254993033, "grad_norm": 0.15872652564709347, "learning_rate": 2.581755593803787e-07, "loss": 0.258, "step": 3214 }, { "epoch": 2.9856014862981883, "grad_norm": 0.1830290046487256, "learning_rate": 2.409638554216868e-07, "loss": 0.2794, "step": 3215 }, { "epoch": 2.986530422666047, "grad_norm": 0.15745598014130835, "learning_rate": 2.2375215146299482e-07, "loss": 0.2645, "step": 3216 }, { "epoch": 2.987459359033906, "grad_norm": 0.16220353029392465, "learning_rate": 2.065404475043029e-07, "loss": 0.2789, "step": 3217 }, { "epoch": 2.988388295401765, "grad_norm": 0.16537567907705764, "learning_rate": 1.8932874354561104e-07, "loss": 0.2724, "step": 3218 }, { "epoch": 2.9893172317696237, "grad_norm": 0.15966135311906623, "learning_rate": 1.7211703958691912e-07, "loss": 0.2603, "step": 3219 }, { "epoch": 2.9902461681374826, "grad_norm": 0.16670677322460914, "learning_rate": 1.549053356282272e-07, "loss": 0.277, "step": 3220 }, { "epoch": 2.9911751045053414, "grad_norm": 0.16236390105210605, "learning_rate": 1.376936316695353e-07, "loss": 0.2778, "step": 3221 }, { "epoch": 2.9921040408732003, "grad_norm": 0.15740425470314215, "learning_rate": 1.204819277108434e-07, "loss": 0.263, "step": 3222 }, { "epoch": 2.993032977241059, "grad_norm": 0.16389828021365552, "learning_rate": 1.0327022375215145e-07, "loss": 0.276, "step": 3223 }, { "epoch": 2.993961913608918, "grad_norm": 0.1578402137230674, "learning_rate": 8.605851979345956e-08, "loss": 0.26, "step": 3224 }, { "epoch": 2.994890849976777, "grad_norm": 0.1705942119258943, "learning_rate": 6.884681583476764e-08, "loss": 0.2776, "step": 3225 }, { "epoch": 2.9958197863446356, "grad_norm": 0.16036831167171559, "learning_rate": 5.1635111876075726e-08, "loss": 0.2761, "step": 3226 }, { "epoch": 2.996748722712494, "grad_norm": 0.17533866907265988, "learning_rate": 3.442340791738382e-08, "loss": 0.2913, "step": 3227 }, { "epoch": 2.997677659080353, "grad_norm": 0.1660516196575767, "learning_rate": 1.721170395869191e-08, "loss": 0.2709, "step": 3228 }, { "epoch": 2.997677659080353, "step": 3228, "total_flos": 3.5889551683046343e+19, "train_loss": 0.40990160300847617, "train_runtime": 91578.7681, "train_samples_per_second": 0.564, "train_steps_per_second": 0.035 } ], "logging_steps": 1, "max_steps": 3228, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.5889551683046343e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }