{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9972020145495244, "eval_steps": 500, "global_step": 891, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003357582540570789, "grad_norm": 15.05952844044008, "learning_rate": 0.0, "loss": 1.1094, "step": 1 }, { "epoch": 0.006715165081141578, "grad_norm": 15.55795952285201, "learning_rate": 1.1111111111111112e-07, "loss": 1.2127, "step": 2 }, { "epoch": 0.010072747621712367, "grad_norm": 16.539001809523764, "learning_rate": 2.2222222222222224e-07, "loss": 1.1671, "step": 3 }, { "epoch": 0.013430330162283156, "grad_norm": 16.368302779349722, "learning_rate": 3.3333333333333335e-07, "loss": 1.1785, "step": 4 }, { "epoch": 0.016787912702853944, "grad_norm": 15.588296216474147, "learning_rate": 4.444444444444445e-07, "loss": 1.1263, "step": 5 }, { "epoch": 0.020145495243424735, "grad_norm": 16.499164600464685, "learning_rate": 5.555555555555555e-07, "loss": 1.1748, "step": 6 }, { "epoch": 0.023503077783995522, "grad_norm": 14.083316686941275, "learning_rate": 6.666666666666667e-07, "loss": 1.1657, "step": 7 }, { "epoch": 0.026860660324566313, "grad_norm": 16.604129854168686, "learning_rate": 7.777777777777779e-07, "loss": 1.2313, "step": 8 }, { "epoch": 0.0302182428651371, "grad_norm": 14.603261078745698, "learning_rate": 8.88888888888889e-07, "loss": 1.1465, "step": 9 }, { "epoch": 0.03357582540570789, "grad_norm": 12.7832318620063, "learning_rate": 1.0000000000000002e-06, "loss": 1.1103, "step": 10 }, { "epoch": 0.03693340794627868, "grad_norm": 11.676821024645601, "learning_rate": 1.111111111111111e-06, "loss": 1.0456, "step": 11 }, { "epoch": 0.04029099048684947, "grad_norm": 10.276715553394455, "learning_rate": 1.2222222222222223e-06, "loss": 1.0994, "step": 12 }, { "epoch": 0.04364857302742026, "grad_norm": 7.580128095019628, "learning_rate": 1.3333333333333334e-06, "loss": 0.9192, "step": 13 }, { "epoch": 0.047006155567991044, "grad_norm": 4.579086500989691, "learning_rate": 1.4444444444444445e-06, "loss": 0.9019, "step": 14 }, { "epoch": 0.05036373810856184, "grad_norm": 4.462238466517284, "learning_rate": 1.5555555555555558e-06, "loss": 0.932, "step": 15 }, { "epoch": 0.053721320649132626, "grad_norm": 4.49504437288462, "learning_rate": 1.6666666666666667e-06, "loss": 0.9504, "step": 16 }, { "epoch": 0.05707890318970341, "grad_norm": 4.856137118720444, "learning_rate": 1.777777777777778e-06, "loss": 0.8997, "step": 17 }, { "epoch": 0.0604364857302742, "grad_norm": 5.458807952193987, "learning_rate": 1.888888888888889e-06, "loss": 0.9135, "step": 18 }, { "epoch": 0.063794068270845, "grad_norm": 4.914187499366455, "learning_rate": 2.0000000000000003e-06, "loss": 0.895, "step": 19 }, { "epoch": 0.06715165081141578, "grad_norm": 5.8484928291138685, "learning_rate": 2.1111111111111114e-06, "loss": 0.9178, "step": 20 }, { "epoch": 0.07050923335198657, "grad_norm": 6.405365379448204, "learning_rate": 2.222222222222222e-06, "loss": 0.9436, "step": 21 }, { "epoch": 0.07386681589255736, "grad_norm": 5.140279358058298, "learning_rate": 2.3333333333333336e-06, "loss": 0.8577, "step": 22 }, { "epoch": 0.07722439843312814, "grad_norm": 4.573855490751355, "learning_rate": 2.4444444444444447e-06, "loss": 0.849, "step": 23 }, { "epoch": 0.08058198097369894, "grad_norm": 3.4903819904679914, "learning_rate": 2.5555555555555557e-06, "loss": 0.8684, "step": 24 }, { "epoch": 0.08393956351426973, "grad_norm": 3.2866185409697413, "learning_rate": 2.666666666666667e-06, "loss": 0.9139, "step": 25 }, { "epoch": 0.08729714605484051, "grad_norm": 2.9343312861580553, "learning_rate": 2.7777777777777783e-06, "loss": 0.825, "step": 26 }, { "epoch": 0.09065472859541131, "grad_norm": 2.7441074948207635, "learning_rate": 2.888888888888889e-06, "loss": 0.7743, "step": 27 }, { "epoch": 0.09401231113598209, "grad_norm": 3.0852450829413036, "learning_rate": 3e-06, "loss": 0.8325, "step": 28 }, { "epoch": 0.09736989367655288, "grad_norm": 2.5969750970962107, "learning_rate": 3.1111111111111116e-06, "loss": 0.8181, "step": 29 }, { "epoch": 0.10072747621712368, "grad_norm": 3.0607284743078367, "learning_rate": 3.2222222222222227e-06, "loss": 0.8839, "step": 30 }, { "epoch": 0.10408505875769446, "grad_norm": 2.6971303460280813, "learning_rate": 3.3333333333333333e-06, "loss": 0.803, "step": 31 }, { "epoch": 0.10744264129826525, "grad_norm": 2.7795448200981054, "learning_rate": 3.444444444444445e-06, "loss": 0.7857, "step": 32 }, { "epoch": 0.11080022383883603, "grad_norm": 2.672977099956262, "learning_rate": 3.555555555555556e-06, "loss": 0.8166, "step": 33 }, { "epoch": 0.11415780637940683, "grad_norm": 2.433972508138389, "learning_rate": 3.6666666666666666e-06, "loss": 0.7488, "step": 34 }, { "epoch": 0.11751538891997762, "grad_norm": 2.4637559196857812, "learning_rate": 3.777777777777778e-06, "loss": 0.7965, "step": 35 }, { "epoch": 0.1208729714605484, "grad_norm": 2.456980144441013, "learning_rate": 3.88888888888889e-06, "loss": 0.8154, "step": 36 }, { "epoch": 0.1242305540011192, "grad_norm": 2.4869279368019273, "learning_rate": 4.000000000000001e-06, "loss": 0.8126, "step": 37 }, { "epoch": 0.12758813654169, "grad_norm": 2.467906619619702, "learning_rate": 4.111111111111111e-06, "loss": 0.8001, "step": 38 }, { "epoch": 0.13094571908226077, "grad_norm": 2.581215311622158, "learning_rate": 4.222222222222223e-06, "loss": 0.8286, "step": 39 }, { "epoch": 0.13430330162283155, "grad_norm": 2.501542706637013, "learning_rate": 4.333333333333334e-06, "loss": 0.8294, "step": 40 }, { "epoch": 0.13766088416340236, "grad_norm": 2.334359062766939, "learning_rate": 4.444444444444444e-06, "loss": 0.8057, "step": 41 }, { "epoch": 0.14101846670397314, "grad_norm": 2.512161617115715, "learning_rate": 4.555555555555556e-06, "loss": 0.7816, "step": 42 }, { "epoch": 0.14437604924454392, "grad_norm": 2.3512830431401186, "learning_rate": 4.666666666666667e-06, "loss": 0.7406, "step": 43 }, { "epoch": 0.14773363178511473, "grad_norm": 2.353319121545363, "learning_rate": 4.777777777777778e-06, "loss": 0.7836, "step": 44 }, { "epoch": 0.1510912143256855, "grad_norm": 2.3785414037245065, "learning_rate": 4.888888888888889e-06, "loss": 0.7905, "step": 45 }, { "epoch": 0.1544487968662563, "grad_norm": 2.489874176687805, "learning_rate": 5e-06, "loss": 0.77, "step": 46 }, { "epoch": 0.1578063794068271, "grad_norm": 2.640302288366859, "learning_rate": 5.1111111111111115e-06, "loss": 0.8096, "step": 47 }, { "epoch": 0.16116396194739788, "grad_norm": 2.6168372492389875, "learning_rate": 5.2222222222222226e-06, "loss": 0.8085, "step": 48 }, { "epoch": 0.16452154448796866, "grad_norm": 2.355298598185473, "learning_rate": 5.333333333333334e-06, "loss": 0.7634, "step": 49 }, { "epoch": 0.16787912702853947, "grad_norm": 2.4159132229794564, "learning_rate": 5.444444444444445e-06, "loss": 0.7764, "step": 50 }, { "epoch": 0.17123670956911025, "grad_norm": 2.4612658382916344, "learning_rate": 5.555555555555557e-06, "loss": 0.7637, "step": 51 }, { "epoch": 0.17459429210968103, "grad_norm": 2.454872028492967, "learning_rate": 5.666666666666667e-06, "loss": 0.8304, "step": 52 }, { "epoch": 0.1779518746502518, "grad_norm": 2.231797525957087, "learning_rate": 5.777777777777778e-06, "loss": 0.8054, "step": 53 }, { "epoch": 0.18130945719082261, "grad_norm": 2.5145359657110573, "learning_rate": 5.88888888888889e-06, "loss": 0.7767, "step": 54 }, { "epoch": 0.1846670397313934, "grad_norm": 2.2680418205268817, "learning_rate": 6e-06, "loss": 0.7446, "step": 55 }, { "epoch": 0.18802462227196418, "grad_norm": 2.5397886944135095, "learning_rate": 6.111111111111112e-06, "loss": 0.8216, "step": 56 }, { "epoch": 0.19138220481253498, "grad_norm": 2.5677000160418615, "learning_rate": 6.222222222222223e-06, "loss": 0.7516, "step": 57 }, { "epoch": 0.19473978735310576, "grad_norm": 2.464277933743688, "learning_rate": 6.333333333333333e-06, "loss": 0.7809, "step": 58 }, { "epoch": 0.19809736989367654, "grad_norm": 2.4742771459078763, "learning_rate": 6.444444444444445e-06, "loss": 0.7734, "step": 59 }, { "epoch": 0.20145495243424735, "grad_norm": 2.4284525343661794, "learning_rate": 6.555555555555556e-06, "loss": 0.7763, "step": 60 }, { "epoch": 0.20481253497481813, "grad_norm": 2.4953358610535985, "learning_rate": 6.666666666666667e-06, "loss": 0.7742, "step": 61 }, { "epoch": 0.2081701175153889, "grad_norm": 2.221862854493057, "learning_rate": 6.777777777777779e-06, "loss": 0.7388, "step": 62 }, { "epoch": 0.21152770005595972, "grad_norm": 2.4115958840359135, "learning_rate": 6.88888888888889e-06, "loss": 0.8117, "step": 63 }, { "epoch": 0.2148852825965305, "grad_norm": 2.247325502758182, "learning_rate": 7e-06, "loss": 0.874, "step": 64 }, { "epoch": 0.21824286513710128, "grad_norm": 2.459302207580601, "learning_rate": 7.111111111111112e-06, "loss": 0.7753, "step": 65 }, { "epoch": 0.22160044767767206, "grad_norm": 2.3615999287857194, "learning_rate": 7.222222222222223e-06, "loss": 0.7364, "step": 66 }, { "epoch": 0.22495803021824287, "grad_norm": 2.345803430121652, "learning_rate": 7.333333333333333e-06, "loss": 0.7828, "step": 67 }, { "epoch": 0.22831561275881365, "grad_norm": 2.3409970367083095, "learning_rate": 7.444444444444445e-06, "loss": 0.7061, "step": 68 }, { "epoch": 0.23167319529938443, "grad_norm": 2.4774854380802624, "learning_rate": 7.555555555555556e-06, "loss": 0.7717, "step": 69 }, { "epoch": 0.23503077783995524, "grad_norm": 2.5208849967610782, "learning_rate": 7.666666666666667e-06, "loss": 0.7771, "step": 70 }, { "epoch": 0.23838836038052602, "grad_norm": 2.6549693177981055, "learning_rate": 7.77777777777778e-06, "loss": 0.8112, "step": 71 }, { "epoch": 0.2417459429210968, "grad_norm": 2.832698323256436, "learning_rate": 7.88888888888889e-06, "loss": 0.7739, "step": 72 }, { "epoch": 0.2451035254616676, "grad_norm": 2.5304942003986453, "learning_rate": 8.000000000000001e-06, "loss": 0.8008, "step": 73 }, { "epoch": 0.2484611080022384, "grad_norm": 2.431052677971174, "learning_rate": 8.111111111111112e-06, "loss": 0.8011, "step": 74 }, { "epoch": 0.2518186905428092, "grad_norm": 2.2473360155050286, "learning_rate": 8.222222222222222e-06, "loss": 0.7438, "step": 75 }, { "epoch": 0.25517627308338, "grad_norm": 2.5408424868361017, "learning_rate": 8.333333333333334e-06, "loss": 0.8015, "step": 76 }, { "epoch": 0.25853385562395076, "grad_norm": 2.3659875362077996, "learning_rate": 8.444444444444446e-06, "loss": 0.7927, "step": 77 }, { "epoch": 0.26189143816452154, "grad_norm": 2.3184771169883636, "learning_rate": 8.555555555555556e-06, "loss": 0.7832, "step": 78 }, { "epoch": 0.2652490207050923, "grad_norm": 2.426200561656744, "learning_rate": 8.666666666666668e-06, "loss": 0.7475, "step": 79 }, { "epoch": 0.2686066032456631, "grad_norm": 2.39729048396846, "learning_rate": 8.777777777777778e-06, "loss": 0.8269, "step": 80 }, { "epoch": 0.27196418578623394, "grad_norm": 2.375526510228167, "learning_rate": 8.888888888888888e-06, "loss": 0.7924, "step": 81 }, { "epoch": 0.2753217683268047, "grad_norm": 2.2496572603077833, "learning_rate": 9e-06, "loss": 0.8096, "step": 82 }, { "epoch": 0.2786793508673755, "grad_norm": 2.3057931599473913, "learning_rate": 9.111111111111112e-06, "loss": 0.7586, "step": 83 }, { "epoch": 0.2820369334079463, "grad_norm": 2.3828592253623784, "learning_rate": 9.222222222222224e-06, "loss": 0.7741, "step": 84 }, { "epoch": 0.28539451594851706, "grad_norm": 2.479750143175691, "learning_rate": 9.333333333333334e-06, "loss": 0.7718, "step": 85 }, { "epoch": 0.28875209848908784, "grad_norm": 2.2810057071437466, "learning_rate": 9.444444444444445e-06, "loss": 0.7687, "step": 86 }, { "epoch": 0.2921096810296586, "grad_norm": 2.3537687144315655, "learning_rate": 9.555555555555556e-06, "loss": 0.7694, "step": 87 }, { "epoch": 0.29546726357022945, "grad_norm": 2.480085659080849, "learning_rate": 9.666666666666667e-06, "loss": 0.8077, "step": 88 }, { "epoch": 0.29882484611080024, "grad_norm": 2.3071631718134733, "learning_rate": 9.777777777777779e-06, "loss": 0.8218, "step": 89 }, { "epoch": 0.302182428651371, "grad_norm": 2.42645063403485, "learning_rate": 9.88888888888889e-06, "loss": 0.7613, "step": 90 }, { "epoch": 0.3055400111919418, "grad_norm": 2.2871396355674958, "learning_rate": 1e-05, "loss": 0.7597, "step": 91 }, { "epoch": 0.3088975937325126, "grad_norm": 2.550927280073565, "learning_rate": 9.999961543109546e-06, "loss": 0.7757, "step": 92 }, { "epoch": 0.31225517627308336, "grad_norm": 2.27888182793667, "learning_rate": 9.999846173029752e-06, "loss": 0.7865, "step": 93 }, { "epoch": 0.3156127588136542, "grad_norm": 2.1772873128455807, "learning_rate": 9.99965389153533e-06, "loss": 0.7405, "step": 94 }, { "epoch": 0.318970341354225, "grad_norm": 2.313731791988704, "learning_rate": 9.999384701584098e-06, "loss": 0.7605, "step": 95 }, { "epoch": 0.32232792389479575, "grad_norm": 2.66647271515439, "learning_rate": 9.999038607316942e-06, "loss": 0.8139, "step": 96 }, { "epoch": 0.32568550643536653, "grad_norm": 2.345938824538182, "learning_rate": 9.998615614057743e-06, "loss": 0.7782, "step": 97 }, { "epoch": 0.3290430889759373, "grad_norm": 2.445303562264931, "learning_rate": 9.998115728313305e-06, "loss": 0.7628, "step": 98 }, { "epoch": 0.3324006715165081, "grad_norm": 2.3792304151925974, "learning_rate": 9.997538957773248e-06, "loss": 0.773, "step": 99 }, { "epoch": 0.33575825405707893, "grad_norm": 2.3052323825950816, "learning_rate": 9.996885311309892e-06, "loss": 0.8015, "step": 100 }, { "epoch": 0.3391158365976497, "grad_norm": 2.2653536708074338, "learning_rate": 9.996154798978122e-06, "loss": 0.759, "step": 101 }, { "epoch": 0.3424734191382205, "grad_norm": 2.445188297437759, "learning_rate": 9.99534743201523e-06, "loss": 0.7828, "step": 102 }, { "epoch": 0.34583100167879127, "grad_norm": 2.1926261320850555, "learning_rate": 9.994463222840748e-06, "loss": 0.7518, "step": 103 }, { "epoch": 0.34918858421936205, "grad_norm": 2.3228023822506154, "learning_rate": 9.993502185056244e-06, "loss": 0.7541, "step": 104 }, { "epoch": 0.35254616675993283, "grad_norm": 2.361098770321157, "learning_rate": 9.992464333445134e-06, "loss": 0.6949, "step": 105 }, { "epoch": 0.3559037493005036, "grad_norm": 2.2629507834451146, "learning_rate": 9.991349683972435e-06, "loss": 0.7731, "step": 106 }, { "epoch": 0.35926133184107445, "grad_norm": 2.264400946281939, "learning_rate": 9.990158253784525e-06, "loss": 0.7746, "step": 107 }, { "epoch": 0.36261891438164523, "grad_norm": 2.536408740504836, "learning_rate": 9.988890061208889e-06, "loss": 0.757, "step": 108 }, { "epoch": 0.365976496922216, "grad_norm": 2.1481392669710657, "learning_rate": 9.987545125753818e-06, "loss": 0.8101, "step": 109 }, { "epoch": 0.3693340794627868, "grad_norm": 2.271583484189747, "learning_rate": 9.986123468108134e-06, "loss": 0.7716, "step": 110 }, { "epoch": 0.37269166200335757, "grad_norm": 2.301407093107629, "learning_rate": 9.984625110140844e-06, "loss": 0.7842, "step": 111 }, { "epoch": 0.37604924454392835, "grad_norm": 2.3089880928671045, "learning_rate": 9.983050074900824e-06, "loss": 0.7452, "step": 112 }, { "epoch": 0.3794068270844992, "grad_norm": 2.413625325654982, "learning_rate": 9.98139838661646e-06, "loss": 0.7502, "step": 113 }, { "epoch": 0.38276440962506997, "grad_norm": 2.289871304585377, "learning_rate": 9.979670070695265e-06, "loss": 0.7708, "step": 114 }, { "epoch": 0.38612199216564075, "grad_norm": 2.290612460056919, "learning_rate": 9.977865153723508e-06, "loss": 0.784, "step": 115 }, { "epoch": 0.38947957470621153, "grad_norm": 2.3027870660016725, "learning_rate": 9.97598366346578e-06, "loss": 0.7534, "step": 116 }, { "epoch": 0.3928371572467823, "grad_norm": 2.2479898856154015, "learning_rate": 9.974025628864592e-06, "loss": 0.7388, "step": 117 }, { "epoch": 0.3961947397873531, "grad_norm": 2.20503493895051, "learning_rate": 9.971991080039912e-06, "loss": 0.763, "step": 118 }, { "epoch": 0.39955232232792387, "grad_norm": 2.2879831426439807, "learning_rate": 9.969880048288704e-06, "loss": 0.8042, "step": 119 }, { "epoch": 0.4029099048684947, "grad_norm": 2.0895550497991815, "learning_rate": 9.96769256608446e-06, "loss": 0.7267, "step": 120 }, { "epoch": 0.4062674874090655, "grad_norm": 2.2118550621037047, "learning_rate": 9.965428667076687e-06, "loss": 0.7642, "step": 121 }, { "epoch": 0.40962506994963627, "grad_norm": 2.3589353890246416, "learning_rate": 9.963088386090386e-06, "loss": 0.7688, "step": 122 }, { "epoch": 0.41298265249020705, "grad_norm": 2.3245448693859805, "learning_rate": 9.960671759125529e-06, "loss": 0.7909, "step": 123 }, { "epoch": 0.4163402350307778, "grad_norm": 2.2267480190116893, "learning_rate": 9.958178823356503e-06, "loss": 0.7525, "step": 124 }, { "epoch": 0.4196978175713486, "grad_norm": 2.4295908603398364, "learning_rate": 9.95560961713153e-06, "loss": 0.8311, "step": 125 }, { "epoch": 0.42305540011191944, "grad_norm": 2.2939978579098987, "learning_rate": 9.95296417997208e-06, "loss": 0.7679, "step": 126 }, { "epoch": 0.4264129826524902, "grad_norm": 2.438507683294902, "learning_rate": 9.950242552572272e-06, "loss": 0.783, "step": 127 }, { "epoch": 0.429770565193061, "grad_norm": 2.121254822380545, "learning_rate": 9.947444776798235e-06, "loss": 0.7213, "step": 128 }, { "epoch": 0.4331281477336318, "grad_norm": 2.393441549390287, "learning_rate": 9.944570895687471e-06, "loss": 0.7832, "step": 129 }, { "epoch": 0.43648573027420257, "grad_norm": 2.354252110767644, "learning_rate": 9.941620953448195e-06, "loss": 0.7984, "step": 130 }, { "epoch": 0.43984331281477335, "grad_norm": 2.3944424356758143, "learning_rate": 9.938594995458644e-06, "loss": 0.7794, "step": 131 }, { "epoch": 0.4432008953553441, "grad_norm": 2.3095894964107093, "learning_rate": 9.935493068266396e-06, "loss": 0.7876, "step": 132 }, { "epoch": 0.44655847789591496, "grad_norm": 2.3120517699348273, "learning_rate": 9.932315219587641e-06, "loss": 0.7665, "step": 133 }, { "epoch": 0.44991606043648574, "grad_norm": 2.4246945894908554, "learning_rate": 9.929061498306448e-06, "loss": 0.7985, "step": 134 }, { "epoch": 0.4532736429770565, "grad_norm": 2.351327159028273, "learning_rate": 9.92573195447402e-06, "loss": 0.8119, "step": 135 }, { "epoch": 0.4566312255176273, "grad_norm": 2.1900583413629042, "learning_rate": 9.922326639307918e-06, "loss": 0.753, "step": 136 }, { "epoch": 0.4599888080581981, "grad_norm": 2.39557007686719, "learning_rate": 9.918845605191274e-06, "loss": 0.792, "step": 137 }, { "epoch": 0.46334639059876886, "grad_norm": 2.245343140864623, "learning_rate": 9.915288905671986e-06, "loss": 0.7924, "step": 138 }, { "epoch": 0.4667039731393397, "grad_norm": 2.095454810542127, "learning_rate": 9.911656595461899e-06, "loss": 0.7287, "step": 139 }, { "epoch": 0.4700615556799105, "grad_norm": 2.3661531662229445, "learning_rate": 9.90794873043595e-06, "loss": 0.783, "step": 140 }, { "epoch": 0.47341913822048126, "grad_norm": 2.218576799584266, "learning_rate": 9.904165367631329e-06, "loss": 0.7682, "step": 141 }, { "epoch": 0.47677672076105204, "grad_norm": 2.2752695851092777, "learning_rate": 9.900306565246579e-06, "loss": 0.757, "step": 142 }, { "epoch": 0.4801343033016228, "grad_norm": 2.201550744224962, "learning_rate": 9.896372382640718e-06, "loss": 0.7691, "step": 143 }, { "epoch": 0.4834918858421936, "grad_norm": 2.1043983734924185, "learning_rate": 9.892362880332316e-06, "loss": 0.7383, "step": 144 }, { "epoch": 0.4868494683827644, "grad_norm": 2.3833966355515153, "learning_rate": 9.888278119998573e-06, "loss": 0.807, "step": 145 }, { "epoch": 0.4902070509233352, "grad_norm": 2.0943822596303674, "learning_rate": 9.884118164474359e-06, "loss": 0.7899, "step": 146 }, { "epoch": 0.493564633463906, "grad_norm": 2.0793762196537524, "learning_rate": 9.879883077751255e-06, "loss": 0.7425, "step": 147 }, { "epoch": 0.4969222160044768, "grad_norm": 2.2384449240578377, "learning_rate": 9.875572924976568e-06, "loss": 0.7857, "step": 148 }, { "epoch": 0.5002797985450476, "grad_norm": 2.0645202722027642, "learning_rate": 9.871187772452327e-06, "loss": 0.7932, "step": 149 }, { "epoch": 0.5036373810856184, "grad_norm": 2.2655225270221377, "learning_rate": 9.866727687634266e-06, "loss": 0.7613, "step": 150 }, { "epoch": 0.5069949636261891, "grad_norm": 2.139175314801808, "learning_rate": 9.86219273913078e-06, "loss": 0.7728, "step": 151 }, { "epoch": 0.51035254616676, "grad_norm": 2.2148970746168932, "learning_rate": 9.857582996701878e-06, "loss": 0.7613, "step": 152 }, { "epoch": 0.5137101287073307, "grad_norm": 2.3638044057711065, "learning_rate": 9.852898531258102e-06, "loss": 0.7538, "step": 153 }, { "epoch": 0.5170677112479015, "grad_norm": 2.16926286809674, "learning_rate": 9.848139414859441e-06, "loss": 0.7518, "step": 154 }, { "epoch": 0.5204252937884724, "grad_norm": 2.264174308627129, "learning_rate": 9.843305720714227e-06, "loss": 0.758, "step": 155 }, { "epoch": 0.5237828763290431, "grad_norm": 2.1658356317455403, "learning_rate": 9.838397523177993e-06, "loss": 0.7508, "step": 156 }, { "epoch": 0.5271404588696139, "grad_norm": 2.2969058345093116, "learning_rate": 9.833414897752346e-06, "loss": 0.7595, "step": 157 }, { "epoch": 0.5304980414101846, "grad_norm": 2.180966154918318, "learning_rate": 9.828357921083803e-06, "loss": 0.7734, "step": 158 }, { "epoch": 0.5338556239507555, "grad_norm": 2.236205762743028, "learning_rate": 9.823226670962598e-06, "loss": 0.821, "step": 159 }, { "epoch": 0.5372132064913262, "grad_norm": 2.4457753187441837, "learning_rate": 9.818021226321502e-06, "loss": 0.8161, "step": 160 }, { "epoch": 0.540570789031897, "grad_norm": 2.1706687105611313, "learning_rate": 9.812741667234599e-06, "loss": 0.7693, "step": 161 }, { "epoch": 0.5439283715724679, "grad_norm": 2.1712893591002045, "learning_rate": 9.807388074916064e-06, "loss": 0.759, "step": 162 }, { "epoch": 0.5472859541130386, "grad_norm": 2.1454942490466675, "learning_rate": 9.801960531718898e-06, "loss": 0.7605, "step": 163 }, { "epoch": 0.5506435366536094, "grad_norm": 2.22853836765068, "learning_rate": 9.796459121133675e-06, "loss": 0.8167, "step": 164 }, { "epoch": 0.5540011191941802, "grad_norm": 2.147190276259434, "learning_rate": 9.790883927787254e-06, "loss": 0.7771, "step": 165 }, { "epoch": 0.557358701734751, "grad_norm": 2.177712396336002, "learning_rate": 9.785235037441473e-06, "loss": 0.7749, "step": 166 }, { "epoch": 0.5607162842753217, "grad_norm": 2.2764867484419353, "learning_rate": 9.779512536991839e-06, "loss": 0.7186, "step": 167 }, { "epoch": 0.5640738668158926, "grad_norm": 2.0316602958146177, "learning_rate": 9.773716514466179e-06, "loss": 0.7092, "step": 168 }, { "epoch": 0.5674314493564634, "grad_norm": 2.335399900268128, "learning_rate": 9.767847059023292e-06, "loss": 0.7561, "step": 169 }, { "epoch": 0.5707890318970341, "grad_norm": 2.1709764537143945, "learning_rate": 9.761904260951583e-06, "loss": 0.7802, "step": 170 }, { "epoch": 0.574146614437605, "grad_norm": 2.0593129146431512, "learning_rate": 9.755888211667663e-06, "loss": 0.7301, "step": 171 }, { "epoch": 0.5775041969781757, "grad_norm": 2.441441500782324, "learning_rate": 9.749799003714954e-06, "loss": 0.7799, "step": 172 }, { "epoch": 0.5808617795187465, "grad_norm": 2.2660299178551773, "learning_rate": 9.743636730762259e-06, "loss": 0.7827, "step": 173 }, { "epoch": 0.5842193620593172, "grad_norm": 2.1765013932906396, "learning_rate": 9.737401487602314e-06, "loss": 0.7267, "step": 174 }, { "epoch": 0.5875769445998881, "grad_norm": 2.3327142301922956, "learning_rate": 9.731093370150349e-06, "loss": 0.7456, "step": 175 }, { "epoch": 0.5909345271404589, "grad_norm": 2.1506465746934973, "learning_rate": 9.724712475442597e-06, "loss": 0.7703, "step": 176 }, { "epoch": 0.5942921096810296, "grad_norm": 2.356569475164915, "learning_rate": 9.718258901634802e-06, "loss": 0.7102, "step": 177 }, { "epoch": 0.5976496922216005, "grad_norm": 2.232872599999044, "learning_rate": 9.71173274800072e-06, "loss": 0.7432, "step": 178 }, { "epoch": 0.6010072747621712, "grad_norm": 2.2094954984314996, "learning_rate": 9.70513411493058e-06, "loss": 0.7298, "step": 179 }, { "epoch": 0.604364857302742, "grad_norm": 2.247936712152706, "learning_rate": 9.698463103929542e-06, "loss": 0.7627, "step": 180 }, { "epoch": 0.6077224398433129, "grad_norm": 2.178442031708872, "learning_rate": 9.691719817616148e-06, "loss": 0.747, "step": 181 }, { "epoch": 0.6110800223838836, "grad_norm": 1.963354472091314, "learning_rate": 9.684904359720724e-06, "loss": 0.7338, "step": 182 }, { "epoch": 0.6144376049244544, "grad_norm": 2.353759222193367, "learning_rate": 9.678016835083798e-06, "loss": 0.7535, "step": 183 }, { "epoch": 0.6177951874650252, "grad_norm": 2.1917086328236373, "learning_rate": 9.671057349654481e-06, "loss": 0.8249, "step": 184 }, { "epoch": 0.621152770005596, "grad_norm": 2.068590516399619, "learning_rate": 9.66402601048884e-06, "loss": 0.7565, "step": 185 }, { "epoch": 0.6245103525461667, "grad_norm": 2.233193447193882, "learning_rate": 9.656922925748254e-06, "loss": 0.779, "step": 186 }, { "epoch": 0.6278679350867375, "grad_norm": 1.9913300969947432, "learning_rate": 9.649748204697741e-06, "loss": 0.7111, "step": 187 }, { "epoch": 0.6312255176273084, "grad_norm": 2.0770537048721205, "learning_rate": 9.642501957704287e-06, "loss": 0.7737, "step": 188 }, { "epoch": 0.6345831001678791, "grad_norm": 2.026052626547883, "learning_rate": 9.63518429623514e-06, "loss": 0.7678, "step": 189 }, { "epoch": 0.63794068270845, "grad_norm": 2.004420216986479, "learning_rate": 9.627795332856107e-06, "loss": 0.7765, "step": 190 }, { "epoch": 0.6412982652490207, "grad_norm": 2.0936509457969374, "learning_rate": 9.620335181229805e-06, "loss": 0.7583, "step": 191 }, { "epoch": 0.6446558477895915, "grad_norm": 2.038743312113399, "learning_rate": 9.612803956113932e-06, "loss": 0.7755, "step": 192 }, { "epoch": 0.6480134303301622, "grad_norm": 2.1648346294989294, "learning_rate": 9.605201773359485e-06, "loss": 0.7125, "step": 193 }, { "epoch": 0.6513710128707331, "grad_norm": 2.023535905337207, "learning_rate": 9.59752874990899e-06, "loss": 0.72, "step": 194 }, { "epoch": 0.6547285954113039, "grad_norm": 2.0647928726363776, "learning_rate": 9.589785003794692e-06, "loss": 0.741, "step": 195 }, { "epoch": 0.6580861779518746, "grad_norm": 2.657611602927363, "learning_rate": 9.581970654136752e-06, "loss": 0.7723, "step": 196 }, { "epoch": 0.6614437604924455, "grad_norm": 2.211743855012671, "learning_rate": 9.574085821141406e-06, "loss": 0.754, "step": 197 }, { "epoch": 0.6648013430330162, "grad_norm": 2.131488016325981, "learning_rate": 9.566130626099118e-06, "loss": 0.7738, "step": 198 }, { "epoch": 0.668158925573587, "grad_norm": 2.1420712267681195, "learning_rate": 9.55810519138271e-06, "loss": 0.781, "step": 199 }, { "epoch": 0.6715165081141579, "grad_norm": 2.245825883364256, "learning_rate": 9.550009640445492e-06, "loss": 0.7606, "step": 200 }, { "epoch": 0.6748740906547286, "grad_norm": 2.027082764830745, "learning_rate": 9.541844097819347e-06, "loss": 0.7535, "step": 201 }, { "epoch": 0.6782316731952994, "grad_norm": 1.9877374251586413, "learning_rate": 9.533608689112827e-06, "loss": 0.7559, "step": 202 }, { "epoch": 0.6815892557358701, "grad_norm": 1.9927267340906514, "learning_rate": 9.525303541009218e-06, "loss": 0.6754, "step": 203 }, { "epoch": 0.684946838276441, "grad_norm": 2.092882170995953, "learning_rate": 9.516928781264588e-06, "loss": 0.7431, "step": 204 }, { "epoch": 0.6883044208170117, "grad_norm": 2.0525414950537972, "learning_rate": 9.508484538705823e-06, "loss": 0.7649, "step": 205 }, { "epoch": 0.6916620033575825, "grad_norm": 1.9668368596759938, "learning_rate": 9.499970943228646e-06, "loss": 0.7218, "step": 206 }, { "epoch": 0.6950195858981534, "grad_norm": 1.9745189748654561, "learning_rate": 9.491388125795623e-06, "loss": 0.7104, "step": 207 }, { "epoch": 0.6983771684387241, "grad_norm": 2.201042855482161, "learning_rate": 9.482736218434144e-06, "loss": 0.7477, "step": 208 }, { "epoch": 0.7017347509792949, "grad_norm": 2.0154595111164895, "learning_rate": 9.474015354234385e-06, "loss": 0.7587, "step": 209 }, { "epoch": 0.7050923335198657, "grad_norm": 2.313239702363474, "learning_rate": 9.465225667347275e-06, "loss": 0.7292, "step": 210 }, { "epoch": 0.7084499160604365, "grad_norm": 2.025272477121896, "learning_rate": 9.45636729298243e-06, "loss": 0.7241, "step": 211 }, { "epoch": 0.7118074986010072, "grad_norm": 2.0795424496465573, "learning_rate": 9.447440367406053e-06, "loss": 0.7458, "step": 212 }, { "epoch": 0.7151650811415781, "grad_norm": 2.320432869755114, "learning_rate": 9.438445027938873e-06, "loss": 0.76, "step": 213 }, { "epoch": 0.7185226636821489, "grad_norm": 2.1418660928160875, "learning_rate": 9.429381412954e-06, "loss": 0.7463, "step": 214 }, { "epoch": 0.7218802462227196, "grad_norm": 2.037263699160382, "learning_rate": 9.420249661874812e-06, "loss": 0.7562, "step": 215 }, { "epoch": 0.7252378287632905, "grad_norm": 2.0664873609895844, "learning_rate": 9.41104991517281e-06, "loss": 0.719, "step": 216 }, { "epoch": 0.7285954113038612, "grad_norm": 2.13157533259705, "learning_rate": 9.401782314365458e-06, "loss": 0.7611, "step": 217 }, { "epoch": 0.731952993844432, "grad_norm": 1.9677915863746518, "learning_rate": 9.392447002013996e-06, "loss": 0.7241, "step": 218 }, { "epoch": 0.7353105763850027, "grad_norm": 2.0599615713607466, "learning_rate": 9.383044121721257e-06, "loss": 0.7413, "step": 219 }, { "epoch": 0.7386681589255736, "grad_norm": 2.0298587251271107, "learning_rate": 9.37357381812946e-06, "loss": 0.7479, "step": 220 }, { "epoch": 0.7420257414661444, "grad_norm": 1.9944542234012115, "learning_rate": 9.364036236917972e-06, "loss": 0.6834, "step": 221 }, { "epoch": 0.7453833240067151, "grad_norm": 2.0288217614081265, "learning_rate": 9.354431524801082e-06, "loss": 0.7512, "step": 222 }, { "epoch": 0.748740906547286, "grad_norm": 2.0374680319858514, "learning_rate": 9.344759829525734e-06, "loss": 0.7138, "step": 223 }, { "epoch": 0.7520984890878567, "grad_norm": 1.911969221157047, "learning_rate": 9.335021299869256e-06, "loss": 0.7382, "step": 224 }, { "epoch": 0.7554560716284275, "grad_norm": 1.9835399597591075, "learning_rate": 9.32521608563708e-06, "loss": 0.7436, "step": 225 }, { "epoch": 0.7588136541689984, "grad_norm": 2.0847726685364045, "learning_rate": 9.315344337660422e-06, "loss": 0.7364, "step": 226 }, { "epoch": 0.7621712367095691, "grad_norm": 2.0415670551670426, "learning_rate": 9.305406207793974e-06, "loss": 0.7225, "step": 227 }, { "epoch": 0.7655288192501399, "grad_norm": 1.987105167453389, "learning_rate": 9.295401848913569e-06, "loss": 0.7458, "step": 228 }, { "epoch": 0.7688864017907107, "grad_norm": 1.9316452047409314, "learning_rate": 9.285331414913816e-06, "loss": 0.6967, "step": 229 }, { "epoch": 0.7722439843312815, "grad_norm": 1.9874288288055189, "learning_rate": 9.275195060705749e-06, "loss": 0.7501, "step": 230 }, { "epoch": 0.7756015668718522, "grad_norm": 2.0963531264104533, "learning_rate": 9.264992942214427e-06, "loss": 0.7236, "step": 231 }, { "epoch": 0.7789591494124231, "grad_norm": 2.0961978727374397, "learning_rate": 9.254725216376562e-06, "loss": 0.7666, "step": 232 }, { "epoch": 0.7823167319529939, "grad_norm": 1.981574993630262, "learning_rate": 9.244392041138068e-06, "loss": 0.7449, "step": 233 }, { "epoch": 0.7856743144935646, "grad_norm": 1.907742433400848, "learning_rate": 9.233993575451663e-06, "loss": 0.7052, "step": 234 }, { "epoch": 0.7890318970341355, "grad_norm": 2.0188171238038426, "learning_rate": 9.223529979274411e-06, "loss": 0.7166, "step": 235 }, { "epoch": 0.7923894795747062, "grad_norm": 2.047444273717583, "learning_rate": 9.213001413565259e-06, "loss": 0.7614, "step": 236 }, { "epoch": 0.795747062115277, "grad_norm": 2.0237204195979603, "learning_rate": 9.202408040282567e-06, "loss": 0.7407, "step": 237 }, { "epoch": 0.7991046446558477, "grad_norm": 2.132189489227427, "learning_rate": 9.191750022381613e-06, "loss": 0.76, "step": 238 }, { "epoch": 0.8024622271964186, "grad_norm": 1.9452576363735405, "learning_rate": 9.181027523812088e-06, "loss": 0.6906, "step": 239 }, { "epoch": 0.8058198097369894, "grad_norm": 2.0526086557604204, "learning_rate": 9.170240709515573e-06, "loss": 0.7492, "step": 240 }, { "epoch": 0.8091773922775601, "grad_norm": 2.127542984513071, "learning_rate": 9.159389745423003e-06, "loss": 0.753, "step": 241 }, { "epoch": 0.812534974818131, "grad_norm": 2.190600637823083, "learning_rate": 9.14847479845211e-06, "loss": 0.7687, "step": 242 }, { "epoch": 0.8158925573587017, "grad_norm": 1.976664413924339, "learning_rate": 9.137496036504868e-06, "loss": 0.7236, "step": 243 }, { "epoch": 0.8192501398992725, "grad_norm": 2.0073069621610813, "learning_rate": 9.126453628464889e-06, "loss": 0.7513, "step": 244 }, { "epoch": 0.8226077224398433, "grad_norm": 2.0275714232847366, "learning_rate": 9.115347744194844e-06, "loss": 0.7117, "step": 245 }, { "epoch": 0.8259653049804141, "grad_norm": 1.950051870826525, "learning_rate": 9.10417855453385e-06, "loss": 0.7421, "step": 246 }, { "epoch": 0.8293228875209849, "grad_norm": 1.9776365232195794, "learning_rate": 9.09294623129482e-06, "loss": 0.7683, "step": 247 }, { "epoch": 0.8326804700615557, "grad_norm": 1.9425183877927914, "learning_rate": 9.081650947261847e-06, "loss": 0.7454, "step": 248 }, { "epoch": 0.8360380526021265, "grad_norm": 2.0588781735343926, "learning_rate": 9.070292876187532e-06, "loss": 0.7511, "step": 249 }, { "epoch": 0.8393956351426972, "grad_norm": 2.1208079935750734, "learning_rate": 9.058872192790314e-06, "loss": 0.7594, "step": 250 }, { "epoch": 0.842753217683268, "grad_norm": 1.960725561854021, "learning_rate": 9.047389072751777e-06, "loss": 0.7164, "step": 251 }, { "epoch": 0.8461108002238389, "grad_norm": 2.073474220676331, "learning_rate": 9.035843692713961e-06, "loss": 0.7256, "step": 252 }, { "epoch": 0.8494683827644096, "grad_norm": 2.0957908157579137, "learning_rate": 9.02423623027663e-06, "loss": 0.7307, "step": 253 }, { "epoch": 0.8528259653049804, "grad_norm": 2.152126139365395, "learning_rate": 9.012566863994548e-06, "loss": 0.7434, "step": 254 }, { "epoch": 0.8561835478455512, "grad_norm": 2.024800247472397, "learning_rate": 9.000835773374733e-06, "loss": 0.7454, "step": 255 }, { "epoch": 0.859541130386122, "grad_norm": 1.9021347801188042, "learning_rate": 8.98904313887369e-06, "loss": 0.7057, "step": 256 }, { "epoch": 0.8628987129266927, "grad_norm": 2.1498786642675607, "learning_rate": 8.977189141894645e-06, "loss": 0.7711, "step": 257 }, { "epoch": 0.8662562954672636, "grad_norm": 1.92161399371383, "learning_rate": 8.965273964784735e-06, "loss": 0.6948, "step": 258 }, { "epoch": 0.8696138780078344, "grad_norm": 2.064988715065064, "learning_rate": 8.953297790832231e-06, "loss": 0.7545, "step": 259 }, { "epoch": 0.8729714605484051, "grad_norm": 2.209448102356102, "learning_rate": 8.941260804263697e-06, "loss": 0.7427, "step": 260 }, { "epoch": 0.876329043088976, "grad_norm": 2.0663786056657165, "learning_rate": 8.929163190241157e-06, "loss": 0.7129, "step": 261 }, { "epoch": 0.8796866256295467, "grad_norm": 1.9398344638285674, "learning_rate": 8.917005134859263e-06, "loss": 0.6766, "step": 262 }, { "epoch": 0.8830442081701175, "grad_norm": 2.1841210939318914, "learning_rate": 8.904786825142416e-06, "loss": 0.7312, "step": 263 }, { "epoch": 0.8864017907106883, "grad_norm": 2.077339438871376, "learning_rate": 8.892508449041893e-06, "loss": 0.752, "step": 264 }, { "epoch": 0.8897593732512591, "grad_norm": 2.0959324603771123, "learning_rate": 8.88017019543296e-06, "loss": 0.741, "step": 265 }, { "epoch": 0.8931169557918299, "grad_norm": 2.0026594953656427, "learning_rate": 8.867772254111966e-06, "loss": 0.7121, "step": 266 }, { "epoch": 0.8964745383324007, "grad_norm": 1.9765771238302214, "learning_rate": 8.85531481579342e-06, "loss": 0.7259, "step": 267 }, { "epoch": 0.8998321208729715, "grad_norm": 2.081056505352585, "learning_rate": 8.842798072107055e-06, "loss": 0.8211, "step": 268 }, { "epoch": 0.9031897034135422, "grad_norm": 2.0635888708323313, "learning_rate": 8.83022221559489e-06, "loss": 0.7545, "step": 269 }, { "epoch": 0.906547285954113, "grad_norm": 1.9645226085630179, "learning_rate": 8.81758743970826e-06, "loss": 0.7097, "step": 270 }, { "epoch": 0.9099048684946839, "grad_norm": 2.090055548313761, "learning_rate": 8.804893938804839e-06, "loss": 0.7085, "step": 271 }, { "epoch": 0.9132624510352546, "grad_norm": 1.9631224971590835, "learning_rate": 8.79214190814566e-06, "loss": 0.749, "step": 272 }, { "epoch": 0.9166200335758254, "grad_norm": 2.151696624831223, "learning_rate": 8.779331543892097e-06, "loss": 0.7437, "step": 273 }, { "epoch": 0.9199776161163962, "grad_norm": 2.0673655977509533, "learning_rate": 8.766463043102864e-06, "loss": 0.7405, "step": 274 }, { "epoch": 0.923335198656967, "grad_norm": 1.915638224161175, "learning_rate": 8.75353660373097e-06, "loss": 0.7099, "step": 275 }, { "epoch": 0.9266927811975377, "grad_norm": 2.143796983074377, "learning_rate": 8.740552424620679e-06, "loss": 0.6971, "step": 276 }, { "epoch": 0.9300503637381086, "grad_norm": 2.057152351652913, "learning_rate": 8.727510705504453e-06, "loss": 0.7293, "step": 277 }, { "epoch": 0.9334079462786794, "grad_norm": 2.1240051292960116, "learning_rate": 8.714411646999878e-06, "loss": 0.7741, "step": 278 }, { "epoch": 0.9367655288192501, "grad_norm": 2.0326991553451945, "learning_rate": 8.701255450606579e-06, "loss": 0.7643, "step": 279 }, { "epoch": 0.940123111359821, "grad_norm": 2.088415602611133, "learning_rate": 8.688042318703111e-06, "loss": 0.7464, "step": 280 }, { "epoch": 0.9434806939003917, "grad_norm": 2.1711465158973393, "learning_rate": 8.674772454543869e-06, "loss": 0.7103, "step": 281 }, { "epoch": 0.9468382764409625, "grad_norm": 1.9500315142474602, "learning_rate": 8.661446062255931e-06, "loss": 0.6947, "step": 282 }, { "epoch": 0.9501958589815332, "grad_norm": 1.846901032667399, "learning_rate": 8.648063346835943e-06, "loss": 0.726, "step": 283 }, { "epoch": 0.9535534415221041, "grad_norm": 2.078490317740966, "learning_rate": 8.634624514146954e-06, "loss": 0.7353, "step": 284 }, { "epoch": 0.9569110240626749, "grad_norm": 2.02753516538753, "learning_rate": 8.621129770915248e-06, "loss": 0.7712, "step": 285 }, { "epoch": 0.9602686066032456, "grad_norm": 1.9463320169189116, "learning_rate": 8.607579324727175e-06, "loss": 0.7472, "step": 286 }, { "epoch": 0.9636261891438165, "grad_norm": 1.9444246710199546, "learning_rate": 8.59397338402594e-06, "loss": 0.7502, "step": 287 }, { "epoch": 0.9669837716843872, "grad_norm": 1.9590062622910647, "learning_rate": 8.580312158108413e-06, "loss": 0.7464, "step": 288 }, { "epoch": 0.970341354224958, "grad_norm": 1.956148304515578, "learning_rate": 8.566595857121902e-06, "loss": 0.7099, "step": 289 }, { "epoch": 0.9736989367655288, "grad_norm": 1.8926398800628126, "learning_rate": 8.55282469206092e-06, "loss": 0.7299, "step": 290 }, { "epoch": 0.9770565193060996, "grad_norm": 2.0917641075340123, "learning_rate": 8.538998874763942e-06, "loss": 0.7639, "step": 291 }, { "epoch": 0.9804141018466704, "grad_norm": 1.9718732131481036, "learning_rate": 8.525118617910144e-06, "loss": 0.7547, "step": 292 }, { "epoch": 0.9837716843872412, "grad_norm": 1.9069388930319855, "learning_rate": 8.511184135016134e-06, "loss": 0.7309, "step": 293 }, { "epoch": 0.987129266927812, "grad_norm": 2.014949861976609, "learning_rate": 8.497195640432664e-06, "loss": 0.7261, "step": 294 }, { "epoch": 0.9904868494683827, "grad_norm": 1.9120513950060911, "learning_rate": 8.483153349341336e-06, "loss": 0.7166, "step": 295 }, { "epoch": 0.9938444320089536, "grad_norm": 1.8053701743199972, "learning_rate": 8.46905747775129e-06, "loss": 0.6816, "step": 296 }, { "epoch": 0.9972020145495244, "grad_norm": 2.003732103494112, "learning_rate": 8.45490824249588e-06, "loss": 0.7147, "step": 297 }, { "epoch": 1.0033575825405707, "grad_norm": 8.303350987198142, "learning_rate": 8.440705861229344e-06, "loss": 1.2836, "step": 298 }, { "epoch": 1.0067151650811417, "grad_norm": 2.2471820576545714, "learning_rate": 8.426450552423451e-06, "loss": 0.5247, "step": 299 }, { "epoch": 1.0100727476217124, "grad_norm": 2.151639519325533, "learning_rate": 8.412142535364139e-06, "loss": 0.5023, "step": 300 }, { "epoch": 1.0134303301622831, "grad_norm": 2.045570754833388, "learning_rate": 8.397782030148147e-06, "loss": 0.5212, "step": 301 }, { "epoch": 1.0167879127028538, "grad_norm": 2.157217329180836, "learning_rate": 8.383369257679625e-06, "loss": 0.5258, "step": 302 }, { "epoch": 1.0201454952434248, "grad_norm": 2.1393611084312547, "learning_rate": 8.368904439666739e-06, "loss": 0.4882, "step": 303 }, { "epoch": 1.0235030777839955, "grad_norm": 2.4819644900313738, "learning_rate": 8.354387798618254e-06, "loss": 0.5222, "step": 304 }, { "epoch": 1.0268606603245662, "grad_norm": 2.5039988079708904, "learning_rate": 8.339819557840124e-06, "loss": 0.4725, "step": 305 }, { "epoch": 1.0302182428651372, "grad_norm": 2.4034784239124063, "learning_rate": 8.32519994143204e-06, "loss": 0.5223, "step": 306 }, { "epoch": 1.033575825405708, "grad_norm": 2.388530410162058, "learning_rate": 8.310529174284004e-06, "loss": 0.5291, "step": 307 }, { "epoch": 1.0369334079462786, "grad_norm": 2.3271329489823613, "learning_rate": 8.295807482072842e-06, "loss": 0.5197, "step": 308 }, { "epoch": 1.0402909904868494, "grad_norm": 2.1361835792169432, "learning_rate": 8.281035091258762e-06, "loss": 0.4758, "step": 309 }, { "epoch": 1.0436485730274203, "grad_norm": 2.2920827998361784, "learning_rate": 8.266212229081846e-06, "loss": 0.4927, "step": 310 }, { "epoch": 1.047006155567991, "grad_norm": 2.296117198784697, "learning_rate": 8.251339123558573e-06, "loss": 0.4897, "step": 311 }, { "epoch": 1.0503637381085618, "grad_norm": 2.4100622360712793, "learning_rate": 8.236416003478295e-06, "loss": 0.4794, "step": 312 }, { "epoch": 1.0537213206491327, "grad_norm": 2.3131605280324865, "learning_rate": 8.221443098399733e-06, "loss": 0.4872, "step": 313 }, { "epoch": 1.0570789031897034, "grad_norm": 2.236522744276815, "learning_rate": 8.206420638647433e-06, "loss": 0.4945, "step": 314 }, { "epoch": 1.0604364857302742, "grad_norm": 2.2019268672438286, "learning_rate": 8.191348855308229e-06, "loss": 0.4766, "step": 315 }, { "epoch": 1.063794068270845, "grad_norm": 2.1588616911267002, "learning_rate": 8.176227980227693e-06, "loss": 0.4723, "step": 316 }, { "epoch": 1.0671516508114158, "grad_norm": 2.3900841792146577, "learning_rate": 8.161058246006558e-06, "loss": 0.5207, "step": 317 }, { "epoch": 1.0705092333519866, "grad_norm": 2.3583948513012394, "learning_rate": 8.145839885997146e-06, "loss": 0.4906, "step": 318 }, { "epoch": 1.0738668158925573, "grad_norm": 2.3746826909440064, "learning_rate": 8.130573134299782e-06, "loss": 0.4918, "step": 319 }, { "epoch": 1.0772243984331282, "grad_norm": 2.002312382387202, "learning_rate": 8.11525822575918e-06, "loss": 0.4515, "step": 320 }, { "epoch": 1.080581980973699, "grad_norm": 2.2773099709125497, "learning_rate": 8.099895395960847e-06, "loss": 0.5124, "step": 321 }, { "epoch": 1.0839395635142697, "grad_norm": 1.9626004596776638, "learning_rate": 8.084484881227449e-06, "loss": 0.4867, "step": 322 }, { "epoch": 1.0872971460548406, "grad_norm": 2.2075102999748317, "learning_rate": 8.069026918615173e-06, "loss": 0.4901, "step": 323 }, { "epoch": 1.0906547285954113, "grad_norm": 2.1111018515847793, "learning_rate": 8.05352174591009e-06, "loss": 0.5072, "step": 324 }, { "epoch": 1.094012311135982, "grad_norm": 2.343955652192936, "learning_rate": 8.037969601624495e-06, "loss": 0.5104, "step": 325 }, { "epoch": 1.0973698936765528, "grad_norm": 2.0222492524048734, "learning_rate": 8.022370724993229e-06, "loss": 0.4585, "step": 326 }, { "epoch": 1.1007274762171237, "grad_norm": 2.3170486820891316, "learning_rate": 8.006725355970008e-06, "loss": 0.4979, "step": 327 }, { "epoch": 1.1040850587576945, "grad_norm": 2.2109613576414646, "learning_rate": 7.99103373522373e-06, "loss": 0.4929, "step": 328 }, { "epoch": 1.1074426412982652, "grad_norm": 2.052769278396322, "learning_rate": 7.975296104134768e-06, "loss": 0.4891, "step": 329 }, { "epoch": 1.1108002238388361, "grad_norm": 2.197495831185824, "learning_rate": 7.959512704791269e-06, "loss": 0.4957, "step": 330 }, { "epoch": 1.1141578063794069, "grad_norm": 2.113288752249262, "learning_rate": 7.943683779985412e-06, "loss": 0.4891, "step": 331 }, { "epoch": 1.1175153889199776, "grad_norm": 2.3150223677191604, "learning_rate": 7.927809573209691e-06, "loss": 0.4667, "step": 332 }, { "epoch": 1.1208729714605483, "grad_norm": 2.066863441298375, "learning_rate": 7.911890328653156e-06, "loss": 0.4485, "step": 333 }, { "epoch": 1.1242305540011193, "grad_norm": 2.1565376459504977, "learning_rate": 7.895926291197667e-06, "loss": 0.4817, "step": 334 }, { "epoch": 1.12758813654169, "grad_norm": 2.3475799553693038, "learning_rate": 7.87991770641412e-06, "loss": 0.5305, "step": 335 }, { "epoch": 1.1309457190822607, "grad_norm": 2.1203359960845116, "learning_rate": 7.863864820558669e-06, "loss": 0.5083, "step": 336 }, { "epoch": 1.1343033016228317, "grad_norm": 2.081087965565155, "learning_rate": 7.847767880568944e-06, "loss": 0.4588, "step": 337 }, { "epoch": 1.1376608841634024, "grad_norm": 2.081318262101045, "learning_rate": 7.831627134060249e-06, "loss": 0.4846, "step": 338 }, { "epoch": 1.141018466703973, "grad_norm": 2.209767183684003, "learning_rate": 7.815442829321754e-06, "loss": 0.483, "step": 339 }, { "epoch": 1.1443760492445438, "grad_norm": 2.011880925223341, "learning_rate": 7.799215215312667e-06, "loss": 0.4579, "step": 340 }, { "epoch": 1.1477336317851148, "grad_norm": 2.0797725312115203, "learning_rate": 7.782944541658423e-06, "loss": 0.5117, "step": 341 }, { "epoch": 1.1510912143256855, "grad_norm": 1.9959817767175392, "learning_rate": 7.766631058646826e-06, "loss": 0.4622, "step": 342 }, { "epoch": 1.1544487968662562, "grad_norm": 2.2735006851582202, "learning_rate": 7.750275017224208e-06, "loss": 0.4724, "step": 343 }, { "epoch": 1.1578063794068272, "grad_norm": 2.0970076738982044, "learning_rate": 7.733876668991565e-06, "loss": 0.4924, "step": 344 }, { "epoch": 1.161163961947398, "grad_norm": 2.2352119607145307, "learning_rate": 7.71743626620069e-06, "loss": 0.517, "step": 345 }, { "epoch": 1.1645215444879686, "grad_norm": 2.302631355787852, "learning_rate": 7.700954061750295e-06, "loss": 0.487, "step": 346 }, { "epoch": 1.1678791270285394, "grad_norm": 2.1221219186208966, "learning_rate": 7.684430309182106e-06, "loss": 0.4709, "step": 347 }, { "epoch": 1.1712367095691103, "grad_norm": 2.101482815450197, "learning_rate": 7.667865262676981e-06, "loss": 0.489, "step": 348 }, { "epoch": 1.174594292109681, "grad_norm": 2.105161148484614, "learning_rate": 7.651259177050996e-06, "loss": 0.5033, "step": 349 }, { "epoch": 1.1779518746502518, "grad_norm": 1.957600802485906, "learning_rate": 7.634612307751513e-06, "loss": 0.45, "step": 350 }, { "epoch": 1.1813094571908227, "grad_norm": 2.2812310534697775, "learning_rate": 7.617924910853266e-06, "loss": 0.5108, "step": 351 }, { "epoch": 1.1846670397313934, "grad_norm": 2.2141528970819246, "learning_rate": 7.601197243054411e-06, "loss": 0.4998, "step": 352 }, { "epoch": 1.1880246222719641, "grad_norm": 2.1186842901832033, "learning_rate": 7.584429561672586e-06, "loss": 0.4822, "step": 353 }, { "epoch": 1.1913822048125349, "grad_norm": 2.2241188415508857, "learning_rate": 7.567622124640942e-06, "loss": 0.4824, "step": 354 }, { "epoch": 1.1947397873531058, "grad_norm": 2.2126573138667402, "learning_rate": 7.5507751905041885e-06, "loss": 0.5051, "step": 355 }, { "epoch": 1.1980973698936765, "grad_norm": 2.175213998538769, "learning_rate": 7.533889018414602e-06, "loss": 0.4909, "step": 356 }, { "epoch": 1.2014549524342473, "grad_norm": 2.1682912210779732, "learning_rate": 7.516963868128054e-06, "loss": 0.4975, "step": 357 }, { "epoch": 1.2048125349748182, "grad_norm": 2.183195934735365, "learning_rate": 7.500000000000001e-06, "loss": 0.5057, "step": 358 }, { "epoch": 1.208170117515389, "grad_norm": 2.1348501168594525, "learning_rate": 7.4829976749814935e-06, "loss": 0.4958, "step": 359 }, { "epoch": 1.2115277000559597, "grad_norm": 2.45122587491375, "learning_rate": 7.46595715461515e-06, "loss": 0.51, "step": 360 }, { "epoch": 1.2148852825965304, "grad_norm": 2.273341521941601, "learning_rate": 7.4488787010311425e-06, "loss": 0.4986, "step": 361 }, { "epoch": 1.2182428651371013, "grad_norm": 2.1265214648425808, "learning_rate": 7.431762576943157e-06, "loss": 0.5224, "step": 362 }, { "epoch": 1.221600447677672, "grad_norm": 2.207039996610266, "learning_rate": 7.414609045644356e-06, "loss": 0.5036, "step": 363 }, { "epoch": 1.2249580302182428, "grad_norm": 2.2511356341290045, "learning_rate": 7.3974183710033334e-06, "loss": 0.4985, "step": 364 }, { "epoch": 1.2283156127588137, "grad_norm": 2.2414220008243655, "learning_rate": 7.38019081746004e-06, "loss": 0.4826, "step": 365 }, { "epoch": 1.2316731952993845, "grad_norm": 2.2345724571038583, "learning_rate": 7.362926650021736e-06, "loss": 0.4734, "step": 366 }, { "epoch": 1.2350307778399552, "grad_norm": 2.020267975703017, "learning_rate": 7.345626134258897e-06, "loss": 0.4707, "step": 367 }, { "epoch": 1.238388360380526, "grad_norm": 2.017700711508486, "learning_rate": 7.3282895363011405e-06, "loss": 0.4429, "step": 368 }, { "epoch": 1.2417459429210969, "grad_norm": 2.4075032884901484, "learning_rate": 7.310917122833127e-06, "loss": 0.5123, "step": 369 }, { "epoch": 1.2451035254616676, "grad_norm": 2.0744065650842924, "learning_rate": 7.293509161090453e-06, "loss": 0.4868, "step": 370 }, { "epoch": 1.2484611080022383, "grad_norm": 2.1398898292409654, "learning_rate": 7.276065918855554e-06, "loss": 0.4917, "step": 371 }, { "epoch": 1.2518186905428093, "grad_norm": 2.1639714949391915, "learning_rate": 7.2585876644535705e-06, "loss": 0.4957, "step": 372 }, { "epoch": 1.25517627308338, "grad_norm": 2.3088349879154446, "learning_rate": 7.241074666748228e-06, "loss": 0.5311, "step": 373 }, { "epoch": 1.2585338556239507, "grad_norm": 2.20877128750944, "learning_rate": 7.2235271951377005e-06, "loss": 0.5217, "step": 374 }, { "epoch": 1.2618914381645214, "grad_norm": 2.079999331915844, "learning_rate": 7.205945519550467e-06, "loss": 0.4972, "step": 375 }, { "epoch": 1.2652490207050924, "grad_norm": 2.1653370386720643, "learning_rate": 7.188329910441154e-06, "loss": 0.4715, "step": 376 }, { "epoch": 1.268606603245663, "grad_norm": 2.2020260358728083, "learning_rate": 7.170680638786383e-06, "loss": 0.4841, "step": 377 }, { "epoch": 1.271964185786234, "grad_norm": 2.313938671583983, "learning_rate": 7.1529979760805946e-06, "loss": 0.5132, "step": 378 }, { "epoch": 1.2753217683268048, "grad_norm": 2.3187218424786433, "learning_rate": 7.135282194331881e-06, "loss": 0.4916, "step": 379 }, { "epoch": 1.2786793508673755, "grad_norm": 2.381365131626571, "learning_rate": 7.1175335660577906e-06, "loss": 0.4985, "step": 380 }, { "epoch": 1.2820369334079462, "grad_norm": 2.188895335954739, "learning_rate": 7.099752364281147e-06, "loss": 0.4985, "step": 381 }, { "epoch": 1.285394515948517, "grad_norm": 2.2491929362704863, "learning_rate": 7.0819388625258385e-06, "loss": 0.4648, "step": 382 }, { "epoch": 1.288752098489088, "grad_norm": 2.302063139422305, "learning_rate": 7.0640933348126235e-06, "loss": 0.5151, "step": 383 }, { "epoch": 1.2921096810296586, "grad_norm": 2.2033193942118423, "learning_rate": 7.046216055654902e-06, "loss": 0.4853, "step": 384 }, { "epoch": 1.2954672635702296, "grad_norm": 2.2781079428025364, "learning_rate": 7.028307300054499e-06, "loss": 0.5407, "step": 385 }, { "epoch": 1.2988248461108003, "grad_norm": 2.4113225121524366, "learning_rate": 7.0103673434974375e-06, "loss": 0.504, "step": 386 }, { "epoch": 1.302182428651371, "grad_norm": 2.1395395477301933, "learning_rate": 6.992396461949693e-06, "loss": 0.4724, "step": 387 }, { "epoch": 1.3055400111919417, "grad_norm": 2.1628168470264364, "learning_rate": 6.974394931852957e-06, "loss": 0.4901, "step": 388 }, { "epoch": 1.3088975937325125, "grad_norm": 2.0779952047374244, "learning_rate": 6.956363030120377e-06, "loss": 0.4779, "step": 389 }, { "epoch": 1.3122551762730834, "grad_norm": 2.026126139321993, "learning_rate": 6.9383010341323e-06, "loss": 0.4661, "step": 390 }, { "epoch": 1.3156127588136541, "grad_norm": 2.290741048224957, "learning_rate": 6.920209221732007e-06, "loss": 0.4814, "step": 391 }, { "epoch": 1.318970341354225, "grad_norm": 2.1365050290585423, "learning_rate": 6.902087871221439e-06, "loss": 0.498, "step": 392 }, { "epoch": 1.3223279238947958, "grad_norm": 2.2937163334382817, "learning_rate": 6.88393726135691e-06, "loss": 0.5079, "step": 393 }, { "epoch": 1.3256855064353665, "grad_norm": 2.072458351219792, "learning_rate": 6.865757671344827e-06, "loss": 0.4769, "step": 394 }, { "epoch": 1.3290430889759373, "grad_norm": 2.0989809200956544, "learning_rate": 6.8475493808373895e-06, "loss": 0.4766, "step": 395 }, { "epoch": 1.332400671516508, "grad_norm": 2.064563966968899, "learning_rate": 6.829312669928293e-06, "loss": 0.456, "step": 396 }, { "epoch": 1.335758254057079, "grad_norm": 2.1069716399584197, "learning_rate": 6.811047819148413e-06, "loss": 0.4984, "step": 397 }, { "epoch": 1.3391158365976497, "grad_norm": 2.213794419117928, "learning_rate": 6.792755109461498e-06, "loss": 0.4866, "step": 398 }, { "epoch": 1.3424734191382206, "grad_norm": 2.2174112182270522, "learning_rate": 6.7744348222598386e-06, "loss": 0.499, "step": 399 }, { "epoch": 1.3458310016787913, "grad_norm": 2.079145185573435, "learning_rate": 6.756087239359948e-06, "loss": 0.493, "step": 400 }, { "epoch": 1.349188584219362, "grad_norm": 2.1392680497000085, "learning_rate": 6.737712642998219e-06, "loss": 0.5378, "step": 401 }, { "epoch": 1.3525461667599328, "grad_norm": 2.0513971936886675, "learning_rate": 6.719311315826589e-06, "loss": 0.4714, "step": 402 }, { "epoch": 1.3559037493005035, "grad_norm": 2.2552876351142284, "learning_rate": 6.700883540908185e-06, "loss": 0.4872, "step": 403 }, { "epoch": 1.3592613318410744, "grad_norm": 2.0728376046951777, "learning_rate": 6.682429601712976e-06, "loss": 0.4799, "step": 404 }, { "epoch": 1.3626189143816452, "grad_norm": 2.0220373122992674, "learning_rate": 6.663949782113413e-06, "loss": 0.5166, "step": 405 }, { "epoch": 1.3659764969222161, "grad_norm": 2.220177493037904, "learning_rate": 6.64544436638005e-06, "loss": 0.4781, "step": 406 }, { "epoch": 1.3693340794627868, "grad_norm": 1.9793083289019477, "learning_rate": 6.626913639177189e-06, "loss": 0.4867, "step": 407 }, { "epoch": 1.3726916620033576, "grad_norm": 2.0274582078000822, "learning_rate": 6.608357885558485e-06, "loss": 0.4443, "step": 408 }, { "epoch": 1.3760492445439283, "grad_norm": 2.1847030624625883, "learning_rate": 6.589777390962575e-06, "loss": 0.5259, "step": 409 }, { "epoch": 1.3794068270844992, "grad_norm": 2.2214921474975395, "learning_rate": 6.571172441208678e-06, "loss": 0.4816, "step": 410 }, { "epoch": 1.38276440962507, "grad_norm": 2.349546029810803, "learning_rate": 6.552543322492195e-06, "loss": 0.5083, "step": 411 }, { "epoch": 1.3861219921656407, "grad_norm": 2.2544220841659315, "learning_rate": 6.53389032138032e-06, "loss": 0.5073, "step": 412 }, { "epoch": 1.3894795747062116, "grad_norm": 2.1899206248487135, "learning_rate": 6.515213724807621e-06, "loss": 0.473, "step": 413 }, { "epoch": 1.3928371572467824, "grad_norm": 2.144097777561319, "learning_rate": 6.49651382007163e-06, "loss": 0.4745, "step": 414 }, { "epoch": 1.396194739787353, "grad_norm": 2.249837918425837, "learning_rate": 6.477790894828422e-06, "loss": 0.5074, "step": 415 }, { "epoch": 1.3995523223279238, "grad_norm": 2.203568377218348, "learning_rate": 6.459045237088189e-06, "loss": 0.5182, "step": 416 }, { "epoch": 1.4029099048684948, "grad_norm": 2.207808012115493, "learning_rate": 6.440277135210815e-06, "loss": 0.4861, "step": 417 }, { "epoch": 1.4062674874090655, "grad_norm": 2.1441864682397886, "learning_rate": 6.421486877901436e-06, "loss": 0.4886, "step": 418 }, { "epoch": 1.4096250699496362, "grad_norm": 2.228627269764285, "learning_rate": 6.402674754205998e-06, "loss": 0.4773, "step": 419 }, { "epoch": 1.4129826524902072, "grad_norm": 2.1994987489882893, "learning_rate": 6.383841053506813e-06, "loss": 0.5075, "step": 420 }, { "epoch": 1.4163402350307779, "grad_norm": 2.183688476148547, "learning_rate": 6.364986065518106e-06, "loss": 0.4917, "step": 421 }, { "epoch": 1.4196978175713486, "grad_norm": 2.269619040413762, "learning_rate": 6.3461100802815625e-06, "loss": 0.4967, "step": 422 }, { "epoch": 1.4230554001119193, "grad_norm": 2.103981888475452, "learning_rate": 6.3272133881618596e-06, "loss": 0.4431, "step": 423 }, { "epoch": 1.4264129826524903, "grad_norm": 2.1258273496585156, "learning_rate": 6.308296279842204e-06, "loss": 0.5031, "step": 424 }, { "epoch": 1.429770565193061, "grad_norm": 2.196489174906513, "learning_rate": 6.289359046319862e-06, "loss": 0.4924, "step": 425 }, { "epoch": 1.4331281477336317, "grad_norm": 2.1269856468419963, "learning_rate": 6.270401978901678e-06, "loss": 0.4895, "step": 426 }, { "epoch": 1.4364857302742027, "grad_norm": 2.2126446207865587, "learning_rate": 6.2514253691996e-06, "loss": 0.5122, "step": 427 }, { "epoch": 1.4398433128147734, "grad_norm": 2.1913325147414726, "learning_rate": 6.2324295091261885e-06, "loss": 0.5283, "step": 428 }, { "epoch": 1.4432008953553441, "grad_norm": 2.091169249602068, "learning_rate": 6.213414690890125e-06, "loss": 0.4879, "step": 429 }, { "epoch": 1.4465584778959149, "grad_norm": 2.210503348980095, "learning_rate": 6.194381206991723e-06, "loss": 0.4887, "step": 430 }, { "epoch": 1.4499160604364858, "grad_norm": 2.164926411078867, "learning_rate": 6.175329350218426e-06, "loss": 0.4711, "step": 431 }, { "epoch": 1.4532736429770565, "grad_norm": 2.1482110012584203, "learning_rate": 6.156259413640302e-06, "loss": 0.462, "step": 432 }, { "epoch": 1.4566312255176272, "grad_norm": 2.294567867253737, "learning_rate": 6.1371716906055336e-06, "loss": 0.5164, "step": 433 }, { "epoch": 1.4599888080581982, "grad_norm": 2.1428555316811346, "learning_rate": 6.11806647473591e-06, "loss": 0.4756, "step": 434 }, { "epoch": 1.463346390598769, "grad_norm": 2.0267786414519313, "learning_rate": 6.098944059922311e-06, "loss": 0.477, "step": 435 }, { "epoch": 1.4667039731393396, "grad_norm": 2.14676631545224, "learning_rate": 6.079804740320181e-06, "loss": 0.4668, "step": 436 }, { "epoch": 1.4700615556799104, "grad_norm": 2.2538467654250267, "learning_rate": 6.060648810345006e-06, "loss": 0.495, "step": 437 }, { "epoch": 1.4734191382204813, "grad_norm": 2.1469963540401067, "learning_rate": 6.041476564667785e-06, "loss": 0.4824, "step": 438 }, { "epoch": 1.476776720761052, "grad_norm": 2.222380545278691, "learning_rate": 6.022288298210502e-06, "loss": 0.4753, "step": 439 }, { "epoch": 1.4801343033016228, "grad_norm": 2.1083124763573746, "learning_rate": 6.003084306141579e-06, "loss": 0.5052, "step": 440 }, { "epoch": 1.4834918858421937, "grad_norm": 2.1370436926979353, "learning_rate": 5.983864883871344e-06, "loss": 0.4789, "step": 441 }, { "epoch": 1.4868494683827644, "grad_norm": 2.3178640332647316, "learning_rate": 5.964630327047485e-06, "loss": 0.5193, "step": 442 }, { "epoch": 1.4902070509233352, "grad_norm": 2.1956778031643496, "learning_rate": 5.945380931550497e-06, "loss": 0.4849, "step": 443 }, { "epoch": 1.4935646334639059, "grad_norm": 2.1954225526435063, "learning_rate": 5.926116993489143e-06, "loss": 0.4852, "step": 444 }, { "epoch": 1.4969222160044768, "grad_norm": 2.0679488397891896, "learning_rate": 5.906838809195879e-06, "loss": 0.477, "step": 445 }, { "epoch": 1.5002797985450476, "grad_norm": 2.1914965672856375, "learning_rate": 5.887546675222319e-06, "loss": 0.4897, "step": 446 }, { "epoch": 1.5036373810856185, "grad_norm": 2.2169157191153603, "learning_rate": 5.8682408883346535e-06, "loss": 0.481, "step": 447 }, { "epoch": 1.5069949636261892, "grad_norm": 2.2332788961047636, "learning_rate": 5.848921745509094e-06, "loss": 0.5045, "step": 448 }, { "epoch": 1.51035254616676, "grad_norm": 2.116170949280568, "learning_rate": 5.829589543927305e-06, "loss": 0.4674, "step": 449 }, { "epoch": 1.5137101287073307, "grad_norm": 2.043440626400537, "learning_rate": 5.8102445809718325e-06, "loss": 0.4964, "step": 450 }, { "epoch": 1.5170677112479014, "grad_norm": 2.0944801885939204, "learning_rate": 5.790887154221521e-06, "loss": 0.4948, "step": 451 }, { "epoch": 1.5204252937884724, "grad_norm": 2.3084091670034623, "learning_rate": 5.771517561446949e-06, "loss": 0.5108, "step": 452 }, { "epoch": 1.523782876329043, "grad_norm": 2.3208214523309683, "learning_rate": 5.75213610060584e-06, "loss": 0.4914, "step": 453 }, { "epoch": 1.527140458869614, "grad_norm": 2.024408080024364, "learning_rate": 5.7327430698384775e-06, "loss": 0.4919, "step": 454 }, { "epoch": 1.5304980414101848, "grad_norm": 2.2191159501747073, "learning_rate": 5.713338767463129e-06, "loss": 0.5085, "step": 455 }, { "epoch": 1.5338556239507555, "grad_norm": 2.192013145259984, "learning_rate": 5.693923491971445e-06, "loss": 0.4936, "step": 456 }, { "epoch": 1.5372132064913262, "grad_norm": 2.137744041707972, "learning_rate": 5.674497542023875e-06, "loss": 0.5004, "step": 457 }, { "epoch": 1.540570789031897, "grad_norm": 2.032793978959609, "learning_rate": 5.65506121644507e-06, "loss": 0.491, "step": 458 }, { "epoch": 1.5439283715724679, "grad_norm": 2.2242785868303194, "learning_rate": 5.635614814219289e-06, "loss": 0.4974, "step": 459 }, { "epoch": 1.5472859541130386, "grad_norm": 2.2872129720575662, "learning_rate": 5.616158634485793e-06, "loss": 0.5045, "step": 460 }, { "epoch": 1.5506435366536095, "grad_norm": 2.5161657966811197, "learning_rate": 5.596692976534256e-06, "loss": 0.4776, "step": 461 }, { "epoch": 1.5540011191941803, "grad_norm": 2.1830947757285566, "learning_rate": 5.577218139800143e-06, "loss": 0.4779, "step": 462 }, { "epoch": 1.557358701734751, "grad_norm": 2.2723393111932286, "learning_rate": 5.557734423860122e-06, "loss": 0.4559, "step": 463 }, { "epoch": 1.5607162842753217, "grad_norm": 2.039582763815814, "learning_rate": 5.538242128427444e-06, "loss": 0.4967, "step": 464 }, { "epoch": 1.5640738668158924, "grad_norm": 2.1867791207126572, "learning_rate": 5.518741553347341e-06, "loss": 0.4793, "step": 465 }, { "epoch": 1.5674314493564634, "grad_norm": 2.184162576004028, "learning_rate": 5.499232998592399e-06, "loss": 0.4563, "step": 466 }, { "epoch": 1.5707890318970341, "grad_norm": 2.2279834676183077, "learning_rate": 5.479716764257961e-06, "loss": 0.4726, "step": 467 }, { "epoch": 1.574146614437605, "grad_norm": 2.146603101538437, "learning_rate": 5.4601931505575e-06, "loss": 0.4761, "step": 468 }, { "epoch": 1.5775041969781758, "grad_norm": 2.2103167022630386, "learning_rate": 5.44066245781801e-06, "loss": 0.4955, "step": 469 }, { "epoch": 1.5808617795187465, "grad_norm": 2.3722558539750156, "learning_rate": 5.421124986475371e-06, "loss": 0.5089, "step": 470 }, { "epoch": 1.5842193620593172, "grad_norm": 2.3496747654346697, "learning_rate": 5.4015810370697445e-06, "loss": 0.4878, "step": 471 }, { "epoch": 1.587576944599888, "grad_norm": 2.185007470426342, "learning_rate": 5.382030910240936e-06, "loss": 0.4713, "step": 472 }, { "epoch": 1.590934527140459, "grad_norm": 2.2103514110080593, "learning_rate": 5.362474906723781e-06, "loss": 0.5096, "step": 473 }, { "epoch": 1.5942921096810296, "grad_norm": 2.141248032889711, "learning_rate": 5.342913327343515e-06, "loss": 0.4891, "step": 474 }, { "epoch": 1.5976496922216006, "grad_norm": 2.172202613793392, "learning_rate": 5.3233464730111426e-06, "loss": 0.4929, "step": 475 }, { "epoch": 1.6010072747621713, "grad_norm": 2.238037583616825, "learning_rate": 5.303774644718813e-06, "loss": 0.4849, "step": 476 }, { "epoch": 1.604364857302742, "grad_norm": 2.0572504308978954, "learning_rate": 5.284198143535188e-06, "loss": 0.4946, "step": 477 }, { "epoch": 1.6077224398433128, "grad_norm": 2.0917913417158016, "learning_rate": 5.2646172706008154e-06, "loss": 0.4834, "step": 478 }, { "epoch": 1.6110800223838835, "grad_norm": 2.013696077036751, "learning_rate": 5.245032327123488e-06, "loss": 0.4564, "step": 479 }, { "epoch": 1.6144376049244544, "grad_norm": 2.073234984497162, "learning_rate": 5.225443614373614e-06, "loss": 0.4479, "step": 480 }, { "epoch": 1.6177951874650252, "grad_norm": 2.208574587163591, "learning_rate": 5.20585143367959e-06, "loss": 0.4761, "step": 481 }, { "epoch": 1.621152770005596, "grad_norm": 2.1530116735583316, "learning_rate": 5.186256086423148e-06, "loss": 0.4702, "step": 482 }, { "epoch": 1.6245103525461668, "grad_norm": 2.200115990394491, "learning_rate": 5.166657874034745e-06, "loss": 0.5088, "step": 483 }, { "epoch": 1.6278679350867375, "grad_norm": 2.1407757545469424, "learning_rate": 5.147057097988898e-06, "loss": 0.5084, "step": 484 }, { "epoch": 1.6312255176273083, "grad_norm": 2.124229197200729, "learning_rate": 5.127454059799567e-06, "loss": 0.4623, "step": 485 }, { "epoch": 1.634583100167879, "grad_norm": 2.217735463671646, "learning_rate": 5.1078490610155105e-06, "loss": 0.4946, "step": 486 }, { "epoch": 1.63794068270845, "grad_norm": 2.0764480657249758, "learning_rate": 5.088242403215644e-06, "loss": 0.5089, "step": 487 }, { "epoch": 1.6412982652490207, "grad_norm": 2.331029267583452, "learning_rate": 5.0686343880044044e-06, "loss": 0.473, "step": 488 }, { "epoch": 1.6446558477895916, "grad_norm": 2.1804145039026546, "learning_rate": 5.049025317007108e-06, "loss": 0.4934, "step": 489 }, { "epoch": 1.6480134303301623, "grad_norm": 1.9682449318933604, "learning_rate": 5.029415491865311e-06, "loss": 0.4616, "step": 490 }, { "epoch": 1.651371012870733, "grad_norm": 2.3938687293871133, "learning_rate": 5.009805214232177e-06, "loss": 0.5293, "step": 491 }, { "epoch": 1.6547285954113038, "grad_norm": 2.1248954294843094, "learning_rate": 4.990194785767824e-06, "loss": 0.4815, "step": 492 }, { "epoch": 1.6580861779518745, "grad_norm": 2.066522021485902, "learning_rate": 4.97058450813469e-06, "loss": 0.4782, "step": 493 }, { "epoch": 1.6614437604924455, "grad_norm": 2.132367969004758, "learning_rate": 4.950974682992894e-06, "loss": 0.4493, "step": 494 }, { "epoch": 1.6648013430330162, "grad_norm": 2.195382323978713, "learning_rate": 4.931365611995598e-06, "loss": 0.5095, "step": 495 }, { "epoch": 1.6681589255735871, "grad_norm": 2.0731108125500346, "learning_rate": 4.911757596784358e-06, "loss": 0.5098, "step": 496 }, { "epoch": 1.6715165081141579, "grad_norm": 2.2045936873389618, "learning_rate": 4.892150938984491e-06, "loss": 0.5034, "step": 497 }, { "epoch": 1.6748740906547286, "grad_norm": 2.119499431019322, "learning_rate": 4.872545940200435e-06, "loss": 0.4621, "step": 498 }, { "epoch": 1.6782316731952993, "grad_norm": 2.0571530774922717, "learning_rate": 4.8529429020111035e-06, "loss": 0.4324, "step": 499 }, { "epoch": 1.68158925573587, "grad_norm": 2.080333899217976, "learning_rate": 4.833342125965257e-06, "loss": 0.4786, "step": 500 }, { "epoch": 1.684946838276441, "grad_norm": 2.198178104289353, "learning_rate": 4.813743913576852e-06, "loss": 0.476, "step": 501 }, { "epoch": 1.6883044208170117, "grad_norm": 2.1587194322010514, "learning_rate": 4.794148566320412e-06, "loss": 0.463, "step": 502 }, { "epoch": 1.6916620033575827, "grad_norm": 2.3918082310854296, "learning_rate": 4.774556385626386e-06, "loss": 0.502, "step": 503 }, { "epoch": 1.6950195858981534, "grad_norm": 2.267854796695586, "learning_rate": 4.754967672876513e-06, "loss": 0.5066, "step": 504 }, { "epoch": 1.698377168438724, "grad_norm": 2.172897001954314, "learning_rate": 4.7353827293991845e-06, "loss": 0.4865, "step": 505 }, { "epoch": 1.7017347509792948, "grad_norm": 2.249989307898594, "learning_rate": 4.715801856464812e-06, "loss": 0.5135, "step": 506 }, { "epoch": 1.7050923335198656, "grad_norm": 2.182167380702455, "learning_rate": 4.6962253552811885e-06, "loss": 0.52, "step": 507 }, { "epoch": 1.7084499160604365, "grad_norm": 2.3182206016361384, "learning_rate": 4.676653526988858e-06, "loss": 0.4623, "step": 508 }, { "epoch": 1.7118074986010072, "grad_norm": 2.298712891082637, "learning_rate": 4.657086672656486e-06, "loss": 0.4734, "step": 509 }, { "epoch": 1.7151650811415782, "grad_norm": 2.1611212242763194, "learning_rate": 4.63752509327622e-06, "loss": 0.4484, "step": 510 }, { "epoch": 1.718522663682149, "grad_norm": 2.2744157038160497, "learning_rate": 4.617969089759066e-06, "loss": 0.5041, "step": 511 }, { "epoch": 1.7218802462227196, "grad_norm": 2.001332944054495, "learning_rate": 4.598418962930258e-06, "loss": 0.494, "step": 512 }, { "epoch": 1.7252378287632903, "grad_norm": 2.15178549984152, "learning_rate": 4.57887501352463e-06, "loss": 0.4906, "step": 513 }, { "epoch": 1.728595411303861, "grad_norm": 2.0567126022881403, "learning_rate": 4.559337542181993e-06, "loss": 0.4654, "step": 514 }, { "epoch": 1.731952993844432, "grad_norm": 2.3440444984090214, "learning_rate": 4.539806849442501e-06, "loss": 0.4806, "step": 515 }, { "epoch": 1.7353105763850027, "grad_norm": 2.2415234653831475, "learning_rate": 4.520283235742042e-06, "loss": 0.4623, "step": 516 }, { "epoch": 1.7386681589255737, "grad_norm": 2.1332817601414997, "learning_rate": 4.500767001407604e-06, "loss": 0.4522, "step": 517 }, { "epoch": 1.7420257414661444, "grad_norm": 2.116507838045294, "learning_rate": 4.481258446652662e-06, "loss": 0.4842, "step": 518 }, { "epoch": 1.7453833240067151, "grad_norm": 2.215105939137212, "learning_rate": 4.4617578715725565e-06, "loss": 0.4649, "step": 519 }, { "epoch": 1.7487409065472859, "grad_norm": 2.225499325958965, "learning_rate": 4.4422655761398785e-06, "loss": 0.4853, "step": 520 }, { "epoch": 1.7520984890878566, "grad_norm": 2.254229574580799, "learning_rate": 4.4227818601998575e-06, "loss": 0.4883, "step": 521 }, { "epoch": 1.7554560716284275, "grad_norm": 2.1468138229729443, "learning_rate": 4.403307023465746e-06, "loss": 0.4786, "step": 522 }, { "epoch": 1.7588136541689985, "grad_norm": 2.169367653787952, "learning_rate": 4.383841365514208e-06, "loss": 0.4933, "step": 523 }, { "epoch": 1.7621712367095692, "grad_norm": 2.2540586767928916, "learning_rate": 4.364385185780712e-06, "loss": 0.4423, "step": 524 }, { "epoch": 1.76552881925014, "grad_norm": 2.046616711801834, "learning_rate": 4.3449387835549305e-06, "loss": 0.4517, "step": 525 }, { "epoch": 1.7688864017907107, "grad_norm": 2.054096374050294, "learning_rate": 4.325502457976126e-06, "loss": 0.4562, "step": 526 }, { "epoch": 1.7722439843312814, "grad_norm": 2.1897562491663023, "learning_rate": 4.306076508028557e-06, "loss": 0.4872, "step": 527 }, { "epoch": 1.775601566871852, "grad_norm": 2.1685603692137985, "learning_rate": 4.286661232536873e-06, "loss": 0.4847, "step": 528 }, { "epoch": 1.778959149412423, "grad_norm": 2.1915605427774034, "learning_rate": 4.267256930161523e-06, "loss": 0.5192, "step": 529 }, { "epoch": 1.782316731952994, "grad_norm": 2.254879047746067, "learning_rate": 4.247863899394162e-06, "loss": 0.4687, "step": 530 }, { "epoch": 1.7856743144935647, "grad_norm": 2.1134520274196125, "learning_rate": 4.228482438553052e-06, "loss": 0.5262, "step": 531 }, { "epoch": 1.7890318970341355, "grad_norm": 2.068253922507518, "learning_rate": 4.209112845778481e-06, "loss": 0.4839, "step": 532 }, { "epoch": 1.7923894795747062, "grad_norm": 2.0887709442403346, "learning_rate": 4.189755419028169e-06, "loss": 0.4623, "step": 533 }, { "epoch": 1.795747062115277, "grad_norm": 2.300959929057977, "learning_rate": 4.1704104560726955e-06, "loss": 0.5047, "step": 534 }, { "epoch": 1.7991046446558476, "grad_norm": 2.156784226239325, "learning_rate": 4.151078254490908e-06, "loss": 0.4553, "step": 535 }, { "epoch": 1.8024622271964186, "grad_norm": 2.0472342846000893, "learning_rate": 4.131759111665349e-06, "loss": 0.444, "step": 536 }, { "epoch": 1.8058198097369895, "grad_norm": 2.15308812602405, "learning_rate": 4.112453324777683e-06, "loss": 0.4504, "step": 537 }, { "epoch": 1.8091773922775602, "grad_norm": 2.305556846294406, "learning_rate": 4.09316119080412e-06, "loss": 0.4561, "step": 538 }, { "epoch": 1.812534974818131, "grad_norm": 2.0629496902341304, "learning_rate": 4.073883006510858e-06, "loss": 0.4639, "step": 539 }, { "epoch": 1.8158925573587017, "grad_norm": 2.621774490091145, "learning_rate": 4.054619068449502e-06, "loss": 0.4988, "step": 540 }, { "epoch": 1.8192501398992724, "grad_norm": 2.249906503732158, "learning_rate": 4.035369672952516e-06, "loss": 0.4665, "step": 541 }, { "epoch": 1.8226077224398431, "grad_norm": 2.124308282019638, "learning_rate": 4.016135116128656e-06, "loss": 0.4837, "step": 542 }, { "epoch": 1.825965304980414, "grad_norm": 2.216203736606476, "learning_rate": 3.996915693858422e-06, "loss": 0.4599, "step": 543 }, { "epoch": 1.829322887520985, "grad_norm": 2.199055435229539, "learning_rate": 3.977711701789499e-06, "loss": 0.4996, "step": 544 }, { "epoch": 1.8326804700615558, "grad_norm": 2.1456836995965123, "learning_rate": 3.9585234353322155e-06, "loss": 0.474, "step": 545 }, { "epoch": 1.8360380526021265, "grad_norm": 2.088890168115453, "learning_rate": 3.939351189654996e-06, "loss": 0.4551, "step": 546 }, { "epoch": 1.8393956351426972, "grad_norm": 2.059736510246409, "learning_rate": 3.920195259679822e-06, "loss": 0.484, "step": 547 }, { "epoch": 1.842753217683268, "grad_norm": 2.4289464926505504, "learning_rate": 3.901055940077691e-06, "loss": 0.5043, "step": 548 }, { "epoch": 1.8461108002238389, "grad_norm": 2.0962090325586606, "learning_rate": 3.881933525264092e-06, "loss": 0.4398, "step": 549 }, { "epoch": 1.8494683827644096, "grad_norm": 2.1707115890324165, "learning_rate": 3.862828309394469e-06, "loss": 0.4925, "step": 550 }, { "epoch": 1.8528259653049806, "grad_norm": 2.015982277160347, "learning_rate": 3.843740586359701e-06, "loss": 0.4757, "step": 551 }, { "epoch": 1.8561835478455513, "grad_norm": 2.2304879540207203, "learning_rate": 3.824670649781576e-06, "loss": 0.4614, "step": 552 }, { "epoch": 1.859541130386122, "grad_norm": 2.091915679725523, "learning_rate": 3.805618793008279e-06, "loss": 0.4448, "step": 553 }, { "epoch": 1.8628987129266927, "grad_norm": 2.16946840752013, "learning_rate": 3.786585309109877e-06, "loss": 0.4649, "step": 554 }, { "epoch": 1.8662562954672635, "grad_norm": 2.128475934593608, "learning_rate": 3.7675704908738136e-06, "loss": 0.4802, "step": 555 }, { "epoch": 1.8696138780078344, "grad_norm": 2.165472295510831, "learning_rate": 3.7485746308004013e-06, "loss": 0.4977, "step": 556 }, { "epoch": 1.8729714605484051, "grad_norm": 2.212252433928935, "learning_rate": 3.7295980210983233e-06, "loss": 0.4935, "step": 557 }, { "epoch": 1.876329043088976, "grad_norm": 2.1928900923013956, "learning_rate": 3.71064095368014e-06, "loss": 0.4627, "step": 558 }, { "epoch": 1.8796866256295468, "grad_norm": 2.085149102799712, "learning_rate": 3.6917037201577977e-06, "loss": 0.4278, "step": 559 }, { "epoch": 1.8830442081701175, "grad_norm": 2.0816980346294307, "learning_rate": 3.672786611838142e-06, "loss": 0.4631, "step": 560 }, { "epoch": 1.8864017907106883, "grad_norm": 2.103474181111992, "learning_rate": 3.653889919718439e-06, "loss": 0.4511, "step": 561 }, { "epoch": 1.889759373251259, "grad_norm": 2.2744496879859275, "learning_rate": 3.635013934481895e-06, "loss": 0.4974, "step": 562 }, { "epoch": 1.89311695579183, "grad_norm": 2.2227249595968934, "learning_rate": 3.616158946493188e-06, "loss": 0.4769, "step": 563 }, { "epoch": 1.8964745383324007, "grad_norm": 2.152621448218455, "learning_rate": 3.5973252457940034e-06, "loss": 0.4994, "step": 564 }, { "epoch": 1.8998321208729716, "grad_norm": 2.070223009955467, "learning_rate": 3.578513122098566e-06, "loss": 0.5039, "step": 565 }, { "epoch": 1.9031897034135423, "grad_norm": 2.121366546607162, "learning_rate": 3.559722864789187e-06, "loss": 0.4789, "step": 566 }, { "epoch": 1.906547285954113, "grad_norm": 2.1017767501093823, "learning_rate": 3.5409547629118124e-06, "loss": 0.4562, "step": 567 }, { "epoch": 1.9099048684946838, "grad_norm": 2.274326927273865, "learning_rate": 3.5222091051715803e-06, "loss": 0.4623, "step": 568 }, { "epoch": 1.9132624510352545, "grad_norm": 1.9186891449000945, "learning_rate": 3.5034861799283713e-06, "loss": 0.5144, "step": 569 }, { "epoch": 1.9166200335758254, "grad_norm": 2.3556578562236252, "learning_rate": 3.48478627519238e-06, "loss": 0.4503, "step": 570 }, { "epoch": 1.9199776161163962, "grad_norm": 1.9892297433331403, "learning_rate": 3.466109678619681e-06, "loss": 0.4934, "step": 571 }, { "epoch": 1.9233351986569671, "grad_norm": 2.2314638850872144, "learning_rate": 3.4474566775078055e-06, "loss": 0.4934, "step": 572 }, { "epoch": 1.9266927811975378, "grad_norm": 2.2782528645464173, "learning_rate": 3.4288275587913235e-06, "loss": 0.4948, "step": 573 }, { "epoch": 1.9300503637381086, "grad_norm": 2.36739181255683, "learning_rate": 3.4102226090374246e-06, "loss": 0.4741, "step": 574 }, { "epoch": 1.9334079462786793, "grad_norm": 2.1683249260482444, "learning_rate": 3.3916421144415146e-06, "loss": 0.4732, "step": 575 }, { "epoch": 1.93676552881925, "grad_norm": 2.1221739672342497, "learning_rate": 3.3730863608228125e-06, "loss": 0.4274, "step": 576 }, { "epoch": 1.940123111359821, "grad_norm": 2.095387968090082, "learning_rate": 3.35455563361995e-06, "loss": 0.4649, "step": 577 }, { "epoch": 1.9434806939003917, "grad_norm": 2.1689072561371154, "learning_rate": 3.336050217886588e-06, "loss": 0.4986, "step": 578 }, { "epoch": 1.9468382764409626, "grad_norm": 2.2504194821367345, "learning_rate": 3.3175703982870232e-06, "loss": 0.4716, "step": 579 }, { "epoch": 1.9501958589815334, "grad_norm": 2.084871129563975, "learning_rate": 3.2991164590918162e-06, "loss": 0.4403, "step": 580 }, { "epoch": 1.953553441522104, "grad_norm": 2.0143154450944123, "learning_rate": 3.280688684173412e-06, "loss": 0.4452, "step": 581 }, { "epoch": 1.9569110240626748, "grad_norm": 2.1804050757472977, "learning_rate": 3.262287357001781e-06, "loss": 0.516, "step": 582 }, { "epoch": 1.9602686066032455, "grad_norm": 2.0645062573328232, "learning_rate": 3.2439127606400546e-06, "loss": 0.461, "step": 583 }, { "epoch": 1.9636261891438165, "grad_norm": 2.070842126513229, "learning_rate": 3.225565177740163e-06, "loss": 0.466, "step": 584 }, { "epoch": 1.9669837716843872, "grad_norm": 2.0600131128468595, "learning_rate": 3.2072448905385046e-06, "loss": 0.433, "step": 585 }, { "epoch": 1.9703413542249582, "grad_norm": 2.0764551843814107, "learning_rate": 3.1889521808515888e-06, "loss": 0.45, "step": 586 }, { "epoch": 1.9736989367655289, "grad_norm": 2.1493865400194103, "learning_rate": 3.1706873300717094e-06, "loss": 0.4903, "step": 587 }, { "epoch": 1.9770565193060996, "grad_norm": 2.1217797993065988, "learning_rate": 3.152450619162612e-06, "loss": 0.456, "step": 588 }, { "epoch": 1.9804141018466703, "grad_norm": 2.2674307611908273, "learning_rate": 3.1342423286551756e-06, "loss": 0.4758, "step": 589 }, { "epoch": 1.983771684387241, "grad_norm": 2.016774239865244, "learning_rate": 3.116062738643092e-06, "loss": 0.4871, "step": 590 }, { "epoch": 1.987129266927812, "grad_norm": 2.167349361097923, "learning_rate": 3.097912128778563e-06, "loss": 0.4621, "step": 591 }, { "epoch": 1.9904868494683827, "grad_norm": 2.1086622374082644, "learning_rate": 3.0797907782679944e-06, "loss": 0.462, "step": 592 }, { "epoch": 1.9938444320089537, "grad_norm": 2.142067661122772, "learning_rate": 3.061698965867701e-06, "loss": 0.4403, "step": 593 }, { "epoch": 1.9972020145495244, "grad_norm": 2.211038574058772, "learning_rate": 3.043636969879625e-06, "loss": 0.4748, "step": 594 }, { "epoch": 2.0033575825405707, "grad_norm": 8.113958823135038, "learning_rate": 3.0256050681470446e-06, "loss": 0.7156, "step": 595 }, { "epoch": 2.0067151650811414, "grad_norm": 2.568864765515772, "learning_rate": 3.007603538050309e-06, "loss": 0.2897, "step": 596 }, { "epoch": 2.010072747621712, "grad_norm": 2.313285944394538, "learning_rate": 2.989632656502564e-06, "loss": 0.2573, "step": 597 }, { "epoch": 2.0134303301622833, "grad_norm": 2.2688120461445687, "learning_rate": 2.971692699945502e-06, "loss": 0.2617, "step": 598 }, { "epoch": 2.016787912702854, "grad_norm": 2.2390498633994875, "learning_rate": 2.9537839443451e-06, "loss": 0.2628, "step": 599 }, { "epoch": 2.020145495243425, "grad_norm": 2.018211337998392, "learning_rate": 2.935906665187378e-06, "loss": 0.2577, "step": 600 }, { "epoch": 2.0235030777839955, "grad_norm": 2.0822307985642268, "learning_rate": 2.9180611374741623e-06, "loss": 0.2481, "step": 601 }, { "epoch": 2.0268606603245662, "grad_norm": 2.651846463170353, "learning_rate": 2.900247635718856e-06, "loss": 0.2961, "step": 602 }, { "epoch": 2.030218242865137, "grad_norm": 3.6928643005110513, "learning_rate": 2.8824664339422115e-06, "loss": 0.281, "step": 603 }, { "epoch": 2.0335758254057077, "grad_norm": 3.075668633649421, "learning_rate": 2.8647178056681197e-06, "loss": 0.2588, "step": 604 }, { "epoch": 2.036933407946279, "grad_norm": 2.6467867984028577, "learning_rate": 2.847002023919406e-06, "loss": 0.2678, "step": 605 }, { "epoch": 2.0402909904868496, "grad_norm": 2.724532376339797, "learning_rate": 2.8293193612136183e-06, "loss": 0.2405, "step": 606 }, { "epoch": 2.0436485730274203, "grad_norm": 2.4483147811459975, "learning_rate": 2.8116700895588473e-06, "loss": 0.241, "step": 607 }, { "epoch": 2.047006155567991, "grad_norm": 2.2477666437496566, "learning_rate": 2.7940544804495345e-06, "loss": 0.2513, "step": 608 }, { "epoch": 2.0503637381085618, "grad_norm": 2.154565484035418, "learning_rate": 2.7764728048623003e-06, "loss": 0.2506, "step": 609 }, { "epoch": 2.0537213206491325, "grad_norm": 2.17650555134875, "learning_rate": 2.7589253332517736e-06, "loss": 0.2387, "step": 610 }, { "epoch": 2.057078903189703, "grad_norm": 2.253001429434042, "learning_rate": 2.741412335546431e-06, "loss": 0.2491, "step": 611 }, { "epoch": 2.0604364857302744, "grad_norm": 2.0324493606743146, "learning_rate": 2.7239340811444476e-06, "loss": 0.2402, "step": 612 }, { "epoch": 2.063794068270845, "grad_norm": 2.2299955712377666, "learning_rate": 2.706490838909547e-06, "loss": 0.255, "step": 613 }, { "epoch": 2.067151650811416, "grad_norm": 2.0377107617066965, "learning_rate": 2.6890828771668742e-06, "loss": 0.2576, "step": 614 }, { "epoch": 2.0705092333519866, "grad_norm": 2.1876913026406037, "learning_rate": 2.671710463698859e-06, "loss": 0.2427, "step": 615 }, { "epoch": 2.0738668158925573, "grad_norm": 2.0004569183512233, "learning_rate": 2.6543738657411033e-06, "loss": 0.2305, "step": 616 }, { "epoch": 2.077224398433128, "grad_norm": 2.2750688222972695, "learning_rate": 2.6370733499782654e-06, "loss": 0.2398, "step": 617 }, { "epoch": 2.0805819809736987, "grad_norm": 2.196557061021631, "learning_rate": 2.6198091825399606e-06, "loss": 0.2659, "step": 618 }, { "epoch": 2.08393956351427, "grad_norm": 2.4966706499173306, "learning_rate": 2.6025816289966703e-06, "loss": 0.2528, "step": 619 }, { "epoch": 2.0872971460548406, "grad_norm": 1.9987480291512625, "learning_rate": 2.5853909543556444e-06, "loss": 0.2381, "step": 620 }, { "epoch": 2.0906547285954113, "grad_norm": 2.3473080858318793, "learning_rate": 2.568237423056844e-06, "loss": 0.2185, "step": 621 }, { "epoch": 2.094012311135982, "grad_norm": 2.2351448577994, "learning_rate": 2.5511212989688587e-06, "loss": 0.2492, "step": 622 }, { "epoch": 2.097369893676553, "grad_norm": 2.334819375193785, "learning_rate": 2.534042845384851e-06, "loss": 0.2264, "step": 623 }, { "epoch": 2.1007274762171235, "grad_norm": 2.1645622435514578, "learning_rate": 2.517002325018508e-06, "loss": 0.2433, "step": 624 }, { "epoch": 2.1040850587576942, "grad_norm": 2.4245838623271645, "learning_rate": 2.5000000000000015e-06, "loss": 0.2685, "step": 625 }, { "epoch": 2.1074426412982654, "grad_norm": 2.187347569512869, "learning_rate": 2.4830361318719493e-06, "loss": 0.2314, "step": 626 }, { "epoch": 2.110800223838836, "grad_norm": 2.125826756884641, "learning_rate": 2.4661109815854005e-06, "loss": 0.2601, "step": 627 }, { "epoch": 2.114157806379407, "grad_norm": 2.175643578704326, "learning_rate": 2.449224809495815e-06, "loss": 0.248, "step": 628 }, { "epoch": 2.1175153889199776, "grad_norm": 2.3549155345423842, "learning_rate": 2.4323778753590582e-06, "loss": 0.2289, "step": 629 }, { "epoch": 2.1208729714605483, "grad_norm": 2.2111119256937877, "learning_rate": 2.4155704383274154e-06, "loss": 0.2437, "step": 630 }, { "epoch": 2.124230554001119, "grad_norm": 2.3478963695246273, "learning_rate": 2.3988027569455895e-06, "loss": 0.2517, "step": 631 }, { "epoch": 2.12758813654169, "grad_norm": 2.2461086864800106, "learning_rate": 2.3820750891467355e-06, "loss": 0.2333, "step": 632 }, { "epoch": 2.130945719082261, "grad_norm": 2.2329675778706926, "learning_rate": 2.365387692248488e-06, "loss": 0.2359, "step": 633 }, { "epoch": 2.1343033016228317, "grad_norm": 2.3145727443697934, "learning_rate": 2.348740822949006e-06, "loss": 0.2477, "step": 634 }, { "epoch": 2.1376608841634024, "grad_norm": 2.1666894881929895, "learning_rate": 2.33213473732302e-06, "loss": 0.2442, "step": 635 }, { "epoch": 2.141018466703973, "grad_norm": 2.215179935351511, "learning_rate": 2.3155696908178974e-06, "loss": 0.2492, "step": 636 }, { "epoch": 2.144376049244544, "grad_norm": 2.0303810141253344, "learning_rate": 2.2990459382497086e-06, "loss": 0.2414, "step": 637 }, { "epoch": 2.1477336317851146, "grad_norm": 2.2599318995246636, "learning_rate": 2.2825637337993094e-06, "loss": 0.2542, "step": 638 }, { "epoch": 2.1510912143256853, "grad_norm": 2.229319114603247, "learning_rate": 2.266123331008436e-06, "loss": 0.2763, "step": 639 }, { "epoch": 2.1544487968662565, "grad_norm": 2.018769817603059, "learning_rate": 2.2497249827757933e-06, "loss": 0.2279, "step": 640 }, { "epoch": 2.157806379406827, "grad_norm": 2.1991246705651317, "learning_rate": 2.233368941353175e-06, "loss": 0.2415, "step": 641 }, { "epoch": 2.161163961947398, "grad_norm": 2.2319026864631875, "learning_rate": 2.2170554583415782e-06, "loss": 0.2207, "step": 642 }, { "epoch": 2.1645215444879686, "grad_norm": 2.1965612835784936, "learning_rate": 2.2007847846873342e-06, "loss": 0.2425, "step": 643 }, { "epoch": 2.1678791270285394, "grad_norm": 2.1567182556620774, "learning_rate": 2.1845571706782486e-06, "loss": 0.2303, "step": 644 }, { "epoch": 2.17123670956911, "grad_norm": 2.2622288541045683, "learning_rate": 2.1683728659397517e-06, "loss": 0.2429, "step": 645 }, { "epoch": 2.1745942921096812, "grad_norm": 2.2558926541854176, "learning_rate": 2.1522321194310577e-06, "loss": 0.2541, "step": 646 }, { "epoch": 2.177951874650252, "grad_norm": 2.2770654917423765, "learning_rate": 2.1361351794413334e-06, "loss": 0.2446, "step": 647 }, { "epoch": 2.1813094571908227, "grad_norm": 2.173609001184362, "learning_rate": 2.1200822935858807e-06, "loss": 0.251, "step": 648 }, { "epoch": 2.1846670397313934, "grad_norm": 2.2522011504738577, "learning_rate": 2.1040737088023323e-06, "loss": 0.2481, "step": 649 }, { "epoch": 2.188024622271964, "grad_norm": 2.1688069367996596, "learning_rate": 2.0881096713468435e-06, "loss": 0.2486, "step": 650 }, { "epoch": 2.191382204812535, "grad_norm": 2.3838615198983706, "learning_rate": 2.0721904267903097e-06, "loss": 0.2457, "step": 651 }, { "epoch": 2.1947397873531056, "grad_norm": 2.1625681906768346, "learning_rate": 2.056316220014588e-06, "loss": 0.2271, "step": 652 }, { "epoch": 2.1980973698936763, "grad_norm": 2.2745686739163014, "learning_rate": 2.040487295208732e-06, "loss": 0.2238, "step": 653 }, { "epoch": 2.2014549524342475, "grad_norm": 2.0740837502881235, "learning_rate": 2.024703895865232e-06, "loss": 0.2633, "step": 654 }, { "epoch": 2.204812534974818, "grad_norm": 2.348484545437271, "learning_rate": 2.0089662647762716e-06, "loss": 0.2502, "step": 655 }, { "epoch": 2.208170117515389, "grad_norm": 2.1411611952162346, "learning_rate": 1.9932746440299926e-06, "loss": 0.2352, "step": 656 }, { "epoch": 2.2115277000559597, "grad_norm": 2.1602600116386514, "learning_rate": 1.977629275006772e-06, "loss": 0.2214, "step": 657 }, { "epoch": 2.2148852825965304, "grad_norm": 2.1825649500433104, "learning_rate": 1.962030398375506e-06, "loss": 0.2217, "step": 658 }, { "epoch": 2.218242865137101, "grad_norm": 2.076444790094385, "learning_rate": 1.946478254089911e-06, "loss": 0.2327, "step": 659 }, { "epoch": 2.2216004476776723, "grad_norm": 2.242188296768225, "learning_rate": 1.9309730813848302e-06, "loss": 0.2341, "step": 660 }, { "epoch": 2.224958030218243, "grad_norm": 2.324463919600608, "learning_rate": 1.915515118772555e-06, "loss": 0.2367, "step": 661 }, { "epoch": 2.2283156127588137, "grad_norm": 2.0513327269909487, "learning_rate": 1.9001046040391558e-06, "loss": 0.242, "step": 662 }, { "epoch": 2.2316731952993845, "grad_norm": 2.344026468770851, "learning_rate": 1.884741774240823e-06, "loss": 0.2665, "step": 663 }, { "epoch": 2.235030777839955, "grad_norm": 2.2216964878287735, "learning_rate": 1.8694268657002197e-06, "loss": 0.2433, "step": 664 }, { "epoch": 2.238388360380526, "grad_norm": 2.1555814735411976, "learning_rate": 1.8541601140028542e-06, "loss": 0.2397, "step": 665 }, { "epoch": 2.2417459429210966, "grad_norm": 2.1482906635963253, "learning_rate": 1.8389417539934428e-06, "loss": 0.2216, "step": 666 }, { "epoch": 2.245103525461668, "grad_norm": 2.096021593139733, "learning_rate": 1.8237720197723075e-06, "loss": 0.2262, "step": 667 }, { "epoch": 2.2484611080022385, "grad_norm": 2.266711561199349, "learning_rate": 1.8086511446917715e-06, "loss": 0.2343, "step": 668 }, { "epoch": 2.2518186905428093, "grad_norm": 2.232410418505839, "learning_rate": 1.7935793613525693e-06, "loss": 0.2593, "step": 669 }, { "epoch": 2.25517627308338, "grad_norm": 2.3428925980088264, "learning_rate": 1.7785569016002686e-06, "loss": 0.2743, "step": 670 }, { "epoch": 2.2585338556239507, "grad_norm": 2.236303024740682, "learning_rate": 1.7635839965217055e-06, "loss": 0.2301, "step": 671 }, { "epoch": 2.2618914381645214, "grad_norm": 2.0637903925635177, "learning_rate": 1.748660876441428e-06, "loss": 0.2643, "step": 672 }, { "epoch": 2.265249020705092, "grad_norm": 2.2271012154627994, "learning_rate": 1.7337877709181527e-06, "loss": 0.2309, "step": 673 }, { "epoch": 2.2686066032456633, "grad_norm": 2.1077879509424005, "learning_rate": 1.7189649087412385e-06, "loss": 0.261, "step": 674 }, { "epoch": 2.271964185786234, "grad_norm": 2.2008818744280263, "learning_rate": 1.7041925179271584e-06, "loss": 0.2453, "step": 675 }, { "epoch": 2.2753217683268048, "grad_norm": 2.2427383093212394, "learning_rate": 1.689470825715998e-06, "loss": 0.2349, "step": 676 }, { "epoch": 2.2786793508673755, "grad_norm": 2.325006154923223, "learning_rate": 1.6748000585679602e-06, "loss": 0.2529, "step": 677 }, { "epoch": 2.282036933407946, "grad_norm": 2.30699822949776, "learning_rate": 1.6601804421598787e-06, "loss": 0.2558, "step": 678 }, { "epoch": 2.285394515948517, "grad_norm": 2.131117963004742, "learning_rate": 1.6456122013817477e-06, "loss": 0.2334, "step": 679 }, { "epoch": 2.2887520984890877, "grad_norm": 2.1171412775183582, "learning_rate": 1.631095560333264e-06, "loss": 0.2431, "step": 680 }, { "epoch": 2.2921096810296584, "grad_norm": 2.2433228629531774, "learning_rate": 1.6166307423203765e-06, "loss": 0.214, "step": 681 }, { "epoch": 2.2954672635702296, "grad_norm": 2.268872598922477, "learning_rate": 1.6022179698518525e-06, "loss": 0.2401, "step": 682 }, { "epoch": 2.2988248461108003, "grad_norm": 2.191932766219746, "learning_rate": 1.5878574646358608e-06, "loss": 0.2178, "step": 683 }, { "epoch": 2.302182428651371, "grad_norm": 2.2800614694305144, "learning_rate": 1.573549447576549e-06, "loss": 0.2335, "step": 684 }, { "epoch": 2.3055400111919417, "grad_norm": 2.3217546136753273, "learning_rate": 1.5592941387706562e-06, "loss": 0.2349, "step": 685 }, { "epoch": 2.3088975937325125, "grad_norm": 2.2055299968173, "learning_rate": 1.5450917575041209e-06, "loss": 0.2461, "step": 686 }, { "epoch": 2.312255176273083, "grad_norm": 2.1265013617268256, "learning_rate": 1.5309425222487119e-06, "loss": 0.2166, "step": 687 }, { "epoch": 2.3156127588136544, "grad_norm": 2.1979179058845695, "learning_rate": 1.5168466506586654e-06, "loss": 0.2196, "step": 688 }, { "epoch": 2.318970341354225, "grad_norm": 2.167123534236895, "learning_rate": 1.502804359567337e-06, "loss": 0.2427, "step": 689 }, { "epoch": 2.322327923894796, "grad_norm": 2.3539399012866418, "learning_rate": 1.4888158649838675e-06, "loss": 0.2386, "step": 690 }, { "epoch": 2.3256855064353665, "grad_norm": 2.1796719146281345, "learning_rate": 1.4748813820898554e-06, "loss": 0.236, "step": 691 }, { "epoch": 2.3290430889759373, "grad_norm": 2.2205243451241, "learning_rate": 1.4610011252360594e-06, "loss": 0.2229, "step": 692 }, { "epoch": 2.332400671516508, "grad_norm": 2.3094055557029494, "learning_rate": 1.4471753079390815e-06, "loss": 0.2396, "step": 693 }, { "epoch": 2.3357582540570787, "grad_norm": 2.1939883736480157, "learning_rate": 1.4334041428781003e-06, "loss": 0.231, "step": 694 }, { "epoch": 2.33911583659765, "grad_norm": 2.149486473333343, "learning_rate": 1.4196878418915894e-06, "loss": 0.2365, "step": 695 }, { "epoch": 2.3424734191382206, "grad_norm": 2.2453237673213255, "learning_rate": 1.4060266159740627e-06, "loss": 0.2388, "step": 696 }, { "epoch": 2.3458310016787913, "grad_norm": 2.423577584509045, "learning_rate": 1.3924206752728282e-06, "loss": 0.2401, "step": 697 }, { "epoch": 2.349188584219362, "grad_norm": 2.4806635634108187, "learning_rate": 1.3788702290847517e-06, "loss": 0.2429, "step": 698 }, { "epoch": 2.3525461667599328, "grad_norm": 2.380640333144661, "learning_rate": 1.3653754858530477e-06, "loss": 0.2258, "step": 699 }, { "epoch": 2.3559037493005035, "grad_norm": 2.2491745778254066, "learning_rate": 1.3519366531640589e-06, "loss": 0.2622, "step": 700 }, { "epoch": 2.3592613318410747, "grad_norm": 2.316018513747914, "learning_rate": 1.3385539377440709e-06, "loss": 0.248, "step": 701 }, { "epoch": 2.3626189143816454, "grad_norm": 2.314430211194231, "learning_rate": 1.3252275454561337e-06, "loss": 0.2536, "step": 702 }, { "epoch": 2.365976496922216, "grad_norm": 2.1763408291528674, "learning_rate": 1.3119576812968893e-06, "loss": 0.2403, "step": 703 }, { "epoch": 2.369334079462787, "grad_norm": 2.1416964805672283, "learning_rate": 1.2987445493934236e-06, "loss": 0.2273, "step": 704 }, { "epoch": 2.3726916620033576, "grad_norm": 2.1683455218648358, "learning_rate": 1.2855883530001228e-06, "loss": 0.2423, "step": 705 }, { "epoch": 2.3760492445439283, "grad_norm": 2.2242131906759597, "learning_rate": 1.272489294495548e-06, "loss": 0.2404, "step": 706 }, { "epoch": 2.379406827084499, "grad_norm": 2.1865211099221553, "learning_rate": 1.2594475753793211e-06, "loss": 0.2483, "step": 707 }, { "epoch": 2.3827644096250697, "grad_norm": 2.0946660616224815, "learning_rate": 1.2464633962690304e-06, "loss": 0.255, "step": 708 }, { "epoch": 2.386121992165641, "grad_norm": 2.2231751389825463, "learning_rate": 1.2335369568971362e-06, "loss": 0.2343, "step": 709 }, { "epoch": 2.3894795747062116, "grad_norm": 2.312671362781463, "learning_rate": 1.2206684561079035e-06, "loss": 0.2408, "step": 710 }, { "epoch": 2.3928371572467824, "grad_norm": 2.179660173524886, "learning_rate": 1.207858091854342e-06, "loss": 0.2383, "step": 711 }, { "epoch": 2.396194739787353, "grad_norm": 2.1293264857856555, "learning_rate": 1.1951060611951615e-06, "loss": 0.23, "step": 712 }, { "epoch": 2.399552322327924, "grad_norm": 2.1755461849480446, "learning_rate": 1.1824125602917414e-06, "loss": 0.2354, "step": 713 }, { "epoch": 2.4029099048684945, "grad_norm": 1.9840827894505013, "learning_rate": 1.1697777844051105e-06, "loss": 0.2284, "step": 714 }, { "epoch": 2.4062674874090657, "grad_norm": 2.0944031352778643, "learning_rate": 1.1572019278929457e-06, "loss": 0.2357, "step": 715 }, { "epoch": 2.4096250699496364, "grad_norm": 2.286049493049029, "learning_rate": 1.1446851842065804e-06, "loss": 0.2219, "step": 716 }, { "epoch": 2.412982652490207, "grad_norm": 2.195180056991891, "learning_rate": 1.1322277458880337e-06, "loss": 0.2443, "step": 717 }, { "epoch": 2.416340235030778, "grad_norm": 2.360744350880101, "learning_rate": 1.1198298045670402e-06, "loss": 0.2307, "step": 718 }, { "epoch": 2.4196978175713486, "grad_norm": 2.27588589395575, "learning_rate": 1.1074915509581086e-06, "loss": 0.2218, "step": 719 }, { "epoch": 2.4230554001119193, "grad_norm": 2.2915120318076894, "learning_rate": 1.0952131748575855e-06, "loss": 0.2348, "step": 720 }, { "epoch": 2.42641298265249, "grad_norm": 2.3740865208478428, "learning_rate": 1.0829948651407374e-06, "loss": 0.233, "step": 721 }, { "epoch": 2.429770565193061, "grad_norm": 2.309847767416398, "learning_rate": 1.0708368097588435e-06, "loss": 0.2411, "step": 722 }, { "epoch": 2.433128147733632, "grad_norm": 2.1585332492732703, "learning_rate": 1.0587391957363053e-06, "loss": 0.2689, "step": 723 }, { "epoch": 2.4364857302742027, "grad_norm": 2.2620198419443645, "learning_rate": 1.0467022091677692e-06, "loss": 0.2386, "step": 724 }, { "epoch": 2.4398433128147734, "grad_norm": 2.4789714742421998, "learning_rate": 1.0347260352152644e-06, "loss": 0.2542, "step": 725 }, { "epoch": 2.443200895355344, "grad_norm": 2.233580671739803, "learning_rate": 1.0228108581053565e-06, "loss": 0.2342, "step": 726 }, { "epoch": 2.446558477895915, "grad_norm": 2.1024662676881314, "learning_rate": 1.0109568611263094e-06, "loss": 0.222, "step": 727 }, { "epoch": 2.4499160604364856, "grad_norm": 2.276521915661851, "learning_rate": 9.991642266252672e-07, "loss": 0.2099, "step": 728 }, { "epoch": 2.4532736429770567, "grad_norm": 2.260563206399162, "learning_rate": 9.87433136005454e-07, "loss": 0.2548, "step": 729 }, { "epoch": 2.4566312255176275, "grad_norm": 2.156550439442849, "learning_rate": 9.757637697233723e-07, "loss": 0.2211, "step": 730 }, { "epoch": 2.459988808058198, "grad_norm": 2.2236291372113866, "learning_rate": 9.641563072860416e-07, "loss": 0.2258, "step": 731 }, { "epoch": 2.463346390598769, "grad_norm": 2.1205062426646437, "learning_rate": 9.526109272482237e-07, "loss": 0.2201, "step": 732 }, { "epoch": 2.4667039731393396, "grad_norm": 2.3294849171122567, "learning_rate": 9.41127807209688e-07, "loss": 0.2303, "step": 733 }, { "epoch": 2.4700615556799104, "grad_norm": 2.127623518916985, "learning_rate": 9.297071238124683e-07, "loss": 0.2374, "step": 734 }, { "epoch": 2.473419138220481, "grad_norm": 2.358371434685423, "learning_rate": 9.183490527381539e-07, "loss": 0.2415, "step": 735 }, { "epoch": 2.476776720761052, "grad_norm": 2.3348118958807014, "learning_rate": 9.070537687051817e-07, "loss": 0.2253, "step": 736 }, { "epoch": 2.480134303301623, "grad_norm": 2.254796435173114, "learning_rate": 8.958214454661529e-07, "loss": 0.2474, "step": 737 }, { "epoch": 2.4834918858421937, "grad_norm": 2.135376686047964, "learning_rate": 8.846522558051563e-07, "loss": 0.2193, "step": 738 }, { "epoch": 2.4868494683827644, "grad_norm": 2.4255834301642745, "learning_rate": 8.735463715351139e-07, "loss": 0.2569, "step": 739 }, { "epoch": 2.490207050923335, "grad_norm": 2.1786279330493694, "learning_rate": 8.625039634951354e-07, "loss": 0.2388, "step": 740 }, { "epoch": 2.493564633463906, "grad_norm": 2.3684247953727144, "learning_rate": 8.515252015478915e-07, "loss": 0.2432, "step": 741 }, { "epoch": 2.4969222160044766, "grad_norm": 2.1452983004533706, "learning_rate": 8.406102545769989e-07, "loss": 0.2361, "step": 742 }, { "epoch": 2.500279798545048, "grad_norm": 2.239393813510702, "learning_rate": 8.297592904844282e-07, "loss": 0.2169, "step": 743 }, { "epoch": 2.5036373810856185, "grad_norm": 2.1080838250603233, "learning_rate": 8.189724761879131e-07, "loss": 0.2402, "step": 744 }, { "epoch": 2.5069949636261892, "grad_norm": 2.205253345024292, "learning_rate": 8.082499776183883e-07, "loss": 0.2345, "step": 745 }, { "epoch": 2.51035254616676, "grad_norm": 2.236843913141464, "learning_rate": 7.975919597174342e-07, "loss": 0.2272, "step": 746 }, { "epoch": 2.5137101287073307, "grad_norm": 2.1687504666441257, "learning_rate": 7.869985864347424e-07, "loss": 0.2304, "step": 747 }, { "epoch": 2.5170677112479014, "grad_norm": 2.32345363923919, "learning_rate": 7.764700207255904e-07, "loss": 0.2409, "step": 748 }, { "epoch": 2.520425293788472, "grad_norm": 2.2137378690316605, "learning_rate": 7.660064245483384e-07, "loss": 0.2273, "step": 749 }, { "epoch": 2.523782876329043, "grad_norm": 2.261904827600831, "learning_rate": 7.556079588619341e-07, "loss": 0.2219, "step": 750 }, { "epoch": 2.527140458869614, "grad_norm": 2.2075096631540245, "learning_rate": 7.452747836234392e-07, "loss": 0.2234, "step": 751 }, { "epoch": 2.5304980414101848, "grad_norm": 2.2106003656998707, "learning_rate": 7.350070577855716e-07, "loss": 0.2485, "step": 752 }, { "epoch": 2.5338556239507555, "grad_norm": 2.262010781586963, "learning_rate": 7.24804939294253e-07, "loss": 0.2405, "step": 753 }, { "epoch": 2.537213206491326, "grad_norm": 2.1267348431476387, "learning_rate": 7.146685850861851e-07, "loss": 0.2394, "step": 754 }, { "epoch": 2.540570789031897, "grad_norm": 2.305882442336494, "learning_rate": 7.045981510864319e-07, "loss": 0.2528, "step": 755 }, { "epoch": 2.543928371572468, "grad_norm": 2.018265078425514, "learning_rate": 6.945937922060259e-07, "loss": 0.233, "step": 756 }, { "epoch": 2.547285954113039, "grad_norm": 2.2614071614786666, "learning_rate": 6.846556623395795e-07, "loss": 0.222, "step": 757 }, { "epoch": 2.5506435366536095, "grad_norm": 2.1351635366635158, "learning_rate": 6.74783914362922e-07, "loss": 0.2273, "step": 758 }, { "epoch": 2.5540011191941803, "grad_norm": 2.100876831365208, "learning_rate": 6.649787001307451e-07, "loss": 0.2072, "step": 759 }, { "epoch": 2.557358701734751, "grad_norm": 2.1077270882398524, "learning_rate": 6.552401704742678e-07, "loss": 0.2147, "step": 760 }, { "epoch": 2.5607162842753217, "grad_norm": 2.185148590555281, "learning_rate": 6.455684751989194e-07, "loss": 0.2387, "step": 761 }, { "epoch": 2.5640738668158924, "grad_norm": 2.4273332489758714, "learning_rate": 6.359637630820292e-07, "loss": 0.2187, "step": 762 }, { "epoch": 2.567431449356463, "grad_norm": 2.2410879666266945, "learning_rate": 6.26426181870542e-07, "loss": 0.2356, "step": 763 }, { "epoch": 2.570789031897034, "grad_norm": 2.0546535173663236, "learning_rate": 6.169558782787438e-07, "loss": 0.2134, "step": 764 }, { "epoch": 2.574146614437605, "grad_norm": 2.2712662103864667, "learning_rate": 6.075529979860068e-07, "loss": 0.2434, "step": 765 }, { "epoch": 2.577504196978176, "grad_norm": 2.495990931424611, "learning_rate": 5.982176856345445e-07, "loss": 0.2572, "step": 766 }, { "epoch": 2.5808617795187465, "grad_norm": 2.272797069007875, "learning_rate": 5.889500848271901e-07, "loss": 0.2365, "step": 767 }, { "epoch": 2.5842193620593172, "grad_norm": 2.2261054362951573, "learning_rate": 5.797503381251896e-07, "loss": 0.2345, "step": 768 }, { "epoch": 2.587576944599888, "grad_norm": 2.2214663136739072, "learning_rate": 5.706185870460018e-07, "loss": 0.2582, "step": 769 }, { "epoch": 2.590934527140459, "grad_norm": 2.2509221328745666, "learning_rate": 5.61554972061128e-07, "loss": 0.2405, "step": 770 }, { "epoch": 2.59429210968103, "grad_norm": 1.9882931292487553, "learning_rate": 5.525596325939469e-07, "loss": 0.2074, "step": 771 }, { "epoch": 2.5976496922216006, "grad_norm": 2.2641730214593667, "learning_rate": 5.436327070175729e-07, "loss": 0.2264, "step": 772 }, { "epoch": 2.6010072747621713, "grad_norm": 2.1593141034556607, "learning_rate": 5.347743326527255e-07, "loss": 0.2334, "step": 773 }, { "epoch": 2.604364857302742, "grad_norm": 2.160652045504084, "learning_rate": 5.25984645765617e-07, "loss": 0.2348, "step": 774 }, { "epoch": 2.6077224398433128, "grad_norm": 2.1624773063108873, "learning_rate": 5.172637815658583e-07, "loss": 0.2046, "step": 775 }, { "epoch": 2.6110800223838835, "grad_norm": 2.2467642799151624, "learning_rate": 5.086118742043761e-07, "loss": 0.2521, "step": 776 }, { "epoch": 2.614437604924454, "grad_norm": 2.1254445389657435, "learning_rate": 5.000290567713533e-07, "loss": 0.2209, "step": 777 }, { "epoch": 2.617795187465025, "grad_norm": 2.331029167817777, "learning_rate": 4.915154612941781e-07, "loss": 0.2461, "step": 778 }, { "epoch": 2.621152770005596, "grad_norm": 2.225940066081278, "learning_rate": 4.830712187354125e-07, "loss": 0.2521, "step": 779 }, { "epoch": 2.624510352546167, "grad_norm": 2.1960281391785768, "learning_rate": 4.7469645899078153e-07, "loss": 0.2081, "step": 780 }, { "epoch": 2.6278679350867375, "grad_norm": 2.202588232752528, "learning_rate": 4.663913108871726e-07, "loss": 0.2217, "step": 781 }, { "epoch": 2.6312255176273083, "grad_norm": 2.1413814255877788, "learning_rate": 4.581559021806542e-07, "loss": 0.2279, "step": 782 }, { "epoch": 2.634583100167879, "grad_norm": 2.2575040003842877, "learning_rate": 4.4999035955450964e-07, "loss": 0.2507, "step": 783 }, { "epoch": 2.63794068270845, "grad_norm": 2.29100443191833, "learning_rate": 4.4189480861729137e-07, "loss": 0.247, "step": 784 }, { "epoch": 2.641298265249021, "grad_norm": 2.0949207855222625, "learning_rate": 4.3386937390088366e-07, "loss": 0.205, "step": 785 }, { "epoch": 2.6446558477895916, "grad_norm": 2.3016984109745064, "learning_rate": 4.259141788585947e-07, "loss": 0.2436, "step": 786 }, { "epoch": 2.6480134303301623, "grad_norm": 2.2773955253769276, "learning_rate": 4.1802934586324897e-07, "loss": 0.2329, "step": 787 }, { "epoch": 2.651371012870733, "grad_norm": 2.2679994509516543, "learning_rate": 4.102149962053098e-07, "loss": 0.2416, "step": 788 }, { "epoch": 2.654728595411304, "grad_norm": 2.134297523257456, "learning_rate": 4.0247125009101275e-07, "loss": 0.2384, "step": 789 }, { "epoch": 2.6580861779518745, "grad_norm": 2.3727877129494774, "learning_rate": 3.947982266405159e-07, "loss": 0.2313, "step": 790 }, { "epoch": 2.6614437604924452, "grad_norm": 2.1765597243225288, "learning_rate": 3.871960438860689e-07, "loss": 0.2257, "step": 791 }, { "epoch": 2.664801343033016, "grad_norm": 2.313798954233314, "learning_rate": 3.796648187701957e-07, "loss": 0.2436, "step": 792 }, { "epoch": 2.668158925573587, "grad_norm": 2.3444816164677595, "learning_rate": 3.72204667143895e-07, "loss": 0.2534, "step": 793 }, { "epoch": 2.671516508114158, "grad_norm": 2.29388898580906, "learning_rate": 3.648157037648598e-07, "loss": 0.2159, "step": 794 }, { "epoch": 2.6748740906547286, "grad_norm": 2.184396762194593, "learning_rate": 3.574980422957147e-07, "loss": 0.2151, "step": 795 }, { "epoch": 2.6782316731952993, "grad_norm": 2.3518739979849683, "learning_rate": 3.5025179530225995e-07, "loss": 0.2236, "step": 796 }, { "epoch": 2.68158925573587, "grad_norm": 2.2064933728915213, "learning_rate": 3.43077074251747e-07, "loss": 0.2305, "step": 797 }, { "epoch": 2.684946838276441, "grad_norm": 2.340839131571328, "learning_rate": 3.359739895111602e-07, "loss": 0.2451, "step": 798 }, { "epoch": 2.688304420817012, "grad_norm": 2.2538444477416055, "learning_rate": 3.289426503455201e-07, "loss": 0.2234, "step": 799 }, { "epoch": 2.6916620033575827, "grad_norm": 2.244758740164714, "learning_rate": 3.2198316491620305e-07, "loss": 0.2294, "step": 800 }, { "epoch": 2.6950195858981534, "grad_norm": 2.149355042894359, "learning_rate": 3.150956402792765e-07, "loss": 0.2216, "step": 801 }, { "epoch": 2.698377168438724, "grad_norm": 2.259174848777998, "learning_rate": 3.082801823838527e-07, "loss": 0.2268, "step": 802 }, { "epoch": 2.701734750979295, "grad_norm": 2.2477921891179204, "learning_rate": 3.015368960704584e-07, "loss": 0.242, "step": 803 }, { "epoch": 2.7050923335198656, "grad_norm": 2.1272415575213643, "learning_rate": 2.9486588506942303e-07, "loss": 0.2342, "step": 804 }, { "epoch": 2.7084499160604363, "grad_norm": 2.334010741930345, "learning_rate": 2.882672519992824e-07, "loss": 0.2285, "step": 805 }, { "epoch": 2.711807498601007, "grad_norm": 2.427045189554706, "learning_rate": 2.817410983651997e-07, "loss": 0.2562, "step": 806 }, { "epoch": 2.715165081141578, "grad_norm": 2.062462864366435, "learning_rate": 2.7528752455740606e-07, "loss": 0.1984, "step": 807 }, { "epoch": 2.718522663682149, "grad_norm": 2.117107968846607, "learning_rate": 2.6890662984965234e-07, "loss": 0.2167, "step": 808 }, { "epoch": 2.7218802462227196, "grad_norm": 2.172976071364664, "learning_rate": 2.625985123976876e-07, "loss": 0.2312, "step": 809 }, { "epoch": 2.7252378287632903, "grad_norm": 2.1650501300436438, "learning_rate": 2.5636326923774325e-07, "loss": 0.2423, "step": 810 }, { "epoch": 2.728595411303861, "grad_norm": 2.212412244165142, "learning_rate": 2.5020099628504603e-07, "loss": 0.2185, "step": 811 }, { "epoch": 2.7319529938444322, "grad_norm": 2.1280889488629287, "learning_rate": 2.441117883323374e-07, "loss": 0.2413, "step": 812 }, { "epoch": 2.735310576385003, "grad_norm": 2.18787418811109, "learning_rate": 2.3809573904841844e-07, "loss": 0.233, "step": 813 }, { "epoch": 2.7386681589255737, "grad_norm": 2.2553778724383737, "learning_rate": 2.3215294097670927e-07, "loss": 0.2236, "step": 814 }, { "epoch": 2.7420257414661444, "grad_norm": 2.080459053836358, "learning_rate": 2.262834855338225e-07, "loss": 0.2376, "step": 815 }, { "epoch": 2.745383324006715, "grad_norm": 2.165809723126224, "learning_rate": 2.204874630081616e-07, "loss": 0.2225, "step": 816 }, { "epoch": 2.748740906547286, "grad_norm": 1.956393897313748, "learning_rate": 2.1476496255852685e-07, "loss": 0.233, "step": 817 }, { "epoch": 2.7520984890878566, "grad_norm": 2.101187708321008, "learning_rate": 2.091160722127472e-07, "loss": 0.2233, "step": 818 }, { "epoch": 2.7554560716284273, "grad_norm": 2.328065045528626, "learning_rate": 2.0354087886632623e-07, "loss": 0.2371, "step": 819 }, { "epoch": 2.7588136541689985, "grad_norm": 2.1295011524399086, "learning_rate": 1.9803946828110376e-07, "loss": 0.2408, "step": 820 }, { "epoch": 2.762171236709569, "grad_norm": 2.0812004019092822, "learning_rate": 1.9261192508393755e-07, "loss": 0.2211, "step": 821 }, { "epoch": 2.76552881925014, "grad_norm": 2.110225954008512, "learning_rate": 1.8725833276540095e-07, "loss": 0.2328, "step": 822 }, { "epoch": 2.7688864017907107, "grad_norm": 2.209831203361078, "learning_rate": 1.8197877367849948e-07, "loss": 0.2424, "step": 823 }, { "epoch": 2.7722439843312814, "grad_norm": 2.1406558326813103, "learning_rate": 1.7677332903740296e-07, "loss": 0.2293, "step": 824 }, { "epoch": 2.775601566871852, "grad_norm": 2.160818736774281, "learning_rate": 1.7164207891619823e-07, "loss": 0.2265, "step": 825 }, { "epoch": 2.7789591494124233, "grad_norm": 2.1902094381897177, "learning_rate": 1.6658510224765333e-07, "loss": 0.2253, "step": 826 }, { "epoch": 2.782316731952994, "grad_norm": 2.311103825756955, "learning_rate": 1.6160247682200813e-07, "loss": 0.2455, "step": 827 }, { "epoch": 2.7856743144935647, "grad_norm": 2.218083782097005, "learning_rate": 1.566942792857745e-07, "loss": 0.2233, "step": 828 }, { "epoch": 2.7890318970341355, "grad_norm": 2.2738455358176277, "learning_rate": 1.5186058514055912e-07, "loss": 0.2399, "step": 829 }, { "epoch": 2.792389479574706, "grad_norm": 2.193046078475537, "learning_rate": 1.471014687418998e-07, "loss": 0.219, "step": 830 }, { "epoch": 2.795747062115277, "grad_norm": 2.1748698706908773, "learning_rate": 1.4241700329812368e-07, "loss": 0.2208, "step": 831 }, { "epoch": 2.7991046446558476, "grad_norm": 2.25948599781455, "learning_rate": 1.3780726086922103e-07, "loss": 0.2205, "step": 832 }, { "epoch": 2.8024622271964184, "grad_norm": 2.1627090816106906, "learning_rate": 1.332723123657348e-07, "loss": 0.2155, "step": 833 }, { "epoch": 2.8058198097369895, "grad_norm": 2.1556941222951918, "learning_rate": 1.288122275476733e-07, "loss": 0.2209, "step": 834 }, { "epoch": 2.8091773922775602, "grad_norm": 2.199934634740392, "learning_rate": 1.244270750234333e-07, "loss": 0.2362, "step": 835 }, { "epoch": 2.812534974818131, "grad_norm": 2.265457178349347, "learning_rate": 1.201169222487464e-07, "loss": 0.2395, "step": 836 }, { "epoch": 2.8158925573587017, "grad_norm": 2.243639772812836, "learning_rate": 1.1588183552564247e-07, "loss": 0.2251, "step": 837 }, { "epoch": 2.8192501398992724, "grad_norm": 2.1527813152932174, "learning_rate": 1.1172188000142803e-07, "loss": 0.2434, "step": 838 }, { "epoch": 2.822607722439843, "grad_norm": 2.290823128697244, "learning_rate": 1.0763711966768453e-07, "loss": 0.2078, "step": 839 }, { "epoch": 2.8259653049804143, "grad_norm": 2.108924814493277, "learning_rate": 1.0362761735928372e-07, "loss": 0.2209, "step": 840 }, { "epoch": 2.829322887520985, "grad_norm": 2.2692353365060574, "learning_rate": 9.969343475342285e-08, "loss": 0.2413, "step": 841 }, { "epoch": 2.8326804700615558, "grad_norm": 2.320890260637866, "learning_rate": 9.583463236867318e-08, "loss": 0.2405, "step": 842 }, { "epoch": 2.8360380526021265, "grad_norm": 2.114878278170637, "learning_rate": 9.205126956405075e-08, "loss": 0.2122, "step": 843 }, { "epoch": 2.839395635142697, "grad_norm": 2.1364581985268356, "learning_rate": 8.834340453810375e-08, "loss": 0.2173, "step": 844 }, { "epoch": 2.842753217683268, "grad_norm": 2.327798293544225, "learning_rate": 8.471109432801494e-08, "loss": 0.2305, "step": 845 }, { "epoch": 2.8461108002238387, "grad_norm": 2.1784789640309614, "learning_rate": 8.11543948087279e-08, "loss": 0.2238, "step": 846 }, { "epoch": 2.8494683827644094, "grad_norm": 2.3102169432150563, "learning_rate": 7.76733606920832e-08, "loss": 0.2158, "step": 847 }, { "epoch": 2.8528259653049806, "grad_norm": 2.0676705717179877, "learning_rate": 7.426804552598088e-08, "loss": 0.2276, "step": 848 }, { "epoch": 2.8561835478455513, "grad_norm": 2.321852646027188, "learning_rate": 7.093850169355266e-08, "loss": 0.2412, "step": 849 }, { "epoch": 2.859541130386122, "grad_norm": 2.129502189630901, "learning_rate": 6.768478041236037e-08, "loss": 0.2102, "step": 850 }, { "epoch": 2.8628987129266927, "grad_norm": 2.432479560424763, "learning_rate": 6.450693173360445e-08, "loss": 0.219, "step": 851 }, { "epoch": 2.8662562954672635, "grad_norm": 2.246241784383759, "learning_rate": 6.140500454135668e-08, "loss": 0.2172, "step": 852 }, { "epoch": 2.8696138780078346, "grad_norm": 2.2178086164936777, "learning_rate": 5.8379046551807486e-08, "loss": 0.2355, "step": 853 }, { "epoch": 2.8729714605484054, "grad_norm": 2.079524766097902, "learning_rate": 5.542910431252935e-08, "loss": 0.2208, "step": 854 }, { "epoch": 2.876329043088976, "grad_norm": 2.230864146180223, "learning_rate": 5.255522320176565e-08, "loss": 0.2268, "step": 855 }, { "epoch": 2.879686625629547, "grad_norm": 2.1326498447596496, "learning_rate": 4.975744742772848e-08, "loss": 0.2182, "step": 856 }, { "epoch": 2.8830442081701175, "grad_norm": 2.1859329833247467, "learning_rate": 4.7035820027920284e-08, "loss": 0.2364, "step": 857 }, { "epoch": 2.8864017907106883, "grad_norm": 2.2460754412578505, "learning_rate": 4.439038286847164e-08, "loss": 0.2222, "step": 858 }, { "epoch": 2.889759373251259, "grad_norm": 2.0025699472964504, "learning_rate": 4.182117664349783e-08, "loss": 0.2378, "step": 859 }, { "epoch": 2.8931169557918297, "grad_norm": 2.3077570677713948, "learning_rate": 3.9328240874471624e-08, "loss": 0.2254, "step": 860 }, { "epoch": 2.8964745383324004, "grad_norm": 2.0838649182430946, "learning_rate": 3.6911613909616505e-08, "loss": 0.2131, "step": 861 }, { "epoch": 2.8998321208729716, "grad_norm": 2.2289951432899024, "learning_rate": 3.457133292331494e-08, "loss": 0.2288, "step": 862 }, { "epoch": 2.9031897034135423, "grad_norm": 2.1558465137002654, "learning_rate": 3.230743391553881e-08, "loss": 0.2181, "step": 863 }, { "epoch": 2.906547285954113, "grad_norm": 2.1450777488580774, "learning_rate": 3.011995171129545e-08, "loss": 0.2055, "step": 864 }, { "epoch": 2.9099048684946838, "grad_norm": 2.1709969058929484, "learning_rate": 2.8008919960090253e-08, "loss": 0.2475, "step": 865 }, { "epoch": 2.9132624510352545, "grad_norm": 1.9687943814735813, "learning_rate": 2.5974371135408792e-08, "loss": 0.2006, "step": 866 }, { "epoch": 2.9166200335758257, "grad_norm": 2.2277357096196204, "learning_rate": 2.401633653422053e-08, "loss": 0.2245, "step": 867 }, { "epoch": 2.9199776161163964, "grad_norm": 2.225179238729893, "learning_rate": 2.2134846276494205e-08, "loss": 0.2628, "step": 868 }, { "epoch": 2.923335198656967, "grad_norm": 2.233813263586234, "learning_rate": 2.032992930473543e-08, "loss": 0.2367, "step": 869 }, { "epoch": 2.926692781197538, "grad_norm": 2.192104481598423, "learning_rate": 1.860161338354205e-08, "loss": 0.221, "step": 870 }, { "epoch": 2.9300503637381086, "grad_norm": 2.2721116115550117, "learning_rate": 1.69499250991767e-08, "loss": 0.2229, "step": 871 }, { "epoch": 2.9334079462786793, "grad_norm": 2.069015317512474, "learning_rate": 1.5374889859157137e-08, "loss": 0.2026, "step": 872 }, { "epoch": 2.93676552881925, "grad_norm": 2.226727947199419, "learning_rate": 1.3876531891867106e-08, "loss": 0.2329, "step": 873 }, { "epoch": 2.9401231113598207, "grad_norm": 2.3274573956235862, "learning_rate": 1.2454874246181081e-08, "loss": 0.2307, "step": 874 }, { "epoch": 2.9434806939003915, "grad_norm": 2.2826046277606267, "learning_rate": 1.1109938791112328e-08, "loss": 0.2381, "step": 875 }, { "epoch": 2.9468382764409626, "grad_norm": 2.1682098280901574, "learning_rate": 9.841746215474845e-09, "loss": 0.2414, "step": 876 }, { "epoch": 2.9501958589815334, "grad_norm": 2.023824966326867, "learning_rate": 8.650316027566386e-09, "loss": 0.2139, "step": 877 }, { "epoch": 2.953553441522104, "grad_norm": 2.1379663757918106, "learning_rate": 7.535666554866483e-09, "loss": 0.2369, "step": 878 }, { "epoch": 2.956911024062675, "grad_norm": 2.338410010088983, "learning_rate": 6.497814943756675e-09, "loss": 0.2422, "step": 879 }, { "epoch": 2.9602686066032455, "grad_norm": 2.069543683427084, "learning_rate": 5.536777159254603e-09, "loss": 0.2214, "step": 880 }, { "epoch": 2.9636261891438167, "grad_norm": 2.3978229945182914, "learning_rate": 4.652567984770873e-09, "loss": 0.2285, "step": 881 }, { "epoch": 2.9669837716843874, "grad_norm": 2.177819105580462, "learning_rate": 3.845201021879241e-09, "loss": 0.2206, "step": 882 }, { "epoch": 2.970341354224958, "grad_norm": 2.232082224924113, "learning_rate": 3.1146886901090024e-09, "loss": 0.2267, "step": 883 }, { "epoch": 2.973698936765529, "grad_norm": 2.3050073009713854, "learning_rate": 2.461042226752919e-09, "loss": 0.2457, "step": 884 }, { "epoch": 2.9770565193060996, "grad_norm": 2.1979166006413515, "learning_rate": 1.8842716866956935e-09, "loss": 0.2412, "step": 885 }, { "epoch": 2.9804141018466703, "grad_norm": 2.255837191828035, "learning_rate": 1.3843859422574269e-09, "loss": 0.2331, "step": 886 }, { "epoch": 2.983771684387241, "grad_norm": 2.342133934859939, "learning_rate": 9.613926830587262e-10, "loss": 0.2382, "step": 887 }, { "epoch": 2.9871292669278118, "grad_norm": 2.280659517009034, "learning_rate": 6.152984159024655e-10, "loss": 0.2454, "step": 888 }, { "epoch": 2.9904868494683825, "grad_norm": 2.3892948295661443, "learning_rate": 3.4610846467109106e-10, "loss": 0.2257, "step": 889 }, { "epoch": 2.9938444320089537, "grad_norm": 2.2858765844038023, "learning_rate": 1.538269702494599e-10, "loss": 0.2244, "step": 890 }, { "epoch": 2.9972020145495244, "grad_norm": 2.298089033956775, "learning_rate": 3.8456890455451646e-11, "loss": 0.2323, "step": 891 }, { "epoch": 2.9972020145495244, "step": 891, "total_flos": 1.9585051653255987e+17, "train_loss": 0.5026469534026787, "train_runtime": 9007.4522, "train_samples_per_second": 4.76, "train_steps_per_second": 0.099 } ], "logging_steps": 1, "max_steps": 891, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.9585051653255987e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }