{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9968, "eval_steps": 500, "global_step": 780, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00256, "grad_norm": 9.365875273875238, "learning_rate": 1.25e-06, "loss": 1.3849, "step": 1 }, { "epoch": 0.00512, "grad_norm": 10.543859791920642, "learning_rate": 2.5e-06, "loss": 1.4809, "step": 2 }, { "epoch": 0.00768, "grad_norm": 8.02060315144952, "learning_rate": 3.7500000000000005e-06, "loss": 1.3488, "step": 3 }, { "epoch": 0.01024, "grad_norm": 6.20091121600503, "learning_rate": 5e-06, "loss": 1.1901, "step": 4 }, { "epoch": 0.0128, "grad_norm": 5.164630784639751, "learning_rate": 6.25e-06, "loss": 1.1539, "step": 5 }, { "epoch": 0.01536, "grad_norm": 4.321523771840541, "learning_rate": 7.500000000000001e-06, "loss": 1.0821, "step": 6 }, { "epoch": 0.01792, "grad_norm": 3.931935241507329, "learning_rate": 8.750000000000001e-06, "loss": 0.9753, "step": 7 }, { "epoch": 0.02048, "grad_norm": 3.5793289410771827, "learning_rate": 1e-05, "loss": 0.9717, "step": 8 }, { "epoch": 0.02304, "grad_norm": 4.256516218362947, "learning_rate": 1.125e-05, "loss": 0.9383, "step": 9 }, { "epoch": 0.0256, "grad_norm": 4.063010311058102, "learning_rate": 1.25e-05, "loss": 1.0224, "step": 10 }, { "epoch": 0.02816, "grad_norm": 4.272978299264522, "learning_rate": 1.375e-05, "loss": 0.8981, "step": 11 }, { "epoch": 0.03072, "grad_norm": 3.75622481926478, "learning_rate": 1.5000000000000002e-05, "loss": 0.9254, "step": 12 }, { "epoch": 0.03328, "grad_norm": 3.103766157100673, "learning_rate": 1.6250000000000002e-05, "loss": 0.839, "step": 13 }, { "epoch": 0.03584, "grad_norm": 3.0001871471139943, "learning_rate": 1.7500000000000002e-05, "loss": 0.8211, "step": 14 }, { "epoch": 0.0384, "grad_norm": 2.9538431919045105, "learning_rate": 1.8750000000000002e-05, "loss": 0.7587, "step": 15 }, { "epoch": 0.04096, "grad_norm": 3.1375029063705995, "learning_rate": 2e-05, "loss": 0.8149, "step": 16 }, { "epoch": 0.04352, "grad_norm": 3.086989542887256, "learning_rate": 1.9999915456072218e-05, "loss": 0.7836, "step": 17 }, { "epoch": 0.04608, "grad_norm": 3.0607437954707812, "learning_rate": 1.9999661825718408e-05, "loss": 0.8128, "step": 18 }, { "epoch": 0.04864, "grad_norm": 3.0400418762467196, "learning_rate": 1.9999239113227146e-05, "loss": 0.7667, "step": 19 }, { "epoch": 0.0512, "grad_norm": 2.82664678787635, "learning_rate": 1.9998647325745995e-05, "loss": 0.7523, "step": 20 }, { "epoch": 0.05376, "grad_norm": 2.8593372729121036, "learning_rate": 1.9997886473281355e-05, "loss": 0.7988, "step": 21 }, { "epoch": 0.05632, "grad_norm": 2.923977789288882, "learning_rate": 1.9996956568698325e-05, "loss": 0.7527, "step": 22 }, { "epoch": 0.05888, "grad_norm": 2.6178941828631337, "learning_rate": 1.9995857627720456e-05, "loss": 0.7438, "step": 23 }, { "epoch": 0.06144, "grad_norm": 2.5820149137237345, "learning_rate": 1.99945896689295e-05, "loss": 0.7248, "step": 24 }, { "epoch": 0.064, "grad_norm": 2.7301648737950224, "learning_rate": 1.9993152713765116e-05, "loss": 0.6776, "step": 25 }, { "epoch": 0.06656, "grad_norm": 2.5608433077601775, "learning_rate": 1.999154678652446e-05, "loss": 0.8218, "step": 26 }, { "epoch": 0.06912, "grad_norm": 3.0122091067319623, "learning_rate": 1.998977191436181e-05, "loss": 0.7429, "step": 27 }, { "epoch": 0.07168, "grad_norm": 2.5954170063820654, "learning_rate": 1.9987828127288105e-05, "loss": 0.6964, "step": 28 }, { "epoch": 0.07424, "grad_norm": 2.7446609447785004, "learning_rate": 1.998571545817042e-05, "loss": 0.7901, "step": 29 }, { "epoch": 0.0768, "grad_norm": 2.6357015498254794, "learning_rate": 1.9983433942731427e-05, "loss": 0.6802, "step": 30 }, { "epoch": 0.07936, "grad_norm": 3.0253927729854966, "learning_rate": 1.998098361954878e-05, "loss": 0.7918, "step": 31 }, { "epoch": 0.08192, "grad_norm": 2.729017093894504, "learning_rate": 1.9978364530054465e-05, "loss": 0.7565, "step": 32 }, { "epoch": 0.08448, "grad_norm": 2.551676529089632, "learning_rate": 1.9975576718534105e-05, "loss": 0.6984, "step": 33 }, { "epoch": 0.08704, "grad_norm": 2.375670501782899, "learning_rate": 1.9972620232126215e-05, "loss": 0.711, "step": 34 }, { "epoch": 0.0896, "grad_norm": 2.532366615420919, "learning_rate": 1.996949512082138e-05, "loss": 0.7398, "step": 35 }, { "epoch": 0.09216, "grad_norm": 2.531683530333962, "learning_rate": 1.996620143746144e-05, "loss": 0.7466, "step": 36 }, { "epoch": 0.09472, "grad_norm": 2.4816273494459598, "learning_rate": 1.9962739237738585e-05, "loss": 0.7701, "step": 37 }, { "epoch": 0.09728, "grad_norm": 2.5153024518723006, "learning_rate": 1.9959108580194403e-05, "loss": 0.765, "step": 38 }, { "epoch": 0.09984, "grad_norm": 2.2940292462059735, "learning_rate": 1.9955309526218903e-05, "loss": 0.7029, "step": 39 }, { "epoch": 0.1024, "grad_norm": 2.438121400214285, "learning_rate": 1.9951342140049483e-05, "loss": 0.7248, "step": 40 }, { "epoch": 0.10496, "grad_norm": 2.645240477077927, "learning_rate": 1.9947206488769812e-05, "loss": 0.7326, "step": 41 }, { "epoch": 0.10752, "grad_norm": 2.4038499208315445, "learning_rate": 1.9942902642308737e-05, "loss": 0.6941, "step": 42 }, { "epoch": 0.11008, "grad_norm": 2.5452147223708486, "learning_rate": 1.9938430673439075e-05, "loss": 0.7328, "step": 43 }, { "epoch": 0.11264, "grad_norm": 2.1971161649405815, "learning_rate": 1.993379065777639e-05, "loss": 0.6695, "step": 44 }, { "epoch": 0.1152, "grad_norm": 2.3455095751305945, "learning_rate": 1.9928982673777707e-05, "loss": 0.6808, "step": 45 }, { "epoch": 0.11776, "grad_norm": 2.205715323577842, "learning_rate": 1.9924006802740203e-05, "loss": 0.692, "step": 46 }, { "epoch": 0.12032, "grad_norm": 2.879359922240759, "learning_rate": 1.9918863128799805e-05, "loss": 0.7251, "step": 47 }, { "epoch": 0.12288, "grad_norm": 2.306572227959017, "learning_rate": 1.9913551738929803e-05, "loss": 0.734, "step": 48 }, { "epoch": 0.12544, "grad_norm": 3.010751503600904, "learning_rate": 1.9908072722939344e-05, "loss": 0.6667, "step": 49 }, { "epoch": 0.128, "grad_norm": 2.500047773905711, "learning_rate": 1.9902426173471933e-05, "loss": 0.7766, "step": 50 }, { "epoch": 0.13056, "grad_norm": 2.3211836211522945, "learning_rate": 1.9896612186003866e-05, "loss": 0.7355, "step": 51 }, { "epoch": 0.13312, "grad_norm": 2.1700803300583553, "learning_rate": 1.9890630858842614e-05, "loss": 0.6743, "step": 52 }, { "epoch": 0.13568, "grad_norm": 2.4572092973317416, "learning_rate": 1.988448229312515e-05, "loss": 0.6261, "step": 53 }, { "epoch": 0.13824, "grad_norm": 2.6489718279675167, "learning_rate": 1.9878166592816255e-05, "loss": 0.7783, "step": 54 }, { "epoch": 0.1408, "grad_norm": 1.9656310646192854, "learning_rate": 1.9871683864706752e-05, "loss": 0.6444, "step": 55 }, { "epoch": 0.14336, "grad_norm": 2.8772548921197374, "learning_rate": 1.9865034218411698e-05, "loss": 0.6984, "step": 56 }, { "epoch": 0.14592, "grad_norm": 2.2991328693562183, "learning_rate": 1.9858217766368538e-05, "loss": 0.7365, "step": 57 }, { "epoch": 0.14848, "grad_norm": 2.744151628806484, "learning_rate": 1.98512346238352e-05, "loss": 0.7668, "step": 58 }, { "epoch": 0.15104, "grad_norm": 2.126168797905977, "learning_rate": 1.984408490888814e-05, "loss": 0.6806, "step": 59 }, { "epoch": 0.1536, "grad_norm": 2.179164947113899, "learning_rate": 1.9836768742420355e-05, "loss": 0.7316, "step": 60 }, { "epoch": 0.15616, "grad_norm": 2.471943900346479, "learning_rate": 1.9829286248139334e-05, "loss": 0.8413, "step": 61 }, { "epoch": 0.15872, "grad_norm": 2.1155043013923898, "learning_rate": 1.9821637552564973e-05, "loss": 0.7118, "step": 62 }, { "epoch": 0.16128, "grad_norm": 2.511969558064074, "learning_rate": 1.9813822785027422e-05, "loss": 0.8131, "step": 63 }, { "epoch": 0.16384, "grad_norm": 2.967819167847234, "learning_rate": 1.9805842077664913e-05, "loss": 0.7301, "step": 64 }, { "epoch": 0.1664, "grad_norm": 2.8259470035066485, "learning_rate": 1.9797695565421507e-05, "loss": 0.7323, "step": 65 }, { "epoch": 0.16896, "grad_norm": 2.302591195035351, "learning_rate": 1.978938338604484e-05, "loss": 0.6715, "step": 66 }, { "epoch": 0.17152, "grad_norm": 2.3196544383121958, "learning_rate": 1.978090568008377e-05, "loss": 0.685, "step": 67 }, { "epoch": 0.17408, "grad_norm": 2.624524610568192, "learning_rate": 1.9772262590886006e-05, "loss": 0.6869, "step": 68 }, { "epoch": 0.17664, "grad_norm": 2.6543764757775006, "learning_rate": 1.9763454264595694e-05, "loss": 0.8062, "step": 69 }, { "epoch": 0.1792, "grad_norm": 2.434462493719978, "learning_rate": 1.975448085015093e-05, "loss": 0.727, "step": 70 }, { "epoch": 0.18176, "grad_norm": 4.222411149653797, "learning_rate": 1.9745342499281254e-05, "loss": 0.7087, "step": 71 }, { "epoch": 0.18432, "grad_norm": 2.1852577874422963, "learning_rate": 1.9736039366505087e-05, "loss": 0.6602, "step": 72 }, { "epoch": 0.18688, "grad_norm": 2.366328418247487, "learning_rate": 1.9726571609127097e-05, "loss": 0.7275, "step": 73 }, { "epoch": 0.18944, "grad_norm": 2.4876758296788193, "learning_rate": 1.9716939387235573e-05, "loss": 0.6976, "step": 74 }, { "epoch": 0.192, "grad_norm": 2.4805839086487382, "learning_rate": 1.9707142863699687e-05, "loss": 0.6665, "step": 75 }, { "epoch": 0.19456, "grad_norm": 2.427841997203471, "learning_rate": 1.969718220416675e-05, "loss": 0.6817, "step": 76 }, { "epoch": 0.19712, "grad_norm": 2.386419228571022, "learning_rate": 1.9687057577059422e-05, "loss": 0.7231, "step": 77 }, { "epoch": 0.19968, "grad_norm": 2.1832161777994394, "learning_rate": 1.9676769153572853e-05, "loss": 0.6697, "step": 78 }, { "epoch": 0.20224, "grad_norm": 1.913933923954127, "learning_rate": 1.966631710767178e-05, "loss": 0.6854, "step": 79 }, { "epoch": 0.2048, "grad_norm": 2.2129580021231066, "learning_rate": 1.965570161608762e-05, "loss": 0.6902, "step": 80 }, { "epoch": 0.20736, "grad_norm": 2.3913638942796283, "learning_rate": 1.9644922858315432e-05, "loss": 0.6949, "step": 81 }, { "epoch": 0.20992, "grad_norm": 2.523715832051759, "learning_rate": 1.9633981016610926e-05, "loss": 0.6668, "step": 82 }, { "epoch": 0.21248, "grad_norm": 2.6639369089204794, "learning_rate": 1.9622876275987355e-05, "loss": 0.7303, "step": 83 }, { "epoch": 0.21504, "grad_norm": 2.3755204875190827, "learning_rate": 1.9611608824212395e-05, "loss": 0.7113, "step": 84 }, { "epoch": 0.2176, "grad_norm": 2.252602926327337, "learning_rate": 1.9600178851804977e-05, "loss": 0.7382, "step": 85 }, { "epoch": 0.22016, "grad_norm": 2.2611177708517363, "learning_rate": 1.958858655203205e-05, "loss": 0.6784, "step": 86 }, { "epoch": 0.22272, "grad_norm": 2.3668773245432537, "learning_rate": 1.9576832120905323e-05, "loss": 0.7523, "step": 87 }, { "epoch": 0.22528, "grad_norm": 2.0990171366400414, "learning_rate": 1.9564915757177955e-05, "loss": 0.6191, "step": 88 }, { "epoch": 0.22784, "grad_norm": 2.1035732112703722, "learning_rate": 1.9552837662341182e-05, "loss": 0.6708, "step": 89 }, { "epoch": 0.2304, "grad_norm": 2.333412145423922, "learning_rate": 1.954059804062092e-05, "loss": 0.677, "step": 90 }, { "epoch": 0.23296, "grad_norm": 2.1825211355833405, "learning_rate": 1.952819709897431e-05, "loss": 0.6866, "step": 91 }, { "epoch": 0.23552, "grad_norm": 2.2533551325921866, "learning_rate": 1.951563504708622e-05, "loss": 0.5733, "step": 92 }, { "epoch": 0.23808, "grad_norm": 2.4138760679849174, "learning_rate": 1.9502912097365677e-05, "loss": 0.6838, "step": 93 }, { "epoch": 0.24064, "grad_norm": 2.7111028570055598, "learning_rate": 1.9490028464942322e-05, "loss": 0.6759, "step": 94 }, { "epoch": 0.2432, "grad_norm": 2.303650092378439, "learning_rate": 1.9476984367662724e-05, "loss": 0.6187, "step": 95 }, { "epoch": 0.24576, "grad_norm": 2.36652208711437, "learning_rate": 1.9463780026086735e-05, "loss": 0.7569, "step": 96 }, { "epoch": 0.24832, "grad_norm": 1.957291438435352, "learning_rate": 1.9450415663483734e-05, "loss": 0.6306, "step": 97 }, { "epoch": 0.25088, "grad_norm": 2.5876692343060443, "learning_rate": 1.9436891505828854e-05, "loss": 0.743, "step": 98 }, { "epoch": 0.25344, "grad_norm": 2.4337909184554642, "learning_rate": 1.9423207781799186e-05, "loss": 0.6156, "step": 99 }, { "epoch": 0.256, "grad_norm": 2.1443313808235698, "learning_rate": 1.9409364722769882e-05, "loss": 0.6749, "step": 100 }, { "epoch": 0.25856, "grad_norm": 2.542769952057182, "learning_rate": 1.939536256281026e-05, "loss": 0.7428, "step": 101 }, { "epoch": 0.26112, "grad_norm": 2.3379731947006865, "learning_rate": 1.938120153867983e-05, "loss": 0.7159, "step": 102 }, { "epoch": 0.26368, "grad_norm": 2.1808691357013954, "learning_rate": 1.936688188982433e-05, "loss": 0.7103, "step": 103 }, { "epoch": 0.26624, "grad_norm": 1.916729367685334, "learning_rate": 1.9352403858371618e-05, "loss": 0.6595, "step": 104 }, { "epoch": 0.2688, "grad_norm": 2.014231671417234, "learning_rate": 1.9337767689127628e-05, "loss": 0.6411, "step": 105 }, { "epoch": 0.27136, "grad_norm": 1.9129910147877291, "learning_rate": 1.9322973629572207e-05, "loss": 0.6819, "step": 106 }, { "epoch": 0.27392, "grad_norm": 2.077225498483606, "learning_rate": 1.9308021929854934e-05, "loss": 0.6587, "step": 107 }, { "epoch": 0.27648, "grad_norm": 2.0796212070981865, "learning_rate": 1.9292912842790893e-05, "loss": 0.7157, "step": 108 }, { "epoch": 0.27904, "grad_norm": 2.2161612611915147, "learning_rate": 1.92776466238564e-05, "loss": 0.6943, "step": 109 }, { "epoch": 0.2816, "grad_norm": 2.1549077684507396, "learning_rate": 1.9262223531184678e-05, "loss": 0.6259, "step": 110 }, { "epoch": 0.28416, "grad_norm": 2.0654699290261647, "learning_rate": 1.924664382556149e-05, "loss": 0.6057, "step": 111 }, { "epoch": 0.28672, "grad_norm": 2.142110152136122, "learning_rate": 1.9230907770420737e-05, "loss": 0.6625, "step": 112 }, { "epoch": 0.28928, "grad_norm": 1.9897544774269862, "learning_rate": 1.9215015631840005e-05, "loss": 0.6164, "step": 113 }, { "epoch": 0.29184, "grad_norm": 2.2587245433808594, "learning_rate": 1.9198967678536054e-05, "loss": 0.6616, "step": 114 }, { "epoch": 0.2944, "grad_norm": 2.342586021827016, "learning_rate": 1.918276418186028e-05, "loss": 0.6494, "step": 115 }, { "epoch": 0.29696, "grad_norm": 1.9302093197641312, "learning_rate": 1.916640541579415e-05, "loss": 0.65, "step": 116 }, { "epoch": 0.29952, "grad_norm": 2.115112025916886, "learning_rate": 1.9149891656944513e-05, "loss": 0.6374, "step": 117 }, { "epoch": 0.30208, "grad_norm": 2.1565182972269032, "learning_rate": 1.913322318453899e-05, "loss": 0.699, "step": 118 }, { "epoch": 0.30464, "grad_norm": 2.11350736146981, "learning_rate": 1.9116400280421196e-05, "loss": 0.6088, "step": 119 }, { "epoch": 0.3072, "grad_norm": 2.344994034519172, "learning_rate": 1.9099423229046015e-05, "loss": 0.752, "step": 120 }, { "epoch": 0.30976, "grad_norm": 2.093068646948046, "learning_rate": 1.9082292317474766e-05, "loss": 0.6185, "step": 121 }, { "epoch": 0.31232, "grad_norm": 2.316319871123848, "learning_rate": 1.9065007835370358e-05, "loss": 0.7374, "step": 122 }, { "epoch": 0.31488, "grad_norm": 2.096344829293569, "learning_rate": 1.90475700749924e-05, "loss": 0.672, "step": 123 }, { "epoch": 0.31744, "grad_norm": 2.5526545007602235, "learning_rate": 1.902997933119223e-05, "loss": 0.7168, "step": 124 }, { "epoch": 0.32, "grad_norm": 2.234512228398607, "learning_rate": 1.9012235901407976e-05, "loss": 0.6724, "step": 125 }, { "epoch": 0.32256, "grad_norm": 2.221990498534791, "learning_rate": 1.8994340085659474e-05, "loss": 0.6165, "step": 126 }, { "epoch": 0.32512, "grad_norm": 2.1511357630477526, "learning_rate": 1.897629218654325e-05, "loss": 0.6733, "step": 127 }, { "epoch": 0.32768, "grad_norm": 2.122487155014188, "learning_rate": 1.8958092509227347e-05, "loss": 0.6036, "step": 128 }, { "epoch": 0.33024, "grad_norm": 2.0229651456552924, "learning_rate": 1.8939741361446207e-05, "loss": 0.581, "step": 129 }, { "epoch": 0.3328, "grad_norm": 2.1911603701017226, "learning_rate": 1.8921239053495465e-05, "loss": 0.7016, "step": 130 }, { "epoch": 0.33536, "grad_norm": 2.0676857284142804, "learning_rate": 1.8902585898226663e-05, "loss": 0.6983, "step": 131 }, { "epoch": 0.33792, "grad_norm": 2.264571405087662, "learning_rate": 1.888378221104201e-05, "loss": 0.724, "step": 132 }, { "epoch": 0.34048, "grad_norm": 2.051720164060425, "learning_rate": 1.8864828309889022e-05, "loss": 0.6245, "step": 133 }, { "epoch": 0.34304, "grad_norm": 2.2288895604426116, "learning_rate": 1.8845724515255147e-05, "loss": 0.6611, "step": 134 }, { "epoch": 0.3456, "grad_norm": 2.008766119781776, "learning_rate": 1.8826471150162354e-05, "loss": 0.6202, "step": 135 }, { "epoch": 0.34816, "grad_norm": 2.4451017728151045, "learning_rate": 1.880706854016166e-05, "loss": 0.714, "step": 136 }, { "epoch": 0.35072, "grad_norm": 2.279883321646425, "learning_rate": 1.8787517013327642e-05, "loss": 0.6921, "step": 137 }, { "epoch": 0.35328, "grad_norm": 1.8819123586138218, "learning_rate": 1.876781690025287e-05, "loss": 0.6242, "step": 138 }, { "epoch": 0.35584, "grad_norm": 2.209480600689614, "learning_rate": 1.8747968534042333e-05, "loss": 0.685, "step": 139 }, { "epoch": 0.3584, "grad_norm": 2.25151039966994, "learning_rate": 1.87279722503078e-05, "loss": 0.6191, "step": 140 }, { "epoch": 0.36096, "grad_norm": 1.9964258482072739, "learning_rate": 1.8707828387162145e-05, "loss": 0.6209, "step": 141 }, { "epoch": 0.36352, "grad_norm": 2.06019337674458, "learning_rate": 1.8687537285213627e-05, "loss": 0.6007, "step": 142 }, { "epoch": 0.36608, "grad_norm": 2.2157618609318264, "learning_rate": 1.866709928756014e-05, "loss": 0.6939, "step": 143 }, { "epoch": 0.36864, "grad_norm": 2.0119018007709553, "learning_rate": 1.8646514739783404e-05, "loss": 0.6719, "step": 144 }, { "epoch": 0.3712, "grad_norm": 2.2377675426901287, "learning_rate": 1.8625783989943124e-05, "loss": 0.6735, "step": 145 }, { "epoch": 0.37376, "grad_norm": 1.8794121110318662, "learning_rate": 1.8604907388571097e-05, "loss": 0.6737, "step": 146 }, { "epoch": 0.37632, "grad_norm": 2.0370150523104704, "learning_rate": 1.8583885288665307e-05, "loss": 0.7013, "step": 147 }, { "epoch": 0.37888, "grad_norm": 2.0600475834352445, "learning_rate": 1.8562718045683933e-05, "loss": 0.6418, "step": 148 }, { "epoch": 0.38144, "grad_norm": 2.0885287172547096, "learning_rate": 1.854140601753934e-05, "loss": 0.6588, "step": 149 }, { "epoch": 0.384, "grad_norm": 2.1901572667391362, "learning_rate": 1.8519949564592047e-05, "loss": 0.6395, "step": 150 }, { "epoch": 0.38656, "grad_norm": 2.223773214792353, "learning_rate": 1.8498349049644614e-05, "loss": 0.7024, "step": 151 }, { "epoch": 0.38912, "grad_norm": 1.9602921943490395, "learning_rate": 1.8476604837935515e-05, "loss": 0.6104, "step": 152 }, { "epoch": 0.39168, "grad_norm": 1.7818478560580842, "learning_rate": 1.8454717297132966e-05, "loss": 0.6368, "step": 153 }, { "epoch": 0.39424, "grad_norm": 2.0816014287167466, "learning_rate": 1.8432686797328697e-05, "loss": 0.7267, "step": 154 }, { "epoch": 0.3968, "grad_norm": 2.0299850911079, "learning_rate": 1.8410513711031713e-05, "loss": 0.6158, "step": 155 }, { "epoch": 0.39936, "grad_norm": 1.7411644336765735, "learning_rate": 1.8388198413161962e-05, "loss": 0.5699, "step": 156 }, { "epoch": 0.40192, "grad_norm": 1.925361120184857, "learning_rate": 1.8365741281044046e-05, "loss": 0.6232, "step": 157 }, { "epoch": 0.40448, "grad_norm": 2.083962310552385, "learning_rate": 1.8343142694400784e-05, "loss": 0.6156, "step": 158 }, { "epoch": 0.40704, "grad_norm": 2.3609597131509945, "learning_rate": 1.8320403035346834e-05, "loss": 0.7054, "step": 159 }, { "epoch": 0.4096, "grad_norm": 2.060537565443389, "learning_rate": 1.829752268838222e-05, "loss": 0.6445, "step": 160 }, { "epoch": 0.41216, "grad_norm": 2.022152022387066, "learning_rate": 1.8274502040385814e-05, "loss": 0.5156, "step": 161 }, { "epoch": 0.41472, "grad_norm": 2.1533000714293022, "learning_rate": 1.8251341480608823e-05, "loss": 0.6786, "step": 162 }, { "epoch": 0.41728, "grad_norm": 2.221504339814079, "learning_rate": 1.8228041400668185e-05, "loss": 0.6856, "step": 163 }, { "epoch": 0.41984, "grad_norm": 2.123873387795693, "learning_rate": 1.8204602194539948e-05, "loss": 0.6883, "step": 164 }, { "epoch": 0.4224, "grad_norm": 1.9816040211257193, "learning_rate": 1.8181024258552633e-05, "loss": 0.651, "step": 165 }, { "epoch": 0.42496, "grad_norm": 1.9951719680446292, "learning_rate": 1.8157307991380496e-05, "loss": 0.63, "step": 166 }, { "epoch": 0.42752, "grad_norm": 1.8866496489390194, "learning_rate": 1.8133453794036816e-05, "loss": 0.5996, "step": 167 }, { "epoch": 0.43008, "grad_norm": 2.0782535399638924, "learning_rate": 1.81094620698671e-05, "loss": 0.68, "step": 168 }, { "epoch": 0.43264, "grad_norm": 2.015837845490803, "learning_rate": 1.8085333224542263e-05, "loss": 0.6587, "step": 169 }, { "epoch": 0.4352, "grad_norm": 1.9606661811078803, "learning_rate": 1.806106766605178e-05, "loss": 0.6543, "step": 170 }, { "epoch": 0.43776, "grad_norm": 2.0108460061760463, "learning_rate": 1.8036665804696777e-05, "loss": 0.7058, "step": 171 }, { "epoch": 0.44032, "grad_norm": 2.3760532048781773, "learning_rate": 1.8012128053083097e-05, "loss": 0.6896, "step": 172 }, { "epoch": 0.44288, "grad_norm": 2.068909616846273, "learning_rate": 1.798745482611431e-05, "loss": 0.6617, "step": 173 }, { "epoch": 0.44544, "grad_norm": 2.174520441881339, "learning_rate": 1.7962646540984733e-05, "loss": 0.6053, "step": 174 }, { "epoch": 0.448, "grad_norm": 1.8665524812908805, "learning_rate": 1.7937703617172326e-05, "loss": 0.5854, "step": 175 }, { "epoch": 0.45056, "grad_norm": 2.0307815923129144, "learning_rate": 1.7912626476431648e-05, "loss": 0.7031, "step": 176 }, { "epoch": 0.45312, "grad_norm": 2.3551765798842754, "learning_rate": 1.7887415542786694e-05, "loss": 0.6616, "step": 177 }, { "epoch": 0.45568, "grad_norm": 2.316754252058375, "learning_rate": 1.786207124252373e-05, "loss": 0.6539, "step": 178 }, { "epoch": 0.45824, "grad_norm": 1.8999783082061619, "learning_rate": 1.7836594004184097e-05, "loss": 0.6462, "step": 179 }, { "epoch": 0.4608, "grad_norm": 2.1452149051911142, "learning_rate": 1.7810984258556955e-05, "loss": 0.6678, "step": 180 }, { "epoch": 0.46336, "grad_norm": 1.8647430589477332, "learning_rate": 1.7785242438672002e-05, "loss": 0.654, "step": 181 }, { "epoch": 0.46592, "grad_norm": 2.1243120883953615, "learning_rate": 1.7759368979792145e-05, "loss": 0.6092, "step": 182 }, { "epoch": 0.46848, "grad_norm": 2.2473766249395015, "learning_rate": 1.773336431940616e-05, "loss": 0.6942, "step": 183 }, { "epoch": 0.47104, "grad_norm": 1.9868382907140005, "learning_rate": 1.770722889722126e-05, "loss": 0.6385, "step": 184 }, { "epoch": 0.4736, "grad_norm": 1.8495835605726167, "learning_rate": 1.7680963155155712e-05, "loss": 0.5967, "step": 185 }, { "epoch": 0.47616, "grad_norm": 2.012001328207229, "learning_rate": 1.76545675373313e-05, "loss": 0.5712, "step": 186 }, { "epoch": 0.47872, "grad_norm": 2.028400500138602, "learning_rate": 1.7628042490065877e-05, "loss": 0.6759, "step": 187 }, { "epoch": 0.48128, "grad_norm": 2.1686891556942474, "learning_rate": 1.760138846186577e-05, "loss": 0.6193, "step": 188 }, { "epoch": 0.48384, "grad_norm": 1.8958996376109047, "learning_rate": 1.7574605903418226e-05, "loss": 0.5843, "step": 189 }, { "epoch": 0.4864, "grad_norm": 1.8940614758684058, "learning_rate": 1.7547695267583794e-05, "loss": 0.5822, "step": 190 }, { "epoch": 0.48896, "grad_norm": 2.024651976413743, "learning_rate": 1.7520657009388634e-05, "loss": 0.6215, "step": 191 }, { "epoch": 0.49152, "grad_norm": 1.9418524552652976, "learning_rate": 1.749349158601686e-05, "loss": 0.5839, "step": 192 }, { "epoch": 0.49408, "grad_norm": 1.8295776930635907, "learning_rate": 1.7466199456802784e-05, "loss": 0.5904, "step": 193 }, { "epoch": 0.49664, "grad_norm": 1.797136193223393, "learning_rate": 1.743878108322318e-05, "loss": 0.526, "step": 194 }, { "epoch": 0.4992, "grad_norm": 2.2173879719468297, "learning_rate": 1.741123692888943e-05, "loss": 0.6384, "step": 195 }, { "epoch": 0.50176, "grad_norm": 1.9844216586253793, "learning_rate": 1.738356745953975e-05, "loss": 0.6999, "step": 196 }, { "epoch": 0.50432, "grad_norm": 2.0317553296200987, "learning_rate": 1.7355773143031247e-05, "loss": 0.6412, "step": 197 }, { "epoch": 0.50688, "grad_norm": 1.8020878757675494, "learning_rate": 1.7327854449332067e-05, "loss": 0.5577, "step": 198 }, { "epoch": 0.50944, "grad_norm": 2.1986751893306664, "learning_rate": 1.729981185051342e-05, "loss": 0.6713, "step": 199 }, { "epoch": 0.512, "grad_norm": 2.083058419584056, "learning_rate": 1.7271645820741586e-05, "loss": 0.6577, "step": 200 }, { "epoch": 0.51456, "grad_norm": 2.124603731918396, "learning_rate": 1.7243356836269928e-05, "loss": 0.6368, "step": 201 }, { "epoch": 0.51712, "grad_norm": 1.9258241573803725, "learning_rate": 1.7214945375430816e-05, "loss": 0.6209, "step": 202 }, { "epoch": 0.51968, "grad_norm": 2.005751012165097, "learning_rate": 1.718641191862755e-05, "loss": 0.6142, "step": 203 }, { "epoch": 0.52224, "grad_norm": 1.9212135746911165, "learning_rate": 1.715775694832623e-05, "loss": 0.5725, "step": 204 }, { "epoch": 0.5248, "grad_norm": 1.830132532496078, "learning_rate": 1.7128980949047607e-05, "loss": 0.5957, "step": 205 }, { "epoch": 0.52736, "grad_norm": 1.9564593565799755, "learning_rate": 1.7100084407358882e-05, "loss": 0.673, "step": 206 }, { "epoch": 0.52992, "grad_norm": 1.9307648613118853, "learning_rate": 1.7071067811865477e-05, "loss": 0.5983, "step": 207 }, { "epoch": 0.53248, "grad_norm": 2.2224928140736084, "learning_rate": 1.7041931653202788e-05, "loss": 0.6069, "step": 208 }, { "epoch": 0.53504, "grad_norm": 2.251661406658001, "learning_rate": 1.7012676424027873e-05, "loss": 0.6489, "step": 209 }, { "epoch": 0.5376, "grad_norm": 1.9038603600791324, "learning_rate": 1.6983302619011125e-05, "loss": 0.5801, "step": 210 }, { "epoch": 0.54016, "grad_norm": 1.8456987114605605, "learning_rate": 1.6953810734827917e-05, "loss": 0.5935, "step": 211 }, { "epoch": 0.54272, "grad_norm": 1.922915287325509, "learning_rate": 1.6924201270150194e-05, "loss": 0.5463, "step": 212 }, { "epoch": 0.54528, "grad_norm": 1.9097147283983678, "learning_rate": 1.6894474725638043e-05, "loss": 0.6044, "step": 213 }, { "epoch": 0.54784, "grad_norm": 2.102752688389468, "learning_rate": 1.686463160393123e-05, "loss": 0.67, "step": 214 }, { "epoch": 0.5504, "grad_norm": 1.9917213811536882, "learning_rate": 1.6834672409640705e-05, "loss": 0.6429, "step": 215 }, { "epoch": 0.55296, "grad_norm": 1.974969043991725, "learning_rate": 1.680459764934006e-05, "loss": 0.6335, "step": 216 }, { "epoch": 0.55552, "grad_norm": 2.196550392557438, "learning_rate": 1.677440783155696e-05, "loss": 0.6468, "step": 217 }, { "epoch": 0.55808, "grad_norm": 1.8154736707811576, "learning_rate": 1.6744103466764566e-05, "loss": 0.576, "step": 218 }, { "epoch": 0.56064, "grad_norm": 2.1050477732929633, "learning_rate": 1.671368506737288e-05, "loss": 0.6891, "step": 219 }, { "epoch": 0.5632, "grad_norm": 1.7992925292913355, "learning_rate": 1.6683153147720098e-05, "loss": 0.5747, "step": 220 }, { "epoch": 0.56576, "grad_norm": 1.8783520446755098, "learning_rate": 1.66525082240639e-05, "loss": 0.5986, "step": 221 }, { "epoch": 0.56832, "grad_norm": 1.9949767049473588, "learning_rate": 1.6621750814572728e-05, "loss": 0.5943, "step": 222 }, { "epoch": 0.57088, "grad_norm": 2.1034251822329537, "learning_rate": 1.6590881439317025e-05, "loss": 0.6973, "step": 223 }, { "epoch": 0.57344, "grad_norm": 1.9209578446003501, "learning_rate": 1.6559900620260435e-05, "loss": 0.6694, "step": 224 }, { "epoch": 0.576, "grad_norm": 1.9220786921566595, "learning_rate": 1.6528808881250986e-05, "loss": 0.6559, "step": 225 }, { "epoch": 0.57856, "grad_norm": 2.0292167831172225, "learning_rate": 1.6497606748012227e-05, "loss": 0.5997, "step": 226 }, { "epoch": 0.58112, "grad_norm": 2.113629211379267, "learning_rate": 1.646629474813433e-05, "loss": 0.6631, "step": 227 }, { "epoch": 0.58368, "grad_norm": 1.8495528915876476, "learning_rate": 1.64348734110652e-05, "loss": 0.6076, "step": 228 }, { "epoch": 0.58624, "grad_norm": 2.2273973566204304, "learning_rate": 1.6403343268101476e-05, "loss": 0.6892, "step": 229 }, { "epoch": 0.5888, "grad_norm": 2.18163885450235, "learning_rate": 1.6371704852379587e-05, "loss": 0.6187, "step": 230 }, { "epoch": 0.59136, "grad_norm": 1.7637267715802756, "learning_rate": 1.6339958698866716e-05, "loss": 0.5672, "step": 231 }, { "epoch": 0.59392, "grad_norm": 2.0590984564497234, "learning_rate": 1.6308105344351776e-05, "loss": 0.606, "step": 232 }, { "epoch": 0.59648, "grad_norm": 2.1250870681801692, "learning_rate": 1.6276145327436298e-05, "loss": 0.5996, "step": 233 }, { "epoch": 0.59904, "grad_norm": 1.8078047249763116, "learning_rate": 1.6244079188525358e-05, "loss": 0.5606, "step": 234 }, { "epoch": 0.6016, "grad_norm": 1.881044538945948, "learning_rate": 1.621190746981842e-05, "loss": 0.5877, "step": 235 }, { "epoch": 0.60416, "grad_norm": 1.92455732840866, "learning_rate": 1.617963071530018e-05, "loss": 0.5617, "step": 236 }, { "epoch": 0.60672, "grad_norm": 1.8832244455059177, "learning_rate": 1.6147249470731355e-05, "loss": 0.5666, "step": 237 }, { "epoch": 0.60928, "grad_norm": 2.082069857382887, "learning_rate": 1.6114764283639467e-05, "loss": 0.5915, "step": 238 }, { "epoch": 0.61184, "grad_norm": 2.001850994059788, "learning_rate": 1.608217570330958e-05, "loss": 0.6164, "step": 239 }, { "epoch": 0.6144, "grad_norm": 2.022238936162546, "learning_rate": 1.6049484280775012e-05, "loss": 0.5671, "step": 240 }, { "epoch": 0.61696, "grad_norm": 1.9972354860485817, "learning_rate": 1.601669056880801e-05, "loss": 0.5998, "step": 241 }, { "epoch": 0.61952, "grad_norm": 1.9148720082205921, "learning_rate": 1.598379512191042e-05, "loss": 0.5544, "step": 242 }, { "epoch": 0.62208, "grad_norm": 2.1123296116387644, "learning_rate": 1.5950798496304303e-05, "loss": 0.6324, "step": 243 }, { "epoch": 0.62464, "grad_norm": 2.093610037072655, "learning_rate": 1.591770124992252e-05, "loss": 0.6248, "step": 244 }, { "epoch": 0.6272, "grad_norm": 2.009063600874344, "learning_rate": 1.5884503942399314e-05, "loss": 0.537, "step": 245 }, { "epoch": 0.62976, "grad_norm": 2.0328862721575938, "learning_rate": 1.585120713506084e-05, "loss": 0.6311, "step": 246 }, { "epoch": 0.63232, "grad_norm": 1.797706266437, "learning_rate": 1.5817811390915676e-05, "loss": 0.6263, "step": 247 }, { "epoch": 0.63488, "grad_norm": 1.8701784871242504, "learning_rate": 1.5784317274645294e-05, "loss": 0.5819, "step": 248 }, { "epoch": 0.63744, "grad_norm": 1.8585542262348, "learning_rate": 1.575072535259452e-05, "loss": 0.5841, "step": 249 }, { "epoch": 0.64, "grad_norm": 1.9058528200620377, "learning_rate": 1.571703619276197e-05, "loss": 0.5926, "step": 250 }, { "epoch": 0.64256, "grad_norm": 1.8283834697734105, "learning_rate": 1.5683250364790415e-05, "loss": 0.6193, "step": 251 }, { "epoch": 0.64512, "grad_norm": 2.0581163017398225, "learning_rate": 1.5649368439957182e-05, "loss": 0.6108, "step": 252 }, { "epoch": 0.64768, "grad_norm": 1.8346847863840565, "learning_rate": 1.5615390991164465e-05, "loss": 0.5812, "step": 253 }, { "epoch": 0.65024, "grad_norm": 1.9374483799503257, "learning_rate": 1.5581318592929665e-05, "loss": 0.5771, "step": 254 }, { "epoch": 0.6528, "grad_norm": 1.8804501132111215, "learning_rate": 1.5547151821375654e-05, "loss": 0.5901, "step": 255 }, { "epoch": 0.65536, "grad_norm": 1.6759367916217955, "learning_rate": 1.5512891254221046e-05, "loss": 0.5727, "step": 256 }, { "epoch": 0.65792, "grad_norm": 1.877781648410285, "learning_rate": 1.5478537470770425e-05, "loss": 0.5859, "step": 257 }, { "epoch": 0.66048, "grad_norm": 2.0006799639933384, "learning_rate": 1.5444091051904545e-05, "loss": 0.6468, "step": 258 }, { "epoch": 0.66304, "grad_norm": 1.8416628315812464, "learning_rate": 1.540955258007052e-05, "loss": 0.5418, "step": 259 }, { "epoch": 0.6656, "grad_norm": 1.8634187238978763, "learning_rate": 1.537492263927196e-05, "loss": 0.5494, "step": 260 }, { "epoch": 0.66816, "grad_norm": 1.9439207615090717, "learning_rate": 1.5340201815059116e-05, "loss": 0.5393, "step": 261 }, { "epoch": 0.67072, "grad_norm": 1.8477228376076562, "learning_rate": 1.5305390694518953e-05, "loss": 0.5406, "step": 262 }, { "epoch": 0.67328, "grad_norm": 2.071111010740906, "learning_rate": 1.5270489866265236e-05, "loss": 0.612, "step": 263 }, { "epoch": 0.67584, "grad_norm": 1.9934090221262504, "learning_rate": 1.52354999204286e-05, "loss": 0.5988, "step": 264 }, { "epoch": 0.6784, "grad_norm": 2.028728814629187, "learning_rate": 1.5200421448646525e-05, "loss": 0.5714, "step": 265 }, { "epoch": 0.68096, "grad_norm": 2.030984918895053, "learning_rate": 1.5165255044053373e-05, "loss": 0.5977, "step": 266 }, { "epoch": 0.68352, "grad_norm": 1.8548332091193185, "learning_rate": 1.5130001301270334e-05, "loss": 0.5617, "step": 267 }, { "epoch": 0.68608, "grad_norm": 1.8656760202026166, "learning_rate": 1.509466081639539e-05, "loss": 0.578, "step": 268 }, { "epoch": 0.68864, "grad_norm": 2.065217551069724, "learning_rate": 1.5059234186993217e-05, "loss": 0.6552, "step": 269 }, { "epoch": 0.6912, "grad_norm": 1.9936665359457852, "learning_rate": 1.5023722012085098e-05, "loss": 0.6026, "step": 270 }, { "epoch": 0.69376, "grad_norm": 1.7581904286308514, "learning_rate": 1.4988124892138782e-05, "loss": 0.5201, "step": 271 }, { "epoch": 0.69632, "grad_norm": 2.0162120027330284, "learning_rate": 1.4952443429058334e-05, "loss": 0.5915, "step": 272 }, { "epoch": 0.69888, "grad_norm": 2.0537671862981695, "learning_rate": 1.4916678226173966e-05, "loss": 0.5461, "step": 273 }, { "epoch": 0.70144, "grad_norm": 2.0260603366887295, "learning_rate": 1.4880829888231818e-05, "loss": 0.6289, "step": 274 }, { "epoch": 0.704, "grad_norm": 1.840168296016021, "learning_rate": 1.4844899021383756e-05, "loss": 0.5839, "step": 275 }, { "epoch": 0.70656, "grad_norm": 1.9912121443060273, "learning_rate": 1.4808886233177096e-05, "loss": 0.608, "step": 276 }, { "epoch": 0.70912, "grad_norm": 1.8192068126971754, "learning_rate": 1.4772792132544354e-05, "loss": 0.5602, "step": 277 }, { "epoch": 0.71168, "grad_norm": 1.8029322129897425, "learning_rate": 1.4736617329792942e-05, "loss": 0.604, "step": 278 }, { "epoch": 0.71424, "grad_norm": 1.9524417007781032, "learning_rate": 1.4700362436594834e-05, "loss": 0.5981, "step": 279 }, { "epoch": 0.7168, "grad_norm": 1.8084386702168436, "learning_rate": 1.4664028065976245e-05, "loss": 0.5817, "step": 280 }, { "epoch": 0.71936, "grad_norm": 2.1802877987514604, "learning_rate": 1.4627614832307261e-05, "loss": 0.6396, "step": 281 }, { "epoch": 0.72192, "grad_norm": 2.01749908189055, "learning_rate": 1.459112335129144e-05, "loss": 0.6185, "step": 282 }, { "epoch": 0.72448, "grad_norm": 2.038378021785889, "learning_rate": 1.4554554239955412e-05, "loss": 0.5979, "step": 283 }, { "epoch": 0.72704, "grad_norm": 1.9263728755541718, "learning_rate": 1.4517908116638433e-05, "loss": 0.5926, "step": 284 }, { "epoch": 0.7296, "grad_norm": 1.9574220263738664, "learning_rate": 1.4481185600981945e-05, "loss": 0.5807, "step": 285 }, { "epoch": 0.73216, "grad_norm": 1.9996955352260337, "learning_rate": 1.4444387313919092e-05, "loss": 0.603, "step": 286 }, { "epoch": 0.73472, "grad_norm": 1.9024141327391195, "learning_rate": 1.440751387766422e-05, "loss": 0.5523, "step": 287 }, { "epoch": 0.73728, "grad_norm": 1.793371820272698, "learning_rate": 1.437056591570235e-05, "loss": 0.5853, "step": 288 }, { "epoch": 0.73984, "grad_norm": 1.9581066893720518, "learning_rate": 1.4333544052778655e-05, "loss": 0.6131, "step": 289 }, { "epoch": 0.7424, "grad_norm": 1.8602904498846513, "learning_rate": 1.4296448914887866e-05, "loss": 0.5976, "step": 290 }, { "epoch": 0.74496, "grad_norm": 1.75407998573317, "learning_rate": 1.4259281129263727e-05, "loss": 0.527, "step": 291 }, { "epoch": 0.74752, "grad_norm": 1.9323066549675147, "learning_rate": 1.4222041324368347e-05, "loss": 0.6473, "step": 292 }, { "epoch": 0.75008, "grad_norm": 1.6313629921698742, "learning_rate": 1.4184730129881601e-05, "loss": 0.4679, "step": 293 }, { "epoch": 0.75264, "grad_norm": 1.8821039452506023, "learning_rate": 1.4147348176690479e-05, "loss": 0.596, "step": 294 }, { "epoch": 0.7552, "grad_norm": 2.0756598406983855, "learning_rate": 1.4109896096878408e-05, "loss": 0.6384, "step": 295 }, { "epoch": 0.75776, "grad_norm": 1.945088077779729, "learning_rate": 1.4072374523714577e-05, "loss": 0.5608, "step": 296 }, { "epoch": 0.76032, "grad_norm": 1.935435206680739, "learning_rate": 1.4034784091643218e-05, "loss": 0.5793, "step": 297 }, { "epoch": 0.76288, "grad_norm": 2.0773604638736485, "learning_rate": 1.399712543627289e-05, "loss": 0.6529, "step": 298 }, { "epoch": 0.76544, "grad_norm": 1.9163069194223226, "learning_rate": 1.3959399194365712e-05, "loss": 0.6056, "step": 299 }, { "epoch": 0.768, "grad_norm": 1.837037316584707, "learning_rate": 1.392160600382663e-05, "loss": 0.5853, "step": 300 }, { "epoch": 0.77056, "grad_norm": 2.0141541605155395, "learning_rate": 1.3883746503692587e-05, "loss": 0.5898, "step": 301 }, { "epoch": 0.77312, "grad_norm": 1.8576928552652707, "learning_rate": 1.3845821334121763e-05, "loss": 0.5624, "step": 302 }, { "epoch": 0.77568, "grad_norm": 1.9366964632674861, "learning_rate": 1.3807831136382706e-05, "loss": 0.6462, "step": 303 }, { "epoch": 0.77824, "grad_norm": 1.9953305945547755, "learning_rate": 1.3769776552843532e-05, "loss": 0.6181, "step": 304 }, { "epoch": 0.7808, "grad_norm": 1.9990027115873188, "learning_rate": 1.3731658226961031e-05, "loss": 0.6303, "step": 305 }, { "epoch": 0.78336, "grad_norm": 1.9858501691321515, "learning_rate": 1.3693476803269799e-05, "loss": 0.5916, "step": 306 }, { "epoch": 0.78592, "grad_norm": 1.9457879785437557, "learning_rate": 1.3655232927371342e-05, "loss": 0.5691, "step": 307 }, { "epoch": 0.78848, "grad_norm": 1.9025068512726264, "learning_rate": 1.3616927245923157e-05, "loss": 0.5378, "step": 308 }, { "epoch": 0.79104, "grad_norm": 1.9938161594081456, "learning_rate": 1.3578560406627798e-05, "loss": 0.6176, "step": 309 }, { "epoch": 0.7936, "grad_norm": 1.959851063997509, "learning_rate": 1.3540133058221927e-05, "loss": 0.6209, "step": 310 }, { "epoch": 0.79616, "grad_norm": 1.8935432838789927, "learning_rate": 1.3501645850465327e-05, "loss": 0.632, "step": 311 }, { "epoch": 0.79872, "grad_norm": 1.7611365587951862, "learning_rate": 1.346309943412995e-05, "loss": 0.5552, "step": 312 }, { "epoch": 0.80128, "grad_norm": 1.8949244476360045, "learning_rate": 1.342449446098888e-05, "loss": 0.6063, "step": 313 }, { "epoch": 0.80384, "grad_norm": 2.0195750845328395, "learning_rate": 1.3385831583805329e-05, "loss": 0.5886, "step": 314 }, { "epoch": 0.8064, "grad_norm": 1.8175701233229387, "learning_rate": 1.33471114563216e-05, "loss": 0.5937, "step": 315 }, { "epoch": 0.80896, "grad_norm": 2.1857335001832303, "learning_rate": 1.3308334733248019e-05, "loss": 0.6594, "step": 316 }, { "epoch": 0.81152, "grad_norm": 1.842277672202166, "learning_rate": 1.3269502070251885e-05, "loss": 0.5555, "step": 317 }, { "epoch": 0.81408, "grad_norm": 1.8495246423693503, "learning_rate": 1.323061412394637e-05, "loss": 0.6004, "step": 318 }, { "epoch": 0.81664, "grad_norm": 1.899465186771449, "learning_rate": 1.3191671551879418e-05, "loss": 0.5188, "step": 319 }, { "epoch": 0.8192, "grad_norm": 2.0205565271224635, "learning_rate": 1.3152675012522629e-05, "loss": 0.6318, "step": 320 }, { "epoch": 0.82176, "grad_norm": 1.9988122791104623, "learning_rate": 1.311362516526012e-05, "loss": 0.6078, "step": 321 }, { "epoch": 0.82432, "grad_norm": 1.7875368384493597, "learning_rate": 1.3074522670377392e-05, "loss": 0.5636, "step": 322 }, { "epoch": 0.82688, "grad_norm": 2.1123854748425894, "learning_rate": 1.3035368189050142e-05, "loss": 0.6282, "step": 323 }, { "epoch": 0.82944, "grad_norm": 2.0127085167163066, "learning_rate": 1.2996162383333097e-05, "loss": 0.5353, "step": 324 }, { "epoch": 0.832, "grad_norm": 1.9207019986925034, "learning_rate": 1.2956905916148821e-05, "loss": 0.5553, "step": 325 }, { "epoch": 0.83456, "grad_norm": 1.8938216896130815, "learning_rate": 1.2917599451276498e-05, "loss": 0.5619, "step": 326 }, { "epoch": 0.83712, "grad_norm": 1.8339307990942622, "learning_rate": 1.2878243653340714e-05, "loss": 0.5301, "step": 327 }, { "epoch": 0.83968, "grad_norm": 2.020673528812742, "learning_rate": 1.2838839187800218e-05, "loss": 0.5634, "step": 328 }, { "epoch": 0.84224, "grad_norm": 2.0042232597725422, "learning_rate": 1.2799386720936663e-05, "loss": 0.565, "step": 329 }, { "epoch": 0.8448, "grad_norm": 1.6440410966587669, "learning_rate": 1.2759886919843354e-05, "loss": 0.5487, "step": 330 }, { "epoch": 0.84736, "grad_norm": 2.033498045933394, "learning_rate": 1.2720340452413962e-05, "loss": 0.5313, "step": 331 }, { "epoch": 0.84992, "grad_norm": 1.9130851998572171, "learning_rate": 1.2680747987331215e-05, "loss": 0.5445, "step": 332 }, { "epoch": 0.85248, "grad_norm": 2.025171351176087, "learning_rate": 1.264111019405562e-05, "loss": 0.5699, "step": 333 }, { "epoch": 0.85504, "grad_norm": 1.9582047276900996, "learning_rate": 1.2601427742814123e-05, "loss": 0.5473, "step": 334 }, { "epoch": 0.8576, "grad_norm": 1.924176492111395, "learning_rate": 1.2561701304588782e-05, "loss": 0.5896, "step": 335 }, { "epoch": 0.86016, "grad_norm": 2.0844112285904823, "learning_rate": 1.2521931551105427e-05, "loss": 0.5678, "step": 336 }, { "epoch": 0.86272, "grad_norm": 1.9210828492706074, "learning_rate": 1.248211915482228e-05, "loss": 0.5465, "step": 337 }, { "epoch": 0.86528, "grad_norm": 1.9083499459646247, "learning_rate": 1.244226478891862e-05, "loss": 0.5568, "step": 338 }, { "epoch": 0.86784, "grad_norm": 1.763077087198924, "learning_rate": 1.2402369127283374e-05, "loss": 0.5632, "step": 339 }, { "epoch": 0.8704, "grad_norm": 1.838786657423739, "learning_rate": 1.2362432844503725e-05, "loss": 0.5387, "step": 340 }, { "epoch": 0.87296, "grad_norm": 1.8884385940297985, "learning_rate": 1.2322456615853718e-05, "loss": 0.6314, "step": 341 }, { "epoch": 0.87552, "grad_norm": 1.8950854441907627, "learning_rate": 1.2282441117282831e-05, "loss": 0.54, "step": 342 }, { "epoch": 0.87808, "grad_norm": 1.9183342023884988, "learning_rate": 1.224238702540454e-05, "loss": 0.5748, "step": 343 }, { "epoch": 0.88064, "grad_norm": 1.9998947291027693, "learning_rate": 1.2202295017484911e-05, "loss": 0.595, "step": 344 }, { "epoch": 0.8832, "grad_norm": 1.7690361873293163, "learning_rate": 1.2162165771431094e-05, "loss": 0.4816, "step": 345 }, { "epoch": 0.88576, "grad_norm": 1.8887292208556585, "learning_rate": 1.212199996577991e-05, "loss": 0.5548, "step": 346 }, { "epoch": 0.88832, "grad_norm": 1.9015638197509928, "learning_rate": 1.2081798279686354e-05, "loss": 0.5399, "step": 347 }, { "epoch": 0.89088, "grad_norm": 1.8797520886634111, "learning_rate": 1.2041561392912118e-05, "loss": 0.5652, "step": 348 }, { "epoch": 0.89344, "grad_norm": 1.692840000131321, "learning_rate": 1.2001289985814088e-05, "loss": 0.5431, "step": 349 }, { "epoch": 0.896, "grad_norm": 1.9518924556661963, "learning_rate": 1.1960984739332851e-05, "loss": 0.5328, "step": 350 }, { "epoch": 0.89856, "grad_norm": 1.9176530907572014, "learning_rate": 1.1920646334981176e-05, "loss": 0.5948, "step": 351 }, { "epoch": 0.90112, "grad_norm": 1.8600950416768505, "learning_rate": 1.1880275454832493e-05, "loss": 0.5214, "step": 352 }, { "epoch": 0.90368, "grad_norm": 2.0987333730497033, "learning_rate": 1.1839872781509358e-05, "loss": 0.6008, "step": 353 }, { "epoch": 0.90624, "grad_norm": 2.0052047615781157, "learning_rate": 1.1799438998171909e-05, "loss": 0.5804, "step": 354 }, { "epoch": 0.9088, "grad_norm": 1.9543922891858452, "learning_rate": 1.175897478850632e-05, "loss": 0.5801, "step": 355 }, { "epoch": 0.91136, "grad_norm": 1.9278210545847223, "learning_rate": 1.1718480836713228e-05, "loss": 0.5786, "step": 356 }, { "epoch": 0.91392, "grad_norm": 1.9169280603449947, "learning_rate": 1.1677957827496191e-05, "loss": 0.5683, "step": 357 }, { "epoch": 0.91648, "grad_norm": 1.9206761554005032, "learning_rate": 1.1637406446050072e-05, "loss": 0.5628, "step": 358 }, { "epoch": 0.91904, "grad_norm": 2.0320383330546714, "learning_rate": 1.1596827378049491e-05, "loss": 0.5568, "step": 359 }, { "epoch": 0.9216, "grad_norm": 2.011012460142978, "learning_rate": 1.1556221309637204e-05, "loss": 0.5911, "step": 360 }, { "epoch": 0.92416, "grad_norm": 1.6733867449703013, "learning_rate": 1.1515588927412509e-05, "loss": 0.4909, "step": 361 }, { "epoch": 0.92672, "grad_norm": 1.7160108869926338, "learning_rate": 1.147493091841965e-05, "loss": 0.4918, "step": 362 }, { "epoch": 0.92928, "grad_norm": 1.8248678379826555, "learning_rate": 1.1434247970136188e-05, "loss": 0.5299, "step": 363 }, { "epoch": 0.93184, "grad_norm": 1.8911031735554016, "learning_rate": 1.1393540770461358e-05, "loss": 0.622, "step": 364 }, { "epoch": 0.9344, "grad_norm": 1.6917252673616938, "learning_rate": 1.1352810007704476e-05, "loss": 0.5672, "step": 365 }, { "epoch": 0.93696, "grad_norm": 1.7652120249882262, "learning_rate": 1.1312056370573277e-05, "loss": 0.4876, "step": 366 }, { "epoch": 0.93952, "grad_norm": 1.912188345028055, "learning_rate": 1.127128054816227e-05, "loss": 0.5229, "step": 367 }, { "epoch": 0.94208, "grad_norm": 1.9268235049476894, "learning_rate": 1.1230483229941092e-05, "loss": 0.4969, "step": 368 }, { "epoch": 0.94464, "grad_norm": 1.7124716981838979, "learning_rate": 1.1189665105742846e-05, "loss": 0.4973, "step": 369 }, { "epoch": 0.9472, "grad_norm": 1.7639128606217356, "learning_rate": 1.1148826865752445e-05, "loss": 0.55, "step": 370 }, { "epoch": 0.94976, "grad_norm": 1.9598599864278292, "learning_rate": 1.1107969200494928e-05, "loss": 0.5607, "step": 371 }, { "epoch": 0.95232, "grad_norm": 1.8988749717432336, "learning_rate": 1.1067092800823798e-05, "loss": 0.5147, "step": 372 }, { "epoch": 0.95488, "grad_norm": 1.8154562117513298, "learning_rate": 1.1026198357909327e-05, "loss": 0.5039, "step": 373 }, { "epoch": 0.95744, "grad_norm": 1.7241766515805417, "learning_rate": 1.0985286563226887e-05, "loss": 0.5053, "step": 374 }, { "epoch": 0.96, "grad_norm": 1.886751738203789, "learning_rate": 1.0944358108545236e-05, "loss": 0.5563, "step": 375 }, { "epoch": 0.96256, "grad_norm": 1.8887450413746818, "learning_rate": 1.0903413685914843e-05, "loss": 0.5866, "step": 376 }, { "epoch": 0.96512, "grad_norm": 1.8801523776322404, "learning_rate": 1.0862453987656162e-05, "loss": 0.573, "step": 377 }, { "epoch": 0.96768, "grad_norm": 1.8773940578406214, "learning_rate": 1.0821479706347953e-05, "loss": 0.4809, "step": 378 }, { "epoch": 0.97024, "grad_norm": 1.8407925735220598, "learning_rate": 1.0780491534815549e-05, "loss": 0.5471, "step": 379 }, { "epoch": 0.9728, "grad_norm": 2.144021305027399, "learning_rate": 1.0739490166119155e-05, "loss": 0.5732, "step": 380 }, { "epoch": 0.97536, "grad_norm": 1.8454174363471443, "learning_rate": 1.0698476293542124e-05, "loss": 0.5603, "step": 381 }, { "epoch": 0.97792, "grad_norm": 1.8272486716574172, "learning_rate": 1.0657450610579225e-05, "loss": 0.5493, "step": 382 }, { "epoch": 0.98048, "grad_norm": 1.8920692278670657, "learning_rate": 1.0616413810924937e-05, "loss": 0.5611, "step": 383 }, { "epoch": 0.98304, "grad_norm": 2.0131793545946626, "learning_rate": 1.057536658846171e-05, "loss": 0.5706, "step": 384 }, { "epoch": 0.9856, "grad_norm": 1.8744182336273665, "learning_rate": 1.053430963724822e-05, "loss": 0.5511, "step": 385 }, { "epoch": 0.98816, "grad_norm": 1.705985214182383, "learning_rate": 1.0493243651507654e-05, "loss": 0.4967, "step": 386 }, { "epoch": 0.99072, "grad_norm": 1.855409465319251, "learning_rate": 1.0452169325615956e-05, "loss": 0.5375, "step": 387 }, { "epoch": 0.99328, "grad_norm": 1.7606864394836728, "learning_rate": 1.04110873540901e-05, "loss": 0.52, "step": 388 }, { "epoch": 0.99584, "grad_norm": 1.8631155705703495, "learning_rate": 1.0369998431576328e-05, "loss": 0.6018, "step": 389 }, { "epoch": 0.9984, "grad_norm": 1.8541664181232975, "learning_rate": 1.0328903252838415e-05, "loss": 0.5396, "step": 390 }, { "epoch": 1.00096, "grad_norm": 1.5672351465494485, "learning_rate": 1.0287802512745935e-05, "loss": 0.3953, "step": 391 }, { "epoch": 1.00352, "grad_norm": 1.558195855755232, "learning_rate": 1.0246696906262484e-05, "loss": 0.3791, "step": 392 }, { "epoch": 1.00608, "grad_norm": 1.5431663246597491, "learning_rate": 1.0205587128433944e-05, "loss": 0.3558, "step": 393 }, { "epoch": 1.00864, "grad_norm": 1.445015635657179, "learning_rate": 1.016447387437674e-05, "loss": 0.3351, "step": 394 }, { "epoch": 1.0112, "grad_norm": 1.481573837422665, "learning_rate": 1.0123357839266066e-05, "loss": 0.3327, "step": 395 }, { "epoch": 1.01376, "grad_norm": 1.5392257435904235, "learning_rate": 1.0082239718324136e-05, "loss": 0.2687, "step": 396 }, { "epoch": 1.01632, "grad_norm": 1.4663853851401927, "learning_rate": 1.004112020680845e-05, "loss": 0.3138, "step": 397 }, { "epoch": 1.01888, "grad_norm": 1.650830842194171, "learning_rate": 1e-05, "loss": 0.3256, "step": 398 }, { "epoch": 1.02144, "grad_norm": 1.5814294192961373, "learning_rate": 9.958879793191553e-06, "loss": 0.3138, "step": 399 }, { "epoch": 1.024, "grad_norm": 1.6865522137000346, "learning_rate": 9.917760281675867e-06, "loss": 0.3153, "step": 400 }, { "epoch": 1.02656, "grad_norm": 1.7826957665836876, "learning_rate": 9.876642160733937e-06, "loss": 0.3152, "step": 401 }, { "epoch": 1.02912, "grad_norm": 1.7725840767006589, "learning_rate": 9.835526125623262e-06, "loss": 0.2928, "step": 402 }, { "epoch": 1.03168, "grad_norm": 1.8174709729145797, "learning_rate": 9.794412871566057e-06, "loss": 0.3079, "step": 403 }, { "epoch": 1.03424, "grad_norm": 2.005333790551581, "learning_rate": 9.753303093737518e-06, "loss": 0.3421, "step": 404 }, { "epoch": 1.0368, "grad_norm": 1.873929148570637, "learning_rate": 9.71219748725407e-06, "loss": 0.3364, "step": 405 }, { "epoch": 1.03936, "grad_norm": 1.5777934082692342, "learning_rate": 9.671096747161587e-06, "loss": 0.3168, "step": 406 }, { "epoch": 1.04192, "grad_norm": 1.6309016044222864, "learning_rate": 9.630001568423677e-06, "loss": 0.2704, "step": 407 }, { "epoch": 1.04448, "grad_norm": 1.7365095962411672, "learning_rate": 9.588912645909905e-06, "loss": 0.3153, "step": 408 }, { "epoch": 1.04704, "grad_norm": 1.7258342184967879, "learning_rate": 9.547830674384043e-06, "loss": 0.3018, "step": 409 }, { "epoch": 1.0496, "grad_norm": 1.6833772470314767, "learning_rate": 9.506756348492348e-06, "loss": 0.3026, "step": 410 }, { "epoch": 1.05216, "grad_norm": 1.813776064675088, "learning_rate": 9.465690362751781e-06, "loss": 0.3179, "step": 411 }, { "epoch": 1.05472, "grad_norm": 1.7975291601015548, "learning_rate": 9.424633411538289e-06, "loss": 0.3629, "step": 412 }, { "epoch": 1.05728, "grad_norm": 1.4794908417040535, "learning_rate": 9.383586189075065e-06, "loss": 0.3126, "step": 413 }, { "epoch": 1.05984, "grad_norm": 1.594883685807337, "learning_rate": 9.342549389420777e-06, "loss": 0.2997, "step": 414 }, { "epoch": 1.0624, "grad_norm": 1.4379937930102515, "learning_rate": 9.30152370645788e-06, "loss": 0.2796, "step": 415 }, { "epoch": 1.06496, "grad_norm": 1.772458848105549, "learning_rate": 9.260509833880848e-06, "loss": 0.3676, "step": 416 }, { "epoch": 1.06752, "grad_norm": 1.7331530871521117, "learning_rate": 9.21950846518445e-06, "loss": 0.2902, "step": 417 }, { "epoch": 1.07008, "grad_norm": 1.7622988238409705, "learning_rate": 9.17852029365205e-06, "loss": 0.2951, "step": 418 }, { "epoch": 1.07264, "grad_norm": 1.7162234337691245, "learning_rate": 9.13754601234384e-06, "loss": 0.3163, "step": 419 }, { "epoch": 1.0752, "grad_norm": 1.7337491899665072, "learning_rate": 9.096586314085162e-06, "loss": 0.3362, "step": 420 }, { "epoch": 1.07776, "grad_norm": 1.574010878864483, "learning_rate": 9.055641891454766e-06, "loss": 0.284, "step": 421 }, { "epoch": 1.08032, "grad_norm": 2.0248969916179123, "learning_rate": 9.014713436773114e-06, "loss": 0.3209, "step": 422 }, { "epoch": 1.08288, "grad_norm": 1.6901116986132412, "learning_rate": 8.973801642090674e-06, "loss": 0.3283, "step": 423 }, { "epoch": 1.08544, "grad_norm": 1.8279240135161061, "learning_rate": 8.932907199176206e-06, "loss": 0.2894, "step": 424 }, { "epoch": 1.088, "grad_norm": 1.6523113744718456, "learning_rate": 8.892030799505072e-06, "loss": 0.3071, "step": 425 }, { "epoch": 1.09056, "grad_norm": 1.8531686326526295, "learning_rate": 8.85117313424756e-06, "loss": 0.326, "step": 426 }, { "epoch": 1.09312, "grad_norm": 1.70280493987216, "learning_rate": 8.810334894257156e-06, "loss": 0.3218, "step": 427 }, { "epoch": 1.09568, "grad_norm": 1.5975578961167665, "learning_rate": 8.769516770058915e-06, "loss": 0.3014, "step": 428 }, { "epoch": 1.09824, "grad_norm": 1.5963594131551222, "learning_rate": 8.728719451837735e-06, "loss": 0.2813, "step": 429 }, { "epoch": 1.1008, "grad_norm": 1.5356693796819318, "learning_rate": 8.687943629426725e-06, "loss": 0.2765, "step": 430 }, { "epoch": 1.10336, "grad_norm": 1.9059233869160093, "learning_rate": 8.647189992295526e-06, "loss": 0.2862, "step": 431 }, { "epoch": 1.10592, "grad_norm": 1.915468592863442, "learning_rate": 8.606459229538645e-06, "loss": 0.3143, "step": 432 }, { "epoch": 1.10848, "grad_norm": 1.74246639812338, "learning_rate": 8.56575202986382e-06, "loss": 0.3091, "step": 433 }, { "epoch": 1.11104, "grad_norm": 1.8647458734292492, "learning_rate": 8.525069081580351e-06, "loss": 0.3317, "step": 434 }, { "epoch": 1.1136, "grad_norm": 1.715483977869925, "learning_rate": 8.484411072587491e-06, "loss": 0.3065, "step": 435 }, { "epoch": 1.11616, "grad_norm": 1.6696415990436007, "learning_rate": 8.443778690362801e-06, "loss": 0.2741, "step": 436 }, { "epoch": 1.11872, "grad_norm": 1.7084096219447564, "learning_rate": 8.403172621950512e-06, "loss": 0.3058, "step": 437 }, { "epoch": 1.12128, "grad_norm": 1.7929139885486827, "learning_rate": 8.362593553949926e-06, "loss": 0.2868, "step": 438 }, { "epoch": 1.12384, "grad_norm": 1.757789981994357, "learning_rate": 8.322042172503812e-06, "loss": 0.3129, "step": 439 }, { "epoch": 1.1264, "grad_norm": 1.6625503348814892, "learning_rate": 8.281519163286772e-06, "loss": 0.287, "step": 440 }, { "epoch": 1.12896, "grad_norm": 1.9281991436107038, "learning_rate": 8.241025211493684e-06, "loss": 0.3375, "step": 441 }, { "epoch": 1.13152, "grad_norm": 1.6723278237853747, "learning_rate": 8.200561001828093e-06, "loss": 0.2843, "step": 442 }, { "epoch": 1.13408, "grad_norm": 1.6487601783747716, "learning_rate": 8.160127218490643e-06, "loss": 0.3173, "step": 443 }, { "epoch": 1.13664, "grad_norm": 1.8497858729306897, "learning_rate": 8.11972454516751e-06, "loss": 0.3022, "step": 444 }, { "epoch": 1.1392, "grad_norm": 1.6639405806591592, "learning_rate": 8.079353665018827e-06, "loss": 0.3453, "step": 445 }, { "epoch": 1.14176, "grad_norm": 1.560079035489574, "learning_rate": 8.039015260667154e-06, "loss": 0.33, "step": 446 }, { "epoch": 1.14432, "grad_norm": 1.7303374905946536, "learning_rate": 7.998710014185916e-06, "loss": 0.311, "step": 447 }, { "epoch": 1.14688, "grad_norm": 1.7019483722377131, "learning_rate": 7.958438607087884e-06, "loss": 0.3124, "step": 448 }, { "epoch": 1.14944, "grad_norm": 1.7461543792169232, "learning_rate": 7.918201720313648e-06, "loss": 0.3132, "step": 449 }, { "epoch": 1.152, "grad_norm": 1.65701293316665, "learning_rate": 7.878000034220092e-06, "loss": 0.2898, "step": 450 }, { "epoch": 1.15456, "grad_norm": 1.793230509633861, "learning_rate": 7.837834228568911e-06, "loss": 0.3116, "step": 451 }, { "epoch": 1.15712, "grad_norm": 1.9106756293670617, "learning_rate": 7.797704982515094e-06, "loss": 0.3451, "step": 452 }, { "epoch": 1.15968, "grad_norm": 1.8497679963390545, "learning_rate": 7.75761297459546e-06, "loss": 0.2923, "step": 453 }, { "epoch": 1.16224, "grad_norm": 1.7480611256420666, "learning_rate": 7.717558882717175e-06, "loss": 0.3156, "step": 454 }, { "epoch": 1.1648, "grad_norm": 1.667882679284623, "learning_rate": 7.677543384146287e-06, "loss": 0.2967, "step": 455 }, { "epoch": 1.16736, "grad_norm": 1.6327780686406852, "learning_rate": 7.637567155496277e-06, "loss": 0.3298, "step": 456 }, { "epoch": 1.16992, "grad_norm": 1.9718387276246228, "learning_rate": 7.597630872716631e-06, "loss": 0.3067, "step": 457 }, { "epoch": 1.17248, "grad_norm": 1.8631792827164149, "learning_rate": 7.5577352110813825e-06, "loss": 0.3188, "step": 458 }, { "epoch": 1.17504, "grad_norm": 1.7515381932583938, "learning_rate": 7.517880845177725e-06, "loss": 0.3103, "step": 459 }, { "epoch": 1.1776, "grad_norm": 1.6929585603147987, "learning_rate": 7.478068448894577e-06, "loss": 0.2854, "step": 460 }, { "epoch": 1.1801599999999999, "grad_norm": 1.6653769100031355, "learning_rate": 7.438298695411218e-06, "loss": 0.2605, "step": 461 }, { "epoch": 1.18272, "grad_norm": 1.660565063408245, "learning_rate": 7.398572257185879e-06, "loss": 0.2627, "step": 462 }, { "epoch": 1.1852800000000001, "grad_norm": 1.6879487748419324, "learning_rate": 7.358889805944383e-06, "loss": 0.2577, "step": 463 }, { "epoch": 1.18784, "grad_norm": 1.7460066493392816, "learning_rate": 7.31925201266879e-06, "loss": 0.3328, "step": 464 }, { "epoch": 1.1904, "grad_norm": 1.6332719487911835, "learning_rate": 7.2796595475860425e-06, "loss": 0.2843, "step": 465 }, { "epoch": 1.19296, "grad_norm": 1.9280765743425918, "learning_rate": 7.240113080156646e-06, "loss": 0.3132, "step": 466 }, { "epoch": 1.19552, "grad_norm": 1.8041999203900638, "learning_rate": 7.200613279063341e-06, "loss": 0.284, "step": 467 }, { "epoch": 1.19808, "grad_norm": 1.927491301377088, "learning_rate": 7.161160812199785e-06, "loss": 0.3379, "step": 468 }, { "epoch": 1.20064, "grad_norm": 1.7700179930703523, "learning_rate": 7.121756346659292e-06, "loss": 0.3304, "step": 469 }, { "epoch": 1.2032, "grad_norm": 1.8239759132867097, "learning_rate": 7.082400548723505e-06, "loss": 0.2773, "step": 470 }, { "epoch": 1.20576, "grad_norm": 1.8558590715535617, "learning_rate": 7.043094083851181e-06, "loss": 0.3101, "step": 471 }, { "epoch": 1.20832, "grad_norm": 1.9466031042965117, "learning_rate": 7.003837616666906e-06, "loss": 0.3011, "step": 472 }, { "epoch": 1.21088, "grad_norm": 1.718281958362647, "learning_rate": 6.96463181094986e-06, "loss": 0.2914, "step": 473 }, { "epoch": 1.21344, "grad_norm": 1.6920570061933058, "learning_rate": 6.925477329622609e-06, "loss": 0.29, "step": 474 }, { "epoch": 1.216, "grad_norm": 1.5582172395472835, "learning_rate": 6.886374834739883e-06, "loss": 0.2565, "step": 475 }, { "epoch": 1.21856, "grad_norm": 1.9619111558727853, "learning_rate": 6.847324987477375e-06, "loss": 0.2811, "step": 476 }, { "epoch": 1.22112, "grad_norm": 1.7660903136072212, "learning_rate": 6.808328448120588e-06, "loss": 0.3277, "step": 477 }, { "epoch": 1.2236799999999999, "grad_norm": 1.6879368816014204, "learning_rate": 6.769385876053632e-06, "loss": 0.2918, "step": 478 }, { "epoch": 1.22624, "grad_norm": 1.8466731384715298, "learning_rate": 6.730497929748116e-06, "loss": 0.3103, "step": 479 }, { "epoch": 1.2288000000000001, "grad_norm": 1.8338896405236385, "learning_rate": 6.6916652667519855e-06, "loss": 0.3211, "step": 480 }, { "epoch": 1.23136, "grad_norm": 1.7548167253010734, "learning_rate": 6.652888543678404e-06, "loss": 0.3331, "step": 481 }, { "epoch": 1.23392, "grad_norm": 1.7191841522008924, "learning_rate": 6.614168416194674e-06, "loss": 0.2684, "step": 482 }, { "epoch": 1.23648, "grad_norm": 1.6620234347821567, "learning_rate": 6.575505539011123e-06, "loss": 0.273, "step": 483 }, { "epoch": 1.23904, "grad_norm": 1.8705146664514543, "learning_rate": 6.536900565870052e-06, "loss": 0.3324, "step": 484 }, { "epoch": 1.2416, "grad_norm": 1.6318180235961661, "learning_rate": 6.498354149534677e-06, "loss": 0.2891, "step": 485 }, { "epoch": 1.24416, "grad_norm": 1.7493455359816799, "learning_rate": 6.459866941778077e-06, "loss": 0.2847, "step": 486 }, { "epoch": 1.24672, "grad_norm": 1.6377143720858458, "learning_rate": 6.421439593372201e-06, "loss": 0.2841, "step": 487 }, { "epoch": 1.24928, "grad_norm": 1.7580656850520682, "learning_rate": 6.3830727540768445e-06, "loss": 0.3086, "step": 488 }, { "epoch": 1.25184, "grad_norm": 1.7537075156428699, "learning_rate": 6.344767072628659e-06, "loss": 0.3166, "step": 489 }, { "epoch": 1.2544, "grad_norm": 1.727275836520608, "learning_rate": 6.3065231967302055e-06, "loss": 0.279, "step": 490 }, { "epoch": 1.25696, "grad_norm": 1.8430842401562408, "learning_rate": 6.268341773038973e-06, "loss": 0.2752, "step": 491 }, { "epoch": 1.25952, "grad_norm": 1.7578451456561774, "learning_rate": 6.230223447156469e-06, "loss": 0.2958, "step": 492 }, { "epoch": 1.26208, "grad_norm": 1.6454134556997662, "learning_rate": 6.1921688636172964e-06, "loss": 0.274, "step": 493 }, { "epoch": 1.26464, "grad_norm": 1.6214753367677326, "learning_rate": 6.154178665878241e-06, "loss": 0.2761, "step": 494 }, { "epoch": 1.2671999999999999, "grad_norm": 1.8385311514429392, "learning_rate": 6.116253496307415e-06, "loss": 0.2883, "step": 495 }, { "epoch": 1.26976, "grad_norm": 1.7575162142152143, "learning_rate": 6.078393996173375e-06, "loss": 0.2789, "step": 496 }, { "epoch": 1.2723200000000001, "grad_norm": 1.7863119640113554, "learning_rate": 6.040600805634287e-06, "loss": 0.3174, "step": 497 }, { "epoch": 1.27488, "grad_norm": 1.802911376776798, "learning_rate": 6.002874563727116e-06, "loss": 0.2831, "step": 498 }, { "epoch": 1.27744, "grad_norm": 1.7950928461300708, "learning_rate": 5.965215908356783e-06, "loss": 0.3047, "step": 499 }, { "epoch": 1.28, "grad_norm": 1.8067717158566738, "learning_rate": 5.927625476285426e-06, "loss": 0.2981, "step": 500 }, { "epoch": 1.28256, "grad_norm": 1.759342634285478, "learning_rate": 5.890103903121593e-06, "loss": 0.3129, "step": 501 }, { "epoch": 1.28512, "grad_norm": 2.0897739052494293, "learning_rate": 5.852651823309521e-06, "loss": 0.2928, "step": 502 }, { "epoch": 1.28768, "grad_norm": 1.8220220157801934, "learning_rate": 5.815269870118403e-06, "loss": 0.2899, "step": 503 }, { "epoch": 1.29024, "grad_norm": 1.688333219221233, "learning_rate": 5.777958675631657e-06, "loss": 0.3004, "step": 504 }, { "epoch": 1.2928, "grad_norm": 1.6140942084232062, "learning_rate": 5.740718870736272e-06, "loss": 0.2953, "step": 505 }, { "epoch": 1.29536, "grad_norm": 1.8293432699580725, "learning_rate": 5.703551085112133e-06, "loss": 0.3021, "step": 506 }, { "epoch": 1.29792, "grad_norm": 1.8202872346713284, "learning_rate": 5.6664559472213495e-06, "loss": 0.2781, "step": 507 }, { "epoch": 1.30048, "grad_norm": 1.8156854105803961, "learning_rate": 5.629434084297654e-06, "loss": 0.3122, "step": 508 }, { "epoch": 1.30304, "grad_norm": 2.0300686868487556, "learning_rate": 5.592486122335784e-06, "loss": 0.3498, "step": 509 }, { "epoch": 1.3056, "grad_norm": 1.7909337703685713, "learning_rate": 5.555612686080909e-06, "loss": 0.3079, "step": 510 }, { "epoch": 1.30816, "grad_norm": 1.5882480407199409, "learning_rate": 5.518814399018058e-06, "loss": 0.2685, "step": 511 }, { "epoch": 1.3107199999999999, "grad_norm": 1.6893045166962126, "learning_rate": 5.482091883361571e-06, "loss": 0.2897, "step": 512 }, { "epoch": 1.31328, "grad_norm": 1.9623021679379937, "learning_rate": 5.445445760044594e-06, "loss": 0.2751, "step": 513 }, { "epoch": 1.3158400000000001, "grad_norm": 1.8542025764374261, "learning_rate": 5.408876648708561e-06, "loss": 0.2808, "step": 514 }, { "epoch": 1.3184, "grad_norm": 1.625965052985563, "learning_rate": 5.372385167692739e-06, "loss": 0.2751, "step": 515 }, { "epoch": 1.32096, "grad_norm": 1.8190194776544601, "learning_rate": 5.335971934023757e-06, "loss": 0.2917, "step": 516 }, { "epoch": 1.32352, "grad_norm": 1.92774093649498, "learning_rate": 5.299637563405169e-06, "loss": 0.2997, "step": 517 }, { "epoch": 1.32608, "grad_norm": 1.7214112637921481, "learning_rate": 5.263382670207063e-06, "loss": 0.2627, "step": 518 }, { "epoch": 1.32864, "grad_norm": 1.6131621215448115, "learning_rate": 5.227207867455648e-06, "loss": 0.2679, "step": 519 }, { "epoch": 1.3312, "grad_norm": 1.68659474905301, "learning_rate": 5.191113766822905e-06, "loss": 0.2732, "step": 520 }, { "epoch": 1.33376, "grad_norm": 1.8809611133528827, "learning_rate": 5.155100978616248e-06, "loss": 0.3065, "step": 521 }, { "epoch": 1.33632, "grad_norm": 1.8386742630648891, "learning_rate": 5.1191701117681815e-06, "loss": 0.3092, "step": 522 }, { "epoch": 1.33888, "grad_norm": 1.780053544517966, "learning_rate": 5.083321773826038e-06, "loss": 0.2825, "step": 523 }, { "epoch": 1.34144, "grad_norm": 1.6847775520894839, "learning_rate": 5.04755657094167e-06, "loss": 0.2928, "step": 524 }, { "epoch": 1.3439999999999999, "grad_norm": 1.5698053305448867, "learning_rate": 5.011875107861221e-06, "loss": 0.2725, "step": 525 }, { "epoch": 1.34656, "grad_norm": 1.796815658001955, "learning_rate": 4.976277987914905e-06, "loss": 0.3287, "step": 526 }, { "epoch": 1.34912, "grad_norm": 1.728558689136845, "learning_rate": 4.940765813006784e-06, "loss": 0.2839, "step": 527 }, { "epoch": 1.35168, "grad_norm": 1.8233040819705273, "learning_rate": 4.905339183604614e-06, "loss": 0.3033, "step": 528 }, { "epoch": 1.3542399999999999, "grad_norm": 1.772177612527984, "learning_rate": 4.86999869872967e-06, "loss": 0.301, "step": 529 }, { "epoch": 1.3568, "grad_norm": 1.6377253150219735, "learning_rate": 4.834744955946631e-06, "loss": 0.2911, "step": 530 }, { "epoch": 1.3593600000000001, "grad_norm": 1.7737384781484877, "learning_rate": 4.79957855135348e-06, "loss": 0.3027, "step": 531 }, { "epoch": 1.36192, "grad_norm": 1.9035288129722436, "learning_rate": 4.764500079571403e-06, "loss": 0.3231, "step": 532 }, { "epoch": 1.36448, "grad_norm": 1.837775666581152, "learning_rate": 4.729510133734766e-06, "loss": 0.2855, "step": 533 }, { "epoch": 1.36704, "grad_norm": 1.7295857177394576, "learning_rate": 4.694609305481055e-06, "loss": 0.2804, "step": 534 }, { "epoch": 1.3696, "grad_norm": 1.806102463932245, "learning_rate": 4.659798184940887e-06, "loss": 0.3093, "step": 535 }, { "epoch": 1.37216, "grad_norm": 1.5986303108578905, "learning_rate": 4.6250773607280375e-06, "loss": 0.2445, "step": 536 }, { "epoch": 1.37472, "grad_norm": 1.9007193691045698, "learning_rate": 4.590447419929481e-06, "loss": 0.28, "step": 537 }, { "epoch": 1.37728, "grad_norm": 1.5654130383489473, "learning_rate": 4.555908948095455e-06, "loss": 0.2487, "step": 538 }, { "epoch": 1.37984, "grad_norm": 1.807208592204752, "learning_rate": 4.521462529229579e-06, "loss": 0.2874, "step": 539 }, { "epoch": 1.3824, "grad_norm": 1.9059242633131657, "learning_rate": 4.487108745778958e-06, "loss": 0.2994, "step": 540 }, { "epoch": 1.38496, "grad_norm": 1.93530044589512, "learning_rate": 4.452848178624348e-06, "loss": 0.3276, "step": 541 }, { "epoch": 1.3875199999999999, "grad_norm": 1.6211554099082521, "learning_rate": 4.418681407070339e-06, "loss": 0.2484, "step": 542 }, { "epoch": 1.39008, "grad_norm": 1.8235255434109632, "learning_rate": 4.384609008835535e-06, "loss": 0.284, "step": 543 }, { "epoch": 1.39264, "grad_norm": 1.8642953553369541, "learning_rate": 4.350631560042821e-06, "loss": 0.3073, "step": 544 }, { "epoch": 1.3952, "grad_norm": 1.8052071453907574, "learning_rate": 4.3167496352095876e-06, "loss": 0.3176, "step": 545 }, { "epoch": 1.39776, "grad_norm": 2.05400626300769, "learning_rate": 4.282963807238032e-06, "loss": 0.2941, "step": 546 }, { "epoch": 1.40032, "grad_norm": 1.8143960130300665, "learning_rate": 4.2492746474054825e-06, "loss": 0.3092, "step": 547 }, { "epoch": 1.4028800000000001, "grad_norm": 1.8062220799473758, "learning_rate": 4.2156827253547095e-06, "loss": 0.2854, "step": 548 }, { "epoch": 1.40544, "grad_norm": 1.8654013394561701, "learning_rate": 4.182188609084328e-06, "loss": 0.327, "step": 549 }, { "epoch": 1.408, "grad_norm": 1.6776732336572873, "learning_rate": 4.148792864939164e-06, "loss": 0.2519, "step": 550 }, { "epoch": 1.41056, "grad_norm": 1.6871367738087453, "learning_rate": 4.115496057600689e-06, "loss": 0.2978, "step": 551 }, { "epoch": 1.41312, "grad_norm": 1.9401515888224816, "learning_rate": 4.082298750077485e-06, "loss": 0.3189, "step": 552 }, { "epoch": 1.41568, "grad_norm": 1.7127683807523446, "learning_rate": 4.0492015036957e-06, "loss": 0.2839, "step": 553 }, { "epoch": 1.41824, "grad_norm": 1.7006614590161078, "learning_rate": 4.016204878089579e-06, "loss": 0.265, "step": 554 }, { "epoch": 1.4208, "grad_norm": 1.6926095736078215, "learning_rate": 3.983309431191995e-06, "loss": 0.2566, "step": 555 }, { "epoch": 1.42336, "grad_norm": 1.7495489591645166, "learning_rate": 3.950515719224991e-06, "loss": 0.3097, "step": 556 }, { "epoch": 1.42592, "grad_norm": 1.685311274806112, "learning_rate": 3.9178242966904225e-06, "loss": 0.2629, "step": 557 }, { "epoch": 1.42848, "grad_norm": 1.868117738329845, "learning_rate": 3.885235716360534e-06, "loss": 0.3089, "step": 558 }, { "epoch": 1.4310399999999999, "grad_norm": 1.6105881067710597, "learning_rate": 3.852750529268645e-06, "loss": 0.2841, "step": 559 }, { "epoch": 1.4336, "grad_norm": 1.7563576736879873, "learning_rate": 3.820369284699823e-06, "loss": 0.3019, "step": 560 }, { "epoch": 1.43616, "grad_norm": 1.6236719779141782, "learning_rate": 3.788092530181583e-06, "loss": 0.2747, "step": 561 }, { "epoch": 1.43872, "grad_norm": 1.625891715697152, "learning_rate": 3.755920811474647e-06, "loss": 0.2652, "step": 562 }, { "epoch": 1.44128, "grad_norm": 1.6938038174994694, "learning_rate": 3.7238546725637046e-06, "loss": 0.2763, "step": 563 }, { "epoch": 1.44384, "grad_norm": 1.872174965035592, "learning_rate": 3.691894655648225e-06, "loss": 0.2877, "step": 564 }, { "epoch": 1.4464000000000001, "grad_norm": 1.714528697164392, "learning_rate": 3.6600413011332835e-06, "loss": 0.2975, "step": 565 }, { "epoch": 1.44896, "grad_norm": 1.7847360697911003, "learning_rate": 3.6282951476204177e-06, "loss": 0.3059, "step": 566 }, { "epoch": 1.45152, "grad_norm": 1.7410992194167252, "learning_rate": 3.5966567318985267e-06, "loss": 0.3158, "step": 567 }, { "epoch": 1.45408, "grad_norm": 1.6864063612802922, "learning_rate": 3.565126588934803e-06, "loss": 0.2836, "step": 568 }, { "epoch": 1.45664, "grad_norm": 1.714438302514508, "learning_rate": 3.533705251865668e-06, "loss": 0.2957, "step": 569 }, { "epoch": 1.4592, "grad_norm": 1.8173113954750335, "learning_rate": 3.502393251987776e-06, "loss": 0.3121, "step": 570 }, { "epoch": 1.46176, "grad_norm": 1.6936277347451338, "learning_rate": 3.4711911187490165e-06, "loss": 0.2687, "step": 571 }, { "epoch": 1.46432, "grad_norm": 1.9870635360197202, "learning_rate": 3.4400993797395664e-06, "loss": 0.3278, "step": 572 }, { "epoch": 1.46688, "grad_norm": 1.7897731687332026, "learning_rate": 3.4091185606829793e-06, "loss": 0.2655, "step": 573 }, { "epoch": 1.46944, "grad_norm": 1.70048947778315, "learning_rate": 3.3782491854272736e-06, "loss": 0.3024, "step": 574 }, { "epoch": 1.472, "grad_norm": 1.619029376588391, "learning_rate": 3.3474917759361036e-06, "loss": 0.2755, "step": 575 }, { "epoch": 1.4745599999999999, "grad_norm": 1.8451956935598997, "learning_rate": 3.316846852279907e-06, "loss": 0.2863, "step": 576 }, { "epoch": 1.47712, "grad_norm": 1.8145774126755378, "learning_rate": 3.2863149326271226e-06, "loss": 0.281, "step": 577 }, { "epoch": 1.47968, "grad_norm": 1.6672300035397345, "learning_rate": 3.255896533235439e-06, "loss": 0.27, "step": 578 }, { "epoch": 1.48224, "grad_norm": 1.668688823060323, "learning_rate": 3.2255921684430423e-06, "loss": 0.2756, "step": 579 }, { "epoch": 1.4848, "grad_norm": 1.7784555478973214, "learning_rate": 3.195402350659945e-06, "loss": 0.321, "step": 580 }, { "epoch": 1.48736, "grad_norm": 1.6455590944323666, "learning_rate": 3.165327590359295e-06, "loss": 0.2877, "step": 581 }, { "epoch": 1.4899200000000001, "grad_norm": 2.0743380157623124, "learning_rate": 3.135368396068771e-06, "loss": 0.3027, "step": 582 }, { "epoch": 1.49248, "grad_norm": 1.5664539608516517, "learning_rate": 3.1055252743619623e-06, "loss": 0.2573, "step": 583 }, { "epoch": 1.49504, "grad_norm": 1.7550490290565095, "learning_rate": 3.0757987298498106e-06, "loss": 0.2703, "step": 584 }, { "epoch": 1.4976, "grad_norm": 1.7076285620278457, "learning_rate": 3.046189265172085e-06, "loss": 0.2836, "step": 585 }, { "epoch": 1.5001600000000002, "grad_norm": 1.5213003659889548, "learning_rate": 3.0166973809888776e-06, "loss": 0.2958, "step": 586 }, { "epoch": 1.50272, "grad_norm": 1.6842118874383583, "learning_rate": 2.987323575972132e-06, "loss": 0.2819, "step": 587 }, { "epoch": 1.50528, "grad_norm": 1.7336219161047688, "learning_rate": 2.958068346797217e-06, "loss": 0.2939, "step": 588 }, { "epoch": 1.5078399999999998, "grad_norm": 1.8365763102322976, "learning_rate": 2.9289321881345257e-06, "loss": 0.2822, "step": 589 }, { "epoch": 1.5104, "grad_norm": 2.0201724941232273, "learning_rate": 2.8999155926411203e-06, "loss": 0.3133, "step": 590 }, { "epoch": 1.51296, "grad_norm": 1.7737234557135833, "learning_rate": 2.871019050952395e-06, "loss": 0.2718, "step": 591 }, { "epoch": 1.51552, "grad_norm": 1.7105337375961225, "learning_rate": 2.8422430516737733e-06, "loss": 0.2287, "step": 592 }, { "epoch": 1.5180799999999999, "grad_norm": 1.5532376533528256, "learning_rate": 2.813588081372456e-06, "loss": 0.2805, "step": 593 }, { "epoch": 1.52064, "grad_norm": 1.6488135407698572, "learning_rate": 2.7850546245691866e-06, "loss": 0.2783, "step": 594 }, { "epoch": 1.5232, "grad_norm": 1.8138759575713275, "learning_rate": 2.7566431637300738e-06, "loss": 0.2936, "step": 595 }, { "epoch": 1.52576, "grad_norm": 1.9339574210123396, "learning_rate": 2.7283541792584165e-06, "loss": 0.2858, "step": 596 }, { "epoch": 1.52832, "grad_norm": 1.6414720524358055, "learning_rate": 2.7001881494865845e-06, "loss": 0.2717, "step": 597 }, { "epoch": 1.53088, "grad_norm": 1.670022901559193, "learning_rate": 2.672145550667933e-06, "loss": 0.2761, "step": 598 }, { "epoch": 1.5334400000000001, "grad_norm": 1.651543474445551, "learning_rate": 2.6442268569687567e-06, "loss": 0.266, "step": 599 }, { "epoch": 1.536, "grad_norm": 1.7579445968272946, "learning_rate": 2.616432540460255e-06, "loss": 0.2839, "step": 600 }, { "epoch": 1.53856, "grad_norm": 1.7310903919014502, "learning_rate": 2.5887630711105705e-06, "loss": 0.2996, "step": 601 }, { "epoch": 1.54112, "grad_norm": 1.7899987929588956, "learning_rate": 2.561218916776823e-06, "loss": 0.2827, "step": 602 }, { "epoch": 1.5436800000000002, "grad_norm": 1.793887084940259, "learning_rate": 2.5338005431972144e-06, "loss": 0.2962, "step": 603 }, { "epoch": 1.54624, "grad_norm": 1.8405222153934413, "learning_rate": 2.5065084139831443e-06, "loss": 0.2769, "step": 604 }, { "epoch": 1.5488, "grad_norm": 1.6538653809897454, "learning_rate": 2.4793429906113676e-06, "loss": 0.2798, "step": 605 }, { "epoch": 1.5513599999999999, "grad_norm": 1.7853505122465314, "learning_rate": 2.4523047324162087e-06, "loss": 0.2837, "step": 606 }, { "epoch": 1.55392, "grad_norm": 1.705880111795306, "learning_rate": 2.4253940965817726e-06, "loss": 0.3107, "step": 607 }, { "epoch": 1.55648, "grad_norm": 2.053508927696654, "learning_rate": 2.3986115381342347e-06, "loss": 0.3172, "step": 608 }, { "epoch": 1.55904, "grad_norm": 1.6153752928179927, "learning_rate": 2.3719575099341298e-06, "loss": 0.2837, "step": 609 }, { "epoch": 1.5615999999999999, "grad_norm": 1.6914755812024604, "learning_rate": 2.345432462668702e-06, "loss": 0.2551, "step": 610 }, { "epoch": 1.56416, "grad_norm": 1.770027987354017, "learning_rate": 2.3190368448442936e-06, "loss": 0.2574, "step": 611 }, { "epoch": 1.5667200000000001, "grad_norm": 1.859972298306848, "learning_rate": 2.292771102778739e-06, "loss": 0.3037, "step": 612 }, { "epoch": 1.56928, "grad_norm": 1.897188441833873, "learning_rate": 2.266635680593845e-06, "loss": 0.3067, "step": 613 }, { "epoch": 1.57184, "grad_norm": 1.7143004820729382, "learning_rate": 2.2406310202078586e-06, "loss": 0.2807, "step": 614 }, { "epoch": 1.5744, "grad_norm": 1.8448535483722395, "learning_rate": 2.2147575613280013e-06, "loss": 0.2997, "step": 615 }, { "epoch": 1.5769600000000001, "grad_norm": 1.669565045060629, "learning_rate": 2.1890157414430448e-06, "loss": 0.2528, "step": 616 }, { "epoch": 1.57952, "grad_norm": 1.842829735431479, "learning_rate": 2.163405995815904e-06, "loss": 0.2771, "step": 617 }, { "epoch": 1.58208, "grad_norm": 1.8727513182856619, "learning_rate": 2.1379287574762717e-06, "loss": 0.3045, "step": 618 }, { "epoch": 1.58464, "grad_norm": 1.85842601413902, "learning_rate": 2.11258445721331e-06, "loss": 0.319, "step": 619 }, { "epoch": 1.5872000000000002, "grad_norm": 1.8680204100053788, "learning_rate": 2.0873735235683535e-06, "loss": 0.2799, "step": 620 }, { "epoch": 1.58976, "grad_norm": 1.730211436527152, "learning_rate": 2.0622963828276744e-06, "loss": 0.2626, "step": 621 }, { "epoch": 1.59232, "grad_norm": 1.6435312018877513, "learning_rate": 2.037353459015272e-06, "loss": 0.2744, "step": 622 }, { "epoch": 1.5948799999999999, "grad_norm": 1.6224922861504871, "learning_rate": 2.0125451738856903e-06, "loss": 0.2296, "step": 623 }, { "epoch": 1.59744, "grad_norm": 1.861965804154878, "learning_rate": 1.9878719469169104e-06, "loss": 0.2595, "step": 624 }, { "epoch": 1.6, "grad_norm": 1.6943488611068847, "learning_rate": 1.9633341953032246e-06, "loss": 0.2906, "step": 625 }, { "epoch": 1.60256, "grad_norm": 1.6320130474511332, "learning_rate": 1.9389323339482204e-06, "loss": 0.2756, "step": 626 }, { "epoch": 1.6051199999999999, "grad_norm": 1.6698686833486305, "learning_rate": 1.9146667754577408e-06, "loss": 0.3068, "step": 627 }, { "epoch": 1.60768, "grad_norm": 1.6880177914613448, "learning_rate": 1.890537930132903e-06, "loss": 0.2839, "step": 628 }, { "epoch": 1.6102400000000001, "grad_norm": 1.6773268308513856, "learning_rate": 1.8665462059631866e-06, "loss": 0.2491, "step": 629 }, { "epoch": 1.6128, "grad_norm": 1.8474070458904108, "learning_rate": 1.8426920086195065e-06, "loss": 0.2904, "step": 630 }, { "epoch": 1.61536, "grad_norm": 1.6218761795055971, "learning_rate": 1.8189757414473686e-06, "loss": 0.2441, "step": 631 }, { "epoch": 1.61792, "grad_norm": 1.8666720513899506, "learning_rate": 1.795397805460053e-06, "loss": 0.3003, "step": 632 }, { "epoch": 1.6204800000000001, "grad_norm": 1.685788691547833, "learning_rate": 1.7719585993318177e-06, "loss": 0.2896, "step": 633 }, { "epoch": 1.62304, "grad_norm": 1.7449552299776978, "learning_rate": 1.7486585193911787e-06, "loss": 0.2794, "step": 634 }, { "epoch": 1.6256, "grad_norm": 1.704848209821893, "learning_rate": 1.7254979596141886e-06, "loss": 0.2616, "step": 635 }, { "epoch": 1.62816, "grad_norm": 1.8450853254842057, "learning_rate": 1.7024773116177839e-06, "loss": 0.2912, "step": 636 }, { "epoch": 1.63072, "grad_norm": 1.7926242028598987, "learning_rate": 1.6795969646531685e-06, "loss": 0.2617, "step": 637 }, { "epoch": 1.63328, "grad_norm": 1.7925132560586257, "learning_rate": 1.6568573055992188e-06, "loss": 0.2784, "step": 638 }, { "epoch": 1.63584, "grad_norm": 1.7663875310273034, "learning_rate": 1.6342587189559577e-06, "loss": 0.2696, "step": 639 }, { "epoch": 1.6383999999999999, "grad_norm": 1.6485491658987015, "learning_rate": 1.6118015868380387e-06, "loss": 0.2386, "step": 640 }, { "epoch": 1.64096, "grad_norm": 1.7988576600138655, "learning_rate": 1.5894862889682906e-06, "loss": 0.2955, "step": 641 }, { "epoch": 1.64352, "grad_norm": 1.714936041943462, "learning_rate": 1.5673132026713046e-06, "loss": 0.2843, "step": 642 }, { "epoch": 1.64608, "grad_norm": 1.967807455369453, "learning_rate": 1.5452827028670358e-06, "loss": 0.3071, "step": 643 }, { "epoch": 1.6486399999999999, "grad_norm": 1.767360909454626, "learning_rate": 1.523395162064486e-06, "loss": 0.251, "step": 644 }, { "epoch": 1.6512, "grad_norm": 1.6381296026383634, "learning_rate": 1.50165095035539e-06, "loss": 0.2755, "step": 645 }, { "epoch": 1.6537600000000001, "grad_norm": 1.5619584246682827, "learning_rate": 1.480050435407957e-06, "loss": 0.2614, "step": 646 }, { "epoch": 1.65632, "grad_norm": 1.6327509469165924, "learning_rate": 1.4585939824606621e-06, "loss": 0.255, "step": 647 }, { "epoch": 1.65888, "grad_norm": 1.563354867722546, "learning_rate": 1.437281954316071e-06, "loss": 0.2807, "step": 648 }, { "epoch": 1.66144, "grad_norm": 1.7550766801341493, "learning_rate": 1.4161147113346917e-06, "loss": 0.2702, "step": 649 }, { "epoch": 1.6640000000000001, "grad_norm": 1.6865725533510028, "learning_rate": 1.395092611428902e-06, "loss": 0.251, "step": 650 }, { "epoch": 1.66656, "grad_norm": 1.8531854972293114, "learning_rate": 1.374216010056879e-06, "loss": 0.2985, "step": 651 }, { "epoch": 1.66912, "grad_norm": 1.9209594828542413, "learning_rate": 1.353485260216596e-06, "loss": 0.3067, "step": 652 }, { "epoch": 1.67168, "grad_norm": 1.616385407100715, "learning_rate": 1.3329007124398608e-06, "loss": 0.2331, "step": 653 }, { "epoch": 1.67424, "grad_norm": 1.8003991478619372, "learning_rate": 1.3124627147863733e-06, "loss": 0.2878, "step": 654 }, { "epoch": 1.6768, "grad_norm": 1.5521426456650567, "learning_rate": 1.2921716128378581e-06, "loss": 0.2522, "step": 655 }, { "epoch": 1.67936, "grad_norm": 1.7611518891884141, "learning_rate": 1.272027749692203e-06, "loss": 0.261, "step": 656 }, { "epoch": 1.6819199999999999, "grad_norm": 1.7446813440682067, "learning_rate": 1.2520314659576683e-06, "loss": 0.2708, "step": 657 }, { "epoch": 1.68448, "grad_norm": 1.8976906912021962, "learning_rate": 1.2321830997471329e-06, "loss": 0.3082, "step": 658 }, { "epoch": 1.68704, "grad_norm": 1.5745862289974457, "learning_rate": 1.212482986672361e-06, "loss": 0.2438, "step": 659 }, { "epoch": 1.6896, "grad_norm": 1.7666742654864107, "learning_rate": 1.1929314598383423e-06, "loss": 0.2664, "step": 660 }, { "epoch": 1.6921599999999999, "grad_norm": 1.7635144087752495, "learning_rate": 1.1735288498376495e-06, "loss": 0.2784, "step": 661 }, { "epoch": 1.69472, "grad_norm": 1.6871885012565122, "learning_rate": 1.1542754847448544e-06, "loss": 0.2585, "step": 662 }, { "epoch": 1.6972800000000001, "grad_norm": 1.763546085268813, "learning_rate": 1.13517169011098e-06, "loss": 0.2675, "step": 663 }, { "epoch": 1.69984, "grad_norm": 1.6833283054672803, "learning_rate": 1.1162177889579906e-06, "loss": 0.2456, "step": 664 }, { "epoch": 1.7024, "grad_norm": 1.4995582297938723, "learning_rate": 1.0974141017733386e-06, "loss": 0.2219, "step": 665 }, { "epoch": 1.70496, "grad_norm": 1.7935988415350803, "learning_rate": 1.078760946504539e-06, "loss": 0.2878, "step": 666 }, { "epoch": 1.7075200000000001, "grad_norm": 1.8128478947806876, "learning_rate": 1.0602586385537928e-06, "loss": 0.2581, "step": 667 }, { "epoch": 1.71008, "grad_norm": 1.63791230975168, "learning_rate": 1.041907490772658e-06, "loss": 0.2498, "step": 668 }, { "epoch": 1.71264, "grad_norm": 1.590925354056578, "learning_rate": 1.0237078134567535e-06, "loss": 0.2505, "step": 669 }, { "epoch": 1.7151999999999998, "grad_norm": 1.7695283124365373, "learning_rate": 1.0056599143405244e-06, "loss": 0.2754, "step": 670 }, { "epoch": 1.71776, "grad_norm": 1.7812963095403938, "learning_rate": 9.877640985920268e-07, "loss": 0.2798, "step": 671 }, { "epoch": 1.72032, "grad_norm": 1.6627323998398744, "learning_rate": 9.700206688077707e-07, "loss": 0.2298, "step": 672 }, { "epoch": 1.72288, "grad_norm": 1.9098592625989281, "learning_rate": 9.524299250076052e-07, "loss": 0.2805, "step": 673 }, { "epoch": 1.7254399999999999, "grad_norm": 1.6937003995052815, "learning_rate": 9.349921646296423e-07, "loss": 0.2548, "step": 674 }, { "epoch": 1.728, "grad_norm": 1.8012637933088234, "learning_rate": 9.177076825252351e-07, "loss": 0.2343, "step": 675 }, { "epoch": 1.73056, "grad_norm": 1.9146685623071344, "learning_rate": 9.00576770953987e-07, "loss": 0.2783, "step": 676 }, { "epoch": 1.73312, "grad_norm": 1.6706056064712593, "learning_rate": 8.835997195788071e-07, "loss": 0.2596, "step": 677 }, { "epoch": 1.73568, "grad_norm": 1.7350656477174768, "learning_rate": 8.667768154610124e-07, "loss": 0.2754, "step": 678 }, { "epoch": 1.73824, "grad_norm": 1.765131272482126, "learning_rate": 8.501083430554868e-07, "loss": 0.2655, "step": 679 }, { "epoch": 1.7408000000000001, "grad_norm": 1.7079209480392799, "learning_rate": 8.335945842058524e-07, "loss": 0.2853, "step": 680 }, { "epoch": 1.74336, "grad_norm": 1.659320265734026, "learning_rate": 8.172358181397178e-07, "loss": 0.2669, "step": 681 }, { "epoch": 1.74592, "grad_norm": 1.7216123667879766, "learning_rate": 8.010323214639492e-07, "loss": 0.2939, "step": 682 }, { "epoch": 1.74848, "grad_norm": 1.8167275931675924, "learning_rate": 7.849843681599978e-07, "loss": 0.2919, "step": 683 }, { "epoch": 1.7510400000000002, "grad_norm": 1.7516661490079315, "learning_rate": 7.690922295792647e-07, "loss": 0.2405, "step": 684 }, { "epoch": 1.7536, "grad_norm": 1.7608726427403516, "learning_rate": 7.53356174438512e-07, "loss": 0.2692, "step": 685 }, { "epoch": 1.75616, "grad_norm": 1.7716445711738182, "learning_rate": 7.377764688153244e-07, "loss": 0.2444, "step": 686 }, { "epoch": 1.7587199999999998, "grad_norm": 1.6553834628764956, "learning_rate": 7.223533761435986e-07, "loss": 0.2283, "step": 687 }, { "epoch": 1.76128, "grad_norm": 1.6208484626275992, "learning_rate": 7.070871572091076e-07, "loss": 0.2447, "step": 688 }, { "epoch": 1.76384, "grad_norm": 1.9469256466871052, "learning_rate": 6.919780701450684e-07, "loss": 0.3117, "step": 689 }, { "epoch": 1.7664, "grad_norm": 1.807807654678934, "learning_rate": 6.770263704277958e-07, "loss": 0.2489, "step": 690 }, { "epoch": 1.7689599999999999, "grad_norm": 1.7754566354998111, "learning_rate": 6.62232310872375e-07, "loss": 0.2933, "step": 691 }, { "epoch": 1.77152, "grad_norm": 2.142560102394874, "learning_rate": 6.475961416283838e-07, "loss": 0.2976, "step": 692 }, { "epoch": 1.77408, "grad_norm": 1.8425369199032686, "learning_rate": 6.331181101756733e-07, "loss": 0.2525, "step": 693 }, { "epoch": 1.77664, "grad_norm": 1.8067216131513493, "learning_rate": 6.187984613201703e-07, "loss": 0.2724, "step": 694 }, { "epoch": 1.7792, "grad_norm": 1.819627845132494, "learning_rate": 6.046374371897446e-07, "loss": 0.2676, "step": 695 }, { "epoch": 1.78176, "grad_norm": 1.6926155476047056, "learning_rate": 5.906352772301193e-07, "loss": 0.2734, "step": 696 }, { "epoch": 1.7843200000000001, "grad_norm": 1.7608510485979083, "learning_rate": 5.767922182008145e-07, "loss": 0.2753, "step": 697 }, { "epoch": 1.78688, "grad_norm": 1.715763266425071, "learning_rate": 5.631084941711473e-07, "loss": 0.2673, "step": 698 }, { "epoch": 1.78944, "grad_norm": 1.6687789738165386, "learning_rate": 5.495843365162701e-07, "loss": 0.2901, "step": 699 }, { "epoch": 1.792, "grad_norm": 1.8537817214382708, "learning_rate": 5.362199739132656e-07, "loss": 0.2747, "step": 700 }, { "epoch": 1.7945600000000002, "grad_norm": 1.6704242281507662, "learning_rate": 5.230156323372759e-07, "loss": 0.2524, "step": 701 }, { "epoch": 1.79712, "grad_norm": 1.825247698595894, "learning_rate": 5.099715350576817e-07, "loss": 0.2676, "step": 702 }, { "epoch": 1.79968, "grad_norm": 1.7300015240890014, "learning_rate": 4.970879026343256e-07, "loss": 0.2747, "step": 703 }, { "epoch": 1.8022399999999998, "grad_norm": 1.707816251456467, "learning_rate": 4.843649529137861e-07, "loss": 0.2708, "step": 704 }, { "epoch": 1.8048, "grad_norm": 1.8785886608496822, "learning_rate": 4.7180290102568973e-07, "loss": 0.3164, "step": 705 }, { "epoch": 1.80736, "grad_norm": 1.8644648318252912, "learning_rate": 4.594019593790799e-07, "loss": 0.2927, "step": 706 }, { "epoch": 1.80992, "grad_norm": 1.8493889520810634, "learning_rate": 4.471623376588197e-07, "loss": 0.2628, "step": 707 }, { "epoch": 1.8124799999999999, "grad_norm": 1.7845383273554698, "learning_rate": 4.35084242822047e-07, "loss": 0.2582, "step": 708 }, { "epoch": 1.81504, "grad_norm": 1.8293493217735093, "learning_rate": 4.2316787909467915e-07, "loss": 0.2753, "step": 709 }, { "epoch": 1.8176, "grad_norm": 1.6894890585749698, "learning_rate": 4.114134479679543e-07, "loss": 0.2678, "step": 710 }, { "epoch": 1.82016, "grad_norm": 1.7542138094486555, "learning_rate": 3.998211481950254e-07, "loss": 0.2892, "step": 711 }, { "epoch": 1.82272, "grad_norm": 1.8793095620901379, "learning_rate": 3.883911757876058e-07, "loss": 0.2728, "step": 712 }, { "epoch": 1.82528, "grad_norm": 1.7267463710184283, "learning_rate": 3.771237240126469e-07, "loss": 0.2735, "step": 713 }, { "epoch": 1.8278400000000001, "grad_norm": 1.7219666375218627, "learning_rate": 3.66018983389077e-07, "loss": 0.2597, "step": 714 }, { "epoch": 1.8304, "grad_norm": 1.7636190461227865, "learning_rate": 3.5507714168457e-07, "loss": 0.2665, "step": 715 }, { "epoch": 1.83296, "grad_norm": 1.7666528327974367, "learning_rate": 3.442983839123826e-07, "loss": 0.2805, "step": 716 }, { "epoch": 1.83552, "grad_norm": 1.6781083614930392, "learning_rate": 3.3368289232822094e-07, "loss": 0.246, "step": 717 }, { "epoch": 1.8380800000000002, "grad_norm": 1.7603755414192976, "learning_rate": 3.232308464271505e-07, "loss": 0.2947, "step": 718 }, { "epoch": 1.84064, "grad_norm": 1.7521780009843657, "learning_rate": 3.1294242294057974e-07, "loss": 0.3191, "step": 719 }, { "epoch": 1.8432, "grad_norm": 1.6098707300379458, "learning_rate": 3.028177958332512e-07, "loss": 0.2548, "step": 720 }, { "epoch": 1.8457599999999998, "grad_norm": 1.6267142406397461, "learning_rate": 2.928571363003152e-07, "loss": 0.3028, "step": 721 }, { "epoch": 1.84832, "grad_norm": 1.6960630177797111, "learning_rate": 2.8306061276442753e-07, "loss": 0.2631, "step": 722 }, { "epoch": 1.85088, "grad_norm": 1.7493636450152343, "learning_rate": 2.7342839087290183e-07, "loss": 0.2518, "step": 723 }, { "epoch": 1.85344, "grad_norm": 1.6385553511066044, "learning_rate": 2.639606334949163e-07, "loss": 0.2908, "step": 724 }, { "epoch": 1.8559999999999999, "grad_norm": 1.7931640851082686, "learning_rate": 2.5465750071874797e-07, "loss": 0.2649, "step": 725 }, { "epoch": 1.85856, "grad_norm": 1.5912117214737356, "learning_rate": 2.455191498490739e-07, "loss": 0.2664, "step": 726 }, { "epoch": 1.86112, "grad_norm": 1.6678147462368311, "learning_rate": 2.365457354043088e-07, "loss": 0.2172, "step": 727 }, { "epoch": 1.86368, "grad_norm": 1.6035873970448906, "learning_rate": 2.27737409113995e-07, "loss": 0.2504, "step": 728 }, { "epoch": 1.86624, "grad_norm": 1.7528209162894965, "learning_rate": 2.1909431991623097e-07, "loss": 0.2615, "step": 729 }, { "epoch": 1.8688, "grad_norm": 1.8639972671950014, "learning_rate": 2.106166139551602e-07, "loss": 0.2668, "step": 730 }, { "epoch": 1.8713600000000001, "grad_norm": 1.6865465188399893, "learning_rate": 2.0230443457849414e-07, "loss": 0.2797, "step": 731 }, { "epoch": 1.87392, "grad_norm": 1.5210672653088169, "learning_rate": 1.941579223350898e-07, "loss": 0.2304, "step": 732 }, { "epoch": 1.87648, "grad_norm": 1.7085604230659135, "learning_rate": 1.8617721497257823e-07, "loss": 0.2505, "step": 733 }, { "epoch": 1.87904, "grad_norm": 1.8288714006316353, "learning_rate": 1.7836244743502762e-07, "loss": 0.2364, "step": 734 }, { "epoch": 1.8816000000000002, "grad_norm": 1.7484101329116195, "learning_rate": 1.7071375186066607e-07, "loss": 0.2449, "step": 735 }, { "epoch": 1.88416, "grad_norm": 1.6200555865387618, "learning_rate": 1.6323125757964799e-07, "loss": 0.2692, "step": 736 }, { "epoch": 1.88672, "grad_norm": 1.9011939560507523, "learning_rate": 1.5591509111186342e-07, "loss": 0.2652, "step": 737 }, { "epoch": 1.8892799999999998, "grad_norm": 1.7822199769462572, "learning_rate": 1.4876537616480335e-07, "loss": 0.2881, "step": 738 }, { "epoch": 1.89184, "grad_norm": 1.750191048312678, "learning_rate": 1.4178223363146226e-07, "loss": 0.2622, "step": 739 }, { "epoch": 1.8944, "grad_norm": 1.7516915828032618, "learning_rate": 1.349657815883032e-07, "loss": 0.2961, "step": 740 }, { "epoch": 1.89696, "grad_norm": 1.645641455994093, "learning_rate": 1.283161352932505e-07, "loss": 0.2736, "step": 741 }, { "epoch": 1.8995199999999999, "grad_norm": 1.705553244598403, "learning_rate": 1.218334071837468e-07, "loss": 0.2583, "step": 742 }, { "epoch": 1.90208, "grad_norm": 1.7315966835392997, "learning_rate": 1.1551770687485142e-07, "loss": 0.2758, "step": 743 }, { "epoch": 1.90464, "grad_norm": 1.727806848265733, "learning_rate": 1.0936914115738717e-07, "loss": 0.2657, "step": 744 }, { "epoch": 1.9072, "grad_norm": 1.676827461659673, "learning_rate": 1.0338781399613307e-07, "loss": 0.2642, "step": 745 }, { "epoch": 1.90976, "grad_norm": 1.7115198632740858, "learning_rate": 9.757382652806791e-08, "loss": 0.2545, "step": 746 }, { "epoch": 1.91232, "grad_norm": 1.7354830976605933, "learning_rate": 9.192727706065829e-08, "loss": 0.2583, "step": 747 }, { "epoch": 1.9148800000000001, "grad_norm": 1.7582838528396394, "learning_rate": 8.644826107019888e-08, "loss": 0.2814, "step": 748 }, { "epoch": 1.91744, "grad_norm": 1.681917990191324, "learning_rate": 8.113687120019587e-08, "loss": 0.2601, "step": 749 }, { "epoch": 1.92, "grad_norm": 1.7668971882563782, "learning_rate": 7.599319725980047e-08, "loss": 0.2621, "step": 750 }, { "epoch": 1.92256, "grad_norm": 1.7599942208335475, "learning_rate": 7.101732622229462e-08, "loss": 0.2881, "step": 751 }, { "epoch": 1.9251200000000002, "grad_norm": 1.7230238393967752, "learning_rate": 6.62093422236132e-08, "loss": 0.3086, "step": 752 }, { "epoch": 1.92768, "grad_norm": 1.7803538404725028, "learning_rate": 6.15693265609274e-08, "loss": 0.2647, "step": 753 }, { "epoch": 1.93024, "grad_norm": 1.8054504698595564, "learning_rate": 5.709735769126479e-08, "loss": 0.2815, "step": 754 }, { "epoch": 1.9327999999999999, "grad_norm": 1.6120878259133444, "learning_rate": 5.279351123019028e-08, "loss": 0.2307, "step": 755 }, { "epoch": 1.93536, "grad_norm": 1.7147176143131828, "learning_rate": 4.8657859950520524e-08, "loss": 0.2741, "step": 756 }, { "epoch": 1.93792, "grad_norm": 1.826014851535064, "learning_rate": 4.469047378109603e-08, "loss": 0.2949, "step": 757 }, { "epoch": 1.94048, "grad_norm": 1.8033974744681112, "learning_rate": 4.0891419805597634e-08, "loss": 0.2564, "step": 758 }, { "epoch": 1.9430399999999999, "grad_norm": 1.7228110817797906, "learning_rate": 3.7260762261416287e-08, "loss": 0.2654, "step": 759 }, { "epoch": 1.9456, "grad_norm": 1.7114724793916518, "learning_rate": 3.379856253855951e-08, "loss": 0.2441, "step": 760 }, { "epoch": 1.9481600000000001, "grad_norm": 1.736060820260858, "learning_rate": 3.0504879178622214e-08, "loss": 0.2748, "step": 761 }, { "epoch": 1.95072, "grad_norm": 1.7505236918858846, "learning_rate": 2.73797678737886e-08, "loss": 0.2456, "step": 762 }, { "epoch": 1.95328, "grad_norm": 1.5859654595806796, "learning_rate": 2.442328146589512e-08, "loss": 0.2799, "step": 763 }, { "epoch": 1.95584, "grad_norm": 1.7039588055868866, "learning_rate": 2.163546994553789e-08, "loss": 0.2402, "step": 764 }, { "epoch": 1.9584000000000001, "grad_norm": 1.8531697022252631, "learning_rate": 1.9016380451223337e-08, "loss": 0.2944, "step": 765 }, { "epoch": 1.96096, "grad_norm": 1.7102421117876718, "learning_rate": 1.656605726857441e-08, "loss": 0.2704, "step": 766 }, { "epoch": 1.96352, "grad_norm": 2.0294788740008176, "learning_rate": 1.4284541829580056e-08, "loss": 0.3189, "step": 767 }, { "epoch": 1.96608, "grad_norm": 1.8662967648141273, "learning_rate": 1.2171872711895794e-08, "loss": 0.2729, "step": 768 }, { "epoch": 1.96864, "grad_norm": 1.6195984132177164, "learning_rate": 1.0228085638190887e-08, "loss": 0.2613, "step": 769 }, { "epoch": 1.9712, "grad_norm": 1.5450708746300255, "learning_rate": 8.453213475543287e-09, "loss": 0.2625, "step": 770 }, { "epoch": 1.97376, "grad_norm": 1.69409357688793, "learning_rate": 6.84728623488562e-09, "loss": 0.2557, "step": 771 }, { "epoch": 1.9763199999999999, "grad_norm": 1.812223784293616, "learning_rate": 5.410331070498931e-09, "loss": 0.2712, "step": 772 }, { "epoch": 1.97888, "grad_norm": 1.6091781900571478, "learning_rate": 4.142372279548612e-09, "loss": 0.2593, "step": 773 }, { "epoch": 1.98144, "grad_norm": 1.6908756673392302, "learning_rate": 3.043431301678057e-09, "loss": 0.2226, "step": 774 }, { "epoch": 1.984, "grad_norm": 1.7859820632732437, "learning_rate": 2.11352671864562e-09, "loss": 0.2807, "step": 775 }, { "epoch": 1.9865599999999999, "grad_norm": 1.7470027221404476, "learning_rate": 1.3526742540070913e-09, "loss": 0.2718, "step": 776 }, { "epoch": 1.98912, "grad_norm": 1.8210620903336188, "learning_rate": 7.608867728536862e-10, "loss": 0.2749, "step": 777 }, { "epoch": 1.9916800000000001, "grad_norm": 1.9294520890967608, "learning_rate": 3.381742815944389e-10, "loss": 0.2902, "step": 778 }, { "epoch": 1.99424, "grad_norm": 1.7057897433651361, "learning_rate": 8.454392778189935e-11, "loss": 0.2478, "step": 779 }, { "epoch": 1.9968, "grad_norm": 1.7268558478851974, "learning_rate": 0.0, "loss": 0.2642, "step": 780 }, { "epoch": 1.9968, "step": 780, "total_flos": 440534549233664.0, "train_loss": 0.46617485760496213, "train_runtime": 27454.835, "train_samples_per_second": 1.821, "train_steps_per_second": 0.028 } ], "logging_steps": 1.0, "max_steps": 780, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 440534549233664.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }