{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9991675915649276, "eval_steps": 500, "global_step": 3603, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000832408435072142, "grad_norm": 5.852675437927246, "learning_rate": 2.770083102493075e-08, "loss": 0.8812, "step": 1 }, { "epoch": 0.001664816870144284, "grad_norm": 5.757553577423096, "learning_rate": 5.54016620498615e-08, "loss": 0.8425, "step": 2 }, { "epoch": 0.0024972253052164264, "grad_norm": 5.973855972290039, "learning_rate": 8.310249307479226e-08, "loss": 0.8658, "step": 3 }, { "epoch": 0.003329633740288568, "grad_norm": 6.0041680335998535, "learning_rate": 1.10803324099723e-07, "loss": 0.8471, "step": 4 }, { "epoch": 0.004162042175360711, "grad_norm": 5.798811912536621, "learning_rate": 1.3850415512465375e-07, "loss": 0.8696, "step": 5 }, { "epoch": 0.004994450610432853, "grad_norm": 5.87969970703125, "learning_rate": 1.662049861495845e-07, "loss": 0.8461, "step": 6 }, { "epoch": 0.005826859045504994, "grad_norm": 5.775433540344238, "learning_rate": 1.9390581717451524e-07, "loss": 0.859, "step": 7 }, { "epoch": 0.006659267480577136, "grad_norm": 6.0282087326049805, "learning_rate": 2.21606648199446e-07, "loss": 0.8566, "step": 8 }, { "epoch": 0.007491675915649278, "grad_norm": 5.845994472503662, "learning_rate": 2.4930747922437677e-07, "loss": 0.8533, "step": 9 }, { "epoch": 0.008324084350721421, "grad_norm": 5.8350605964660645, "learning_rate": 2.770083102493075e-07, "loss": 0.8669, "step": 10 }, { "epoch": 0.009156492785793563, "grad_norm": 5.791550636291504, "learning_rate": 3.0470914127423823e-07, "loss": 0.8701, "step": 11 }, { "epoch": 0.009988901220865706, "grad_norm": 5.940352439880371, "learning_rate": 3.32409972299169e-07, "loss": 0.8909, "step": 12 }, { "epoch": 0.010821309655937847, "grad_norm": 5.590038299560547, "learning_rate": 3.601108033240998e-07, "loss": 0.8443, "step": 13 }, { "epoch": 0.011653718091009988, "grad_norm": 5.473329067230225, "learning_rate": 3.878116343490305e-07, "loss": 0.8345, "step": 14 }, { "epoch": 0.012486126526082131, "grad_norm": 5.688844680786133, "learning_rate": 4.155124653739612e-07, "loss": 0.8813, "step": 15 }, { "epoch": 0.013318534961154272, "grad_norm": 5.5658440589904785, "learning_rate": 4.43213296398892e-07, "loss": 0.8678, "step": 16 }, { "epoch": 0.014150943396226415, "grad_norm": 5.2306599617004395, "learning_rate": 4.7091412742382274e-07, "loss": 0.8587, "step": 17 }, { "epoch": 0.014983351831298557, "grad_norm": 5.296006202697754, "learning_rate": 4.986149584487535e-07, "loss": 0.8373, "step": 18 }, { "epoch": 0.015815760266370698, "grad_norm": 4.465157508850098, "learning_rate": 5.263157894736843e-07, "loss": 0.7817, "step": 19 }, { "epoch": 0.016648168701442843, "grad_norm": 4.338566780090332, "learning_rate": 5.54016620498615e-07, "loss": 0.7938, "step": 20 }, { "epoch": 0.017480577136514984, "grad_norm": 4.399113655090332, "learning_rate": 5.817174515235457e-07, "loss": 0.8391, "step": 21 }, { "epoch": 0.018312985571587125, "grad_norm": 4.371567726135254, "learning_rate": 6.094182825484765e-07, "loss": 0.8577, "step": 22 }, { "epoch": 0.019145394006659266, "grad_norm": 3.9851796627044678, "learning_rate": 6.371191135734073e-07, "loss": 0.7962, "step": 23 }, { "epoch": 0.01997780244173141, "grad_norm": 4.0287652015686035, "learning_rate": 6.64819944598338e-07, "loss": 0.7759, "step": 24 }, { "epoch": 0.020810210876803552, "grad_norm": 3.2178382873535156, "learning_rate": 6.925207756232688e-07, "loss": 0.7951, "step": 25 }, { "epoch": 0.021642619311875694, "grad_norm": 2.373326301574707, "learning_rate": 7.202216066481996e-07, "loss": 0.7714, "step": 26 }, { "epoch": 0.022475027746947835, "grad_norm": 2.38152813911438, "learning_rate": 7.479224376731302e-07, "loss": 0.7554, "step": 27 }, { "epoch": 0.023307436182019976, "grad_norm": 2.2056140899658203, "learning_rate": 7.75623268698061e-07, "loss": 0.7232, "step": 28 }, { "epoch": 0.02413984461709212, "grad_norm": 2.3134679794311523, "learning_rate": 8.033240997229917e-07, "loss": 0.7865, "step": 29 }, { "epoch": 0.024972253052164262, "grad_norm": 2.2102293968200684, "learning_rate": 8.310249307479224e-07, "loss": 0.7796, "step": 30 }, { "epoch": 0.025804661487236404, "grad_norm": 1.9953457117080688, "learning_rate": 8.587257617728533e-07, "loss": 0.7473, "step": 31 }, { "epoch": 0.026637069922308545, "grad_norm": 1.9345048666000366, "learning_rate": 8.86426592797784e-07, "loss": 0.7113, "step": 32 }, { "epoch": 0.02746947835738069, "grad_norm": 1.8699673414230347, "learning_rate": 9.141274238227148e-07, "loss": 0.7649, "step": 33 }, { "epoch": 0.02830188679245283, "grad_norm": 1.523240566253662, "learning_rate": 9.418282548476455e-07, "loss": 0.7485, "step": 34 }, { "epoch": 0.029134295227524972, "grad_norm": 1.7488731145858765, "learning_rate": 9.695290858725762e-07, "loss": 0.7514, "step": 35 }, { "epoch": 0.029966703662597113, "grad_norm": 1.9269617795944214, "learning_rate": 9.97229916897507e-07, "loss": 0.7005, "step": 36 }, { "epoch": 0.030799112097669258, "grad_norm": 2.1249892711639404, "learning_rate": 1.024930747922438e-06, "loss": 0.7309, "step": 37 }, { "epoch": 0.031631520532741396, "grad_norm": 2.0195670127868652, "learning_rate": 1.0526315789473685e-06, "loss": 0.6937, "step": 38 }, { "epoch": 0.03246392896781354, "grad_norm": 2.0689985752105713, "learning_rate": 1.0803324099722992e-06, "loss": 0.7177, "step": 39 }, { "epoch": 0.033296337402885685, "grad_norm": 2.0408408641815186, "learning_rate": 1.10803324099723e-06, "loss": 0.7019, "step": 40 }, { "epoch": 0.03412874583795782, "grad_norm": 1.8314168453216553, "learning_rate": 1.1357340720221608e-06, "loss": 0.6972, "step": 41 }, { "epoch": 0.03496115427302997, "grad_norm": 1.612196922302246, "learning_rate": 1.1634349030470915e-06, "loss": 0.6842, "step": 42 }, { "epoch": 0.035793562708102106, "grad_norm": 1.4777947664260864, "learning_rate": 1.1911357340720223e-06, "loss": 0.6992, "step": 43 }, { "epoch": 0.03662597114317425, "grad_norm": 1.207147479057312, "learning_rate": 1.218836565096953e-06, "loss": 0.7215, "step": 44 }, { "epoch": 0.037458379578246395, "grad_norm": 0.8992202877998352, "learning_rate": 1.2465373961218838e-06, "loss": 0.663, "step": 45 }, { "epoch": 0.03829078801331853, "grad_norm": 0.8715615272521973, "learning_rate": 1.2742382271468146e-06, "loss": 0.696, "step": 46 }, { "epoch": 0.03912319644839068, "grad_norm": 0.9824283719062805, "learning_rate": 1.3019390581717452e-06, "loss": 0.6709, "step": 47 }, { "epoch": 0.03995560488346282, "grad_norm": 1.060660481452942, "learning_rate": 1.329639889196676e-06, "loss": 0.6799, "step": 48 }, { "epoch": 0.04078801331853496, "grad_norm": 1.121728777885437, "learning_rate": 1.357340720221607e-06, "loss": 0.6862, "step": 49 }, { "epoch": 0.041620421753607105, "grad_norm": 0.9326874017715454, "learning_rate": 1.3850415512465375e-06, "loss": 0.6305, "step": 50 }, { "epoch": 0.04245283018867924, "grad_norm": 0.9469634294509888, "learning_rate": 1.4127423822714684e-06, "loss": 0.6881, "step": 51 }, { "epoch": 0.04328523862375139, "grad_norm": 0.790377140045166, "learning_rate": 1.4404432132963992e-06, "loss": 0.6712, "step": 52 }, { "epoch": 0.04411764705882353, "grad_norm": 0.8012718558311462, "learning_rate": 1.4681440443213299e-06, "loss": 0.6611, "step": 53 }, { "epoch": 0.04495005549389567, "grad_norm": 0.7442087531089783, "learning_rate": 1.4958448753462605e-06, "loss": 0.6363, "step": 54 }, { "epoch": 0.045782463928967815, "grad_norm": 0.6595564484596252, "learning_rate": 1.5235457063711911e-06, "loss": 0.6289, "step": 55 }, { "epoch": 0.04661487236403995, "grad_norm": 0.7440547943115234, "learning_rate": 1.551246537396122e-06, "loss": 0.6602, "step": 56 }, { "epoch": 0.0474472807991121, "grad_norm": 0.6962677240371704, "learning_rate": 1.5789473684210526e-06, "loss": 0.6092, "step": 57 }, { "epoch": 0.04827968923418424, "grad_norm": 0.5988077521324158, "learning_rate": 1.6066481994459834e-06, "loss": 0.6156, "step": 58 }, { "epoch": 0.04911209766925638, "grad_norm": 0.579781711101532, "learning_rate": 1.6343490304709143e-06, "loss": 0.5902, "step": 59 }, { "epoch": 0.049944506104328525, "grad_norm": 0.658952534198761, "learning_rate": 1.6620498614958449e-06, "loss": 0.6049, "step": 60 }, { "epoch": 0.05077691453940067, "grad_norm": 0.7643985748291016, "learning_rate": 1.6897506925207757e-06, "loss": 0.6195, "step": 61 }, { "epoch": 0.05160932297447281, "grad_norm": 0.6020765900611877, "learning_rate": 1.7174515235457066e-06, "loss": 0.6154, "step": 62 }, { "epoch": 0.05244173140954495, "grad_norm": 0.5499704480171204, "learning_rate": 1.7451523545706372e-06, "loss": 0.6152, "step": 63 }, { "epoch": 0.05327413984461709, "grad_norm": 0.5000544190406799, "learning_rate": 1.772853185595568e-06, "loss": 0.6228, "step": 64 }, { "epoch": 0.054106548279689234, "grad_norm": 0.591923713684082, "learning_rate": 1.8005540166204989e-06, "loss": 0.6174, "step": 65 }, { "epoch": 0.05493895671476138, "grad_norm": 0.6395013332366943, "learning_rate": 1.8282548476454295e-06, "loss": 0.6169, "step": 66 }, { "epoch": 0.05577136514983352, "grad_norm": 0.5679120421409607, "learning_rate": 1.8559556786703603e-06, "loss": 0.6119, "step": 67 }, { "epoch": 0.05660377358490566, "grad_norm": 0.5052242279052734, "learning_rate": 1.883656509695291e-06, "loss": 0.6275, "step": 68 }, { "epoch": 0.0574361820199778, "grad_norm": 0.5107065439224243, "learning_rate": 1.911357340720222e-06, "loss": 0.6044, "step": 69 }, { "epoch": 0.058268590455049944, "grad_norm": 0.5278398990631104, "learning_rate": 1.9390581717451524e-06, "loss": 0.6005, "step": 70 }, { "epoch": 0.05910099889012209, "grad_norm": 0.5229373574256897, "learning_rate": 1.9667590027700835e-06, "loss": 0.605, "step": 71 }, { "epoch": 0.05993340732519423, "grad_norm": 0.45075175166130066, "learning_rate": 1.994459833795014e-06, "loss": 0.5785, "step": 72 }, { "epoch": 0.06076581576026637, "grad_norm": 0.44926148653030396, "learning_rate": 2.0221606648199448e-06, "loss": 0.584, "step": 73 }, { "epoch": 0.061598224195338516, "grad_norm": 0.49701979756355286, "learning_rate": 2.049861495844876e-06, "loss": 0.5674, "step": 74 }, { "epoch": 0.062430632630410654, "grad_norm": 0.4435327351093292, "learning_rate": 2.077562326869806e-06, "loss": 0.6039, "step": 75 }, { "epoch": 0.06326304106548279, "grad_norm": 0.5081650614738464, "learning_rate": 2.105263157894737e-06, "loss": 0.6047, "step": 76 }, { "epoch": 0.06409544950055494, "grad_norm": 0.47305235266685486, "learning_rate": 2.1329639889196677e-06, "loss": 0.6117, "step": 77 }, { "epoch": 0.06492785793562708, "grad_norm": 0.501122772693634, "learning_rate": 2.1606648199445983e-06, "loss": 0.5932, "step": 78 }, { "epoch": 0.06576026637069922, "grad_norm": 0.4220408797264099, "learning_rate": 2.1883656509695294e-06, "loss": 0.5489, "step": 79 }, { "epoch": 0.06659267480577137, "grad_norm": 0.45031723380088806, "learning_rate": 2.21606648199446e-06, "loss": 0.5997, "step": 80 }, { "epoch": 0.06742508324084351, "grad_norm": 0.4954591691493988, "learning_rate": 2.2437673130193906e-06, "loss": 0.6027, "step": 81 }, { "epoch": 0.06825749167591565, "grad_norm": 0.471713662147522, "learning_rate": 2.2714681440443217e-06, "loss": 0.5924, "step": 82 }, { "epoch": 0.0690899001109878, "grad_norm": 0.41615861654281616, "learning_rate": 2.2991689750692523e-06, "loss": 0.5723, "step": 83 }, { "epoch": 0.06992230854605994, "grad_norm": 0.3945973515510559, "learning_rate": 2.326869806094183e-06, "loss": 0.5716, "step": 84 }, { "epoch": 0.07075471698113207, "grad_norm": 0.4024456739425659, "learning_rate": 2.3545706371191136e-06, "loss": 0.5668, "step": 85 }, { "epoch": 0.07158712541620421, "grad_norm": 0.4162571132183075, "learning_rate": 2.3822714681440446e-06, "loss": 0.5706, "step": 86 }, { "epoch": 0.07241953385127636, "grad_norm": 0.41097456216812134, "learning_rate": 2.4099722991689752e-06, "loss": 0.6016, "step": 87 }, { "epoch": 0.0732519422863485, "grad_norm": 0.4599824845790863, "learning_rate": 2.437673130193906e-06, "loss": 0.6005, "step": 88 }, { "epoch": 0.07408435072142064, "grad_norm": 0.3909335136413574, "learning_rate": 2.465373961218837e-06, "loss": 0.5682, "step": 89 }, { "epoch": 0.07491675915649279, "grad_norm": 0.4187878370285034, "learning_rate": 2.4930747922437675e-06, "loss": 0.5648, "step": 90 }, { "epoch": 0.07574916759156493, "grad_norm": 0.41880685091018677, "learning_rate": 2.520775623268698e-06, "loss": 0.5944, "step": 91 }, { "epoch": 0.07658157602663707, "grad_norm": 0.4465304911136627, "learning_rate": 2.5484764542936292e-06, "loss": 0.6203, "step": 92 }, { "epoch": 0.07741398446170922, "grad_norm": 0.43194279074668884, "learning_rate": 2.5761772853185594e-06, "loss": 0.5817, "step": 93 }, { "epoch": 0.07824639289678136, "grad_norm": 0.41561976075172424, "learning_rate": 2.6038781163434905e-06, "loss": 0.5741, "step": 94 }, { "epoch": 0.0790788013318535, "grad_norm": 0.4317433536052704, "learning_rate": 2.631578947368421e-06, "loss": 0.5727, "step": 95 }, { "epoch": 0.07991120976692564, "grad_norm": 0.43518805503845215, "learning_rate": 2.659279778393352e-06, "loss": 0.5826, "step": 96 }, { "epoch": 0.08074361820199778, "grad_norm": 0.42625486850738525, "learning_rate": 2.686980609418283e-06, "loss": 0.5984, "step": 97 }, { "epoch": 0.08157602663706992, "grad_norm": 0.41016989946365356, "learning_rate": 2.714681440443214e-06, "loss": 0.5642, "step": 98 }, { "epoch": 0.08240843507214206, "grad_norm": 0.48712158203125, "learning_rate": 2.742382271468144e-06, "loss": 0.5968, "step": 99 }, { "epoch": 0.08324084350721421, "grad_norm": 0.4600513279438019, "learning_rate": 2.770083102493075e-06, "loss": 0.5559, "step": 100 }, { "epoch": 0.08407325194228635, "grad_norm": 0.4251794219017029, "learning_rate": 2.7977839335180057e-06, "loss": 0.5698, "step": 101 }, { "epoch": 0.08490566037735849, "grad_norm": 0.4187605082988739, "learning_rate": 2.8254847645429368e-06, "loss": 0.5463, "step": 102 }, { "epoch": 0.08573806881243064, "grad_norm": 0.45978236198425293, "learning_rate": 2.8531855955678674e-06, "loss": 0.5605, "step": 103 }, { "epoch": 0.08657047724750278, "grad_norm": 0.4345948398113251, "learning_rate": 2.8808864265927985e-06, "loss": 0.548, "step": 104 }, { "epoch": 0.08740288568257491, "grad_norm": 0.4089731276035309, "learning_rate": 2.9085872576177287e-06, "loss": 0.5333, "step": 105 }, { "epoch": 0.08823529411764706, "grad_norm": 0.4466627240180969, "learning_rate": 2.9362880886426597e-06, "loss": 0.5653, "step": 106 }, { "epoch": 0.0890677025527192, "grad_norm": 0.4312840700149536, "learning_rate": 2.9639889196675903e-06, "loss": 0.544, "step": 107 }, { "epoch": 0.08990011098779134, "grad_norm": 0.3958422541618347, "learning_rate": 2.991689750692521e-06, "loss": 0.5314, "step": 108 }, { "epoch": 0.09073251942286349, "grad_norm": 0.4052140712738037, "learning_rate": 3.0193905817174516e-06, "loss": 0.5724, "step": 109 }, { "epoch": 0.09156492785793563, "grad_norm": 0.4097498059272766, "learning_rate": 3.0470914127423822e-06, "loss": 0.5559, "step": 110 }, { "epoch": 0.09239733629300777, "grad_norm": 0.43090808391571045, "learning_rate": 3.0747922437673133e-06, "loss": 0.5456, "step": 111 }, { "epoch": 0.0932297447280799, "grad_norm": 0.43639659881591797, "learning_rate": 3.102493074792244e-06, "loss": 0.5572, "step": 112 }, { "epoch": 0.09406215316315206, "grad_norm": 0.39143499732017517, "learning_rate": 3.130193905817175e-06, "loss": 0.5627, "step": 113 }, { "epoch": 0.0948945615982242, "grad_norm": 0.4211403727531433, "learning_rate": 3.157894736842105e-06, "loss": 0.5701, "step": 114 }, { "epoch": 0.09572697003329633, "grad_norm": 0.42813047766685486, "learning_rate": 3.1855955678670362e-06, "loss": 0.5594, "step": 115 }, { "epoch": 0.09655937846836848, "grad_norm": 0.4047672152519226, "learning_rate": 3.213296398891967e-06, "loss": 0.5467, "step": 116 }, { "epoch": 0.09739178690344062, "grad_norm": 0.4235449731349945, "learning_rate": 3.240997229916898e-06, "loss": 0.5596, "step": 117 }, { "epoch": 0.09822419533851276, "grad_norm": 0.5001515746116638, "learning_rate": 3.2686980609418285e-06, "loss": 0.55, "step": 118 }, { "epoch": 0.09905660377358491, "grad_norm": 0.4500423073768616, "learning_rate": 3.2963988919667596e-06, "loss": 0.5578, "step": 119 }, { "epoch": 0.09988901220865705, "grad_norm": 0.43707484006881714, "learning_rate": 3.3240997229916898e-06, "loss": 0.544, "step": 120 }, { "epoch": 0.10072142064372919, "grad_norm": 0.4369399845600128, "learning_rate": 3.351800554016621e-06, "loss": 0.5551, "step": 121 }, { "epoch": 0.10155382907880134, "grad_norm": 0.4763941764831543, "learning_rate": 3.3795013850415515e-06, "loss": 0.5628, "step": 122 }, { "epoch": 0.10238623751387348, "grad_norm": 0.4882802963256836, "learning_rate": 3.4072022160664825e-06, "loss": 0.5585, "step": 123 }, { "epoch": 0.10321864594894561, "grad_norm": 0.441522479057312, "learning_rate": 3.434903047091413e-06, "loss": 0.5494, "step": 124 }, { "epoch": 0.10405105438401775, "grad_norm": 0.4602923095226288, "learning_rate": 3.462603878116344e-06, "loss": 0.5152, "step": 125 }, { "epoch": 0.1048834628190899, "grad_norm": 0.5620714426040649, "learning_rate": 3.4903047091412744e-06, "loss": 0.5424, "step": 126 }, { "epoch": 0.10571587125416204, "grad_norm": 0.5059617161750793, "learning_rate": 3.5180055401662054e-06, "loss": 0.5758, "step": 127 }, { "epoch": 0.10654827968923418, "grad_norm": 0.44737622141838074, "learning_rate": 3.545706371191136e-06, "loss": 0.53, "step": 128 }, { "epoch": 0.10738068812430633, "grad_norm": 0.5812498331069946, "learning_rate": 3.5734072022160667e-06, "loss": 0.5437, "step": 129 }, { "epoch": 0.10821309655937847, "grad_norm": 0.4357758164405823, "learning_rate": 3.6011080332409978e-06, "loss": 0.5482, "step": 130 }, { "epoch": 0.1090455049944506, "grad_norm": 0.46321603655815125, "learning_rate": 3.628808864265928e-06, "loss": 0.5671, "step": 131 }, { "epoch": 0.10987791342952276, "grad_norm": 0.4342377185821533, "learning_rate": 3.656509695290859e-06, "loss": 0.5532, "step": 132 }, { "epoch": 0.1107103218645949, "grad_norm": 0.4561794102191925, "learning_rate": 3.6842105263157896e-06, "loss": 0.5395, "step": 133 }, { "epoch": 0.11154273029966703, "grad_norm": 0.40543898940086365, "learning_rate": 3.7119113573407207e-06, "loss": 0.5023, "step": 134 }, { "epoch": 0.11237513873473919, "grad_norm": 0.4146581292152405, "learning_rate": 3.739612188365651e-06, "loss": 0.5455, "step": 135 }, { "epoch": 0.11320754716981132, "grad_norm": 0.3970547914505005, "learning_rate": 3.767313019390582e-06, "loss": 0.5341, "step": 136 }, { "epoch": 0.11403995560488346, "grad_norm": 0.45214366912841797, "learning_rate": 3.7950138504155126e-06, "loss": 0.5509, "step": 137 }, { "epoch": 0.1148723640399556, "grad_norm": 0.4272449314594269, "learning_rate": 3.822714681440444e-06, "loss": 0.503, "step": 138 }, { "epoch": 0.11570477247502775, "grad_norm": 0.4434300661087036, "learning_rate": 3.850415512465374e-06, "loss": 0.5212, "step": 139 }, { "epoch": 0.11653718091009989, "grad_norm": 0.4421811103820801, "learning_rate": 3.878116343490305e-06, "loss": 0.5389, "step": 140 }, { "epoch": 0.11736958934517203, "grad_norm": 0.42131364345550537, "learning_rate": 3.9058171745152355e-06, "loss": 0.5443, "step": 141 }, { "epoch": 0.11820199778024418, "grad_norm": 0.47368770837783813, "learning_rate": 3.933518005540167e-06, "loss": 0.5534, "step": 142 }, { "epoch": 0.11903440621531632, "grad_norm": 0.4222336709499359, "learning_rate": 3.961218836565098e-06, "loss": 0.5484, "step": 143 }, { "epoch": 0.11986681465038845, "grad_norm": 0.4217516779899597, "learning_rate": 3.988919667590028e-06, "loss": 0.5359, "step": 144 }, { "epoch": 0.1206992230854606, "grad_norm": 0.46646854281425476, "learning_rate": 4.016620498614959e-06, "loss": 0.5608, "step": 145 }, { "epoch": 0.12153163152053274, "grad_norm": 0.4836723506450653, "learning_rate": 4.0443213296398895e-06, "loss": 0.5471, "step": 146 }, { "epoch": 0.12236403995560488, "grad_norm": 0.42951828241348267, "learning_rate": 4.07202216066482e-06, "loss": 0.5387, "step": 147 }, { "epoch": 0.12319644839067703, "grad_norm": 0.4030103385448456, "learning_rate": 4.099722991689752e-06, "loss": 0.5426, "step": 148 }, { "epoch": 0.12402885682574917, "grad_norm": 0.4542948305606842, "learning_rate": 4.127423822714681e-06, "loss": 0.5331, "step": 149 }, { "epoch": 0.12486126526082131, "grad_norm": 0.481719046831131, "learning_rate": 4.155124653739612e-06, "loss": 0.5384, "step": 150 }, { "epoch": 0.12569367369589346, "grad_norm": 0.4303570091724396, "learning_rate": 4.1828254847645435e-06, "loss": 0.5398, "step": 151 }, { "epoch": 0.12652608213096558, "grad_norm": 0.4159233272075653, "learning_rate": 4.210526315789474e-06, "loss": 0.5373, "step": 152 }, { "epoch": 0.12735849056603774, "grad_norm": 0.37868037819862366, "learning_rate": 4.238227146814405e-06, "loss": 0.5276, "step": 153 }, { "epoch": 0.1281908990011099, "grad_norm": 0.4570467472076416, "learning_rate": 4.265927977839335e-06, "loss": 0.5514, "step": 154 }, { "epoch": 0.129023307436182, "grad_norm": 0.39249682426452637, "learning_rate": 4.293628808864266e-06, "loss": 0.5566, "step": 155 }, { "epoch": 0.12985571587125416, "grad_norm": 0.37955617904663086, "learning_rate": 4.321329639889197e-06, "loss": 0.5411, "step": 156 }, { "epoch": 0.13068812430632631, "grad_norm": 0.42655855417251587, "learning_rate": 4.349030470914128e-06, "loss": 0.538, "step": 157 }, { "epoch": 0.13152053274139844, "grad_norm": 0.41612133383750916, "learning_rate": 4.376731301939059e-06, "loss": 0.5628, "step": 158 }, { "epoch": 0.1323529411764706, "grad_norm": 0.4542618989944458, "learning_rate": 4.404432132963989e-06, "loss": 0.5541, "step": 159 }, { "epoch": 0.13318534961154274, "grad_norm": 0.4357217252254486, "learning_rate": 4.43213296398892e-06, "loss": 0.536, "step": 160 }, { "epoch": 0.13401775804661487, "grad_norm": 0.4199989140033722, "learning_rate": 4.459833795013851e-06, "loss": 0.5207, "step": 161 }, { "epoch": 0.13485016648168702, "grad_norm": 0.43055838346481323, "learning_rate": 4.487534626038781e-06, "loss": 0.5649, "step": 162 }, { "epoch": 0.13568257491675917, "grad_norm": 0.4102376103401184, "learning_rate": 4.515235457063713e-06, "loss": 0.5379, "step": 163 }, { "epoch": 0.1365149833518313, "grad_norm": 0.40180259943008423, "learning_rate": 4.542936288088643e-06, "loss": 0.5461, "step": 164 }, { "epoch": 0.13734739178690344, "grad_norm": 0.4794843792915344, "learning_rate": 4.570637119113574e-06, "loss": 0.5279, "step": 165 }, { "epoch": 0.1381798002219756, "grad_norm": 0.5056953430175781, "learning_rate": 4.598337950138505e-06, "loss": 0.5348, "step": 166 }, { "epoch": 0.13901220865704772, "grad_norm": 0.45556968450546265, "learning_rate": 4.626038781163435e-06, "loss": 0.4995, "step": 167 }, { "epoch": 0.13984461709211987, "grad_norm": 0.40867018699645996, "learning_rate": 4.653739612188366e-06, "loss": 0.5301, "step": 168 }, { "epoch": 0.140677025527192, "grad_norm": 0.44523248076438904, "learning_rate": 4.681440443213297e-06, "loss": 0.5097, "step": 169 }, { "epoch": 0.14150943396226415, "grad_norm": 0.44204166531562805, "learning_rate": 4.709141274238227e-06, "loss": 0.5348, "step": 170 }, { "epoch": 0.1423418423973363, "grad_norm": 0.39262545108795166, "learning_rate": 4.736842105263158e-06, "loss": 0.5127, "step": 171 }, { "epoch": 0.14317425083240842, "grad_norm": 0.4378338158130646, "learning_rate": 4.764542936288089e-06, "loss": 0.498, "step": 172 }, { "epoch": 0.14400665926748057, "grad_norm": 0.4008118510246277, "learning_rate": 4.79224376731302e-06, "loss": 0.5389, "step": 173 }, { "epoch": 0.14483906770255273, "grad_norm": 0.41733667254447937, "learning_rate": 4.8199445983379505e-06, "loss": 0.5216, "step": 174 }, { "epoch": 0.14567147613762485, "grad_norm": 0.4296148121356964, "learning_rate": 4.847645429362881e-06, "loss": 0.5328, "step": 175 }, { "epoch": 0.146503884572697, "grad_norm": 0.4301013648509979, "learning_rate": 4.875346260387812e-06, "loss": 0.5292, "step": 176 }, { "epoch": 0.14733629300776915, "grad_norm": 0.46143457293510437, "learning_rate": 4.903047091412742e-06, "loss": 0.5336, "step": 177 }, { "epoch": 0.14816870144284128, "grad_norm": 0.4127751886844635, "learning_rate": 4.930747922437674e-06, "loss": 0.5322, "step": 178 }, { "epoch": 0.14900110987791343, "grad_norm": 0.4373955726623535, "learning_rate": 4.9584487534626045e-06, "loss": 0.5486, "step": 179 }, { "epoch": 0.14983351831298558, "grad_norm": 0.4892722964286804, "learning_rate": 4.986149584487535e-06, "loss": 0.5416, "step": 180 }, { "epoch": 0.1506659267480577, "grad_norm": 0.40266159176826477, "learning_rate": 5.013850415512466e-06, "loss": 0.5149, "step": 181 }, { "epoch": 0.15149833518312986, "grad_norm": 0.4328942894935608, "learning_rate": 5.041551246537396e-06, "loss": 0.5364, "step": 182 }, { "epoch": 0.152330743618202, "grad_norm": 0.46720901131629944, "learning_rate": 5.069252077562328e-06, "loss": 0.5481, "step": 183 }, { "epoch": 0.15316315205327413, "grad_norm": 0.5006843209266663, "learning_rate": 5.0969529085872585e-06, "loss": 0.5599, "step": 184 }, { "epoch": 0.15399556048834628, "grad_norm": 0.4212501347064972, "learning_rate": 5.124653739612189e-06, "loss": 0.5316, "step": 185 }, { "epoch": 0.15482796892341844, "grad_norm": 0.4488828778266907, "learning_rate": 5.152354570637119e-06, "loss": 0.5163, "step": 186 }, { "epoch": 0.15566037735849056, "grad_norm": 0.40712884068489075, "learning_rate": 5.180055401662051e-06, "loss": 0.529, "step": 187 }, { "epoch": 0.1564927857935627, "grad_norm": 0.4387264847755432, "learning_rate": 5.207756232686981e-06, "loss": 0.5097, "step": 188 }, { "epoch": 0.15732519422863486, "grad_norm": 0.4043707549571991, "learning_rate": 5.235457063711912e-06, "loss": 0.4989, "step": 189 }, { "epoch": 0.158157602663707, "grad_norm": 0.4202433228492737, "learning_rate": 5.263157894736842e-06, "loss": 0.5577, "step": 190 }, { "epoch": 0.15899001109877914, "grad_norm": 0.4169784188270569, "learning_rate": 5.290858725761774e-06, "loss": 0.5331, "step": 191 }, { "epoch": 0.1598224195338513, "grad_norm": 0.42586076259613037, "learning_rate": 5.318559556786704e-06, "loss": 0.5323, "step": 192 }, { "epoch": 0.1606548279689234, "grad_norm": 0.4811337888240814, "learning_rate": 5.346260387811635e-06, "loss": 0.5305, "step": 193 }, { "epoch": 0.16148723640399557, "grad_norm": 0.4163426458835602, "learning_rate": 5.373961218836566e-06, "loss": 0.5038, "step": 194 }, { "epoch": 0.1623196448390677, "grad_norm": 0.4817521572113037, "learning_rate": 5.401662049861495e-06, "loss": 0.5423, "step": 195 }, { "epoch": 0.16315205327413984, "grad_norm": 0.49575290083885193, "learning_rate": 5.429362880886428e-06, "loss": 0.5233, "step": 196 }, { "epoch": 0.163984461709212, "grad_norm": 0.4486011266708374, "learning_rate": 5.4570637119113575e-06, "loss": 0.5004, "step": 197 }, { "epoch": 0.16481687014428412, "grad_norm": 0.4150060713291168, "learning_rate": 5.484764542936288e-06, "loss": 0.52, "step": 198 }, { "epoch": 0.16564927857935627, "grad_norm": 0.4181516468524933, "learning_rate": 5.512465373961219e-06, "loss": 0.498, "step": 199 }, { "epoch": 0.16648168701442842, "grad_norm": 0.5380068421363831, "learning_rate": 5.54016620498615e-06, "loss": 0.5458, "step": 200 }, { "epoch": 0.16731409544950054, "grad_norm": 0.4043119549751282, "learning_rate": 5.567867036011081e-06, "loss": 0.5297, "step": 201 }, { "epoch": 0.1681465038845727, "grad_norm": 0.481564462184906, "learning_rate": 5.5955678670360115e-06, "loss": 0.5275, "step": 202 }, { "epoch": 0.16897891231964485, "grad_norm": 0.4668130874633789, "learning_rate": 5.623268698060942e-06, "loss": 0.5195, "step": 203 }, { "epoch": 0.16981132075471697, "grad_norm": 0.4354064166545868, "learning_rate": 5.6509695290858736e-06, "loss": 0.5147, "step": 204 }, { "epoch": 0.17064372918978912, "grad_norm": 0.45728379487991333, "learning_rate": 5.678670360110804e-06, "loss": 0.5487, "step": 205 }, { "epoch": 0.17147613762486127, "grad_norm": 0.4586714804172516, "learning_rate": 5.706371191135735e-06, "loss": 0.5398, "step": 206 }, { "epoch": 0.1723085460599334, "grad_norm": 0.4355411231517792, "learning_rate": 5.734072022160665e-06, "loss": 0.5143, "step": 207 }, { "epoch": 0.17314095449500555, "grad_norm": 0.4318200647830963, "learning_rate": 5.761772853185597e-06, "loss": 0.5153, "step": 208 }, { "epoch": 0.1739733629300777, "grad_norm": 0.45262500643730164, "learning_rate": 5.789473684210527e-06, "loss": 0.5367, "step": 209 }, { "epoch": 0.17480577136514983, "grad_norm": 0.41200655698776245, "learning_rate": 5.817174515235457e-06, "loss": 0.4899, "step": 210 }, { "epoch": 0.17563817980022198, "grad_norm": 0.4550832211971283, "learning_rate": 5.844875346260388e-06, "loss": 0.5312, "step": 211 }, { "epoch": 0.17647058823529413, "grad_norm": 0.48159360885620117, "learning_rate": 5.8725761772853194e-06, "loss": 0.5105, "step": 212 }, { "epoch": 0.17730299667036625, "grad_norm": 0.4670851230621338, "learning_rate": 5.90027700831025e-06, "loss": 0.5193, "step": 213 }, { "epoch": 0.1781354051054384, "grad_norm": 0.4833730161190033, "learning_rate": 5.927977839335181e-06, "loss": 0.5374, "step": 214 }, { "epoch": 0.17896781354051056, "grad_norm": 0.41837170720100403, "learning_rate": 5.955678670360111e-06, "loss": 0.549, "step": 215 }, { "epoch": 0.17980022197558268, "grad_norm": 0.45265597105026245, "learning_rate": 5.983379501385042e-06, "loss": 0.5009, "step": 216 }, { "epoch": 0.18063263041065483, "grad_norm": 0.4560681879520416, "learning_rate": 6.011080332409973e-06, "loss": 0.5201, "step": 217 }, { "epoch": 0.18146503884572698, "grad_norm": 0.41413623094558716, "learning_rate": 6.038781163434903e-06, "loss": 0.5098, "step": 218 }, { "epoch": 0.1822974472807991, "grad_norm": 0.45979252457618713, "learning_rate": 6.066481994459834e-06, "loss": 0.5127, "step": 219 }, { "epoch": 0.18312985571587126, "grad_norm": 0.42055100202560425, "learning_rate": 6.0941828254847645e-06, "loss": 0.4953, "step": 220 }, { "epoch": 0.18396226415094338, "grad_norm": 0.4109812378883362, "learning_rate": 6.121883656509696e-06, "loss": 0.5282, "step": 221 }, { "epoch": 0.18479467258601553, "grad_norm": 0.405984103679657, "learning_rate": 6.1495844875346266e-06, "loss": 0.5089, "step": 222 }, { "epoch": 0.1856270810210877, "grad_norm": 0.388094037771225, "learning_rate": 6.177285318559557e-06, "loss": 0.5212, "step": 223 }, { "epoch": 0.1864594894561598, "grad_norm": 0.4275857210159302, "learning_rate": 6.204986149584488e-06, "loss": 0.5238, "step": 224 }, { "epoch": 0.18729189789123196, "grad_norm": 0.4158404469490051, "learning_rate": 6.232686980609419e-06, "loss": 0.5246, "step": 225 }, { "epoch": 0.1881243063263041, "grad_norm": 0.44389262795448303, "learning_rate": 6.26038781163435e-06, "loss": 0.5089, "step": 226 }, { "epoch": 0.18895671476137624, "grad_norm": 0.4087418019771576, "learning_rate": 6.2880886426592805e-06, "loss": 0.5089, "step": 227 }, { "epoch": 0.1897891231964484, "grad_norm": 0.5286365747451782, "learning_rate": 6.31578947368421e-06, "loss": 0.5551, "step": 228 }, { "epoch": 0.19062153163152054, "grad_norm": 0.4365133047103882, "learning_rate": 6.343490304709143e-06, "loss": 0.511, "step": 229 }, { "epoch": 0.19145394006659266, "grad_norm": 0.44955843687057495, "learning_rate": 6.3711911357340724e-06, "loss": 0.4946, "step": 230 }, { "epoch": 0.19228634850166482, "grad_norm": 0.41679856181144714, "learning_rate": 6.398891966759003e-06, "loss": 0.5041, "step": 231 }, { "epoch": 0.19311875693673697, "grad_norm": 0.4691244959831238, "learning_rate": 6.426592797783934e-06, "loss": 0.4901, "step": 232 }, { "epoch": 0.1939511653718091, "grad_norm": 0.49247369170188904, "learning_rate": 6.454293628808865e-06, "loss": 0.5176, "step": 233 }, { "epoch": 0.19478357380688124, "grad_norm": 0.4147561192512512, "learning_rate": 6.481994459833796e-06, "loss": 0.4995, "step": 234 }, { "epoch": 0.1956159822419534, "grad_norm": 0.4889633059501648, "learning_rate": 6.509695290858726e-06, "loss": 0.5404, "step": 235 }, { "epoch": 0.19644839067702552, "grad_norm": 0.5010858774185181, "learning_rate": 6.537396121883657e-06, "loss": 0.5358, "step": 236 }, { "epoch": 0.19728079911209767, "grad_norm": 0.38324692845344543, "learning_rate": 6.565096952908588e-06, "loss": 0.4914, "step": 237 }, { "epoch": 0.19811320754716982, "grad_norm": 0.489378958940506, "learning_rate": 6.592797783933519e-06, "loss": 0.4954, "step": 238 }, { "epoch": 0.19894561598224195, "grad_norm": 0.4189784526824951, "learning_rate": 6.62049861495845e-06, "loss": 0.5062, "step": 239 }, { "epoch": 0.1997780244173141, "grad_norm": 0.42447060346603394, "learning_rate": 6.6481994459833796e-06, "loss": 0.5128, "step": 240 }, { "epoch": 0.20061043285238625, "grad_norm": 0.4346916079521179, "learning_rate": 6.67590027700831e-06, "loss": 0.5257, "step": 241 }, { "epoch": 0.20144284128745837, "grad_norm": 0.4251374304294586, "learning_rate": 6.703601108033242e-06, "loss": 0.5267, "step": 242 }, { "epoch": 0.20227524972253053, "grad_norm": 0.40433380007743835, "learning_rate": 6.731301939058172e-06, "loss": 0.4952, "step": 243 }, { "epoch": 0.20310765815760268, "grad_norm": 0.44367408752441406, "learning_rate": 6.759002770083103e-06, "loss": 0.514, "step": 244 }, { "epoch": 0.2039400665926748, "grad_norm": 0.45639634132385254, "learning_rate": 6.7867036011080335e-06, "loss": 0.5206, "step": 245 }, { "epoch": 0.20477247502774695, "grad_norm": 0.4419868290424347, "learning_rate": 6.814404432132965e-06, "loss": 0.5412, "step": 246 }, { "epoch": 0.20560488346281908, "grad_norm": 0.4839082658290863, "learning_rate": 6.842105263157896e-06, "loss": 0.5237, "step": 247 }, { "epoch": 0.20643729189789123, "grad_norm": 0.42248091101646423, "learning_rate": 6.869806094182826e-06, "loss": 0.5242, "step": 248 }, { "epoch": 0.20726970033296338, "grad_norm": 0.46019700169563293, "learning_rate": 6.897506925207756e-06, "loss": 0.4951, "step": 249 }, { "epoch": 0.2081021087680355, "grad_norm": 0.5231823325157166, "learning_rate": 6.925207756232688e-06, "loss": 0.5459, "step": 250 }, { "epoch": 0.20893451720310766, "grad_norm": 0.45339393615722656, "learning_rate": 6.952908587257618e-06, "loss": 0.5099, "step": 251 }, { "epoch": 0.2097669256381798, "grad_norm": 0.4783773124217987, "learning_rate": 6.980609418282549e-06, "loss": 0.4946, "step": 252 }, { "epoch": 0.21059933407325193, "grad_norm": 0.46553727984428406, "learning_rate": 7.008310249307479e-06, "loss": 0.482, "step": 253 }, { "epoch": 0.21143174250832408, "grad_norm": 0.4713238775730133, "learning_rate": 7.036011080332411e-06, "loss": 0.4964, "step": 254 }, { "epoch": 0.21226415094339623, "grad_norm": 0.48873159289360046, "learning_rate": 7.0637119113573415e-06, "loss": 0.5337, "step": 255 }, { "epoch": 0.21309655937846836, "grad_norm": 0.45275193452835083, "learning_rate": 7.091412742382272e-06, "loss": 0.5168, "step": 256 }, { "epoch": 0.2139289678135405, "grad_norm": 0.45386070013046265, "learning_rate": 7.119113573407203e-06, "loss": 0.5093, "step": 257 }, { "epoch": 0.21476137624861266, "grad_norm": 0.46968305110931396, "learning_rate": 7.146814404432133e-06, "loss": 0.5128, "step": 258 }, { "epoch": 0.21559378468368479, "grad_norm": 0.4741690754890442, "learning_rate": 7.174515235457065e-06, "loss": 0.5301, "step": 259 }, { "epoch": 0.21642619311875694, "grad_norm": 0.48019328713417053, "learning_rate": 7.2022160664819955e-06, "loss": 0.4837, "step": 260 }, { "epoch": 0.2172586015538291, "grad_norm": 0.424640029668808, "learning_rate": 7.229916897506925e-06, "loss": 0.4917, "step": 261 }, { "epoch": 0.2180910099889012, "grad_norm": 0.5357044339179993, "learning_rate": 7.257617728531856e-06, "loss": 0.5159, "step": 262 }, { "epoch": 0.21892341842397336, "grad_norm": 0.46231383085250854, "learning_rate": 7.285318559556787e-06, "loss": 0.5235, "step": 263 }, { "epoch": 0.21975582685904552, "grad_norm": 0.4579526484012604, "learning_rate": 7.313019390581718e-06, "loss": 0.4905, "step": 264 }, { "epoch": 0.22058823529411764, "grad_norm": 0.459525465965271, "learning_rate": 7.340720221606649e-06, "loss": 0.525, "step": 265 }, { "epoch": 0.2214206437291898, "grad_norm": 0.5286341309547424, "learning_rate": 7.368421052631579e-06, "loss": 0.5093, "step": 266 }, { "epoch": 0.22225305216426194, "grad_norm": 0.4147150218486786, "learning_rate": 7.396121883656511e-06, "loss": 0.4996, "step": 267 }, { "epoch": 0.22308546059933407, "grad_norm": 0.4209958612918854, "learning_rate": 7.423822714681441e-06, "loss": 0.5076, "step": 268 }, { "epoch": 0.22391786903440622, "grad_norm": 0.41367340087890625, "learning_rate": 7.451523545706372e-06, "loss": 0.5261, "step": 269 }, { "epoch": 0.22475027746947837, "grad_norm": 0.4930833578109741, "learning_rate": 7.479224376731302e-06, "loss": 0.4987, "step": 270 }, { "epoch": 0.2255826859045505, "grad_norm": 0.45146897435188293, "learning_rate": 7.506925207756234e-06, "loss": 0.4857, "step": 271 }, { "epoch": 0.22641509433962265, "grad_norm": 0.4591994881629944, "learning_rate": 7.534626038781164e-06, "loss": 0.5007, "step": 272 }, { "epoch": 0.22724750277469477, "grad_norm": 0.5006039142608643, "learning_rate": 7.5623268698060945e-06, "loss": 0.5174, "step": 273 }, { "epoch": 0.22807991120976692, "grad_norm": 0.4668283462524414, "learning_rate": 7.590027700831025e-06, "loss": 0.5019, "step": 274 }, { "epoch": 0.22891231964483907, "grad_norm": 0.5253505110740662, "learning_rate": 7.617728531855957e-06, "loss": 0.4974, "step": 275 }, { "epoch": 0.2297447280799112, "grad_norm": 0.5322619676589966, "learning_rate": 7.645429362880887e-06, "loss": 0.5147, "step": 276 }, { "epoch": 0.23057713651498335, "grad_norm": 0.44390517473220825, "learning_rate": 7.673130193905818e-06, "loss": 0.5246, "step": 277 }, { "epoch": 0.2314095449500555, "grad_norm": 0.5119628310203552, "learning_rate": 7.700831024930749e-06, "loss": 0.4879, "step": 278 }, { "epoch": 0.23224195338512763, "grad_norm": 0.4466327726840973, "learning_rate": 7.728531855955679e-06, "loss": 0.4999, "step": 279 }, { "epoch": 0.23307436182019978, "grad_norm": 0.4270954728126526, "learning_rate": 7.75623268698061e-06, "loss": 0.4955, "step": 280 }, { "epoch": 0.23390677025527193, "grad_norm": 0.5279788970947266, "learning_rate": 7.78393351800554e-06, "loss": 0.5138, "step": 281 }, { "epoch": 0.23473917869034405, "grad_norm": 0.4631377160549164, "learning_rate": 7.811634349030471e-06, "loss": 0.4915, "step": 282 }, { "epoch": 0.2355715871254162, "grad_norm": 0.509636402130127, "learning_rate": 7.839335180055402e-06, "loss": 0.5205, "step": 283 }, { "epoch": 0.23640399556048836, "grad_norm": 0.42661571502685547, "learning_rate": 7.867036011080334e-06, "loss": 0.4931, "step": 284 }, { "epoch": 0.23723640399556048, "grad_norm": 0.4809859097003937, "learning_rate": 7.894736842105265e-06, "loss": 0.514, "step": 285 }, { "epoch": 0.23806881243063263, "grad_norm": 0.4813212752342224, "learning_rate": 7.922437673130195e-06, "loss": 0.5209, "step": 286 }, { "epoch": 0.23890122086570478, "grad_norm": 0.4047999083995819, "learning_rate": 7.950138504155124e-06, "loss": 0.4689, "step": 287 }, { "epoch": 0.2397336293007769, "grad_norm": 0.48839592933654785, "learning_rate": 7.977839335180056e-06, "loss": 0.495, "step": 288 }, { "epoch": 0.24056603773584906, "grad_norm": 0.4675354063510895, "learning_rate": 8.005540166204987e-06, "loss": 0.5214, "step": 289 }, { "epoch": 0.2413984461709212, "grad_norm": 0.5134713053703308, "learning_rate": 8.033240997229918e-06, "loss": 0.4755, "step": 290 }, { "epoch": 0.24223085460599333, "grad_norm": 0.46296653151512146, "learning_rate": 8.060941828254848e-06, "loss": 0.5137, "step": 291 }, { "epoch": 0.24306326304106549, "grad_norm": 0.43857628107070923, "learning_rate": 8.088642659279779e-06, "loss": 0.5045, "step": 292 }, { "epoch": 0.24389567147613764, "grad_norm": 0.4630931615829468, "learning_rate": 8.11634349030471e-06, "loss": 0.5113, "step": 293 }, { "epoch": 0.24472807991120976, "grad_norm": 0.4841103255748749, "learning_rate": 8.14404432132964e-06, "loss": 0.519, "step": 294 }, { "epoch": 0.2455604883462819, "grad_norm": 0.4541115462779999, "learning_rate": 8.171745152354571e-06, "loss": 0.5125, "step": 295 }, { "epoch": 0.24639289678135406, "grad_norm": 0.4210924208164215, "learning_rate": 8.199445983379503e-06, "loss": 0.5035, "step": 296 }, { "epoch": 0.2472253052164262, "grad_norm": 0.41758114099502563, "learning_rate": 8.227146814404434e-06, "loss": 0.4851, "step": 297 }, { "epoch": 0.24805771365149834, "grad_norm": 0.5241228938102722, "learning_rate": 8.254847645429363e-06, "loss": 0.5046, "step": 298 }, { "epoch": 0.24889012208657046, "grad_norm": 0.5144554972648621, "learning_rate": 8.282548476454293e-06, "loss": 0.54, "step": 299 }, { "epoch": 0.24972253052164262, "grad_norm": 0.5133737921714783, "learning_rate": 8.310249307479224e-06, "loss": 0.499, "step": 300 }, { "epoch": 0.25055493895671477, "grad_norm": 0.5024670362472534, "learning_rate": 8.337950138504156e-06, "loss": 0.5042, "step": 301 }, { "epoch": 0.2513873473917869, "grad_norm": 0.5267788767814636, "learning_rate": 8.365650969529087e-06, "loss": 0.5023, "step": 302 }, { "epoch": 0.25221975582685907, "grad_norm": 0.43696895241737366, "learning_rate": 8.393351800554018e-06, "loss": 0.4858, "step": 303 }, { "epoch": 0.25305216426193117, "grad_norm": 0.5760444402694702, "learning_rate": 8.421052631578948e-06, "loss": 0.5106, "step": 304 }, { "epoch": 0.2538845726970033, "grad_norm": 0.4616737961769104, "learning_rate": 8.448753462603879e-06, "loss": 0.4843, "step": 305 }, { "epoch": 0.25471698113207547, "grad_norm": 0.5310185551643372, "learning_rate": 8.47645429362881e-06, "loss": 0.5263, "step": 306 }, { "epoch": 0.2555493895671476, "grad_norm": 0.4836473762989044, "learning_rate": 8.50415512465374e-06, "loss": 0.4889, "step": 307 }, { "epoch": 0.2563817980022198, "grad_norm": 0.5024266839027405, "learning_rate": 8.53185595567867e-06, "loss": 0.491, "step": 308 }, { "epoch": 0.25721420643729187, "grad_norm": 0.47080838680267334, "learning_rate": 8.559556786703603e-06, "loss": 0.467, "step": 309 }, { "epoch": 0.258046614872364, "grad_norm": 0.44176098704338074, "learning_rate": 8.587257617728532e-06, "loss": 0.4699, "step": 310 }, { "epoch": 0.2588790233074362, "grad_norm": 0.43928641080856323, "learning_rate": 8.614958448753463e-06, "loss": 0.5042, "step": 311 }, { "epoch": 0.2597114317425083, "grad_norm": 0.4728699326515198, "learning_rate": 8.642659279778393e-06, "loss": 0.4923, "step": 312 }, { "epoch": 0.2605438401775805, "grad_norm": 0.45453718304634094, "learning_rate": 8.670360110803326e-06, "loss": 0.4983, "step": 313 }, { "epoch": 0.26137624861265263, "grad_norm": 0.46343910694122314, "learning_rate": 8.698060941828256e-06, "loss": 0.4985, "step": 314 }, { "epoch": 0.2622086570477247, "grad_norm": 0.5328369140625, "learning_rate": 8.725761772853187e-06, "loss": 0.5047, "step": 315 }, { "epoch": 0.2630410654827969, "grad_norm": 0.4438888728618622, "learning_rate": 8.753462603878117e-06, "loss": 0.5047, "step": 316 }, { "epoch": 0.26387347391786903, "grad_norm": 0.5838459730148315, "learning_rate": 8.781163434903048e-06, "loss": 0.5239, "step": 317 }, { "epoch": 0.2647058823529412, "grad_norm": 0.48084893822669983, "learning_rate": 8.808864265927979e-06, "loss": 0.5101, "step": 318 }, { "epoch": 0.26553829078801333, "grad_norm": 0.3967612385749817, "learning_rate": 8.83656509695291e-06, "loss": 0.474, "step": 319 }, { "epoch": 0.2663706992230855, "grad_norm": 0.49214819073677063, "learning_rate": 8.86426592797784e-06, "loss": 0.5064, "step": 320 }, { "epoch": 0.2672031076581576, "grad_norm": 0.46390125155448914, "learning_rate": 8.89196675900277e-06, "loss": 0.4726, "step": 321 }, { "epoch": 0.26803551609322973, "grad_norm": 0.43828120827674866, "learning_rate": 8.919667590027701e-06, "loss": 0.508, "step": 322 }, { "epoch": 0.2688679245283019, "grad_norm": 0.4651317596435547, "learning_rate": 8.947368421052632e-06, "loss": 0.4844, "step": 323 }, { "epoch": 0.26970033296337403, "grad_norm": 0.45498353242874146, "learning_rate": 8.975069252077562e-06, "loss": 0.4959, "step": 324 }, { "epoch": 0.2705327413984462, "grad_norm": 0.5337496995925903, "learning_rate": 9.002770083102493e-06, "loss": 0.4892, "step": 325 }, { "epoch": 0.27136514983351834, "grad_norm": 0.4983648657798767, "learning_rate": 9.030470914127425e-06, "loss": 0.5105, "step": 326 }, { "epoch": 0.27219755826859043, "grad_norm": 0.4203820824623108, "learning_rate": 9.058171745152356e-06, "loss": 0.4674, "step": 327 }, { "epoch": 0.2730299667036626, "grad_norm": 0.4564470052719116, "learning_rate": 9.085872576177287e-06, "loss": 0.4779, "step": 328 }, { "epoch": 0.27386237513873474, "grad_norm": 0.48127999901771545, "learning_rate": 9.113573407202216e-06, "loss": 0.504, "step": 329 }, { "epoch": 0.2746947835738069, "grad_norm": 0.41335824131965637, "learning_rate": 9.141274238227148e-06, "loss": 0.452, "step": 330 }, { "epoch": 0.27552719200887904, "grad_norm": 0.48663002252578735, "learning_rate": 9.168975069252079e-06, "loss": 0.497, "step": 331 }, { "epoch": 0.2763596004439512, "grad_norm": 0.469848096370697, "learning_rate": 9.19667590027701e-06, "loss": 0.4722, "step": 332 }, { "epoch": 0.2771920088790233, "grad_norm": 0.5372227430343628, "learning_rate": 9.22437673130194e-06, "loss": 0.5136, "step": 333 }, { "epoch": 0.27802441731409544, "grad_norm": 0.4874361753463745, "learning_rate": 9.25207756232687e-06, "loss": 0.508, "step": 334 }, { "epoch": 0.2788568257491676, "grad_norm": 0.42719605565071106, "learning_rate": 9.279778393351801e-06, "loss": 0.4939, "step": 335 }, { "epoch": 0.27968923418423974, "grad_norm": 0.4991985261440277, "learning_rate": 9.307479224376732e-06, "loss": 0.5072, "step": 336 }, { "epoch": 0.2805216426193119, "grad_norm": 0.5013337135314941, "learning_rate": 9.335180055401662e-06, "loss": 0.4705, "step": 337 }, { "epoch": 0.281354051054384, "grad_norm": 0.40710508823394775, "learning_rate": 9.362880886426595e-06, "loss": 0.4713, "step": 338 }, { "epoch": 0.28218645948945614, "grad_norm": 0.4505155682563782, "learning_rate": 9.390581717451525e-06, "loss": 0.4912, "step": 339 }, { "epoch": 0.2830188679245283, "grad_norm": 0.4827728569507599, "learning_rate": 9.418282548476454e-06, "loss": 0.5223, "step": 340 }, { "epoch": 0.28385127635960045, "grad_norm": 0.5375002026557922, "learning_rate": 9.445983379501385e-06, "loss": 0.5245, "step": 341 }, { "epoch": 0.2846836847946726, "grad_norm": 0.41850459575653076, "learning_rate": 9.473684210526315e-06, "loss": 0.5029, "step": 342 }, { "epoch": 0.28551609322974475, "grad_norm": 0.508848249912262, "learning_rate": 9.501385041551248e-06, "loss": 0.4864, "step": 343 }, { "epoch": 0.28634850166481685, "grad_norm": 0.5495002865791321, "learning_rate": 9.529085872576178e-06, "loss": 0.5147, "step": 344 }, { "epoch": 0.287180910099889, "grad_norm": 0.49234721064567566, "learning_rate": 9.556786703601109e-06, "loss": 0.5126, "step": 345 }, { "epoch": 0.28801331853496115, "grad_norm": 0.4997720718383789, "learning_rate": 9.58448753462604e-06, "loss": 0.5154, "step": 346 }, { "epoch": 0.2888457269700333, "grad_norm": 0.5731498599052429, "learning_rate": 9.61218836565097e-06, "loss": 0.5084, "step": 347 }, { "epoch": 0.28967813540510545, "grad_norm": 0.5639533400535583, "learning_rate": 9.639889196675901e-06, "loss": 0.458, "step": 348 }, { "epoch": 0.2905105438401776, "grad_norm": 0.5300630927085876, "learning_rate": 9.667590027700832e-06, "loss": 0.499, "step": 349 }, { "epoch": 0.2913429522752497, "grad_norm": 0.5944730639457703, "learning_rate": 9.695290858725762e-06, "loss": 0.5156, "step": 350 }, { "epoch": 0.29217536071032185, "grad_norm": 0.4813040494918823, "learning_rate": 9.722991689750695e-06, "loss": 0.492, "step": 351 }, { "epoch": 0.293007769145394, "grad_norm": 0.4312509000301361, "learning_rate": 9.750692520775623e-06, "loss": 0.4801, "step": 352 }, { "epoch": 0.29384017758046616, "grad_norm": 0.474255234003067, "learning_rate": 9.778393351800554e-06, "loss": 0.4755, "step": 353 }, { "epoch": 0.2946725860155383, "grad_norm": 0.48329421877861023, "learning_rate": 9.806094182825485e-06, "loss": 0.4974, "step": 354 }, { "epoch": 0.29550499445061046, "grad_norm": 0.47372984886169434, "learning_rate": 9.833795013850417e-06, "loss": 0.5232, "step": 355 }, { "epoch": 0.29633740288568255, "grad_norm": 0.4526323080062866, "learning_rate": 9.861495844875348e-06, "loss": 0.4792, "step": 356 }, { "epoch": 0.2971698113207547, "grad_norm": 0.5251845121383667, "learning_rate": 9.889196675900278e-06, "loss": 0.4866, "step": 357 }, { "epoch": 0.29800221975582686, "grad_norm": 0.4455892741680145, "learning_rate": 9.916897506925209e-06, "loss": 0.4904, "step": 358 }, { "epoch": 0.298834628190899, "grad_norm": 0.46031251549720764, "learning_rate": 9.94459833795014e-06, "loss": 0.4744, "step": 359 }, { "epoch": 0.29966703662597116, "grad_norm": 0.4524519443511963, "learning_rate": 9.97229916897507e-06, "loss": 0.4958, "step": 360 }, { "epoch": 0.30049944506104326, "grad_norm": 0.4535054862499237, "learning_rate": 1e-05, "loss": 0.4906, "step": 361 }, { "epoch": 0.3013318534961154, "grad_norm": 0.4776564836502075, "learning_rate": 9.999997652456228e-06, "loss": 0.5017, "step": 362 }, { "epoch": 0.30216426193118756, "grad_norm": 0.5730841159820557, "learning_rate": 9.999990609827113e-06, "loss": 0.4679, "step": 363 }, { "epoch": 0.3029966703662597, "grad_norm": 0.4576025903224945, "learning_rate": 9.999978872119267e-06, "loss": 0.4968, "step": 364 }, { "epoch": 0.30382907880133186, "grad_norm": 0.6305115818977356, "learning_rate": 9.999962439343715e-06, "loss": 0.4939, "step": 365 }, { "epoch": 0.304661487236404, "grad_norm": 0.4445456564426422, "learning_rate": 9.999941311515888e-06, "loss": 0.4895, "step": 366 }, { "epoch": 0.3054938956714761, "grad_norm": 0.535879909992218, "learning_rate": 9.999915488655623e-06, "loss": 0.4841, "step": 367 }, { "epoch": 0.30632630410654826, "grad_norm": 0.5554977059364319, "learning_rate": 9.999884970787168e-06, "loss": 0.5098, "step": 368 }, { "epoch": 0.3071587125416204, "grad_norm": 0.5069095492362976, "learning_rate": 9.999849757939182e-06, "loss": 0.4879, "step": 369 }, { "epoch": 0.30799112097669257, "grad_norm": 0.48924490809440613, "learning_rate": 9.99980985014473e-06, "loss": 0.489, "step": 370 }, { "epoch": 0.3088235294117647, "grad_norm": 0.5114119052886963, "learning_rate": 9.999765247441285e-06, "loss": 0.4927, "step": 371 }, { "epoch": 0.30965593784683687, "grad_norm": 0.6173185706138611, "learning_rate": 9.999715949870729e-06, "loss": 0.5053, "step": 372 }, { "epoch": 0.31048834628190897, "grad_norm": 0.4988461136817932, "learning_rate": 9.999661957479354e-06, "loss": 0.4579, "step": 373 }, { "epoch": 0.3113207547169811, "grad_norm": 0.5641622543334961, "learning_rate": 9.999603270317863e-06, "loss": 0.5126, "step": 374 }, { "epoch": 0.31215316315205327, "grad_norm": 0.5810229778289795, "learning_rate": 9.99953988844136e-06, "loss": 0.4952, "step": 375 }, { "epoch": 0.3129855715871254, "grad_norm": 0.6129553914070129, "learning_rate": 9.999471811909363e-06, "loss": 0.5012, "step": 376 }, { "epoch": 0.3138179800221976, "grad_norm": 0.5977503061294556, "learning_rate": 9.999399040785797e-06, "loss": 0.492, "step": 377 }, { "epoch": 0.3146503884572697, "grad_norm": 0.6007142663002014, "learning_rate": 9.999321575138997e-06, "loss": 0.501, "step": 378 }, { "epoch": 0.3154827968923418, "grad_norm": 0.6089385747909546, "learning_rate": 9.999239415041701e-06, "loss": 0.4894, "step": 379 }, { "epoch": 0.316315205327414, "grad_norm": 0.6288129091262817, "learning_rate": 9.999152560571064e-06, "loss": 0.4886, "step": 380 }, { "epoch": 0.3171476137624861, "grad_norm": 0.6314293146133423, "learning_rate": 9.99906101180864e-06, "loss": 0.5023, "step": 381 }, { "epoch": 0.3179800221975583, "grad_norm": 0.6294896006584167, "learning_rate": 9.998964768840393e-06, "loss": 0.495, "step": 382 }, { "epoch": 0.31881243063263043, "grad_norm": 0.5049759745597839, "learning_rate": 9.998863831756702e-06, "loss": 0.5074, "step": 383 }, { "epoch": 0.3196448390677026, "grad_norm": 0.5267007350921631, "learning_rate": 9.998758200652346e-06, "loss": 0.4751, "step": 384 }, { "epoch": 0.3204772475027747, "grad_norm": 0.5030413866043091, "learning_rate": 9.998647875626514e-06, "loss": 0.5, "step": 385 }, { "epoch": 0.3213096559378468, "grad_norm": 0.49688029289245605, "learning_rate": 9.998532856782805e-06, "loss": 0.4716, "step": 386 }, { "epoch": 0.322142064372919, "grad_norm": 0.4237273335456848, "learning_rate": 9.998413144229224e-06, "loss": 0.4842, "step": 387 }, { "epoch": 0.32297447280799113, "grad_norm": 0.5279334783554077, "learning_rate": 9.998288738078179e-06, "loss": 0.4842, "step": 388 }, { "epoch": 0.3238068812430633, "grad_norm": 0.43567541241645813, "learning_rate": 9.998159638446495e-06, "loss": 0.4772, "step": 389 }, { "epoch": 0.3246392896781354, "grad_norm": 0.4781498610973358, "learning_rate": 9.998025845455394e-06, "loss": 0.4918, "step": 390 }, { "epoch": 0.32547169811320753, "grad_norm": 0.5459445118904114, "learning_rate": 9.99788735923051e-06, "loss": 0.4865, "step": 391 }, { "epoch": 0.3263041065482797, "grad_norm": 0.4947822391986847, "learning_rate": 9.997744179901891e-06, "loss": 0.4784, "step": 392 }, { "epoch": 0.32713651498335183, "grad_norm": 0.5778188109397888, "learning_rate": 9.997596307603979e-06, "loss": 0.5046, "step": 393 }, { "epoch": 0.327968923418424, "grad_norm": 0.46349483728408813, "learning_rate": 9.997443742475628e-06, "loss": 0.5002, "step": 394 }, { "epoch": 0.32880133185349614, "grad_norm": 0.41382893919944763, "learning_rate": 9.997286484660101e-06, "loss": 0.5169, "step": 395 }, { "epoch": 0.32963374028856823, "grad_norm": 0.5306146740913391, "learning_rate": 9.997124534305065e-06, "loss": 0.5053, "step": 396 }, { "epoch": 0.3304661487236404, "grad_norm": 0.4569046199321747, "learning_rate": 9.996957891562598e-06, "loss": 0.5099, "step": 397 }, { "epoch": 0.33129855715871254, "grad_norm": 0.440790057182312, "learning_rate": 9.996786556589175e-06, "loss": 0.5094, "step": 398 }, { "epoch": 0.3321309655937847, "grad_norm": 0.5211203694343567, "learning_rate": 9.996610529545685e-06, "loss": 0.4936, "step": 399 }, { "epoch": 0.33296337402885684, "grad_norm": 0.4785847067832947, "learning_rate": 9.996429810597421e-06, "loss": 0.4808, "step": 400 }, { "epoch": 0.333795782463929, "grad_norm": 0.5769135355949402, "learning_rate": 9.996244399914083e-06, "loss": 0.5027, "step": 401 }, { "epoch": 0.3346281908990011, "grad_norm": 0.49235013127326965, "learning_rate": 9.99605429766977e-06, "loss": 0.4918, "step": 402 }, { "epoch": 0.33546059933407324, "grad_norm": 0.559281587600708, "learning_rate": 9.995859504042994e-06, "loss": 0.5126, "step": 403 }, { "epoch": 0.3362930077691454, "grad_norm": 0.4233754873275757, "learning_rate": 9.99566001921667e-06, "loss": 0.4942, "step": 404 }, { "epoch": 0.33712541620421754, "grad_norm": 0.5626436471939087, "learning_rate": 9.995455843378118e-06, "loss": 0.4992, "step": 405 }, { "epoch": 0.3379578246392897, "grad_norm": 0.4755416810512543, "learning_rate": 9.995246976719063e-06, "loss": 0.497, "step": 406 }, { "epoch": 0.33879023307436185, "grad_norm": 0.5172478556632996, "learning_rate": 9.995033419435632e-06, "loss": 0.4947, "step": 407 }, { "epoch": 0.33962264150943394, "grad_norm": 0.5225632786750793, "learning_rate": 9.994815171728362e-06, "loss": 0.5041, "step": 408 }, { "epoch": 0.3404550499445061, "grad_norm": 0.5508219003677368, "learning_rate": 9.994592233802189e-06, "loss": 0.4589, "step": 409 }, { "epoch": 0.34128745837957825, "grad_norm": 0.5616341233253479, "learning_rate": 9.994364605866455e-06, "loss": 0.4893, "step": 410 }, { "epoch": 0.3421198668146504, "grad_norm": 0.5048859119415283, "learning_rate": 9.99413228813491e-06, "loss": 0.4887, "step": 411 }, { "epoch": 0.34295227524972255, "grad_norm": 0.5626086592674255, "learning_rate": 9.993895280825702e-06, "loss": 0.4853, "step": 412 }, { "epoch": 0.34378468368479465, "grad_norm": 0.46970364451408386, "learning_rate": 9.993653584161387e-06, "loss": 0.4965, "step": 413 }, { "epoch": 0.3446170921198668, "grad_norm": 0.6829771995544434, "learning_rate": 9.993407198368918e-06, "loss": 0.4942, "step": 414 }, { "epoch": 0.34544950055493895, "grad_norm": 0.5054067373275757, "learning_rate": 9.993156123679662e-06, "loss": 0.4761, "step": 415 }, { "epoch": 0.3462819089900111, "grad_norm": 0.6721978783607483, "learning_rate": 9.992900360329376e-06, "loss": 0.4938, "step": 416 }, { "epoch": 0.34711431742508325, "grad_norm": 0.4733940362930298, "learning_rate": 9.992639908558232e-06, "loss": 0.4801, "step": 417 }, { "epoch": 0.3479467258601554, "grad_norm": 0.5857818722724915, "learning_rate": 9.992374768610795e-06, "loss": 0.4631, "step": 418 }, { "epoch": 0.3487791342952275, "grad_norm": 0.4833131432533264, "learning_rate": 9.992104940736038e-06, "loss": 0.469, "step": 419 }, { "epoch": 0.34961154273029965, "grad_norm": 0.5365222692489624, "learning_rate": 9.991830425187333e-06, "loss": 0.4771, "step": 420 }, { "epoch": 0.3504439511653718, "grad_norm": 0.4678042531013489, "learning_rate": 9.991551222222455e-06, "loss": 0.4632, "step": 421 }, { "epoch": 0.35127635960044395, "grad_norm": 0.4540461599826813, "learning_rate": 9.99126733210358e-06, "loss": 0.4764, "step": 422 }, { "epoch": 0.3521087680355161, "grad_norm": 0.47178414463996887, "learning_rate": 9.990978755097287e-06, "loss": 0.4711, "step": 423 }, { "epoch": 0.35294117647058826, "grad_norm": 0.4962081015110016, "learning_rate": 9.990685491474555e-06, "loss": 0.5143, "step": 424 }, { "epoch": 0.35377358490566035, "grad_norm": 0.42502203583717346, "learning_rate": 9.990387541510761e-06, "loss": 0.4839, "step": 425 }, { "epoch": 0.3546059933407325, "grad_norm": 0.4943620264530182, "learning_rate": 9.990084905485689e-06, "loss": 0.4881, "step": 426 }, { "epoch": 0.35543840177580466, "grad_norm": 0.4736219644546509, "learning_rate": 9.989777583683517e-06, "loss": 0.4774, "step": 427 }, { "epoch": 0.3562708102108768, "grad_norm": 0.5000523924827576, "learning_rate": 9.989465576392828e-06, "loss": 0.5023, "step": 428 }, { "epoch": 0.35710321864594896, "grad_norm": 0.5680121183395386, "learning_rate": 9.989148883906599e-06, "loss": 0.4816, "step": 429 }, { "epoch": 0.3579356270810211, "grad_norm": 0.4986157715320587, "learning_rate": 9.988827506522211e-06, "loss": 0.4624, "step": 430 }, { "epoch": 0.3587680355160932, "grad_norm": 0.48457589745521545, "learning_rate": 9.988501444541445e-06, "loss": 0.4946, "step": 431 }, { "epoch": 0.35960044395116536, "grad_norm": 0.5403779149055481, "learning_rate": 9.988170698270477e-06, "loss": 0.4924, "step": 432 }, { "epoch": 0.3604328523862375, "grad_norm": 0.388967901468277, "learning_rate": 9.987835268019883e-06, "loss": 0.4922, "step": 433 }, { "epoch": 0.36126526082130966, "grad_norm": 0.459904283285141, "learning_rate": 9.98749515410464e-06, "loss": 0.4948, "step": 434 }, { "epoch": 0.3620976692563818, "grad_norm": 0.46121054887771606, "learning_rate": 9.987150356844118e-06, "loss": 0.4833, "step": 435 }, { "epoch": 0.36293007769145397, "grad_norm": 0.46013152599334717, "learning_rate": 9.98680087656209e-06, "loss": 0.4854, "step": 436 }, { "epoch": 0.36376248612652606, "grad_norm": 0.5025128126144409, "learning_rate": 9.986446713586724e-06, "loss": 0.4917, "step": 437 }, { "epoch": 0.3645948945615982, "grad_norm": 0.4966227412223816, "learning_rate": 9.986087868250584e-06, "loss": 0.4776, "step": 438 }, { "epoch": 0.36542730299667037, "grad_norm": 0.45798709988594055, "learning_rate": 9.985724340890633e-06, "loss": 0.4892, "step": 439 }, { "epoch": 0.3662597114317425, "grad_norm": 0.4821065068244934, "learning_rate": 9.98535613184823e-06, "loss": 0.4889, "step": 440 }, { "epoch": 0.36709211986681467, "grad_norm": 0.4548647999763489, "learning_rate": 9.984983241469129e-06, "loss": 0.4701, "step": 441 }, { "epoch": 0.36792452830188677, "grad_norm": 0.43688225746154785, "learning_rate": 9.984605670103478e-06, "loss": 0.5215, "step": 442 }, { "epoch": 0.3687569367369589, "grad_norm": 0.549662709236145, "learning_rate": 9.98422341810583e-06, "loss": 0.4871, "step": 443 }, { "epoch": 0.36958934517203107, "grad_norm": 0.40335264801979065, "learning_rate": 9.98383648583512e-06, "loss": 0.4678, "step": 444 }, { "epoch": 0.3704217536071032, "grad_norm": 0.5139138102531433, "learning_rate": 9.983444873654683e-06, "loss": 0.4765, "step": 445 }, { "epoch": 0.3712541620421754, "grad_norm": 0.5220227837562561, "learning_rate": 9.983048581932257e-06, "loss": 0.4893, "step": 446 }, { "epoch": 0.3720865704772475, "grad_norm": 0.47806721925735474, "learning_rate": 9.982647611039961e-06, "loss": 0.4884, "step": 447 }, { "epoch": 0.3729189789123196, "grad_norm": 0.49555501341819763, "learning_rate": 9.982241961354317e-06, "loss": 0.4835, "step": 448 }, { "epoch": 0.3737513873473918, "grad_norm": 0.4484332203865051, "learning_rate": 9.981831633256236e-06, "loss": 0.4983, "step": 449 }, { "epoch": 0.3745837957824639, "grad_norm": 0.41702842712402344, "learning_rate": 9.981416627131022e-06, "loss": 0.5161, "step": 450 }, { "epoch": 0.3754162042175361, "grad_norm": 0.43207862973213196, "learning_rate": 9.980996943368373e-06, "loss": 0.5019, "step": 451 }, { "epoch": 0.3762486126526082, "grad_norm": 0.460387647151947, "learning_rate": 9.98057258236238e-06, "loss": 0.4698, "step": 452 }, { "epoch": 0.3770810210876804, "grad_norm": 0.4922404885292053, "learning_rate": 9.980143544511527e-06, "loss": 0.4838, "step": 453 }, { "epoch": 0.3779134295227525, "grad_norm": 0.4728861153125763, "learning_rate": 9.979709830218688e-06, "loss": 0.4578, "step": 454 }, { "epoch": 0.3787458379578246, "grad_norm": 0.5862690210342407, "learning_rate": 9.979271439891125e-06, "loss": 0.4802, "step": 455 }, { "epoch": 0.3795782463928968, "grad_norm": 0.4514050781726837, "learning_rate": 9.978828373940498e-06, "loss": 0.4834, "step": 456 }, { "epoch": 0.38041065482796893, "grad_norm": 0.5906931757926941, "learning_rate": 9.97838063278285e-06, "loss": 0.4621, "step": 457 }, { "epoch": 0.3812430632630411, "grad_norm": 0.4813970625400543, "learning_rate": 9.977928216838622e-06, "loss": 0.4894, "step": 458 }, { "epoch": 0.38207547169811323, "grad_norm": 0.5167549848556519, "learning_rate": 9.977471126532636e-06, "loss": 0.5006, "step": 459 }, { "epoch": 0.38290788013318533, "grad_norm": 0.4821680188179016, "learning_rate": 9.97700936229411e-06, "loss": 0.4993, "step": 460 }, { "epoch": 0.3837402885682575, "grad_norm": 0.4512309432029724, "learning_rate": 9.976542924556652e-06, "loss": 0.487, "step": 461 }, { "epoch": 0.38457269700332963, "grad_norm": 0.4451935887336731, "learning_rate": 9.976071813758249e-06, "loss": 0.4546, "step": 462 }, { "epoch": 0.3854051054384018, "grad_norm": 0.4715465307235718, "learning_rate": 9.975596030341287e-06, "loss": 0.4891, "step": 463 }, { "epoch": 0.38623751387347394, "grad_norm": 0.39830783009529114, "learning_rate": 9.975115574752532e-06, "loss": 0.4996, "step": 464 }, { "epoch": 0.38706992230854603, "grad_norm": 0.41202181577682495, "learning_rate": 9.974630447443142e-06, "loss": 0.4827, "step": 465 }, { "epoch": 0.3879023307436182, "grad_norm": 0.42218315601348877, "learning_rate": 9.974140648868659e-06, "loss": 0.4635, "step": 466 }, { "epoch": 0.38873473917869034, "grad_norm": 0.4501170516014099, "learning_rate": 9.973646179489014e-06, "loss": 0.4941, "step": 467 }, { "epoch": 0.3895671476137625, "grad_norm": 0.44711339473724365, "learning_rate": 9.97314703976852e-06, "loss": 0.4581, "step": 468 }, { "epoch": 0.39039955604883464, "grad_norm": 0.486054927110672, "learning_rate": 9.97264323017588e-06, "loss": 0.4863, "step": 469 }, { "epoch": 0.3912319644839068, "grad_norm": 0.4822620153427124, "learning_rate": 9.97213475118418e-06, "loss": 0.5206, "step": 470 }, { "epoch": 0.3920643729189789, "grad_norm": 0.40667545795440674, "learning_rate": 9.971621603270887e-06, "loss": 0.5174, "step": 471 }, { "epoch": 0.39289678135405104, "grad_norm": 0.46628639101982117, "learning_rate": 9.971103786917862e-06, "loss": 0.4741, "step": 472 }, { "epoch": 0.3937291897891232, "grad_norm": 0.40411314368247986, "learning_rate": 9.97058130261134e-06, "loss": 0.4768, "step": 473 }, { "epoch": 0.39456159822419534, "grad_norm": 0.4988131523132324, "learning_rate": 9.970054150841942e-06, "loss": 0.4741, "step": 474 }, { "epoch": 0.3953940066592675, "grad_norm": 0.42282071709632874, "learning_rate": 9.969522332104675e-06, "loss": 0.4761, "step": 475 }, { "epoch": 0.39622641509433965, "grad_norm": 0.4693518877029419, "learning_rate": 9.968985846898924e-06, "loss": 0.4808, "step": 476 }, { "epoch": 0.39705882352941174, "grad_norm": 0.5171592235565186, "learning_rate": 9.968444695728461e-06, "loss": 0.4797, "step": 477 }, { "epoch": 0.3978912319644839, "grad_norm": 0.4162120819091797, "learning_rate": 9.967898879101434e-06, "loss": 0.5033, "step": 478 }, { "epoch": 0.39872364039955605, "grad_norm": 0.5173420906066895, "learning_rate": 9.967348397530373e-06, "loss": 0.4704, "step": 479 }, { "epoch": 0.3995560488346282, "grad_norm": 0.48679623007774353, "learning_rate": 9.966793251532197e-06, "loss": 0.4607, "step": 480 }, { "epoch": 0.40038845726970035, "grad_norm": 0.4463588297367096, "learning_rate": 9.966233441628188e-06, "loss": 0.4836, "step": 481 }, { "epoch": 0.4012208657047725, "grad_norm": 0.5191836357116699, "learning_rate": 9.965668968344023e-06, "loss": 0.4966, "step": 482 }, { "epoch": 0.4020532741398446, "grad_norm": 0.4401721656322479, "learning_rate": 9.965099832209753e-06, "loss": 0.4864, "step": 483 }, { "epoch": 0.40288568257491675, "grad_norm": 0.5379999876022339, "learning_rate": 9.964526033759803e-06, "loss": 0.4431, "step": 484 }, { "epoch": 0.4037180910099889, "grad_norm": 0.46185803413391113, "learning_rate": 9.963947573532983e-06, "loss": 0.4894, "step": 485 }, { "epoch": 0.40455049944506105, "grad_norm": 0.45974308252334595, "learning_rate": 9.963364452072475e-06, "loss": 0.4609, "step": 486 }, { "epoch": 0.4053829078801332, "grad_norm": 0.46747586131095886, "learning_rate": 9.962776669925842e-06, "loss": 0.4949, "step": 487 }, { "epoch": 0.40621531631520535, "grad_norm": 0.45227017998695374, "learning_rate": 9.962184227645021e-06, "loss": 0.5051, "step": 488 }, { "epoch": 0.40704772475027745, "grad_norm": 0.46203479170799255, "learning_rate": 9.961587125786328e-06, "loss": 0.476, "step": 489 }, { "epoch": 0.4078801331853496, "grad_norm": 0.4355171024799347, "learning_rate": 9.960985364910448e-06, "loss": 0.4809, "step": 490 }, { "epoch": 0.40871254162042175, "grad_norm": 0.47708234190940857, "learning_rate": 9.960378945582446e-06, "loss": 0.4802, "step": 491 }, { "epoch": 0.4095449500554939, "grad_norm": 0.46055299043655396, "learning_rate": 9.959767868371761e-06, "loss": 0.479, "step": 492 }, { "epoch": 0.41037735849056606, "grad_norm": 0.502031147480011, "learning_rate": 9.959152133852209e-06, "loss": 0.4968, "step": 493 }, { "epoch": 0.41120976692563815, "grad_norm": 0.48754680156707764, "learning_rate": 9.958531742601968e-06, "loss": 0.4791, "step": 494 }, { "epoch": 0.4120421753607103, "grad_norm": 0.4217134416103363, "learning_rate": 9.9579066952036e-06, "loss": 0.4927, "step": 495 }, { "epoch": 0.41287458379578246, "grad_norm": 0.474809467792511, "learning_rate": 9.957276992244039e-06, "loss": 0.491, "step": 496 }, { "epoch": 0.4137069922308546, "grad_norm": 0.48426496982574463, "learning_rate": 9.956642634314582e-06, "loss": 0.4944, "step": 497 }, { "epoch": 0.41453940066592676, "grad_norm": 0.4359665811061859, "learning_rate": 9.956003622010904e-06, "loss": 0.4695, "step": 498 }, { "epoch": 0.4153718091009989, "grad_norm": 0.4425244927406311, "learning_rate": 9.955359955933048e-06, "loss": 0.49, "step": 499 }, { "epoch": 0.416204217536071, "grad_norm": 0.5776862502098083, "learning_rate": 9.95471163668543e-06, "loss": 0.5233, "step": 500 }, { "epoch": 0.41703662597114316, "grad_norm": 0.46279191970825195, "learning_rate": 9.954058664876832e-06, "loss": 0.4912, "step": 501 }, { "epoch": 0.4178690344062153, "grad_norm": 0.5074960589408875, "learning_rate": 9.953401041120403e-06, "loss": 0.4979, "step": 502 }, { "epoch": 0.41870144284128746, "grad_norm": 0.51701819896698, "learning_rate": 9.952738766033668e-06, "loss": 0.4649, "step": 503 }, { "epoch": 0.4195338512763596, "grad_norm": 0.424464613199234, "learning_rate": 9.952071840238511e-06, "loss": 0.4885, "step": 504 }, { "epoch": 0.42036625971143177, "grad_norm": 0.45139414072036743, "learning_rate": 9.951400264361188e-06, "loss": 0.4702, "step": 505 }, { "epoch": 0.42119866814650386, "grad_norm": 0.46182501316070557, "learning_rate": 9.950724039032324e-06, "loss": 0.4643, "step": 506 }, { "epoch": 0.422031076581576, "grad_norm": 0.43030858039855957, "learning_rate": 9.950043164886902e-06, "loss": 0.4903, "step": 507 }, { "epoch": 0.42286348501664817, "grad_norm": 0.4888497292995453, "learning_rate": 9.949357642564275e-06, "loss": 0.5164, "step": 508 }, { "epoch": 0.4236958934517203, "grad_norm": 0.48107990622520447, "learning_rate": 9.948667472708163e-06, "loss": 0.4811, "step": 509 }, { "epoch": 0.42452830188679247, "grad_norm": 0.4666735827922821, "learning_rate": 9.947972655966647e-06, "loss": 0.4719, "step": 510 }, { "epoch": 0.4253607103218646, "grad_norm": 0.4709594249725342, "learning_rate": 9.947273192992171e-06, "loss": 0.5038, "step": 511 }, { "epoch": 0.4261931187569367, "grad_norm": 0.48332247138023376, "learning_rate": 9.946569084441542e-06, "loss": 0.4892, "step": 512 }, { "epoch": 0.42702552719200887, "grad_norm": 0.507439911365509, "learning_rate": 9.945860330975933e-06, "loss": 0.482, "step": 513 }, { "epoch": 0.427857935627081, "grad_norm": 0.4573519825935364, "learning_rate": 9.945146933260876e-06, "loss": 0.4808, "step": 514 }, { "epoch": 0.4286903440621532, "grad_norm": 0.5414947867393494, "learning_rate": 9.94442889196626e-06, "loss": 0.4983, "step": 515 }, { "epoch": 0.4295227524972253, "grad_norm": 0.43500539660453796, "learning_rate": 9.94370620776634e-06, "loss": 0.5014, "step": 516 }, { "epoch": 0.4303551609322974, "grad_norm": 0.5725677609443665, "learning_rate": 9.942978881339732e-06, "loss": 0.487, "step": 517 }, { "epoch": 0.43118756936736957, "grad_norm": 0.4238481819629669, "learning_rate": 9.942246913369409e-06, "loss": 0.4854, "step": 518 }, { "epoch": 0.4320199778024417, "grad_norm": 0.44186079502105713, "learning_rate": 9.941510304542695e-06, "loss": 0.5183, "step": 519 }, { "epoch": 0.4328523862375139, "grad_norm": 0.45779183506965637, "learning_rate": 9.940769055551284e-06, "loss": 0.4754, "step": 520 }, { "epoch": 0.433684794672586, "grad_norm": 0.4756607115268707, "learning_rate": 9.940023167091219e-06, "loss": 0.4907, "step": 521 }, { "epoch": 0.4345172031076582, "grad_norm": 0.47078803181648254, "learning_rate": 9.939272639862905e-06, "loss": 0.4845, "step": 522 }, { "epoch": 0.4353496115427303, "grad_norm": 0.48096713423728943, "learning_rate": 9.9385174745711e-06, "loss": 0.4778, "step": 523 }, { "epoch": 0.4361820199778024, "grad_norm": 0.48447316884994507, "learning_rate": 9.937757671924915e-06, "loss": 0.4883, "step": 524 }, { "epoch": 0.4370144284128746, "grad_norm": 0.5201837420463562, "learning_rate": 9.936993232637818e-06, "loss": 0.5062, "step": 525 }, { "epoch": 0.43784683684794673, "grad_norm": 0.5595635175704956, "learning_rate": 9.936224157427635e-06, "loss": 0.4843, "step": 526 }, { "epoch": 0.4386792452830189, "grad_norm": 0.4055692255496979, "learning_rate": 9.935450447016535e-06, "loss": 0.4659, "step": 527 }, { "epoch": 0.43951165371809103, "grad_norm": 0.5074310302734375, "learning_rate": 9.934672102131052e-06, "loss": 0.4681, "step": 528 }, { "epoch": 0.44034406215316313, "grad_norm": 0.4834982454776764, "learning_rate": 9.933889123502059e-06, "loss": 0.5108, "step": 529 }, { "epoch": 0.4411764705882353, "grad_norm": 0.48070859909057617, "learning_rate": 9.933101511864793e-06, "loss": 0.4833, "step": 530 }, { "epoch": 0.44200887902330743, "grad_norm": 0.4138585031032562, "learning_rate": 9.93230926795883e-06, "loss": 0.4869, "step": 531 }, { "epoch": 0.4428412874583796, "grad_norm": 0.5085632801055908, "learning_rate": 9.931512392528104e-06, "loss": 0.4832, "step": 532 }, { "epoch": 0.44367369589345174, "grad_norm": 0.5004951357841492, "learning_rate": 9.930710886320895e-06, "loss": 0.4908, "step": 533 }, { "epoch": 0.4445061043285239, "grad_norm": 0.498270720243454, "learning_rate": 9.929904750089829e-06, "loss": 0.4734, "step": 534 }, { "epoch": 0.445338512763596, "grad_norm": 0.4868960976600647, "learning_rate": 9.929093984591884e-06, "loss": 0.4477, "step": 535 }, { "epoch": 0.44617092119866814, "grad_norm": 0.5053864121437073, "learning_rate": 9.928278590588382e-06, "loss": 0.464, "step": 536 }, { "epoch": 0.4470033296337403, "grad_norm": 0.47487616539001465, "learning_rate": 9.927458568844994e-06, "loss": 0.471, "step": 537 }, { "epoch": 0.44783573806881244, "grad_norm": 0.4798586368560791, "learning_rate": 9.926633920131732e-06, "loss": 0.4628, "step": 538 }, { "epoch": 0.4486681465038846, "grad_norm": 0.44665879011154175, "learning_rate": 9.925804645222957e-06, "loss": 0.464, "step": 539 }, { "epoch": 0.44950055493895674, "grad_norm": 0.5611540675163269, "learning_rate": 9.924970744897373e-06, "loss": 0.4741, "step": 540 }, { "epoch": 0.45033296337402884, "grad_norm": 0.4786832332611084, "learning_rate": 9.924132219938027e-06, "loss": 0.4732, "step": 541 }, { "epoch": 0.451165371809101, "grad_norm": 0.4666735529899597, "learning_rate": 9.923289071132308e-06, "loss": 0.502, "step": 542 }, { "epoch": 0.45199778024417314, "grad_norm": 0.5262482166290283, "learning_rate": 9.922441299271948e-06, "loss": 0.4744, "step": 543 }, { "epoch": 0.4528301886792453, "grad_norm": 0.5326946377754211, "learning_rate": 9.92158890515302e-06, "loss": 0.4882, "step": 544 }, { "epoch": 0.45366259711431745, "grad_norm": 0.4670204222202301, "learning_rate": 9.920731889575935e-06, "loss": 0.5037, "step": 545 }, { "epoch": 0.45449500554938954, "grad_norm": 0.5759830474853516, "learning_rate": 9.919870253345446e-06, "loss": 0.4822, "step": 546 }, { "epoch": 0.4553274139844617, "grad_norm": 0.42398568987846375, "learning_rate": 9.919003997270648e-06, "loss": 0.4776, "step": 547 }, { "epoch": 0.45615982241953384, "grad_norm": 0.5048391819000244, "learning_rate": 9.918133122164968e-06, "loss": 0.4621, "step": 548 }, { "epoch": 0.456992230854606, "grad_norm": 0.5019201636314392, "learning_rate": 9.917257628846172e-06, "loss": 0.4885, "step": 549 }, { "epoch": 0.45782463928967815, "grad_norm": 0.43558669090270996, "learning_rate": 9.916377518136367e-06, "loss": 0.4843, "step": 550 }, { "epoch": 0.4586570477247503, "grad_norm": 0.526262104511261, "learning_rate": 9.915492790861986e-06, "loss": 0.4854, "step": 551 }, { "epoch": 0.4594894561598224, "grad_norm": 0.5632872581481934, "learning_rate": 9.91460344785381e-06, "loss": 0.4879, "step": 552 }, { "epoch": 0.46032186459489455, "grad_norm": 0.5479772686958313, "learning_rate": 9.913709489946946e-06, "loss": 0.5225, "step": 553 }, { "epoch": 0.4611542730299667, "grad_norm": 0.5169483423233032, "learning_rate": 9.912810917980834e-06, "loss": 0.4791, "step": 554 }, { "epoch": 0.46198668146503885, "grad_norm": 0.5335115194320679, "learning_rate": 9.911907732799251e-06, "loss": 0.4974, "step": 555 }, { "epoch": 0.462819089900111, "grad_norm": 0.49141526222229004, "learning_rate": 9.910999935250302e-06, "loss": 0.4835, "step": 556 }, { "epoch": 0.46365149833518315, "grad_norm": 0.7199199795722961, "learning_rate": 9.910087526186424e-06, "loss": 0.502, "step": 557 }, { "epoch": 0.46448390677025525, "grad_norm": 7.59892463684082, "learning_rate": 9.909170506464389e-06, "loss": 0.453, "step": 558 }, { "epoch": 0.4653163152053274, "grad_norm": 4702.46728515625, "learning_rate": 9.908248876945291e-06, "loss": 1.725, "step": 559 }, { "epoch": 0.46614872364039955, "grad_norm": 13.055675506591797, "learning_rate": 9.907322638494558e-06, "loss": 0.492, "step": 560 }, { "epoch": 0.4669811320754717, "grad_norm": 2.1198976039886475, "learning_rate": 9.906391791981944e-06, "loss": 0.504, "step": 561 }, { "epoch": 0.46781354051054386, "grad_norm": 0.6735315322875977, "learning_rate": 9.90545633828153e-06, "loss": 0.4779, "step": 562 }, { "epoch": 0.468645948945616, "grad_norm": 0.7947428226470947, "learning_rate": 9.904516278271721e-06, "loss": 0.5156, "step": 563 }, { "epoch": 0.4694783573806881, "grad_norm": 0.8350366950035095, "learning_rate": 9.903571612835254e-06, "loss": 0.4695, "step": 564 }, { "epoch": 0.47031076581576026, "grad_norm": 0.6595231294631958, "learning_rate": 9.902622342859183e-06, "loss": 0.5045, "step": 565 }, { "epoch": 0.4711431742508324, "grad_norm": 0.7960402369499207, "learning_rate": 9.901668469234892e-06, "loss": 0.4872, "step": 566 }, { "epoch": 0.47197558268590456, "grad_norm": 0.6408143639564514, "learning_rate": 9.900709992858083e-06, "loss": 0.4483, "step": 567 }, { "epoch": 0.4728079911209767, "grad_norm": 0.8016465306282043, "learning_rate": 9.899746914628782e-06, "loss": 0.4663, "step": 568 }, { "epoch": 0.4736403995560488, "grad_norm": 0.5370069146156311, "learning_rate": 9.898779235451337e-06, "loss": 0.4861, "step": 569 }, { "epoch": 0.47447280799112096, "grad_norm": 0.6317716836929321, "learning_rate": 9.897806956234417e-06, "loss": 0.4831, "step": 570 }, { "epoch": 0.4753052164261931, "grad_norm": 0.649116575717926, "learning_rate": 9.896830077891007e-06, "loss": 0.484, "step": 571 }, { "epoch": 0.47613762486126526, "grad_norm": 0.5388279557228088, "learning_rate": 9.895848601338414e-06, "loss": 0.5064, "step": 572 }, { "epoch": 0.4769700332963374, "grad_norm": 0.6505693793296814, "learning_rate": 9.894862527498259e-06, "loss": 0.4757, "step": 573 }, { "epoch": 0.47780244173140957, "grad_norm": 0.503017783164978, "learning_rate": 9.893871857296487e-06, "loss": 0.5006, "step": 574 }, { "epoch": 0.47863485016648166, "grad_norm": 0.7261723875999451, "learning_rate": 9.892876591663355e-06, "loss": 0.4688, "step": 575 }, { "epoch": 0.4794672586015538, "grad_norm": 0.49724769592285156, "learning_rate": 9.891876731533429e-06, "loss": 0.4621, "step": 576 }, { "epoch": 0.48029966703662597, "grad_norm": 0.6609309315681458, "learning_rate": 9.8908722778456e-06, "loss": 0.4631, "step": 577 }, { "epoch": 0.4811320754716981, "grad_norm": 0.5452806353569031, "learning_rate": 9.889863231543065e-06, "loss": 0.4881, "step": 578 }, { "epoch": 0.48196448390677027, "grad_norm": 0.675403892993927, "learning_rate": 9.888849593573339e-06, "loss": 0.4709, "step": 579 }, { "epoch": 0.4827968923418424, "grad_norm": 0.6482904553413391, "learning_rate": 9.887831364888243e-06, "loss": 0.4907, "step": 580 }, { "epoch": 0.4836293007769145, "grad_norm": 0.6234882473945618, "learning_rate": 9.886808546443914e-06, "loss": 0.4801, "step": 581 }, { "epoch": 0.48446170921198667, "grad_norm": 0.60924232006073, "learning_rate": 9.885781139200794e-06, "loss": 0.4668, "step": 582 }, { "epoch": 0.4852941176470588, "grad_norm": 0.5140445828437805, "learning_rate": 9.88474914412364e-06, "loss": 0.471, "step": 583 }, { "epoch": 0.48612652608213097, "grad_norm": 0.6285945773124695, "learning_rate": 9.88371256218151e-06, "loss": 0.469, "step": 584 }, { "epoch": 0.4869589345172031, "grad_norm": 0.45175155997276306, "learning_rate": 9.882671394347771e-06, "loss": 0.4646, "step": 585 }, { "epoch": 0.4877913429522753, "grad_norm": 0.4605112373828888, "learning_rate": 9.881625641600104e-06, "loss": 0.4653, "step": 586 }, { "epoch": 0.48862375138734737, "grad_norm": 0.5589239597320557, "learning_rate": 9.880575304920484e-06, "loss": 0.4641, "step": 587 }, { "epoch": 0.4894561598224195, "grad_norm": 0.38805943727493286, "learning_rate": 9.879520385295197e-06, "loss": 0.4584, "step": 588 }, { "epoch": 0.4902885682574917, "grad_norm": 0.5301197171211243, "learning_rate": 9.878460883714831e-06, "loss": 0.5203, "step": 589 }, { "epoch": 0.4911209766925638, "grad_norm": 0.47221046686172485, "learning_rate": 9.877396801174277e-06, "loss": 0.489, "step": 590 }, { "epoch": 0.491953385127636, "grad_norm": 0.4699765145778656, "learning_rate": 9.876328138672726e-06, "loss": 0.4739, "step": 591 }, { "epoch": 0.49278579356270813, "grad_norm": 0.4534553587436676, "learning_rate": 9.875254897213674e-06, "loss": 0.4624, "step": 592 }, { "epoch": 0.4936182019977802, "grad_norm": 0.39965343475341797, "learning_rate": 9.87417707780491e-06, "loss": 0.4865, "step": 593 }, { "epoch": 0.4944506104328524, "grad_norm": 0.43681901693344116, "learning_rate": 9.873094681458525e-06, "loss": 0.4793, "step": 594 }, { "epoch": 0.49528301886792453, "grad_norm": 0.4199521839618683, "learning_rate": 9.87200770919091e-06, "loss": 0.4561, "step": 595 }, { "epoch": 0.4961154273029967, "grad_norm": 0.42258721590042114, "learning_rate": 9.870916162022752e-06, "loss": 0.4763, "step": 596 }, { "epoch": 0.49694783573806883, "grad_norm": 0.4213769733905792, "learning_rate": 9.86982004097903e-06, "loss": 0.4815, "step": 597 }, { "epoch": 0.49778024417314093, "grad_norm": 0.41223376989364624, "learning_rate": 9.868719347089024e-06, "loss": 0.4688, "step": 598 }, { "epoch": 0.4986126526082131, "grad_norm": 0.49295932054519653, "learning_rate": 9.867614081386302e-06, "loss": 0.5049, "step": 599 }, { "epoch": 0.49944506104328523, "grad_norm": 0.3732836842536926, "learning_rate": 9.866504244908728e-06, "loss": 0.5007, "step": 600 }, { "epoch": 0.5002774694783574, "grad_norm": 0.39977091550827026, "learning_rate": 9.86538983869846e-06, "loss": 0.4807, "step": 601 }, { "epoch": 0.5011098779134295, "grad_norm": 0.4643203318119049, "learning_rate": 9.864270863801944e-06, "loss": 0.4807, "step": 602 }, { "epoch": 0.5019422863485017, "grad_norm": 0.391868531703949, "learning_rate": 9.863147321269918e-06, "loss": 0.4536, "step": 603 }, { "epoch": 0.5027746947835738, "grad_norm": 0.4176725149154663, "learning_rate": 9.862019212157406e-06, "loss": 0.4658, "step": 604 }, { "epoch": 0.503607103218646, "grad_norm": 0.4765459895133972, "learning_rate": 9.860886537523721e-06, "loss": 0.4891, "step": 605 }, { "epoch": 0.5044395116537181, "grad_norm": 0.41668006777763367, "learning_rate": 9.859749298432468e-06, "loss": 0.4859, "step": 606 }, { "epoch": 0.5052719200887902, "grad_norm": 0.45202815532684326, "learning_rate": 9.858607495951534e-06, "loss": 0.4732, "step": 607 }, { "epoch": 0.5061043285238623, "grad_norm": 0.41504961252212524, "learning_rate": 9.857461131153089e-06, "loss": 0.4529, "step": 608 }, { "epoch": 0.5069367369589345, "grad_norm": 0.4873095452785492, "learning_rate": 9.856310205113594e-06, "loss": 0.4693, "step": 609 }, { "epoch": 0.5077691453940066, "grad_norm": 0.42755189538002014, "learning_rate": 9.855154718913782e-06, "loss": 0.4502, "step": 610 }, { "epoch": 0.5086015538290788, "grad_norm": 0.40592634677886963, "learning_rate": 9.853994673638679e-06, "loss": 0.469, "step": 611 }, { "epoch": 0.5094339622641509, "grad_norm": 0.4807058274745941, "learning_rate": 9.852830070377588e-06, "loss": 0.489, "step": 612 }, { "epoch": 0.5102663706992231, "grad_norm": 0.4219500720500946, "learning_rate": 9.851660910224092e-06, "loss": 0.4927, "step": 613 }, { "epoch": 0.5110987791342952, "grad_norm": 0.45034486055374146, "learning_rate": 9.85048719427605e-06, "loss": 0.4877, "step": 614 }, { "epoch": 0.5119311875693674, "grad_norm": 0.39540550112724304, "learning_rate": 9.849308923635606e-06, "loss": 0.4769, "step": 615 }, { "epoch": 0.5127635960044395, "grad_norm": 0.40875479578971863, "learning_rate": 9.848126099409175e-06, "loss": 0.4732, "step": 616 }, { "epoch": 0.5135960044395117, "grad_norm": 0.4183914363384247, "learning_rate": 9.846938722707446e-06, "loss": 0.4671, "step": 617 }, { "epoch": 0.5144284128745837, "grad_norm": 0.3762715458869934, "learning_rate": 9.845746794645393e-06, "loss": 0.4749, "step": 618 }, { "epoch": 0.5152608213096559, "grad_norm": 0.4459075927734375, "learning_rate": 9.844550316342252e-06, "loss": 0.5014, "step": 619 }, { "epoch": 0.516093229744728, "grad_norm": 0.4192522466182709, "learning_rate": 9.843349288921543e-06, "loss": 0.4872, "step": 620 }, { "epoch": 0.5169256381798002, "grad_norm": 0.40821659564971924, "learning_rate": 9.842143713511044e-06, "loss": 0.479, "step": 621 }, { "epoch": 0.5177580466148723, "grad_norm": 0.45011478662490845, "learning_rate": 9.840933591242817e-06, "loss": 0.4743, "step": 622 }, { "epoch": 0.5185904550499445, "grad_norm": 0.45597994327545166, "learning_rate": 9.839718923253186e-06, "loss": 0.4667, "step": 623 }, { "epoch": 0.5194228634850167, "grad_norm": 0.4411117136478424, "learning_rate": 9.838499710682745e-06, "loss": 0.4849, "step": 624 }, { "epoch": 0.5202552719200888, "grad_norm": 0.46066632866859436, "learning_rate": 9.837275954676357e-06, "loss": 0.4375, "step": 625 }, { "epoch": 0.521087680355161, "grad_norm": 0.42149725556373596, "learning_rate": 9.836047656383152e-06, "loss": 0.4505, "step": 626 }, { "epoch": 0.5219200887902331, "grad_norm": 0.4638105630874634, "learning_rate": 9.834814816956521e-06, "loss": 0.4442, "step": 627 }, { "epoch": 0.5227524972253053, "grad_norm": 0.5468646883964539, "learning_rate": 9.833577437554121e-06, "loss": 0.4775, "step": 628 }, { "epoch": 0.5235849056603774, "grad_norm": 0.44078686833381653, "learning_rate": 9.832335519337877e-06, "loss": 0.4493, "step": 629 }, { "epoch": 0.5244173140954494, "grad_norm": 0.5681177973747253, "learning_rate": 9.831089063473967e-06, "loss": 0.4996, "step": 630 }, { "epoch": 0.5252497225305216, "grad_norm": 0.4630327522754669, "learning_rate": 9.82983807113284e-06, "loss": 0.4714, "step": 631 }, { "epoch": 0.5260821309655938, "grad_norm": 0.47248589992523193, "learning_rate": 9.828582543489194e-06, "loss": 0.4838, "step": 632 }, { "epoch": 0.5269145394006659, "grad_norm": 0.5343927145004272, "learning_rate": 9.827322481721998e-06, "loss": 0.4678, "step": 633 }, { "epoch": 0.5277469478357381, "grad_norm": 0.48827970027923584, "learning_rate": 9.826057887014466e-06, "loss": 0.4841, "step": 634 }, { "epoch": 0.5285793562708102, "grad_norm": 0.42138999700546265, "learning_rate": 9.824788760554078e-06, "loss": 0.4821, "step": 635 }, { "epoch": 0.5294117647058824, "grad_norm": 0.46162131428718567, "learning_rate": 9.823515103532564e-06, "loss": 0.463, "step": 636 }, { "epoch": 0.5302441731409545, "grad_norm": 0.41701412200927734, "learning_rate": 9.822236917145914e-06, "loss": 0.5037, "step": 637 }, { "epoch": 0.5310765815760267, "grad_norm": 0.43682774901390076, "learning_rate": 9.820954202594362e-06, "loss": 0.468, "step": 638 }, { "epoch": 0.5319089900110988, "grad_norm": 0.5141327977180481, "learning_rate": 9.819666961082402e-06, "loss": 0.5035, "step": 639 }, { "epoch": 0.532741398446171, "grad_norm": 0.4251098036766052, "learning_rate": 9.81837519381878e-06, "loss": 0.4749, "step": 640 }, { "epoch": 0.5335738068812431, "grad_norm": 0.4675353765487671, "learning_rate": 9.817078902016481e-06, "loss": 0.4837, "step": 641 }, { "epoch": 0.5344062153163152, "grad_norm": 0.44004005193710327, "learning_rate": 9.81577808689275e-06, "loss": 0.4729, "step": 642 }, { "epoch": 0.5352386237513873, "grad_norm": 0.45024535059928894, "learning_rate": 9.814472749669076e-06, "loss": 0.4783, "step": 643 }, { "epoch": 0.5360710321864595, "grad_norm": 0.3834257125854492, "learning_rate": 9.813162891571189e-06, "loss": 0.4503, "step": 644 }, { "epoch": 0.5369034406215316, "grad_norm": 0.47850501537323, "learning_rate": 9.811848513829074e-06, "loss": 0.466, "step": 645 }, { "epoch": 0.5377358490566038, "grad_norm": 0.397339403629303, "learning_rate": 9.810529617676952e-06, "loss": 0.4804, "step": 646 }, { "epoch": 0.5385682574916759, "grad_norm": 0.45498281717300415, "learning_rate": 9.809206204353289e-06, "loss": 0.4587, "step": 647 }, { "epoch": 0.5394006659267481, "grad_norm": 0.46825650334358215, "learning_rate": 9.807878275100795e-06, "loss": 0.4772, "step": 648 }, { "epoch": 0.5402330743618202, "grad_norm": 0.40103840827941895, "learning_rate": 9.806545831166417e-06, "loss": 0.4725, "step": 649 }, { "epoch": 0.5410654827968924, "grad_norm": 0.4803685247898102, "learning_rate": 9.805208873801346e-06, "loss": 0.4799, "step": 650 }, { "epoch": 0.5418978912319645, "grad_norm": 0.4705343544483185, "learning_rate": 9.803867404261005e-06, "loss": 0.4769, "step": 651 }, { "epoch": 0.5427302996670367, "grad_norm": 0.4226051867008209, "learning_rate": 9.80252142380506e-06, "loss": 0.4734, "step": 652 }, { "epoch": 0.5435627081021087, "grad_norm": 0.42430418729782104, "learning_rate": 9.80117093369741e-06, "loss": 0.474, "step": 653 }, { "epoch": 0.5443951165371809, "grad_norm": 0.4154928922653198, "learning_rate": 9.799815935206187e-06, "loss": 0.4819, "step": 654 }, { "epoch": 0.545227524972253, "grad_norm": 0.359573096036911, "learning_rate": 9.798456429603758e-06, "loss": 0.4564, "step": 655 }, { "epoch": 0.5460599334073252, "grad_norm": 0.5043289661407471, "learning_rate": 9.797092418166725e-06, "loss": 0.4784, "step": 656 }, { "epoch": 0.5468923418423973, "grad_norm": 0.3750476539134979, "learning_rate": 9.795723902175918e-06, "loss": 0.4542, "step": 657 }, { "epoch": 0.5477247502774695, "grad_norm": 0.4907470643520355, "learning_rate": 9.794350882916397e-06, "loss": 0.4908, "step": 658 }, { "epoch": 0.5485571587125416, "grad_norm": 0.457126259803772, "learning_rate": 9.79297336167745e-06, "loss": 0.4323, "step": 659 }, { "epoch": 0.5493895671476138, "grad_norm": 0.42247408628463745, "learning_rate": 9.791591339752596e-06, "loss": 0.4752, "step": 660 }, { "epoch": 0.5502219755826859, "grad_norm": 0.49953287839889526, "learning_rate": 9.790204818439576e-06, "loss": 0.4852, "step": 661 }, { "epoch": 0.5510543840177581, "grad_norm": 0.466348260641098, "learning_rate": 9.788813799040358e-06, "loss": 0.4654, "step": 662 }, { "epoch": 0.5518867924528302, "grad_norm": 0.47069528698921204, "learning_rate": 9.787418282861135e-06, "loss": 0.4919, "step": 663 }, { "epoch": 0.5527192008879024, "grad_norm": 0.3939701020717621, "learning_rate": 9.786018271212318e-06, "loss": 0.4712, "step": 664 }, { "epoch": 0.5535516093229744, "grad_norm": 0.4161626994609833, "learning_rate": 9.784613765408546e-06, "loss": 0.4791, "step": 665 }, { "epoch": 0.5543840177580466, "grad_norm": 0.39920780062675476, "learning_rate": 9.783204766768672e-06, "loss": 0.4669, "step": 666 }, { "epoch": 0.5552164261931187, "grad_norm": 0.47431665658950806, "learning_rate": 9.781791276615774e-06, "loss": 0.4623, "step": 667 }, { "epoch": 0.5560488346281909, "grad_norm": 0.4187391400337219, "learning_rate": 9.780373296277137e-06, "loss": 0.4766, "step": 668 }, { "epoch": 0.556881243063263, "grad_norm": 0.39394712448120117, "learning_rate": 9.778950827084277e-06, "loss": 0.4857, "step": 669 }, { "epoch": 0.5577136514983352, "grad_norm": 0.4316924512386322, "learning_rate": 9.777523870372913e-06, "loss": 0.472, "step": 670 }, { "epoch": 0.5585460599334073, "grad_norm": 0.54462730884552, "learning_rate": 9.776092427482984e-06, "loss": 0.4716, "step": 671 }, { "epoch": 0.5593784683684795, "grad_norm": 0.40116629004478455, "learning_rate": 9.774656499758639e-06, "loss": 0.4498, "step": 672 }, { "epoch": 0.5602108768035516, "grad_norm": 0.5211840271949768, "learning_rate": 9.77321608854824e-06, "loss": 0.4702, "step": 673 }, { "epoch": 0.5610432852386238, "grad_norm": 0.5067088603973389, "learning_rate": 9.771771195204358e-06, "loss": 0.4796, "step": 674 }, { "epoch": 0.5618756936736959, "grad_norm": 0.5115644335746765, "learning_rate": 9.770321821083774e-06, "loss": 0.4809, "step": 675 }, { "epoch": 0.562708102108768, "grad_norm": 0.4665434956550598, "learning_rate": 9.768867967547472e-06, "loss": 0.4649, "step": 676 }, { "epoch": 0.5635405105438401, "grad_norm": 0.5498418211936951, "learning_rate": 9.767409635960653e-06, "loss": 0.4799, "step": 677 }, { "epoch": 0.5643729189789123, "grad_norm": 0.41861289739608765, "learning_rate": 9.76594682769271e-06, "loss": 0.4655, "step": 678 }, { "epoch": 0.5652053274139844, "grad_norm": 0.4563583731651306, "learning_rate": 9.764479544117247e-06, "loss": 0.4608, "step": 679 }, { "epoch": 0.5660377358490566, "grad_norm": 0.45897558331489563, "learning_rate": 9.76300778661207e-06, "loss": 0.4476, "step": 680 }, { "epoch": 0.5668701442841287, "grad_norm": 0.42618608474731445, "learning_rate": 9.761531556559183e-06, "loss": 0.4758, "step": 681 }, { "epoch": 0.5677025527192009, "grad_norm": 0.3950604796409607, "learning_rate": 9.760050855344795e-06, "loss": 0.4566, "step": 682 }, { "epoch": 0.568534961154273, "grad_norm": 0.46975719928741455, "learning_rate": 9.758565684359307e-06, "loss": 0.4612, "step": 683 }, { "epoch": 0.5693673695893452, "grad_norm": 0.4233880639076233, "learning_rate": 9.757076044997324e-06, "loss": 0.4591, "step": 684 }, { "epoch": 0.5701997780244173, "grad_norm": 0.42603635787963867, "learning_rate": 9.75558193865764e-06, "loss": 0.4841, "step": 685 }, { "epoch": 0.5710321864594895, "grad_norm": 0.4705389738082886, "learning_rate": 9.754083366743249e-06, "loss": 0.4501, "step": 686 }, { "epoch": 0.5718645948945617, "grad_norm": 0.4174215793609619, "learning_rate": 9.752580330661336e-06, "loss": 0.4804, "step": 687 }, { "epoch": 0.5726970033296337, "grad_norm": 0.47212427854537964, "learning_rate": 9.751072831823279e-06, "loss": 0.4987, "step": 688 }, { "epoch": 0.5735294117647058, "grad_norm": 0.453812837600708, "learning_rate": 9.749560871644643e-06, "loss": 0.4655, "step": 689 }, { "epoch": 0.574361820199778, "grad_norm": 0.44600024819374084, "learning_rate": 9.748044451545188e-06, "loss": 0.4926, "step": 690 }, { "epoch": 0.5751942286348501, "grad_norm": 0.5240420699119568, "learning_rate": 9.746523572948857e-06, "loss": 0.4919, "step": 691 }, { "epoch": 0.5760266370699223, "grad_norm": 0.415798544883728, "learning_rate": 9.744998237283785e-06, "loss": 0.4447, "step": 692 }, { "epoch": 0.5768590455049944, "grad_norm": 0.39677396416664124, "learning_rate": 9.743468445982284e-06, "loss": 0.4626, "step": 693 }, { "epoch": 0.5776914539400666, "grad_norm": 0.4481954276561737, "learning_rate": 9.741934200480857e-06, "loss": 0.4845, "step": 694 }, { "epoch": 0.5785238623751388, "grad_norm": 0.4083997309207916, "learning_rate": 9.740395502220192e-06, "loss": 0.4673, "step": 695 }, { "epoch": 0.5793562708102109, "grad_norm": 0.4454471170902252, "learning_rate": 9.738852352645145e-06, "loss": 0.4606, "step": 696 }, { "epoch": 0.5801886792452831, "grad_norm": 0.45555463433265686, "learning_rate": 9.737304753204767e-06, "loss": 0.482, "step": 697 }, { "epoch": 0.5810210876803552, "grad_norm": 0.5052851438522339, "learning_rate": 9.735752705352278e-06, "loss": 0.4909, "step": 698 }, { "epoch": 0.5818534961154272, "grad_norm": 0.40319934487342834, "learning_rate": 9.734196210545079e-06, "loss": 0.4804, "step": 699 }, { "epoch": 0.5826859045504994, "grad_norm": 0.4926762878894806, "learning_rate": 9.732635270244745e-06, "loss": 0.455, "step": 700 }, { "epoch": 0.5835183129855716, "grad_norm": 0.41983771324157715, "learning_rate": 9.731069885917029e-06, "loss": 0.4748, "step": 701 }, { "epoch": 0.5843507214206437, "grad_norm": 0.4828875958919525, "learning_rate": 9.729500059031851e-06, "loss": 0.4664, "step": 702 }, { "epoch": 0.5851831298557159, "grad_norm": 0.4084136188030243, "learning_rate": 9.727925791063306e-06, "loss": 0.4576, "step": 703 }, { "epoch": 0.586015538290788, "grad_norm": 0.5071380138397217, "learning_rate": 9.726347083489661e-06, "loss": 0.4702, "step": 704 }, { "epoch": 0.5868479467258602, "grad_norm": 0.4290992021560669, "learning_rate": 9.724763937793352e-06, "loss": 0.4669, "step": 705 }, { "epoch": 0.5876803551609323, "grad_norm": 0.46184346079826355, "learning_rate": 9.723176355460978e-06, "loss": 0.4622, "step": 706 }, { "epoch": 0.5885127635960045, "grad_norm": 0.5367664694786072, "learning_rate": 9.721584337983303e-06, "loss": 0.5039, "step": 707 }, { "epoch": 0.5893451720310766, "grad_norm": 0.47268781065940857, "learning_rate": 9.719987886855264e-06, "loss": 0.4887, "step": 708 }, { "epoch": 0.5901775804661488, "grad_norm": 0.5064681768417358, "learning_rate": 9.718387003575957e-06, "loss": 0.4529, "step": 709 }, { "epoch": 0.5910099889012209, "grad_norm": 0.4127980172634125, "learning_rate": 9.716781689648638e-06, "loss": 0.4661, "step": 710 }, { "epoch": 0.591842397336293, "grad_norm": 0.525039553642273, "learning_rate": 9.715171946580724e-06, "loss": 0.4509, "step": 711 }, { "epoch": 0.5926748057713651, "grad_norm": 0.5468900799751282, "learning_rate": 9.713557775883793e-06, "loss": 0.4854, "step": 712 }, { "epoch": 0.5935072142064373, "grad_norm": 0.4610214829444885, "learning_rate": 9.71193917907358e-06, "loss": 0.4783, "step": 713 }, { "epoch": 0.5943396226415094, "grad_norm": 0.5212530493736267, "learning_rate": 9.710316157669972e-06, "loss": 0.4461, "step": 714 }, { "epoch": 0.5951720310765816, "grad_norm": 0.4037787616252899, "learning_rate": 9.708688713197021e-06, "loss": 0.4931, "step": 715 }, { "epoch": 0.5960044395116537, "grad_norm": 0.4619676172733307, "learning_rate": 9.707056847182921e-06, "loss": 0.4709, "step": 716 }, { "epoch": 0.5968368479467259, "grad_norm": 0.4411291778087616, "learning_rate": 9.705420561160024e-06, "loss": 0.4923, "step": 717 }, { "epoch": 0.597669256381798, "grad_norm": 0.45065465569496155, "learning_rate": 9.703779856664833e-06, "loss": 0.457, "step": 718 }, { "epoch": 0.5985016648168702, "grad_norm": 0.4192379415035248, "learning_rate": 9.702134735237994e-06, "loss": 0.4815, "step": 719 }, { "epoch": 0.5993340732519423, "grad_norm": 0.5066999793052673, "learning_rate": 9.700485198424307e-06, "loss": 0.4763, "step": 720 }, { "epoch": 0.6001664816870145, "grad_norm": 0.4297652840614319, "learning_rate": 9.69883124777272e-06, "loss": 0.4545, "step": 721 }, { "epoch": 0.6009988901220865, "grad_norm": 0.4559658169746399, "learning_rate": 9.697172884836315e-06, "loss": 0.4581, "step": 722 }, { "epoch": 0.6018312985571587, "grad_norm": 0.41913849115371704, "learning_rate": 9.695510111172329e-06, "loss": 0.443, "step": 723 }, { "epoch": 0.6026637069922308, "grad_norm": 0.40670254826545715, "learning_rate": 9.693842928342132e-06, "loss": 0.4689, "step": 724 }, { "epoch": 0.603496115427303, "grad_norm": 0.43783038854599, "learning_rate": 9.69217133791124e-06, "loss": 0.4633, "step": 725 }, { "epoch": 0.6043285238623751, "grad_norm": 0.4118957221508026, "learning_rate": 9.690495341449304e-06, "loss": 0.4812, "step": 726 }, { "epoch": 0.6051609322974473, "grad_norm": 0.41395294666290283, "learning_rate": 9.688814940530115e-06, "loss": 0.4975, "step": 727 }, { "epoch": 0.6059933407325194, "grad_norm": 0.485917866230011, "learning_rate": 9.6871301367316e-06, "loss": 0.5011, "step": 728 }, { "epoch": 0.6068257491675916, "grad_norm": 0.39293450117111206, "learning_rate": 9.68544093163582e-06, "loss": 0.4541, "step": 729 }, { "epoch": 0.6076581576026637, "grad_norm": 0.47208675742149353, "learning_rate": 9.683747326828962e-06, "loss": 0.4707, "step": 730 }, { "epoch": 0.6084905660377359, "grad_norm": 0.41672036051750183, "learning_rate": 9.682049323901358e-06, "loss": 0.4689, "step": 731 }, { "epoch": 0.609322974472808, "grad_norm": 0.45332202315330505, "learning_rate": 9.680346924447458e-06, "loss": 0.5, "step": 732 }, { "epoch": 0.6101553829078802, "grad_norm": 0.4699179530143738, "learning_rate": 9.678640130065846e-06, "loss": 0.4914, "step": 733 }, { "epoch": 0.6109877913429522, "grad_norm": 0.42764753103256226, "learning_rate": 9.676928942359233e-06, "loss": 0.4713, "step": 734 }, { "epoch": 0.6118201997780244, "grad_norm": 0.4510103464126587, "learning_rate": 9.675213362934454e-06, "loss": 0.4422, "step": 735 }, { "epoch": 0.6126526082130965, "grad_norm": 0.44094187021255493, "learning_rate": 9.673493393402466e-06, "loss": 0.478, "step": 736 }, { "epoch": 0.6134850166481687, "grad_norm": 0.4860018491744995, "learning_rate": 9.671769035378352e-06, "loss": 0.4597, "step": 737 }, { "epoch": 0.6143174250832408, "grad_norm": 0.4602179229259491, "learning_rate": 9.670040290481315e-06, "loss": 0.477, "step": 738 }, { "epoch": 0.615149833518313, "grad_norm": 0.4050056040287018, "learning_rate": 9.668307160334676e-06, "loss": 0.4669, "step": 739 }, { "epoch": 0.6159822419533851, "grad_norm": 0.49610546231269836, "learning_rate": 9.666569646565875e-06, "loss": 0.4783, "step": 740 }, { "epoch": 0.6168146503884573, "grad_norm": 0.37179750204086304, "learning_rate": 9.664827750806465e-06, "loss": 0.4661, "step": 741 }, { "epoch": 0.6176470588235294, "grad_norm": 0.42984944581985474, "learning_rate": 9.663081474692123e-06, "loss": 0.4516, "step": 742 }, { "epoch": 0.6184794672586016, "grad_norm": 0.4149248003959656, "learning_rate": 9.661330819862626e-06, "loss": 0.4644, "step": 743 }, { "epoch": 0.6193118756936737, "grad_norm": 0.4002021551132202, "learning_rate": 9.659575787961872e-06, "loss": 0.4571, "step": 744 }, { "epoch": 0.6201442841287459, "grad_norm": 0.3851153552532196, "learning_rate": 9.657816380637868e-06, "loss": 0.4761, "step": 745 }, { "epoch": 0.6209766925638179, "grad_norm": 0.4163179397583008, "learning_rate": 9.656052599542728e-06, "loss": 0.4712, "step": 746 }, { "epoch": 0.6218091009988901, "grad_norm": 0.41883882880210876, "learning_rate": 9.654284446332673e-06, "loss": 0.4481, "step": 747 }, { "epoch": 0.6226415094339622, "grad_norm": 0.35763731598854065, "learning_rate": 9.652511922668029e-06, "loss": 0.4512, "step": 748 }, { "epoch": 0.6234739178690344, "grad_norm": 0.4117819368839264, "learning_rate": 9.650735030213228e-06, "loss": 0.483, "step": 749 }, { "epoch": 0.6243063263041065, "grad_norm": 0.4240819811820984, "learning_rate": 9.648953770636801e-06, "loss": 0.476, "step": 750 }, { "epoch": 0.6251387347391787, "grad_norm": 0.3860422372817993, "learning_rate": 9.647168145611385e-06, "loss": 0.4808, "step": 751 }, { "epoch": 0.6259711431742508, "grad_norm": 0.39864394068717957, "learning_rate": 9.645378156813709e-06, "loss": 0.4881, "step": 752 }, { "epoch": 0.626803551609323, "grad_norm": 0.4221175014972687, "learning_rate": 9.643583805924608e-06, "loss": 0.4726, "step": 753 }, { "epoch": 0.6276359600443951, "grad_norm": 0.40564703941345215, "learning_rate": 9.641785094629008e-06, "loss": 0.4507, "step": 754 }, { "epoch": 0.6284683684794673, "grad_norm": 0.44357532262802124, "learning_rate": 9.639982024615928e-06, "loss": 0.4721, "step": 755 }, { "epoch": 0.6293007769145395, "grad_norm": 0.4528225064277649, "learning_rate": 9.638174597578486e-06, "loss": 0.4774, "step": 756 }, { "epoch": 0.6301331853496115, "grad_norm": 0.41333818435668945, "learning_rate": 9.636362815213884e-06, "loss": 0.4824, "step": 757 }, { "epoch": 0.6309655937846836, "grad_norm": 0.41459783911705017, "learning_rate": 9.63454667922342e-06, "loss": 0.4808, "step": 758 }, { "epoch": 0.6317980022197558, "grad_norm": 0.4365558624267578, "learning_rate": 9.632726191312475e-06, "loss": 0.4583, "step": 759 }, { "epoch": 0.632630410654828, "grad_norm": 0.4275365471839905, "learning_rate": 9.630901353190522e-06, "loss": 0.4966, "step": 760 }, { "epoch": 0.6334628190899001, "grad_norm": 0.4615970253944397, "learning_rate": 9.629072166571114e-06, "loss": 0.4945, "step": 761 }, { "epoch": 0.6342952275249722, "grad_norm": 0.43543118238449097, "learning_rate": 9.627238633171889e-06, "loss": 0.4534, "step": 762 }, { "epoch": 0.6351276359600444, "grad_norm": 0.37240052223205566, "learning_rate": 9.625400754714568e-06, "loss": 0.443, "step": 763 }, { "epoch": 0.6359600443951166, "grad_norm": 0.47188568115234375, "learning_rate": 9.623558532924952e-06, "loss": 0.4559, "step": 764 }, { "epoch": 0.6367924528301887, "grad_norm": 0.38750484585762024, "learning_rate": 9.621711969532917e-06, "loss": 0.4805, "step": 765 }, { "epoch": 0.6376248612652609, "grad_norm": 0.40850406885147095, "learning_rate": 9.61986106627242e-06, "loss": 0.4768, "step": 766 }, { "epoch": 0.638457269700333, "grad_norm": 0.38356107473373413, "learning_rate": 9.618005824881491e-06, "loss": 0.4671, "step": 767 }, { "epoch": 0.6392896781354052, "grad_norm": 0.4014689028263092, "learning_rate": 9.616146247102233e-06, "loss": 0.4405, "step": 768 }, { "epoch": 0.6401220865704772, "grad_norm": 0.3854086995124817, "learning_rate": 9.614282334680827e-06, "loss": 0.4562, "step": 769 }, { "epoch": 0.6409544950055494, "grad_norm": 0.4454103708267212, "learning_rate": 9.612414089367512e-06, "loss": 0.4494, "step": 770 }, { "epoch": 0.6417869034406215, "grad_norm": 0.4228101968765259, "learning_rate": 9.61054151291661e-06, "loss": 0.4675, "step": 771 }, { "epoch": 0.6426193118756937, "grad_norm": 0.4053763151168823, "learning_rate": 9.608664607086497e-06, "loss": 0.4618, "step": 772 }, { "epoch": 0.6434517203107658, "grad_norm": 0.4087989926338196, "learning_rate": 9.606783373639626e-06, "loss": 0.4618, "step": 773 }, { "epoch": 0.644284128745838, "grad_norm": 0.4213400185108185, "learning_rate": 9.604897814342504e-06, "loss": 0.4909, "step": 774 }, { "epoch": 0.6451165371809101, "grad_norm": 0.4600432217121124, "learning_rate": 9.603007930965706e-06, "loss": 0.5026, "step": 775 }, { "epoch": 0.6459489456159823, "grad_norm": 0.3727787435054779, "learning_rate": 9.601113725283864e-06, "loss": 0.4797, "step": 776 }, { "epoch": 0.6467813540510544, "grad_norm": 0.4441661834716797, "learning_rate": 9.599215199075674e-06, "loss": 0.4748, "step": 777 }, { "epoch": 0.6476137624861266, "grad_norm": 0.386455237865448, "learning_rate": 9.597312354123882e-06, "loss": 0.4657, "step": 778 }, { "epoch": 0.6484461709211987, "grad_norm": 0.4307407736778259, "learning_rate": 9.595405192215293e-06, "loss": 0.4416, "step": 779 }, { "epoch": 0.6492785793562708, "grad_norm": 0.3854341208934784, "learning_rate": 9.593493715140767e-06, "loss": 0.4849, "step": 780 }, { "epoch": 0.6501109877913429, "grad_norm": 0.38782092928886414, "learning_rate": 9.591577924695213e-06, "loss": 0.4506, "step": 781 }, { "epoch": 0.6509433962264151, "grad_norm": 0.4059392809867859, "learning_rate": 9.589657822677592e-06, "loss": 0.4686, "step": 782 }, { "epoch": 0.6517758046614872, "grad_norm": 0.39608749747276306, "learning_rate": 9.587733410890916e-06, "loss": 0.4789, "step": 783 }, { "epoch": 0.6526082130965594, "grad_norm": 0.40410831570625305, "learning_rate": 9.585804691142237e-06, "loss": 0.4462, "step": 784 }, { "epoch": 0.6534406215316315, "grad_norm": 0.40202003717422485, "learning_rate": 9.583871665242659e-06, "loss": 0.4329, "step": 785 }, { "epoch": 0.6542730299667037, "grad_norm": 0.42759740352630615, "learning_rate": 9.581934335007326e-06, "loss": 0.4415, "step": 786 }, { "epoch": 0.6551054384017758, "grad_norm": 0.44305333495140076, "learning_rate": 9.579992702255428e-06, "loss": 0.4938, "step": 787 }, { "epoch": 0.655937846836848, "grad_norm": 0.427234947681427, "learning_rate": 9.57804676881019e-06, "loss": 0.4579, "step": 788 }, { "epoch": 0.6567702552719201, "grad_norm": 0.45137763023376465, "learning_rate": 9.576096536498875e-06, "loss": 0.493, "step": 789 }, { "epoch": 0.6576026637069923, "grad_norm": 0.47418224811553955, "learning_rate": 9.574142007152789e-06, "loss": 0.4789, "step": 790 }, { "epoch": 0.6584350721420644, "grad_norm": 0.4159716069698334, "learning_rate": 9.572183182607269e-06, "loss": 0.4555, "step": 791 }, { "epoch": 0.6592674805771365, "grad_norm": 0.40891221165657043, "learning_rate": 9.570220064701686e-06, "loss": 0.4561, "step": 792 }, { "epoch": 0.6600998890122086, "grad_norm": 0.4291469156742096, "learning_rate": 9.568252655279438e-06, "loss": 0.4844, "step": 793 }, { "epoch": 0.6609322974472808, "grad_norm": 0.41984859108924866, "learning_rate": 9.566280956187961e-06, "loss": 0.4797, "step": 794 }, { "epoch": 0.6617647058823529, "grad_norm": 0.43832656741142273, "learning_rate": 9.564304969278714e-06, "loss": 0.4559, "step": 795 }, { "epoch": 0.6625971143174251, "grad_norm": 0.42272278666496277, "learning_rate": 9.562324696407181e-06, "loss": 0.4788, "step": 796 }, { "epoch": 0.6634295227524972, "grad_norm": 0.4183189868927002, "learning_rate": 9.560340139432877e-06, "loss": 0.4894, "step": 797 }, { "epoch": 0.6642619311875694, "grad_norm": 0.3984436094760895, "learning_rate": 9.558351300219335e-06, "loss": 0.4917, "step": 798 }, { "epoch": 0.6650943396226415, "grad_norm": 0.37821829319000244, "learning_rate": 9.556358180634105e-06, "loss": 0.4486, "step": 799 }, { "epoch": 0.6659267480577137, "grad_norm": 0.40212714672088623, "learning_rate": 9.554360782548766e-06, "loss": 0.4569, "step": 800 }, { "epoch": 0.6667591564927858, "grad_norm": 0.421878457069397, "learning_rate": 9.55235910783891e-06, "loss": 0.4667, "step": 801 }, { "epoch": 0.667591564927858, "grad_norm": 0.4031965732574463, "learning_rate": 9.550353158384142e-06, "loss": 0.4908, "step": 802 }, { "epoch": 0.66842397336293, "grad_norm": 0.3970634937286377, "learning_rate": 9.548342936068085e-06, "loss": 0.4644, "step": 803 }, { "epoch": 0.6692563817980022, "grad_norm": 0.437465637922287, "learning_rate": 9.54632844277837e-06, "loss": 0.4628, "step": 804 }, { "epoch": 0.6700887902330743, "grad_norm": 0.3883856236934662, "learning_rate": 9.544309680406648e-06, "loss": 0.4712, "step": 805 }, { "epoch": 0.6709211986681465, "grad_norm": 0.40419673919677734, "learning_rate": 9.542286650848567e-06, "loss": 0.4819, "step": 806 }, { "epoch": 0.6717536071032186, "grad_norm": 0.4148818254470825, "learning_rate": 9.540259356003787e-06, "loss": 0.4714, "step": 807 }, { "epoch": 0.6725860155382908, "grad_norm": 0.4469767212867737, "learning_rate": 9.538227797775976e-06, "loss": 0.4601, "step": 808 }, { "epoch": 0.6734184239733629, "grad_norm": 0.37725868821144104, "learning_rate": 9.536191978072802e-06, "loss": 0.4671, "step": 809 }, { "epoch": 0.6742508324084351, "grad_norm": 0.4010414183139801, "learning_rate": 9.534151898805934e-06, "loss": 0.4641, "step": 810 }, { "epoch": 0.6750832408435072, "grad_norm": 0.37603193521499634, "learning_rate": 9.532107561891044e-06, "loss": 0.4483, "step": 811 }, { "epoch": 0.6759156492785794, "grad_norm": 0.4010283350944519, "learning_rate": 9.5300589692478e-06, "loss": 0.4779, "step": 812 }, { "epoch": 0.6767480577136515, "grad_norm": 0.3795294463634491, "learning_rate": 9.528006122799864e-06, "loss": 0.4757, "step": 813 }, { "epoch": 0.6775804661487237, "grad_norm": 0.42904961109161377, "learning_rate": 9.525949024474897e-06, "loss": 0.4592, "step": 814 }, { "epoch": 0.6784128745837957, "grad_norm": 0.392860472202301, "learning_rate": 9.52388767620455e-06, "loss": 0.477, "step": 815 }, { "epoch": 0.6792452830188679, "grad_norm": 0.4124400019645691, "learning_rate": 9.521822079924465e-06, "loss": 0.4737, "step": 816 }, { "epoch": 0.68007769145394, "grad_norm": 0.4388231039047241, "learning_rate": 9.519752237574273e-06, "loss": 0.4866, "step": 817 }, { "epoch": 0.6809100998890122, "grad_norm": 0.372005820274353, "learning_rate": 9.517678151097591e-06, "loss": 0.4705, "step": 818 }, { "epoch": 0.6817425083240843, "grad_norm": 0.4569007158279419, "learning_rate": 9.515599822442025e-06, "loss": 0.4756, "step": 819 }, { "epoch": 0.6825749167591565, "grad_norm": 0.4299950897693634, "learning_rate": 9.51351725355916e-06, "loss": 0.4807, "step": 820 }, { "epoch": 0.6834073251942286, "grad_norm": 0.3495566248893738, "learning_rate": 9.511430446404566e-06, "loss": 0.4593, "step": 821 }, { "epoch": 0.6842397336293008, "grad_norm": 0.43988698720932007, "learning_rate": 9.50933940293779e-06, "loss": 0.4946, "step": 822 }, { "epoch": 0.685072142064373, "grad_norm": 0.4119141399860382, "learning_rate": 9.507244125122358e-06, "loss": 0.4565, "step": 823 }, { "epoch": 0.6859045504994451, "grad_norm": 0.40885767340660095, "learning_rate": 9.505144614925776e-06, "loss": 0.4624, "step": 824 }, { "epoch": 0.6867369589345172, "grad_norm": 0.4196932017803192, "learning_rate": 9.503040874319519e-06, "loss": 0.4623, "step": 825 }, { "epoch": 0.6875693673695893, "grad_norm": 0.40707823634147644, "learning_rate": 9.500932905279034e-06, "loss": 0.4807, "step": 826 }, { "epoch": 0.6884017758046614, "grad_norm": 0.43840739130973816, "learning_rate": 9.498820709783743e-06, "loss": 0.4698, "step": 827 }, { "epoch": 0.6892341842397336, "grad_norm": 0.3804190158843994, "learning_rate": 9.496704289817035e-06, "loss": 0.4404, "step": 828 }, { "epoch": 0.6900665926748057, "grad_norm": 0.44267645478248596, "learning_rate": 9.494583647366264e-06, "loss": 0.4905, "step": 829 }, { "epoch": 0.6908990011098779, "grad_norm": 0.41382384300231934, "learning_rate": 9.492458784422751e-06, "loss": 0.4689, "step": 830 }, { "epoch": 0.69173140954495, "grad_norm": 0.42739763855934143, "learning_rate": 9.49032970298178e-06, "loss": 0.4782, "step": 831 }, { "epoch": 0.6925638179800222, "grad_norm": 0.4234493672847748, "learning_rate": 9.488196405042596e-06, "loss": 0.4639, "step": 832 }, { "epoch": 0.6933962264150944, "grad_norm": 0.3597807288169861, "learning_rate": 9.486058892608401e-06, "loss": 0.4541, "step": 833 }, { "epoch": 0.6942286348501665, "grad_norm": 0.38074445724487305, "learning_rate": 9.483917167686358e-06, "loss": 0.4801, "step": 834 }, { "epoch": 0.6950610432852387, "grad_norm": 0.38729429244995117, "learning_rate": 9.481771232287585e-06, "loss": 0.482, "step": 835 }, { "epoch": 0.6958934517203108, "grad_norm": 0.45910367369651794, "learning_rate": 9.479621088427152e-06, "loss": 0.4814, "step": 836 }, { "epoch": 0.696725860155383, "grad_norm": 0.4074380397796631, "learning_rate": 9.47746673812408e-06, "loss": 0.4817, "step": 837 }, { "epoch": 0.697558268590455, "grad_norm": 0.4241724908351898, "learning_rate": 9.475308183401347e-06, "loss": 0.4727, "step": 838 }, { "epoch": 0.6983906770255272, "grad_norm": 0.38416868448257446, "learning_rate": 9.473145426285869e-06, "loss": 0.456, "step": 839 }, { "epoch": 0.6992230854605993, "grad_norm": 0.3833445608615875, "learning_rate": 9.470978468808514e-06, "loss": 0.4428, "step": 840 }, { "epoch": 0.7000554938956715, "grad_norm": 0.39522784948349, "learning_rate": 9.46880731300409e-06, "loss": 0.435, "step": 841 }, { "epoch": 0.7008879023307436, "grad_norm": 0.37949830293655396, "learning_rate": 9.466631960911358e-06, "loss": 0.4574, "step": 842 }, { "epoch": 0.7017203107658158, "grad_norm": 0.3668852746486664, "learning_rate": 9.464452414573004e-06, "loss": 0.4629, "step": 843 }, { "epoch": 0.7025527192008879, "grad_norm": 0.4042564332485199, "learning_rate": 9.462268676035664e-06, "loss": 0.4774, "step": 844 }, { "epoch": 0.7033851276359601, "grad_norm": 0.4077381491661072, "learning_rate": 9.460080747349907e-06, "loss": 0.4776, "step": 845 }, { "epoch": 0.7042175360710322, "grad_norm": 0.38546618819236755, "learning_rate": 9.457888630570234e-06, "loss": 0.4547, "step": 846 }, { "epoch": 0.7050499445061044, "grad_norm": 0.3926650583744049, "learning_rate": 9.455692327755082e-06, "loss": 0.4741, "step": 847 }, { "epoch": 0.7058823529411765, "grad_norm": 0.4238012135028839, "learning_rate": 9.45349184096682e-06, "loss": 0.4618, "step": 848 }, { "epoch": 0.7067147613762487, "grad_norm": 0.42878350615501404, "learning_rate": 9.451287172271741e-06, "loss": 0.4404, "step": 849 }, { "epoch": 0.7075471698113207, "grad_norm": 0.4180905222892761, "learning_rate": 9.449078323740066e-06, "loss": 0.4795, "step": 850 }, { "epoch": 0.7083795782463929, "grad_norm": 0.40476009249687195, "learning_rate": 9.446865297445947e-06, "loss": 0.459, "step": 851 }, { "epoch": 0.709211986681465, "grad_norm": 0.3711640238761902, "learning_rate": 9.444648095467453e-06, "loss": 0.4731, "step": 852 }, { "epoch": 0.7100443951165372, "grad_norm": 0.3749478757381439, "learning_rate": 9.442426719886572e-06, "loss": 0.4579, "step": 853 }, { "epoch": 0.7108768035516093, "grad_norm": 0.381773442029953, "learning_rate": 9.440201172789218e-06, "loss": 0.4455, "step": 854 }, { "epoch": 0.7117092119866815, "grad_norm": 0.4290565848350525, "learning_rate": 9.437971456265218e-06, "loss": 0.4741, "step": 855 }, { "epoch": 0.7125416204217536, "grad_norm": 0.3964015245437622, "learning_rate": 9.435737572408316e-06, "loss": 0.4771, "step": 856 }, { "epoch": 0.7133740288568258, "grad_norm": 0.39722374081611633, "learning_rate": 9.433499523316165e-06, "loss": 0.4639, "step": 857 }, { "epoch": 0.7142064372918979, "grad_norm": 0.39079853892326355, "learning_rate": 9.431257311090336e-06, "loss": 0.438, "step": 858 }, { "epoch": 0.7150388457269701, "grad_norm": 0.3988889157772064, "learning_rate": 9.429010937836302e-06, "loss": 0.4471, "step": 859 }, { "epoch": 0.7158712541620422, "grad_norm": 0.39545944333076477, "learning_rate": 9.426760405663448e-06, "loss": 0.4542, "step": 860 }, { "epoch": 0.7167036625971143, "grad_norm": 0.39783865213394165, "learning_rate": 9.424505716685064e-06, "loss": 0.4667, "step": 861 }, { "epoch": 0.7175360710321864, "grad_norm": 0.38103538751602173, "learning_rate": 9.422246873018343e-06, "loss": 0.4689, "step": 862 }, { "epoch": 0.7183684794672586, "grad_norm": 0.3842844069004059, "learning_rate": 9.419983876784378e-06, "loss": 0.4659, "step": 863 }, { "epoch": 0.7192008879023307, "grad_norm": 0.36254891753196716, "learning_rate": 9.41771673010816e-06, "loss": 0.437, "step": 864 }, { "epoch": 0.7200332963374029, "grad_norm": 0.411409467458725, "learning_rate": 9.415445435118581e-06, "loss": 0.4671, "step": 865 }, { "epoch": 0.720865704772475, "grad_norm": 0.4189411401748657, "learning_rate": 9.41316999394843e-06, "loss": 0.507, "step": 866 }, { "epoch": 0.7216981132075472, "grad_norm": 0.448301762342453, "learning_rate": 9.410890408734381e-06, "loss": 0.4789, "step": 867 }, { "epoch": 0.7225305216426193, "grad_norm": 0.39381828904151917, "learning_rate": 9.408606681617006e-06, "loss": 0.4514, "step": 868 }, { "epoch": 0.7233629300776915, "grad_norm": 0.42399051785469055, "learning_rate": 9.406318814740767e-06, "loss": 0.4513, "step": 869 }, { "epoch": 0.7241953385127636, "grad_norm": 0.39758798480033875, "learning_rate": 9.404026810254007e-06, "loss": 0.4539, "step": 870 }, { "epoch": 0.7250277469478358, "grad_norm": 0.4223097562789917, "learning_rate": 9.401730670308963e-06, "loss": 0.4824, "step": 871 }, { "epoch": 0.7258601553829079, "grad_norm": 0.42844802141189575, "learning_rate": 9.399430397061746e-06, "loss": 0.4759, "step": 872 }, { "epoch": 0.72669256381798, "grad_norm": 0.4464619755744934, "learning_rate": 9.397125992672358e-06, "loss": 0.4437, "step": 873 }, { "epoch": 0.7275249722530521, "grad_norm": 0.3978061079978943, "learning_rate": 9.394817459304671e-06, "loss": 0.4828, "step": 874 }, { "epoch": 0.7283573806881243, "grad_norm": 0.4394519329071045, "learning_rate": 9.392504799126439e-06, "loss": 0.4746, "step": 875 }, { "epoch": 0.7291897891231964, "grad_norm": 0.40214264392852783, "learning_rate": 9.39018801430929e-06, "loss": 0.4673, "step": 876 }, { "epoch": 0.7300221975582686, "grad_norm": 0.40048447251319885, "learning_rate": 9.387867107028727e-06, "loss": 0.4793, "step": 877 }, { "epoch": 0.7308546059933407, "grad_norm": 0.4119110703468323, "learning_rate": 9.385542079464123e-06, "loss": 0.4615, "step": 878 }, { "epoch": 0.7316870144284129, "grad_norm": 0.39967912435531616, "learning_rate": 9.383212933798718e-06, "loss": 0.4664, "step": 879 }, { "epoch": 0.732519422863485, "grad_norm": 0.39798152446746826, "learning_rate": 9.38087967221962e-06, "loss": 0.4575, "step": 880 }, { "epoch": 0.7333518312985572, "grad_norm": 0.38400161266326904, "learning_rate": 9.378542296917804e-06, "loss": 0.456, "step": 881 }, { "epoch": 0.7341842397336293, "grad_norm": 0.4055246412754059, "learning_rate": 9.376200810088108e-06, "loss": 0.4613, "step": 882 }, { "epoch": 0.7350166481687015, "grad_norm": 0.3930290639400482, "learning_rate": 9.373855213929227e-06, "loss": 0.4699, "step": 883 }, { "epoch": 0.7358490566037735, "grad_norm": 0.39786094427108765, "learning_rate": 9.371505510643714e-06, "loss": 0.483, "step": 884 }, { "epoch": 0.7366814650388457, "grad_norm": 0.3793211579322815, "learning_rate": 9.369151702437987e-06, "loss": 0.4762, "step": 885 }, { "epoch": 0.7375138734739178, "grad_norm": 0.38085439801216125, "learning_rate": 9.366793791522308e-06, "loss": 0.4535, "step": 886 }, { "epoch": 0.73834628190899, "grad_norm": 0.371055543422699, "learning_rate": 9.364431780110801e-06, "loss": 0.4722, "step": 887 }, { "epoch": 0.7391786903440621, "grad_norm": 0.3615517318248749, "learning_rate": 9.362065670421434e-06, "loss": 0.4177, "step": 888 }, { "epoch": 0.7400110987791343, "grad_norm": 0.3889777660369873, "learning_rate": 9.359695464676025e-06, "loss": 0.455, "step": 889 }, { "epoch": 0.7408435072142064, "grad_norm": 0.4410359561443329, "learning_rate": 9.35732116510024e-06, "loss": 0.4966, "step": 890 }, { "epoch": 0.7416759156492786, "grad_norm": 0.42662596702575684, "learning_rate": 9.354942773923588e-06, "loss": 0.46, "step": 891 }, { "epoch": 0.7425083240843507, "grad_norm": 0.394815593957901, "learning_rate": 9.352560293379417e-06, "loss": 0.4762, "step": 892 }, { "epoch": 0.7433407325194229, "grad_norm": 0.39383867383003235, "learning_rate": 9.350173725704922e-06, "loss": 0.4519, "step": 893 }, { "epoch": 0.744173140954495, "grad_norm": 0.40083327889442444, "learning_rate": 9.34778307314113e-06, "loss": 0.4345, "step": 894 }, { "epoch": 0.7450055493895672, "grad_norm": 0.386772096157074, "learning_rate": 9.345388337932906e-06, "loss": 0.4519, "step": 895 }, { "epoch": 0.7458379578246392, "grad_norm": 0.4634164273738861, "learning_rate": 9.342989522328947e-06, "loss": 0.4256, "step": 896 }, { "epoch": 0.7466703662597114, "grad_norm": 0.4075722396373749, "learning_rate": 9.340586628581783e-06, "loss": 0.4548, "step": 897 }, { "epoch": 0.7475027746947835, "grad_norm": 0.4286724925041199, "learning_rate": 9.338179658947774e-06, "loss": 0.4737, "step": 898 }, { "epoch": 0.7483351831298557, "grad_norm": 0.43397256731987, "learning_rate": 9.335768615687108e-06, "loss": 0.4543, "step": 899 }, { "epoch": 0.7491675915649278, "grad_norm": 0.40175217390060425, "learning_rate": 9.333353501063796e-06, "loss": 0.4702, "step": 900 }, { "epoch": 0.75, "grad_norm": 0.3902556300163269, "learning_rate": 9.330934317345673e-06, "loss": 0.4734, "step": 901 }, { "epoch": 0.7508324084350722, "grad_norm": 0.40193432569503784, "learning_rate": 9.328511066804391e-06, "loss": 0.4382, "step": 902 }, { "epoch": 0.7516648168701443, "grad_norm": 0.4484616219997406, "learning_rate": 9.32608375171543e-06, "loss": 0.4704, "step": 903 }, { "epoch": 0.7524972253052165, "grad_norm": 0.39739924669265747, "learning_rate": 9.32365237435808e-06, "loss": 0.4643, "step": 904 }, { "epoch": 0.7533296337402886, "grad_norm": 0.42331647872924805, "learning_rate": 9.321216937015446e-06, "loss": 0.4584, "step": 905 }, { "epoch": 0.7541620421753608, "grad_norm": 0.3581715524196625, "learning_rate": 9.318777441974446e-06, "loss": 0.467, "step": 906 }, { "epoch": 0.7549944506104328, "grad_norm": 0.40118566155433655, "learning_rate": 9.316333891525809e-06, "loss": 0.443, "step": 907 }, { "epoch": 0.755826859045505, "grad_norm": 0.4858897626399994, "learning_rate": 9.313886287964072e-06, "loss": 0.4666, "step": 908 }, { "epoch": 0.7566592674805771, "grad_norm": 0.43074485659599304, "learning_rate": 9.311434633587577e-06, "loss": 0.4605, "step": 909 }, { "epoch": 0.7574916759156493, "grad_norm": 0.4585159718990326, "learning_rate": 9.308978930698472e-06, "loss": 0.4605, "step": 910 }, { "epoch": 0.7583240843507214, "grad_norm": 0.433880478143692, "learning_rate": 9.306519181602704e-06, "loss": 0.4644, "step": 911 }, { "epoch": 0.7591564927857936, "grad_norm": 0.38200658559799194, "learning_rate": 9.304055388610019e-06, "loss": 0.4427, "step": 912 }, { "epoch": 0.7599889012208657, "grad_norm": 0.49386703968048096, "learning_rate": 9.301587554033965e-06, "loss": 0.4637, "step": 913 }, { "epoch": 0.7608213096559379, "grad_norm": 0.4780905544757843, "learning_rate": 9.299115680191876e-06, "loss": 0.4648, "step": 914 }, { "epoch": 0.76165371809101, "grad_norm": 0.4210283160209656, "learning_rate": 9.296639769404892e-06, "loss": 0.4691, "step": 915 }, { "epoch": 0.7624861265260822, "grad_norm": 0.4278091788291931, "learning_rate": 9.294159823997933e-06, "loss": 0.4551, "step": 916 }, { "epoch": 0.7633185349611543, "grad_norm": 0.39724352955818176, "learning_rate": 9.291675846299711e-06, "loss": 0.4963, "step": 917 }, { "epoch": 0.7641509433962265, "grad_norm": 0.4462328851222992, "learning_rate": 9.289187838642724e-06, "loss": 0.4781, "step": 918 }, { "epoch": 0.7649833518312985, "grad_norm": 0.3846145570278168, "learning_rate": 9.286695803363257e-06, "loss": 0.442, "step": 919 }, { "epoch": 0.7658157602663707, "grad_norm": 0.3947022557258606, "learning_rate": 9.284199742801373e-06, "loss": 0.4804, "step": 920 }, { "epoch": 0.7666481687014428, "grad_norm": 0.4000520408153534, "learning_rate": 9.281699659300917e-06, "loss": 0.5051, "step": 921 }, { "epoch": 0.767480577136515, "grad_norm": 0.4070630669593811, "learning_rate": 9.279195555209513e-06, "loss": 0.4547, "step": 922 }, { "epoch": 0.7683129855715871, "grad_norm": 0.4421361982822418, "learning_rate": 9.276687432878554e-06, "loss": 0.4619, "step": 923 }, { "epoch": 0.7691453940066593, "grad_norm": 0.404674232006073, "learning_rate": 9.274175294663215e-06, "loss": 0.462, "step": 924 }, { "epoch": 0.7699778024417314, "grad_norm": 0.4667954444885254, "learning_rate": 9.271659142922438e-06, "loss": 0.4739, "step": 925 }, { "epoch": 0.7708102108768036, "grad_norm": 0.3848412036895752, "learning_rate": 9.26913898001893e-06, "loss": 0.489, "step": 926 }, { "epoch": 0.7716426193118757, "grad_norm": 0.4155692458152771, "learning_rate": 9.26661480831917e-06, "loss": 0.4522, "step": 927 }, { "epoch": 0.7724750277469479, "grad_norm": 0.4222828447818756, "learning_rate": 9.2640866301934e-06, "loss": 0.4756, "step": 928 }, { "epoch": 0.77330743618202, "grad_norm": 0.3686828911304474, "learning_rate": 9.261554448015625e-06, "loss": 0.4513, "step": 929 }, { "epoch": 0.7741398446170921, "grad_norm": 0.4023358225822449, "learning_rate": 9.259018264163604e-06, "loss": 0.4447, "step": 930 }, { "epoch": 0.7749722530521642, "grad_norm": 0.46268701553344727, "learning_rate": 9.25647808101886e-06, "loss": 0.4696, "step": 931 }, { "epoch": 0.7758046614872364, "grad_norm": 0.39935389161109924, "learning_rate": 9.253933900966672e-06, "loss": 0.4549, "step": 932 }, { "epoch": 0.7766370699223085, "grad_norm": 0.41673651337623596, "learning_rate": 9.251385726396065e-06, "loss": 0.4756, "step": 933 }, { "epoch": 0.7774694783573807, "grad_norm": 0.47298237681388855, "learning_rate": 9.248833559699824e-06, "loss": 0.4617, "step": 934 }, { "epoch": 0.7783018867924528, "grad_norm": 0.3737104833126068, "learning_rate": 9.246277403274475e-06, "loss": 0.437, "step": 935 }, { "epoch": 0.779134295227525, "grad_norm": 0.47945085167884827, "learning_rate": 9.243717259520296e-06, "loss": 0.4657, "step": 936 }, { "epoch": 0.7799667036625971, "grad_norm": 0.3635734021663666, "learning_rate": 9.241153130841305e-06, "loss": 0.4205, "step": 937 }, { "epoch": 0.7807991120976693, "grad_norm": 0.4091576933860779, "learning_rate": 9.238585019645265e-06, "loss": 0.4579, "step": 938 }, { "epoch": 0.7816315205327414, "grad_norm": 0.4312977194786072, "learning_rate": 9.236012928343676e-06, "loss": 0.4557, "step": 939 }, { "epoch": 0.7824639289678136, "grad_norm": 0.44428691267967224, "learning_rate": 9.233436859351778e-06, "loss": 0.4538, "step": 940 }, { "epoch": 0.7832963374028857, "grad_norm": 0.43025487661361694, "learning_rate": 9.230856815088546e-06, "loss": 0.4668, "step": 941 }, { "epoch": 0.7841287458379578, "grad_norm": 0.42932865023612976, "learning_rate": 9.228272797976685e-06, "loss": 0.4588, "step": 942 }, { "epoch": 0.7849611542730299, "grad_norm": 0.45379704236984253, "learning_rate": 9.22568481044263e-06, "loss": 0.4248, "step": 943 }, { "epoch": 0.7857935627081021, "grad_norm": 0.5083206295967102, "learning_rate": 9.223092854916552e-06, "loss": 0.4797, "step": 944 }, { "epoch": 0.7866259711431742, "grad_norm": 0.47013911604881287, "learning_rate": 9.220496933832338e-06, "loss": 0.4839, "step": 945 }, { "epoch": 0.7874583795782464, "grad_norm": 0.38621267676353455, "learning_rate": 9.217897049627605e-06, "loss": 0.4352, "step": 946 }, { "epoch": 0.7882907880133185, "grad_norm": 0.48977726697921753, "learning_rate": 9.21529320474369e-06, "loss": 0.4596, "step": 947 }, { "epoch": 0.7891231964483907, "grad_norm": 0.4230138063430786, "learning_rate": 9.212685401625649e-06, "loss": 0.4623, "step": 948 }, { "epoch": 0.7899556048834628, "grad_norm": 0.41531112790107727, "learning_rate": 9.210073642722256e-06, "loss": 0.4596, "step": 949 }, { "epoch": 0.790788013318535, "grad_norm": 0.4705289602279663, "learning_rate": 9.207457930485996e-06, "loss": 0.4578, "step": 950 }, { "epoch": 0.7916204217536071, "grad_norm": 0.4125988781452179, "learning_rate": 9.20483826737307e-06, "loss": 0.4498, "step": 951 }, { "epoch": 0.7924528301886793, "grad_norm": 0.42193499207496643, "learning_rate": 9.202214655843386e-06, "loss": 0.447, "step": 952 }, { "epoch": 0.7932852386237513, "grad_norm": 0.3790290057659149, "learning_rate": 9.199587098360563e-06, "loss": 0.4602, "step": 953 }, { "epoch": 0.7941176470588235, "grad_norm": 0.37495550513267517, "learning_rate": 9.196955597391923e-06, "loss": 0.4458, "step": 954 }, { "epoch": 0.7949500554938956, "grad_norm": 0.4026005268096924, "learning_rate": 9.19432015540849e-06, "loss": 0.4445, "step": 955 }, { "epoch": 0.7957824639289678, "grad_norm": 0.40387800335884094, "learning_rate": 9.191680774884992e-06, "loss": 0.4688, "step": 956 }, { "epoch": 0.7966148723640399, "grad_norm": 0.3699960708618164, "learning_rate": 9.189037458299854e-06, "loss": 0.4725, "step": 957 }, { "epoch": 0.7974472807991121, "grad_norm": 0.41380584239959717, "learning_rate": 9.186390208135194e-06, "loss": 0.4589, "step": 958 }, { "epoch": 0.7982796892341842, "grad_norm": 0.41292324662208557, "learning_rate": 9.18373902687683e-06, "loss": 0.4501, "step": 959 }, { "epoch": 0.7991120976692564, "grad_norm": 0.3478350341320038, "learning_rate": 9.181083917014262e-06, "loss": 0.4391, "step": 960 }, { "epoch": 0.7999445061043285, "grad_norm": 0.468945175409317, "learning_rate": 9.17842488104069e-06, "loss": 0.475, "step": 961 }, { "epoch": 0.8007769145394007, "grad_norm": 0.40263715386390686, "learning_rate": 9.175761921452992e-06, "loss": 0.4416, "step": 962 }, { "epoch": 0.8016093229744728, "grad_norm": 0.4122201204299927, "learning_rate": 9.173095040751738e-06, "loss": 0.4474, "step": 963 }, { "epoch": 0.802441731409545, "grad_norm": 0.3991578221321106, "learning_rate": 9.17042424144117e-06, "loss": 0.4571, "step": 964 }, { "epoch": 0.803274139844617, "grad_norm": 0.39560526609420776, "learning_rate": 9.16774952602922e-06, "loss": 0.4917, "step": 965 }, { "epoch": 0.8041065482796892, "grad_norm": 0.40626710653305054, "learning_rate": 9.165070897027487e-06, "loss": 0.4676, "step": 966 }, { "epoch": 0.8049389567147613, "grad_norm": 0.4202491343021393, "learning_rate": 9.162388356951257e-06, "loss": 0.454, "step": 967 }, { "epoch": 0.8057713651498335, "grad_norm": 0.35746338963508606, "learning_rate": 9.15970190831948e-06, "loss": 0.4639, "step": 968 }, { "epoch": 0.8066037735849056, "grad_norm": 0.3897421956062317, "learning_rate": 9.157011553654776e-06, "loss": 0.4548, "step": 969 }, { "epoch": 0.8074361820199778, "grad_norm": 0.3858961760997772, "learning_rate": 9.154317295483437e-06, "loss": 0.4629, "step": 970 }, { "epoch": 0.80826859045505, "grad_norm": 0.4071405827999115, "learning_rate": 9.151619136335419e-06, "loss": 0.4685, "step": 971 }, { "epoch": 0.8091009988901221, "grad_norm": 0.41344863176345825, "learning_rate": 9.14891707874434e-06, "loss": 0.4563, "step": 972 }, { "epoch": 0.8099334073251943, "grad_norm": 0.41553568840026855, "learning_rate": 9.146211125247478e-06, "loss": 0.4347, "step": 973 }, { "epoch": 0.8107658157602664, "grad_norm": 0.44281312823295593, "learning_rate": 9.143501278385773e-06, "loss": 0.4563, "step": 974 }, { "epoch": 0.8115982241953386, "grad_norm": 0.45483851432800293, "learning_rate": 9.140787540703817e-06, "loss": 0.4658, "step": 975 }, { "epoch": 0.8124306326304107, "grad_norm": 0.382942795753479, "learning_rate": 9.138069914749859e-06, "loss": 0.4486, "step": 976 }, { "epoch": 0.8132630410654828, "grad_norm": 0.42402827739715576, "learning_rate": 9.135348403075795e-06, "loss": 0.4853, "step": 977 }, { "epoch": 0.8140954495005549, "grad_norm": 0.3964520990848541, "learning_rate": 9.132623008237174e-06, "loss": 0.4955, "step": 978 }, { "epoch": 0.814927857935627, "grad_norm": 0.5112728476524353, "learning_rate": 9.12989373279319e-06, "loss": 0.4493, "step": 979 }, { "epoch": 0.8157602663706992, "grad_norm": 0.4075373113155365, "learning_rate": 9.127160579306678e-06, "loss": 0.4551, "step": 980 }, { "epoch": 0.8165926748057714, "grad_norm": 0.4562476873397827, "learning_rate": 9.124423550344118e-06, "loss": 0.4479, "step": 981 }, { "epoch": 0.8174250832408435, "grad_norm": 0.4151056706905365, "learning_rate": 9.12168264847563e-06, "loss": 0.4721, "step": 982 }, { "epoch": 0.8182574916759157, "grad_norm": 0.3973326086997986, "learning_rate": 9.118937876274965e-06, "loss": 0.4544, "step": 983 }, { "epoch": 0.8190899001109878, "grad_norm": 0.4236065745353699, "learning_rate": 9.116189236319515e-06, "loss": 0.4596, "step": 984 }, { "epoch": 0.81992230854606, "grad_norm": 0.3826847970485687, "learning_rate": 9.113436731190302e-06, "loss": 0.4686, "step": 985 }, { "epoch": 0.8207547169811321, "grad_norm": 0.34965115785598755, "learning_rate": 9.110680363471973e-06, "loss": 0.4528, "step": 986 }, { "epoch": 0.8215871254162043, "grad_norm": 0.40207812190055847, "learning_rate": 9.10792013575281e-06, "loss": 0.4559, "step": 987 }, { "epoch": 0.8224195338512763, "grad_norm": 0.4016001522541046, "learning_rate": 9.10515605062471e-06, "loss": 0.4553, "step": 988 }, { "epoch": 0.8232519422863485, "grad_norm": 0.44705718755722046, "learning_rate": 9.102388110683201e-06, "loss": 0.4915, "step": 989 }, { "epoch": 0.8240843507214206, "grad_norm": 0.43472737073898315, "learning_rate": 9.099616318527426e-06, "loss": 0.4828, "step": 990 }, { "epoch": 0.8249167591564928, "grad_norm": 0.42259252071380615, "learning_rate": 9.096840676760146e-06, "loss": 0.4427, "step": 991 }, { "epoch": 0.8257491675915649, "grad_norm": 0.44350579380989075, "learning_rate": 9.09406118798774e-06, "loss": 0.4859, "step": 992 }, { "epoch": 0.8265815760266371, "grad_norm": 0.39259374141693115, "learning_rate": 9.091277854820191e-06, "loss": 0.4398, "step": 993 }, { "epoch": 0.8274139844617092, "grad_norm": 0.3800427317619324, "learning_rate": 9.088490679871102e-06, "loss": 0.4363, "step": 994 }, { "epoch": 0.8282463928967814, "grad_norm": 0.4360864758491516, "learning_rate": 9.085699665757679e-06, "loss": 0.4651, "step": 995 }, { "epoch": 0.8290788013318535, "grad_norm": 0.39429229497909546, "learning_rate": 9.082904815100732e-06, "loss": 0.4669, "step": 996 }, { "epoch": 0.8299112097669257, "grad_norm": 0.4399055242538452, "learning_rate": 9.080106130524675e-06, "loss": 0.4396, "step": 997 }, { "epoch": 0.8307436182019978, "grad_norm": 0.4866039454936981, "learning_rate": 9.07730361465752e-06, "loss": 0.4543, "step": 998 }, { "epoch": 0.83157602663707, "grad_norm": 0.4812076687812805, "learning_rate": 9.07449727013088e-06, "loss": 0.4731, "step": 999 }, { "epoch": 0.832408435072142, "grad_norm": 0.43150269985198975, "learning_rate": 9.071687099579962e-06, "loss": 0.4774, "step": 1000 }, { "epoch": 0.8332408435072142, "grad_norm": 0.3944185674190521, "learning_rate": 9.068873105643565e-06, "loss": 0.4399, "step": 1001 }, { "epoch": 0.8340732519422863, "grad_norm": 0.5351603031158447, "learning_rate": 9.066055290964079e-06, "loss": 0.4518, "step": 1002 }, { "epoch": 0.8349056603773585, "grad_norm": 0.38343068957328796, "learning_rate": 9.063233658187482e-06, "loss": 0.4843, "step": 1003 }, { "epoch": 0.8357380688124306, "grad_norm": 0.4362078607082367, "learning_rate": 9.060408209963334e-06, "loss": 0.4642, "step": 1004 }, { "epoch": 0.8365704772475028, "grad_norm": 0.4815027117729187, "learning_rate": 9.057578948944783e-06, "loss": 0.4497, "step": 1005 }, { "epoch": 0.8374028856825749, "grad_norm": 0.4579710066318512, "learning_rate": 9.054745877788554e-06, "loss": 0.4475, "step": 1006 }, { "epoch": 0.8382352941176471, "grad_norm": 0.49727219343185425, "learning_rate": 9.051908999154948e-06, "loss": 0.4707, "step": 1007 }, { "epoch": 0.8390677025527192, "grad_norm": 0.43605858087539673, "learning_rate": 9.049068315707847e-06, "loss": 0.4365, "step": 1008 }, { "epoch": 0.8399001109877914, "grad_norm": 0.5100065469741821, "learning_rate": 9.0462238301147e-06, "loss": 0.4332, "step": 1009 }, { "epoch": 0.8407325194228635, "grad_norm": 0.38812533020973206, "learning_rate": 9.04337554504653e-06, "loss": 0.4384, "step": 1010 }, { "epoch": 0.8415649278579356, "grad_norm": 0.384883314371109, "learning_rate": 9.040523463177928e-06, "loss": 0.4663, "step": 1011 }, { "epoch": 0.8423973362930077, "grad_norm": 0.44887587428092957, "learning_rate": 9.037667587187045e-06, "loss": 0.486, "step": 1012 }, { "epoch": 0.8432297447280799, "grad_norm": 0.46977436542510986, "learning_rate": 9.034807919755602e-06, "loss": 0.4575, "step": 1013 }, { "epoch": 0.844062153163152, "grad_norm": 0.3983987867832184, "learning_rate": 9.031944463568877e-06, "loss": 0.4532, "step": 1014 }, { "epoch": 0.8448945615982242, "grad_norm": 0.4556075632572174, "learning_rate": 9.029077221315703e-06, "loss": 0.4685, "step": 1015 }, { "epoch": 0.8457269700332963, "grad_norm": 0.38745394349098206, "learning_rate": 9.026206195688472e-06, "loss": 0.4608, "step": 1016 }, { "epoch": 0.8465593784683685, "grad_norm": 0.4005940556526184, "learning_rate": 9.023331389383126e-06, "loss": 0.4628, "step": 1017 }, { "epoch": 0.8473917869034406, "grad_norm": 0.39503729343414307, "learning_rate": 9.02045280509916e-06, "loss": 0.4688, "step": 1018 }, { "epoch": 0.8482241953385128, "grad_norm": 0.4228137731552124, "learning_rate": 9.017570445539616e-06, "loss": 0.4594, "step": 1019 }, { "epoch": 0.8490566037735849, "grad_norm": 0.36277511715888977, "learning_rate": 9.014684313411077e-06, "loss": 0.4274, "step": 1020 }, { "epoch": 0.8498890122086571, "grad_norm": 0.3664073050022125, "learning_rate": 9.011794411423675e-06, "loss": 0.4592, "step": 1021 }, { "epoch": 0.8507214206437292, "grad_norm": 0.3959209620952606, "learning_rate": 9.008900742291075e-06, "loss": 0.4583, "step": 1022 }, { "epoch": 0.8515538290788013, "grad_norm": 0.3981209099292755, "learning_rate": 9.006003308730487e-06, "loss": 0.4543, "step": 1023 }, { "epoch": 0.8523862375138734, "grad_norm": 0.33361658453941345, "learning_rate": 9.003102113462647e-06, "loss": 0.4526, "step": 1024 }, { "epoch": 0.8532186459489456, "grad_norm": 0.37563925981521606, "learning_rate": 9.000197159211834e-06, "loss": 0.4657, "step": 1025 }, { "epoch": 0.8540510543840177, "grad_norm": 0.36745545268058777, "learning_rate": 8.997288448705846e-06, "loss": 0.4451, "step": 1026 }, { "epoch": 0.8548834628190899, "grad_norm": 0.3501058518886566, "learning_rate": 8.994375984676014e-06, "loss": 0.4413, "step": 1027 }, { "epoch": 0.855715871254162, "grad_norm": 0.38974809646606445, "learning_rate": 8.991459769857195e-06, "loss": 0.4492, "step": 1028 }, { "epoch": 0.8565482796892342, "grad_norm": 0.3752514719963074, "learning_rate": 8.988539806987764e-06, "loss": 0.4533, "step": 1029 }, { "epoch": 0.8573806881243063, "grad_norm": 0.3805655241012573, "learning_rate": 8.985616098809618e-06, "loss": 0.4733, "step": 1030 }, { "epoch": 0.8582130965593785, "grad_norm": 0.46047642827033997, "learning_rate": 8.982688648068169e-06, "loss": 0.4593, "step": 1031 }, { "epoch": 0.8590455049944506, "grad_norm": 0.39984360337257385, "learning_rate": 8.979757457512347e-06, "loss": 0.4749, "step": 1032 }, { "epoch": 0.8598779134295228, "grad_norm": 0.3868159353733063, "learning_rate": 8.976822529894588e-06, "loss": 0.4577, "step": 1033 }, { "epoch": 0.8607103218645948, "grad_norm": 0.41874581575393677, "learning_rate": 8.973883867970844e-06, "loss": 0.4784, "step": 1034 }, { "epoch": 0.861542730299667, "grad_norm": 0.3759986162185669, "learning_rate": 8.970941474500565e-06, "loss": 0.4554, "step": 1035 }, { "epoch": 0.8623751387347391, "grad_norm": 0.4303062856197357, "learning_rate": 8.967995352246714e-06, "loss": 0.4724, "step": 1036 }, { "epoch": 0.8632075471698113, "grad_norm": 0.420803040266037, "learning_rate": 8.965045503975752e-06, "loss": 0.486, "step": 1037 }, { "epoch": 0.8640399556048834, "grad_norm": 0.41059285402297974, "learning_rate": 8.962091932457635e-06, "loss": 0.4549, "step": 1038 }, { "epoch": 0.8648723640399556, "grad_norm": 0.3770391047000885, "learning_rate": 8.959134640465821e-06, "loss": 0.4783, "step": 1039 }, { "epoch": 0.8657047724750278, "grad_norm": 0.5017075538635254, "learning_rate": 8.956173630777255e-06, "loss": 0.4476, "step": 1040 }, { "epoch": 0.8665371809100999, "grad_norm": 0.33851176500320435, "learning_rate": 8.953208906172384e-06, "loss": 0.4677, "step": 1041 }, { "epoch": 0.867369589345172, "grad_norm": 0.40573734045028687, "learning_rate": 8.95024046943513e-06, "loss": 0.4607, "step": 1042 }, { "epoch": 0.8682019977802442, "grad_norm": 0.40829798579216003, "learning_rate": 8.947268323352909e-06, "loss": 0.4613, "step": 1043 }, { "epoch": 0.8690344062153164, "grad_norm": 0.40306246280670166, "learning_rate": 8.944292470716617e-06, "loss": 0.487, "step": 1044 }, { "epoch": 0.8698668146503885, "grad_norm": 0.4743926227092743, "learning_rate": 8.941312914320636e-06, "loss": 0.4623, "step": 1045 }, { "epoch": 0.8706992230854605, "grad_norm": 0.388696551322937, "learning_rate": 8.938329656962818e-06, "loss": 0.4678, "step": 1046 }, { "epoch": 0.8715316315205327, "grad_norm": 0.3969860076904297, "learning_rate": 8.935342701444495e-06, "loss": 0.437, "step": 1047 }, { "epoch": 0.8723640399556049, "grad_norm": 0.3880145251750946, "learning_rate": 8.932352050570467e-06, "loss": 0.4702, "step": 1048 }, { "epoch": 0.873196448390677, "grad_norm": 0.41687145829200745, "learning_rate": 8.929357707149014e-06, "loss": 0.4662, "step": 1049 }, { "epoch": 0.8740288568257492, "grad_norm": 0.3698790967464447, "learning_rate": 8.926359673991874e-06, "loss": 0.4579, "step": 1050 }, { "epoch": 0.8748612652608213, "grad_norm": 0.3788878917694092, "learning_rate": 8.92335795391425e-06, "loss": 0.4726, "step": 1051 }, { "epoch": 0.8756936736958935, "grad_norm": 0.43458738923072815, "learning_rate": 8.920352549734812e-06, "loss": 0.4771, "step": 1052 }, { "epoch": 0.8765260821309656, "grad_norm": 0.3463033139705658, "learning_rate": 8.91734346427569e-06, "loss": 0.4503, "step": 1053 }, { "epoch": 0.8773584905660378, "grad_norm": 0.3873491585254669, "learning_rate": 8.914330700362461e-06, "loss": 0.4513, "step": 1054 }, { "epoch": 0.8781908990011099, "grad_norm": 0.36356833577156067, "learning_rate": 8.91131426082417e-06, "loss": 0.4362, "step": 1055 }, { "epoch": 0.8790233074361821, "grad_norm": 0.3933038115501404, "learning_rate": 8.908294148493303e-06, "loss": 0.4602, "step": 1056 }, { "epoch": 0.8798557158712541, "grad_norm": 0.3321017026901245, "learning_rate": 8.905270366205798e-06, "loss": 0.4457, "step": 1057 }, { "epoch": 0.8806881243063263, "grad_norm": 0.42186474800109863, "learning_rate": 8.902242916801043e-06, "loss": 0.4743, "step": 1058 }, { "epoch": 0.8815205327413984, "grad_norm": 0.3869630694389343, "learning_rate": 8.899211803121861e-06, "loss": 0.432, "step": 1059 }, { "epoch": 0.8823529411764706, "grad_norm": 0.3925645351409912, "learning_rate": 8.896177028014524e-06, "loss": 0.481, "step": 1060 }, { "epoch": 0.8831853496115427, "grad_norm": 0.35652878880500793, "learning_rate": 8.893138594328738e-06, "loss": 0.4576, "step": 1061 }, { "epoch": 0.8840177580466149, "grad_norm": 0.3799140453338623, "learning_rate": 8.890096504917647e-06, "loss": 0.4318, "step": 1062 }, { "epoch": 0.884850166481687, "grad_norm": 0.43312981724739075, "learning_rate": 8.887050762637825e-06, "loss": 0.4647, "step": 1063 }, { "epoch": 0.8856825749167592, "grad_norm": 0.3973630666732788, "learning_rate": 8.884001370349275e-06, "loss": 0.4598, "step": 1064 }, { "epoch": 0.8865149833518313, "grad_norm": 0.3882988393306732, "learning_rate": 8.880948330915435e-06, "loss": 0.4827, "step": 1065 }, { "epoch": 0.8873473917869035, "grad_norm": 0.3990275263786316, "learning_rate": 8.877891647203157e-06, "loss": 0.4571, "step": 1066 }, { "epoch": 0.8881798002219756, "grad_norm": 0.38141873478889465, "learning_rate": 8.874831322082725e-06, "loss": 0.4471, "step": 1067 }, { "epoch": 0.8890122086570478, "grad_norm": 0.3606696128845215, "learning_rate": 8.871767358427835e-06, "loss": 0.4216, "step": 1068 }, { "epoch": 0.8898446170921198, "grad_norm": 0.41676607728004456, "learning_rate": 8.868699759115604e-06, "loss": 0.4574, "step": 1069 }, { "epoch": 0.890677025527192, "grad_norm": 0.38336870074272156, "learning_rate": 8.86562852702656e-06, "loss": 0.4666, "step": 1070 }, { "epoch": 0.8915094339622641, "grad_norm": 0.41286566853523254, "learning_rate": 8.862553665044644e-06, "loss": 0.4788, "step": 1071 }, { "epoch": 0.8923418423973363, "grad_norm": 0.42297109961509705, "learning_rate": 8.859475176057208e-06, "loss": 0.4831, "step": 1072 }, { "epoch": 0.8931742508324084, "grad_norm": 0.41496801376342773, "learning_rate": 8.856393062955003e-06, "loss": 0.4696, "step": 1073 }, { "epoch": 0.8940066592674806, "grad_norm": 0.4308110177516937, "learning_rate": 8.85330732863219e-06, "loss": 0.452, "step": 1074 }, { "epoch": 0.8948390677025527, "grad_norm": 0.4222694933414459, "learning_rate": 8.850217975986326e-06, "loss": 0.4499, "step": 1075 }, { "epoch": 0.8956714761376249, "grad_norm": 0.43070679903030396, "learning_rate": 8.84712500791837e-06, "loss": 0.4717, "step": 1076 }, { "epoch": 0.896503884572697, "grad_norm": 0.3741142153739929, "learning_rate": 8.844028427332667e-06, "loss": 0.4676, "step": 1077 }, { "epoch": 0.8973362930077692, "grad_norm": 0.4449978470802307, "learning_rate": 8.840928237136967e-06, "loss": 0.4547, "step": 1078 }, { "epoch": 0.8981687014428413, "grad_norm": 0.36339613795280457, "learning_rate": 8.837824440242402e-06, "loss": 0.4672, "step": 1079 }, { "epoch": 0.8990011098779135, "grad_norm": 0.385516494512558, "learning_rate": 8.834717039563488e-06, "loss": 0.4796, "step": 1080 }, { "epoch": 0.8998335183129855, "grad_norm": 0.3869156539440155, "learning_rate": 8.83160603801813e-06, "loss": 0.4498, "step": 1081 }, { "epoch": 0.9006659267480577, "grad_norm": 0.3644091784954071, "learning_rate": 8.828491438527614e-06, "loss": 0.4296, "step": 1082 }, { "epoch": 0.9014983351831298, "grad_norm": 0.3746865689754486, "learning_rate": 8.825373244016604e-06, "loss": 0.4595, "step": 1083 }, { "epoch": 0.902330743618202, "grad_norm": 0.36787107586860657, "learning_rate": 8.822251457413138e-06, "loss": 0.4635, "step": 1084 }, { "epoch": 0.9031631520532741, "grad_norm": 0.3880446255207062, "learning_rate": 8.819126081648627e-06, "loss": 0.4619, "step": 1085 }, { "epoch": 0.9039955604883463, "grad_norm": 0.3723534047603607, "learning_rate": 8.815997119657856e-06, "loss": 0.4545, "step": 1086 }, { "epoch": 0.9048279689234184, "grad_norm": 0.3879696726799011, "learning_rate": 8.812864574378974e-06, "loss": 0.474, "step": 1087 }, { "epoch": 0.9056603773584906, "grad_norm": 0.36703255772590637, "learning_rate": 8.809728448753496e-06, "loss": 0.4609, "step": 1088 }, { "epoch": 0.9064927857935627, "grad_norm": 0.3635331094264984, "learning_rate": 8.8065887457263e-06, "loss": 0.4472, "step": 1089 }, { "epoch": 0.9073251942286349, "grad_norm": 0.3628758192062378, "learning_rate": 8.803445468245618e-06, "loss": 0.4376, "step": 1090 }, { "epoch": 0.908157602663707, "grad_norm": 0.41849446296691895, "learning_rate": 8.800298619263047e-06, "loss": 0.4313, "step": 1091 }, { "epoch": 0.9089900110987791, "grad_norm": 0.4343433976173401, "learning_rate": 8.797148201733533e-06, "loss": 0.4294, "step": 1092 }, { "epoch": 0.9098224195338512, "grad_norm": 0.37368395924568176, "learning_rate": 8.793994218615371e-06, "loss": 0.4416, "step": 1093 }, { "epoch": 0.9106548279689234, "grad_norm": 0.4110583961009979, "learning_rate": 8.79083667287021e-06, "loss": 0.4457, "step": 1094 }, { "epoch": 0.9114872364039955, "grad_norm": 0.43879762291908264, "learning_rate": 8.787675567463034e-06, "loss": 0.4837, "step": 1095 }, { "epoch": 0.9123196448390677, "grad_norm": 0.44292664527893066, "learning_rate": 8.784510905362185e-06, "loss": 0.4603, "step": 1096 }, { "epoch": 0.9131520532741398, "grad_norm": 0.4232296943664551, "learning_rate": 8.781342689539329e-06, "loss": 0.4736, "step": 1097 }, { "epoch": 0.913984461709212, "grad_norm": 0.3918737769126892, "learning_rate": 8.778170922969478e-06, "loss": 0.4531, "step": 1098 }, { "epoch": 0.9148168701442841, "grad_norm": 0.4186196029186249, "learning_rate": 8.774995608630979e-06, "loss": 0.417, "step": 1099 }, { "epoch": 0.9156492785793563, "grad_norm": 0.41225314140319824, "learning_rate": 8.771816749505504e-06, "loss": 0.4499, "step": 1100 }, { "epoch": 0.9164816870144284, "grad_norm": 0.4248989522457123, "learning_rate": 8.768634348578062e-06, "loss": 0.4186, "step": 1101 }, { "epoch": 0.9173140954495006, "grad_norm": 0.4327683746814728, "learning_rate": 8.765448408836978e-06, "loss": 0.4625, "step": 1102 }, { "epoch": 0.9181465038845728, "grad_norm": 0.4192464053630829, "learning_rate": 8.762258933273908e-06, "loss": 0.4337, "step": 1103 }, { "epoch": 0.9189789123196448, "grad_norm": 0.4388251304626465, "learning_rate": 8.759065924883827e-06, "loss": 0.4489, "step": 1104 }, { "epoch": 0.9198113207547169, "grad_norm": 0.4132365882396698, "learning_rate": 8.755869386665022e-06, "loss": 0.4482, "step": 1105 }, { "epoch": 0.9206437291897891, "grad_norm": 0.4240826964378357, "learning_rate": 8.7526693216191e-06, "loss": 0.4612, "step": 1106 }, { "epoch": 0.9214761376248612, "grad_norm": 0.4309800863265991, "learning_rate": 8.749465732750982e-06, "loss": 0.4827, "step": 1107 }, { "epoch": 0.9223085460599334, "grad_norm": 0.43188637495040894, "learning_rate": 8.746258623068886e-06, "loss": 0.4666, "step": 1108 }, { "epoch": 0.9231409544950056, "grad_norm": 0.4251968264579773, "learning_rate": 8.74304799558435e-06, "loss": 0.4654, "step": 1109 }, { "epoch": 0.9239733629300777, "grad_norm": 0.4286282956600189, "learning_rate": 8.739833853312208e-06, "loss": 0.4504, "step": 1110 }, { "epoch": 0.9248057713651499, "grad_norm": 0.39924654364585876, "learning_rate": 8.736616199270595e-06, "loss": 0.4432, "step": 1111 }, { "epoch": 0.925638179800222, "grad_norm": 0.44831129908561707, "learning_rate": 8.733395036480946e-06, "loss": 0.4497, "step": 1112 }, { "epoch": 0.9264705882352942, "grad_norm": 0.38468286395072937, "learning_rate": 8.73017036796799e-06, "loss": 0.4564, "step": 1113 }, { "epoch": 0.9273029966703663, "grad_norm": 0.380112886428833, "learning_rate": 8.726942196759744e-06, "loss": 0.4557, "step": 1114 }, { "epoch": 0.9281354051054383, "grad_norm": 0.3891477584838867, "learning_rate": 8.72371052588752e-06, "loss": 0.4478, "step": 1115 }, { "epoch": 0.9289678135405105, "grad_norm": 0.41235268115997314, "learning_rate": 8.720475358385912e-06, "loss": 0.4429, "step": 1116 }, { "epoch": 0.9298002219755827, "grad_norm": 0.3957020938396454, "learning_rate": 8.7172366972928e-06, "loss": 0.4848, "step": 1117 }, { "epoch": 0.9306326304106548, "grad_norm": 0.42292919754981995, "learning_rate": 8.713994545649343e-06, "loss": 0.466, "step": 1118 }, { "epoch": 0.931465038845727, "grad_norm": 0.40453973412513733, "learning_rate": 8.710748906499977e-06, "loss": 0.4424, "step": 1119 }, { "epoch": 0.9322974472807991, "grad_norm": 0.4147048592567444, "learning_rate": 8.707499782892414e-06, "loss": 0.471, "step": 1120 }, { "epoch": 0.9331298557158713, "grad_norm": 0.38894933462142944, "learning_rate": 8.704247177877643e-06, "loss": 0.4822, "step": 1121 }, { "epoch": 0.9339622641509434, "grad_norm": 0.39160868525505066, "learning_rate": 8.700991094509909e-06, "loss": 0.4814, "step": 1122 }, { "epoch": 0.9347946725860156, "grad_norm": 0.362416535615921, "learning_rate": 8.697731535846739e-06, "loss": 0.4605, "step": 1123 }, { "epoch": 0.9356270810210877, "grad_norm": 0.44123584032058716, "learning_rate": 8.69446850494891e-06, "loss": 0.4594, "step": 1124 }, { "epoch": 0.9364594894561599, "grad_norm": 0.3935483992099762, "learning_rate": 8.691202004880468e-06, "loss": 0.4643, "step": 1125 }, { "epoch": 0.937291897891232, "grad_norm": 0.37961041927337646, "learning_rate": 8.687932038708712e-06, "loss": 0.4547, "step": 1126 }, { "epoch": 0.9381243063263041, "grad_norm": 0.4113844037055969, "learning_rate": 8.684658609504199e-06, "loss": 0.4683, "step": 1127 }, { "epoch": 0.9389567147613762, "grad_norm": 0.4119846820831299, "learning_rate": 8.681381720340736e-06, "loss": 0.4563, "step": 1128 }, { "epoch": 0.9397891231964484, "grad_norm": 0.3995191156864166, "learning_rate": 8.67810137429538e-06, "loss": 0.4544, "step": 1129 }, { "epoch": 0.9406215316315205, "grad_norm": 0.403024822473526, "learning_rate": 8.674817574448431e-06, "loss": 0.4732, "step": 1130 }, { "epoch": 0.9414539400665927, "grad_norm": 0.38387593626976013, "learning_rate": 8.671530323883437e-06, "loss": 0.4698, "step": 1131 }, { "epoch": 0.9422863485016648, "grad_norm": 0.3703984022140503, "learning_rate": 8.668239625687183e-06, "loss": 0.4346, "step": 1132 }, { "epoch": 0.943118756936737, "grad_norm": 0.35403862595558167, "learning_rate": 8.664945482949691e-06, "loss": 0.4518, "step": 1133 }, { "epoch": 0.9439511653718091, "grad_norm": 0.3765757381916046, "learning_rate": 8.661647898764221e-06, "loss": 0.4547, "step": 1134 }, { "epoch": 0.9447835738068813, "grad_norm": 0.35943272709846497, "learning_rate": 8.658346876227261e-06, "loss": 0.4333, "step": 1135 }, { "epoch": 0.9456159822419534, "grad_norm": 0.36773252487182617, "learning_rate": 8.655042418438529e-06, "loss": 0.4498, "step": 1136 }, { "epoch": 0.9464483906770256, "grad_norm": 0.3745127022266388, "learning_rate": 8.651734528500968e-06, "loss": 0.461, "step": 1137 }, { "epoch": 0.9472807991120976, "grad_norm": 0.3643935024738312, "learning_rate": 8.648423209520746e-06, "loss": 0.4351, "step": 1138 }, { "epoch": 0.9481132075471698, "grad_norm": 0.4308556616306305, "learning_rate": 8.64510846460725e-06, "loss": 0.4828, "step": 1139 }, { "epoch": 0.9489456159822419, "grad_norm": 0.331564337015152, "learning_rate": 8.641790296873081e-06, "loss": 0.4513, "step": 1140 }, { "epoch": 0.9497780244173141, "grad_norm": 0.3650985658168793, "learning_rate": 8.638468709434057e-06, "loss": 0.4507, "step": 1141 }, { "epoch": 0.9506104328523862, "grad_norm": 0.4312916696071625, "learning_rate": 8.63514370540921e-06, "loss": 0.4469, "step": 1142 }, { "epoch": 0.9514428412874584, "grad_norm": 0.3546101748943329, "learning_rate": 8.631815287920773e-06, "loss": 0.4594, "step": 1143 }, { "epoch": 0.9522752497225305, "grad_norm": 0.3340010643005371, "learning_rate": 8.62848346009419e-06, "loss": 0.4506, "step": 1144 }, { "epoch": 0.9531076581576027, "grad_norm": 0.4434676468372345, "learning_rate": 8.625148225058107e-06, "loss": 0.4825, "step": 1145 }, { "epoch": 0.9539400665926748, "grad_norm": 0.39392799139022827, "learning_rate": 8.621809585944366e-06, "loss": 0.4532, "step": 1146 }, { "epoch": 0.954772475027747, "grad_norm": 0.4085777699947357, "learning_rate": 8.61846754588801e-06, "loss": 0.4852, "step": 1147 }, { "epoch": 0.9556048834628191, "grad_norm": 0.42657431960105896, "learning_rate": 8.61512210802727e-06, "loss": 0.4647, "step": 1148 }, { "epoch": 0.9564372918978913, "grad_norm": 0.35785362124443054, "learning_rate": 8.611773275503572e-06, "loss": 0.4393, "step": 1149 }, { "epoch": 0.9572697003329633, "grad_norm": 0.37917736172676086, "learning_rate": 8.608421051461529e-06, "loss": 0.4496, "step": 1150 }, { "epoch": 0.9581021087680355, "grad_norm": 0.37420985102653503, "learning_rate": 8.605065439048936e-06, "loss": 0.4475, "step": 1151 }, { "epoch": 0.9589345172031076, "grad_norm": 0.38180387020111084, "learning_rate": 8.601706441416776e-06, "loss": 0.4694, "step": 1152 }, { "epoch": 0.9597669256381798, "grad_norm": 0.4098156988620758, "learning_rate": 8.598344061719204e-06, "loss": 0.4602, "step": 1153 }, { "epoch": 0.9605993340732519, "grad_norm": 0.38434961438179016, "learning_rate": 8.594978303113552e-06, "loss": 0.4214, "step": 1154 }, { "epoch": 0.9614317425083241, "grad_norm": 0.37275487184524536, "learning_rate": 8.59160916876033e-06, "loss": 0.4565, "step": 1155 }, { "epoch": 0.9622641509433962, "grad_norm": 0.4662790298461914, "learning_rate": 8.588236661823209e-06, "loss": 0.4807, "step": 1156 }, { "epoch": 0.9630965593784684, "grad_norm": 0.3571013808250427, "learning_rate": 8.584860785469036e-06, "loss": 0.4392, "step": 1157 }, { "epoch": 0.9639289678135405, "grad_norm": 0.3700477182865143, "learning_rate": 8.581481542867818e-06, "loss": 0.4548, "step": 1158 }, { "epoch": 0.9647613762486127, "grad_norm": 0.4246416687965393, "learning_rate": 8.578098937192723e-06, "loss": 0.4503, "step": 1159 }, { "epoch": 0.9655937846836848, "grad_norm": 0.4045102894306183, "learning_rate": 8.574712971620075e-06, "loss": 0.4749, "step": 1160 }, { "epoch": 0.9664261931187569, "grad_norm": 0.35602596402168274, "learning_rate": 8.571323649329352e-06, "loss": 0.4275, "step": 1161 }, { "epoch": 0.967258601553829, "grad_norm": 0.49385765194892883, "learning_rate": 8.567930973503196e-06, "loss": 0.4623, "step": 1162 }, { "epoch": 0.9680910099889012, "grad_norm": 0.4097510576248169, "learning_rate": 8.564534947327381e-06, "loss": 0.46, "step": 1163 }, { "epoch": 0.9689234184239733, "grad_norm": 0.38444530963897705, "learning_rate": 8.561135573990839e-06, "loss": 0.441, "step": 1164 }, { "epoch": 0.9697558268590455, "grad_norm": 0.43890970945358276, "learning_rate": 8.55773285668564e-06, "loss": 0.4766, "step": 1165 }, { "epoch": 0.9705882352941176, "grad_norm": 0.37467607855796814, "learning_rate": 8.554326798606994e-06, "loss": 0.4743, "step": 1166 }, { "epoch": 0.9714206437291898, "grad_norm": 0.3442972004413605, "learning_rate": 8.55091740295325e-06, "loss": 0.4558, "step": 1167 }, { "epoch": 0.9722530521642619, "grad_norm": 0.3897690176963806, "learning_rate": 8.547504672925892e-06, "loss": 0.4647, "step": 1168 }, { "epoch": 0.9730854605993341, "grad_norm": 0.33819976449012756, "learning_rate": 8.544088611729533e-06, "loss": 0.4507, "step": 1169 }, { "epoch": 0.9739178690344062, "grad_norm": 0.3626435399055481, "learning_rate": 8.540669222571911e-06, "loss": 0.4365, "step": 1170 }, { "epoch": 0.9747502774694784, "grad_norm": 0.3824900686740875, "learning_rate": 8.537246508663894e-06, "loss": 0.4761, "step": 1171 }, { "epoch": 0.9755826859045506, "grad_norm": 0.3584446310997009, "learning_rate": 8.533820473219472e-06, "loss": 0.4595, "step": 1172 }, { "epoch": 0.9764150943396226, "grad_norm": 0.39771831035614014, "learning_rate": 8.53039111945575e-06, "loss": 0.4779, "step": 1173 }, { "epoch": 0.9772475027746947, "grad_norm": 0.37545955181121826, "learning_rate": 8.526958450592952e-06, "loss": 0.4357, "step": 1174 }, { "epoch": 0.9780799112097669, "grad_norm": 0.3663276731967926, "learning_rate": 8.523522469854415e-06, "loss": 0.4777, "step": 1175 }, { "epoch": 0.978912319644839, "grad_norm": 0.3252856135368347, "learning_rate": 8.520083180466585e-06, "loss": 0.4317, "step": 1176 }, { "epoch": 0.9797447280799112, "grad_norm": 0.4126368463039398, "learning_rate": 8.516640585659012e-06, "loss": 0.4715, "step": 1177 }, { "epoch": 0.9805771365149833, "grad_norm": 0.35552918910980225, "learning_rate": 8.513194688664356e-06, "loss": 0.4821, "step": 1178 }, { "epoch": 0.9814095449500555, "grad_norm": 0.34943288564682007, "learning_rate": 8.509745492718375e-06, "loss": 0.4405, "step": 1179 }, { "epoch": 0.9822419533851277, "grad_norm": 0.3988153636455536, "learning_rate": 8.506293001059922e-06, "loss": 0.4575, "step": 1180 }, { "epoch": 0.9830743618201998, "grad_norm": 0.3627513647079468, "learning_rate": 8.502837216930947e-06, "loss": 0.4551, "step": 1181 }, { "epoch": 0.983906770255272, "grad_norm": 0.3588287830352783, "learning_rate": 8.499378143576496e-06, "loss": 0.4544, "step": 1182 }, { "epoch": 0.9847391786903441, "grad_norm": 0.3817938566207886, "learning_rate": 8.495915784244694e-06, "loss": 0.458, "step": 1183 }, { "epoch": 0.9855715871254163, "grad_norm": 0.38958173990249634, "learning_rate": 8.49245014218676e-06, "loss": 0.4605, "step": 1184 }, { "epoch": 0.9864039955604883, "grad_norm": 0.3958968222141266, "learning_rate": 8.488981220656993e-06, "loss": 0.4407, "step": 1185 }, { "epoch": 0.9872364039955605, "grad_norm": 0.3889720141887665, "learning_rate": 8.48550902291277e-06, "loss": 0.4434, "step": 1186 }, { "epoch": 0.9880688124306326, "grad_norm": 0.3535591959953308, "learning_rate": 8.482033552214546e-06, "loss": 0.4687, "step": 1187 }, { "epoch": 0.9889012208657048, "grad_norm": 0.44984614849090576, "learning_rate": 8.478554811825846e-06, "loss": 0.4464, "step": 1188 }, { "epoch": 0.9897336293007769, "grad_norm": 0.3912472724914551, "learning_rate": 8.475072805013274e-06, "loss": 0.4488, "step": 1189 }, { "epoch": 0.9905660377358491, "grad_norm": 0.4023473858833313, "learning_rate": 8.471587535046487e-06, "loss": 0.4632, "step": 1190 }, { "epoch": 0.9913984461709212, "grad_norm": 0.3754764795303345, "learning_rate": 8.468099005198224e-06, "loss": 0.4345, "step": 1191 }, { "epoch": 0.9922308546059934, "grad_norm": 0.37986260652542114, "learning_rate": 8.46460721874427e-06, "loss": 0.4315, "step": 1192 }, { "epoch": 0.9930632630410655, "grad_norm": 0.37531065940856934, "learning_rate": 8.461112178963475e-06, "loss": 0.4342, "step": 1193 }, { "epoch": 0.9938956714761377, "grad_norm": 0.4208608865737915, "learning_rate": 8.45761388913774e-06, "loss": 0.4849, "step": 1194 }, { "epoch": 0.9947280799112098, "grad_norm": 0.3419855237007141, "learning_rate": 8.454112352552025e-06, "loss": 0.4543, "step": 1195 }, { "epoch": 0.9955604883462819, "grad_norm": 0.3961000144481659, "learning_rate": 8.450607572494332e-06, "loss": 0.4526, "step": 1196 }, { "epoch": 0.996392896781354, "grad_norm": 0.42482149600982666, "learning_rate": 8.447099552255708e-06, "loss": 0.4498, "step": 1197 }, { "epoch": 0.9972253052164262, "grad_norm": 0.3759855329990387, "learning_rate": 8.44358829513025e-06, "loss": 0.4251, "step": 1198 }, { "epoch": 0.9980577136514983, "grad_norm": 0.3645038306713104, "learning_rate": 8.44007380441509e-06, "loss": 0.4439, "step": 1199 }, { "epoch": 0.9988901220865705, "grad_norm": 0.46094900369644165, "learning_rate": 8.436556083410392e-06, "loss": 0.4726, "step": 1200 }, { "epoch": 0.9997225305216426, "grad_norm": 0.42478302121162415, "learning_rate": 8.433035135419358e-06, "loss": 0.4445, "step": 1201 }, { "epoch": 1.0005549389567148, "grad_norm": 0.7507914900779724, "learning_rate": 8.429510963748224e-06, "loss": 0.7647, "step": 1202 }, { "epoch": 1.001387347391787, "grad_norm": 0.43571215867996216, "learning_rate": 8.425983571706247e-06, "loss": 0.4272, "step": 1203 }, { "epoch": 1.002219755826859, "grad_norm": 0.4088655114173889, "learning_rate": 8.422452962605709e-06, "loss": 0.4526, "step": 1204 }, { "epoch": 1.0030521642619312, "grad_norm": 0.34656473994255066, "learning_rate": 8.418919139761914e-06, "loss": 0.3969, "step": 1205 }, { "epoch": 1.0038845726970034, "grad_norm": 0.4557936489582062, "learning_rate": 8.415382106493183e-06, "loss": 0.45, "step": 1206 }, { "epoch": 1.0047169811320755, "grad_norm": 0.3934694528579712, "learning_rate": 8.411841866120855e-06, "loss": 0.4424, "step": 1207 }, { "epoch": 1.0055493895671477, "grad_norm": 0.410007119178772, "learning_rate": 8.408298421969275e-06, "loss": 0.4463, "step": 1208 }, { "epoch": 1.0063817980022198, "grad_norm": 0.3848797678947449, "learning_rate": 8.4047517773658e-06, "loss": 0.4238, "step": 1209 }, { "epoch": 1.007214206437292, "grad_norm": 0.3674464821815491, "learning_rate": 8.40120193564079e-06, "loss": 0.3869, "step": 1210 }, { "epoch": 1.0080466148723641, "grad_norm": 0.42686253786087036, "learning_rate": 8.39764890012761e-06, "loss": 0.4377, "step": 1211 }, { "epoch": 1.0088790233074363, "grad_norm": 0.4299822151660919, "learning_rate": 8.394092674162625e-06, "loss": 0.4527, "step": 1212 }, { "epoch": 1.0097114317425082, "grad_norm": 0.4144967496395111, "learning_rate": 8.390533261085188e-06, "loss": 0.4261, "step": 1213 }, { "epoch": 1.0105438401775804, "grad_norm": 0.3909236192703247, "learning_rate": 8.386970664237653e-06, "loss": 0.4208, "step": 1214 }, { "epoch": 1.0113762486126525, "grad_norm": 0.404340535402298, "learning_rate": 8.383404886965361e-06, "loss": 0.4307, "step": 1215 }, { "epoch": 1.0122086570477247, "grad_norm": 0.39880749583244324, "learning_rate": 8.37983593261664e-06, "loss": 0.422, "step": 1216 }, { "epoch": 1.0130410654827968, "grad_norm": 0.37474164366722107, "learning_rate": 8.376263804542798e-06, "loss": 0.4333, "step": 1217 }, { "epoch": 1.013873473917869, "grad_norm": 0.39734119176864624, "learning_rate": 8.372688506098128e-06, "loss": 0.4147, "step": 1218 }, { "epoch": 1.0147058823529411, "grad_norm": 0.3793570399284363, "learning_rate": 8.369110040639899e-06, "loss": 0.4257, "step": 1219 }, { "epoch": 1.0155382907880133, "grad_norm": 0.38969290256500244, "learning_rate": 8.365528411528348e-06, "loss": 0.4657, "step": 1220 }, { "epoch": 1.0163706992230854, "grad_norm": 0.33815810084342957, "learning_rate": 8.361943622126694e-06, "loss": 0.3868, "step": 1221 }, { "epoch": 1.0172031076581576, "grad_norm": 0.4408543109893799, "learning_rate": 8.358355675801112e-06, "loss": 0.4481, "step": 1222 }, { "epoch": 1.0180355160932297, "grad_norm": 0.3819434940814972, "learning_rate": 8.354764575920747e-06, "loss": 0.4484, "step": 1223 }, { "epoch": 1.0188679245283019, "grad_norm": 0.3943055272102356, "learning_rate": 8.351170325857705e-06, "loss": 0.4066, "step": 1224 }, { "epoch": 1.019700332963374, "grad_norm": 0.3979198634624481, "learning_rate": 8.347572928987052e-06, "loss": 0.4461, "step": 1225 }, { "epoch": 1.0205327413984462, "grad_norm": 0.3459455370903015, "learning_rate": 8.343972388686806e-06, "loss": 0.3935, "step": 1226 }, { "epoch": 1.0213651498335183, "grad_norm": 0.4697541892528534, "learning_rate": 8.340368708337934e-06, "loss": 0.478, "step": 1227 }, { "epoch": 1.0221975582685905, "grad_norm": 0.392013818025589, "learning_rate": 8.336761891324357e-06, "loss": 0.4317, "step": 1228 }, { "epoch": 1.0230299667036626, "grad_norm": 0.3422892987728119, "learning_rate": 8.333151941032941e-06, "loss": 0.3749, "step": 1229 }, { "epoch": 1.0238623751387348, "grad_norm": 0.4656471312046051, "learning_rate": 8.32953886085349e-06, "loss": 0.4637, "step": 1230 }, { "epoch": 1.024694783573807, "grad_norm": 0.4500276744365692, "learning_rate": 8.325922654178752e-06, "loss": 0.454, "step": 1231 }, { "epoch": 1.025527192008879, "grad_norm": 0.3819878399372101, "learning_rate": 8.322303324404408e-06, "loss": 0.4158, "step": 1232 }, { "epoch": 1.0263596004439512, "grad_norm": 0.4874471426010132, "learning_rate": 8.318680874929068e-06, "loss": 0.4479, "step": 1233 }, { "epoch": 1.0271920088790234, "grad_norm": 0.39757636189460754, "learning_rate": 8.315055309154283e-06, "loss": 0.4527, "step": 1234 }, { "epoch": 1.0280244173140956, "grad_norm": 0.3715905547142029, "learning_rate": 8.311426630484513e-06, "loss": 0.4181, "step": 1235 }, { "epoch": 1.0288568257491675, "grad_norm": 0.44828590750694275, "learning_rate": 8.30779484232716e-06, "loss": 0.4454, "step": 1236 }, { "epoch": 1.0296892341842396, "grad_norm": 0.4298873245716095, "learning_rate": 8.304159948092532e-06, "loss": 0.4382, "step": 1237 }, { "epoch": 1.0305216426193118, "grad_norm": 0.36218270659446716, "learning_rate": 8.30052195119386e-06, "loss": 0.4058, "step": 1238 }, { "epoch": 1.031354051054384, "grad_norm": 0.4292842149734497, "learning_rate": 8.296880855047284e-06, "loss": 0.4512, "step": 1239 }, { "epoch": 1.032186459489456, "grad_norm": 0.3632907569408417, "learning_rate": 8.293236663071859e-06, "loss": 0.4331, "step": 1240 }, { "epoch": 1.0330188679245282, "grad_norm": 0.3891299366950989, "learning_rate": 8.289589378689548e-06, "loss": 0.4313, "step": 1241 }, { "epoch": 1.0338512763596004, "grad_norm": 0.37545377016067505, "learning_rate": 8.28593900532521e-06, "loss": 0.375, "step": 1242 }, { "epoch": 1.0346836847946725, "grad_norm": 0.3906904458999634, "learning_rate": 8.28228554640661e-06, "loss": 0.4785, "step": 1243 }, { "epoch": 1.0355160932297447, "grad_norm": 0.3426535725593567, "learning_rate": 8.278629005364412e-06, "loss": 0.387, "step": 1244 }, { "epoch": 1.0363485016648168, "grad_norm": 0.4440067410469055, "learning_rate": 8.274969385632173e-06, "loss": 0.4773, "step": 1245 }, { "epoch": 1.037180910099889, "grad_norm": 0.36117398738861084, "learning_rate": 8.271306690646336e-06, "loss": 0.4375, "step": 1246 }, { "epoch": 1.0380133185349611, "grad_norm": 0.4358084499835968, "learning_rate": 8.267640923846242e-06, "loss": 0.4346, "step": 1247 }, { "epoch": 1.0388457269700333, "grad_norm": 0.3763819932937622, "learning_rate": 8.263972088674103e-06, "loss": 0.4179, "step": 1248 }, { "epoch": 1.0396781354051055, "grad_norm": 0.3596190810203552, "learning_rate": 8.260300188575024e-06, "loss": 0.4451, "step": 1249 }, { "epoch": 1.0405105438401776, "grad_norm": 0.42740023136138916, "learning_rate": 8.256625226996981e-06, "loss": 0.4607, "step": 1250 }, { "epoch": 1.0413429522752498, "grad_norm": 0.3612176775932312, "learning_rate": 8.252947207390832e-06, "loss": 0.4038, "step": 1251 }, { "epoch": 1.042175360710322, "grad_norm": 0.3865748941898346, "learning_rate": 8.249266133210296e-06, "loss": 0.4395, "step": 1252 }, { "epoch": 1.043007769145394, "grad_norm": 0.37287238240242004, "learning_rate": 8.245582007911967e-06, "loss": 0.4055, "step": 1253 }, { "epoch": 1.0438401775804662, "grad_norm": 0.4149041771888733, "learning_rate": 8.241894834955306e-06, "loss": 0.421, "step": 1254 }, { "epoch": 1.0446725860155384, "grad_norm": 0.42680850625038147, "learning_rate": 8.238204617802633e-06, "loss": 0.4522, "step": 1255 }, { "epoch": 1.0455049944506105, "grad_norm": 0.3849968910217285, "learning_rate": 8.234511359919125e-06, "loss": 0.3983, "step": 1256 }, { "epoch": 1.0463374028856827, "grad_norm": 0.405513197183609, "learning_rate": 8.230815064772815e-06, "loss": 0.4165, "step": 1257 }, { "epoch": 1.0471698113207548, "grad_norm": 0.4122266471385956, "learning_rate": 8.22711573583459e-06, "loss": 0.4478, "step": 1258 }, { "epoch": 1.0480022197558267, "grad_norm": 0.3673398196697235, "learning_rate": 8.223413376578182e-06, "loss": 0.4134, "step": 1259 }, { "epoch": 1.048834628190899, "grad_norm": 0.40100428462028503, "learning_rate": 8.219707990480177e-06, "loss": 0.4296, "step": 1260 }, { "epoch": 1.049667036625971, "grad_norm": 0.34723445773124695, "learning_rate": 8.215999581019993e-06, "loss": 0.4036, "step": 1261 }, { "epoch": 1.0504994450610432, "grad_norm": 0.4086048901081085, "learning_rate": 8.212288151679892e-06, "loss": 0.4462, "step": 1262 }, { "epoch": 1.0513318534961154, "grad_norm": 0.37759000062942505, "learning_rate": 8.208573705944972e-06, "loss": 0.4191, "step": 1263 }, { "epoch": 1.0521642619311875, "grad_norm": 0.43619635701179504, "learning_rate": 8.204856247303163e-06, "loss": 0.4364, "step": 1264 }, { "epoch": 1.0529966703662597, "grad_norm": 0.395222544670105, "learning_rate": 8.201135779245222e-06, "loss": 0.4511, "step": 1265 }, { "epoch": 1.0538290788013318, "grad_norm": 0.4762343466281891, "learning_rate": 8.197412305264735e-06, "loss": 0.4423, "step": 1266 }, { "epoch": 1.054661487236404, "grad_norm": 0.4282004237174988, "learning_rate": 8.193685828858109e-06, "loss": 0.4506, "step": 1267 }, { "epoch": 1.0554938956714761, "grad_norm": 0.35303211212158203, "learning_rate": 8.189956353524568e-06, "loss": 0.4602, "step": 1268 }, { "epoch": 1.0563263041065483, "grad_norm": 0.3769274652004242, "learning_rate": 8.18622388276616e-06, "loss": 0.4354, "step": 1269 }, { "epoch": 1.0571587125416204, "grad_norm": 0.41085049510002136, "learning_rate": 8.182488420087737e-06, "loss": 0.4079, "step": 1270 }, { "epoch": 1.0579911209766926, "grad_norm": 0.40207546949386597, "learning_rate": 8.178749968996965e-06, "loss": 0.4262, "step": 1271 }, { "epoch": 1.0588235294117647, "grad_norm": 0.4222968518733978, "learning_rate": 8.175008533004312e-06, "loss": 0.4536, "step": 1272 }, { "epoch": 1.0596559378468369, "grad_norm": 0.4397706091403961, "learning_rate": 8.171264115623056e-06, "loss": 0.4143, "step": 1273 }, { "epoch": 1.060488346281909, "grad_norm": 0.37260058522224426, "learning_rate": 8.167516720369268e-06, "loss": 0.3961, "step": 1274 }, { "epoch": 1.0613207547169812, "grad_norm": 0.4061874449253082, "learning_rate": 8.163766350761819e-06, "loss": 0.4376, "step": 1275 }, { "epoch": 1.0621531631520533, "grad_norm": 0.4455544054508209, "learning_rate": 8.160013010322372e-06, "loss": 0.4101, "step": 1276 }, { "epoch": 1.0629855715871255, "grad_norm": 0.36185914278030396, "learning_rate": 8.156256702575378e-06, "loss": 0.4202, "step": 1277 }, { "epoch": 1.0638179800221976, "grad_norm": 0.4335435628890991, "learning_rate": 8.152497431048076e-06, "loss": 0.4351, "step": 1278 }, { "epoch": 1.0646503884572698, "grad_norm": 0.3470878303050995, "learning_rate": 8.148735199270487e-06, "loss": 0.3819, "step": 1279 }, { "epoch": 1.065482796892342, "grad_norm": 0.3797052800655365, "learning_rate": 8.144970010775417e-06, "loss": 0.4304, "step": 1280 }, { "epoch": 1.066315205327414, "grad_norm": 0.44937318563461304, "learning_rate": 8.141201869098439e-06, "loss": 0.4613, "step": 1281 }, { "epoch": 1.067147613762486, "grad_norm": 0.3713797628879547, "learning_rate": 8.137430777777904e-06, "loss": 0.4366, "step": 1282 }, { "epoch": 1.0679800221975582, "grad_norm": 0.3958394527435303, "learning_rate": 8.133656740354936e-06, "loss": 0.4576, "step": 1283 }, { "epoch": 1.0688124306326303, "grad_norm": 0.4216252267360687, "learning_rate": 8.129879760373419e-06, "loss": 0.4375, "step": 1284 }, { "epoch": 1.0696448390677025, "grad_norm": 0.3756641745567322, "learning_rate": 8.126099841380008e-06, "loss": 0.442, "step": 1285 }, { "epoch": 1.0704772475027746, "grad_norm": 0.35558274388313293, "learning_rate": 8.122316986924108e-06, "loss": 0.4089, "step": 1286 }, { "epoch": 1.0713096559378468, "grad_norm": 0.4253517985343933, "learning_rate": 8.118531200557888e-06, "loss": 0.479, "step": 1287 }, { "epoch": 1.072142064372919, "grad_norm": 0.3373982012271881, "learning_rate": 8.114742485836267e-06, "loss": 0.404, "step": 1288 }, { "epoch": 1.072974472807991, "grad_norm": 0.41005459427833557, "learning_rate": 8.110950846316915e-06, "loss": 0.4544, "step": 1289 }, { "epoch": 1.0738068812430632, "grad_norm": 0.3480115532875061, "learning_rate": 8.107156285560249e-06, "loss": 0.4235, "step": 1290 }, { "epoch": 1.0746392896781354, "grad_norm": 0.373367041349411, "learning_rate": 8.103358807129424e-06, "loss": 0.4552, "step": 1291 }, { "epoch": 1.0754716981132075, "grad_norm": 0.3816337585449219, "learning_rate": 8.099558414590343e-06, "loss": 0.4019, "step": 1292 }, { "epoch": 1.0763041065482797, "grad_norm": 0.3785970211029053, "learning_rate": 8.09575511151164e-06, "loss": 0.4361, "step": 1293 }, { "epoch": 1.0771365149833518, "grad_norm": 0.37990981340408325, "learning_rate": 8.091948901464683e-06, "loss": 0.4375, "step": 1294 }, { "epoch": 1.077968923418424, "grad_norm": 0.36004897952079773, "learning_rate": 8.088139788023568e-06, "loss": 0.4331, "step": 1295 }, { "epoch": 1.0788013318534961, "grad_norm": 0.38282451033592224, "learning_rate": 8.084327774765121e-06, "loss": 0.4659, "step": 1296 }, { "epoch": 1.0796337402885683, "grad_norm": 0.3787173330783844, "learning_rate": 8.08051286526889e-06, "loss": 0.4041, "step": 1297 }, { "epoch": 1.0804661487236404, "grad_norm": 0.366184264421463, "learning_rate": 8.076695063117141e-06, "loss": 0.4211, "step": 1298 }, { "epoch": 1.0812985571587126, "grad_norm": 0.3390844166278839, "learning_rate": 8.072874371894856e-06, "loss": 0.4288, "step": 1299 }, { "epoch": 1.0821309655937847, "grad_norm": 0.35803496837615967, "learning_rate": 8.069050795189732e-06, "loss": 0.4241, "step": 1300 }, { "epoch": 1.082963374028857, "grad_norm": 0.3761143982410431, "learning_rate": 8.065224336592175e-06, "loss": 0.4112, "step": 1301 }, { "epoch": 1.083795782463929, "grad_norm": 0.3917039930820465, "learning_rate": 8.061394999695295e-06, "loss": 0.398, "step": 1302 }, { "epoch": 1.0846281908990012, "grad_norm": 0.35649728775024414, "learning_rate": 8.057562788094909e-06, "loss": 0.4079, "step": 1303 }, { "epoch": 1.0854605993340734, "grad_norm": 0.4020186960697174, "learning_rate": 8.053727705389527e-06, "loss": 0.421, "step": 1304 }, { "epoch": 1.0862930077691453, "grad_norm": 0.3835377097129822, "learning_rate": 8.049889755180363e-06, "loss": 0.4015, "step": 1305 }, { "epoch": 1.0871254162042177, "grad_norm": 0.3527611196041107, "learning_rate": 8.046048941071316e-06, "loss": 0.4202, "step": 1306 }, { "epoch": 1.0879578246392896, "grad_norm": 0.41720572113990784, "learning_rate": 8.042205266668982e-06, "loss": 0.494, "step": 1307 }, { "epoch": 1.0887902330743617, "grad_norm": 0.4160442054271698, "learning_rate": 8.038358735582632e-06, "loss": 0.4654, "step": 1308 }, { "epoch": 1.0896226415094339, "grad_norm": 0.3363052010536194, "learning_rate": 8.034509351424231e-06, "loss": 0.3651, "step": 1309 }, { "epoch": 1.090455049944506, "grad_norm": 0.45190951228141785, "learning_rate": 8.030657117808415e-06, "loss": 0.4048, "step": 1310 }, { "epoch": 1.0912874583795782, "grad_norm": 0.3866722881793976, "learning_rate": 8.026802038352503e-06, "loss": 0.4191, "step": 1311 }, { "epoch": 1.0921198668146503, "grad_norm": 0.4028158187866211, "learning_rate": 8.02294411667648e-06, "loss": 0.3982, "step": 1312 }, { "epoch": 1.0929522752497225, "grad_norm": 0.4108687937259674, "learning_rate": 8.019083356403002e-06, "loss": 0.4326, "step": 1313 }, { "epoch": 1.0937846836847946, "grad_norm": 0.417467325925827, "learning_rate": 8.015219761157387e-06, "loss": 0.4195, "step": 1314 }, { "epoch": 1.0946170921198668, "grad_norm": 0.3949446678161621, "learning_rate": 8.011353334567625e-06, "loss": 0.4035, "step": 1315 }, { "epoch": 1.095449500554939, "grad_norm": 0.374812513589859, "learning_rate": 8.007484080264355e-06, "loss": 0.4318, "step": 1316 }, { "epoch": 1.096281908990011, "grad_norm": 0.4128841459751129, "learning_rate": 8.003612001880872e-06, "loss": 0.436, "step": 1317 }, { "epoch": 1.0971143174250833, "grad_norm": 0.4258720278739929, "learning_rate": 7.99973710305313e-06, "loss": 0.4393, "step": 1318 }, { "epoch": 1.0979467258601554, "grad_norm": 0.38994690775871277, "learning_rate": 7.995859387419726e-06, "loss": 0.4135, "step": 1319 }, { "epoch": 1.0987791342952276, "grad_norm": 0.3956961929798126, "learning_rate": 7.9919788586219e-06, "loss": 0.4261, "step": 1320 }, { "epoch": 1.0996115427302997, "grad_norm": 0.4195059537887573, "learning_rate": 7.988095520303539e-06, "loss": 0.4351, "step": 1321 }, { "epoch": 1.1004439511653719, "grad_norm": 0.3911471366882324, "learning_rate": 7.984209376111165e-06, "loss": 0.4434, "step": 1322 }, { "epoch": 1.101276359600444, "grad_norm": 0.48853620886802673, "learning_rate": 7.980320429693934e-06, "loss": 0.4585, "step": 1323 }, { "epoch": 1.1021087680355162, "grad_norm": 0.40332579612731934, "learning_rate": 7.976428684703637e-06, "loss": 0.3821, "step": 1324 }, { "epoch": 1.1029411764705883, "grad_norm": 0.41588088870048523, "learning_rate": 7.97253414479469e-06, "loss": 0.469, "step": 1325 }, { "epoch": 1.1037735849056605, "grad_norm": 0.405784010887146, "learning_rate": 7.968636813624134e-06, "loss": 0.3994, "step": 1326 }, { "epoch": 1.1046059933407326, "grad_norm": 0.39917218685150146, "learning_rate": 7.964736694851632e-06, "loss": 0.4317, "step": 1327 }, { "epoch": 1.1054384017758045, "grad_norm": 0.46947672963142395, "learning_rate": 7.960833792139461e-06, "loss": 0.4775, "step": 1328 }, { "epoch": 1.106270810210877, "grad_norm": 0.42204639315605164, "learning_rate": 7.95692810915252e-06, "loss": 0.4228, "step": 1329 }, { "epoch": 1.1071032186459488, "grad_norm": 0.41625940799713135, "learning_rate": 7.953019649558309e-06, "loss": 0.4016, "step": 1330 }, { "epoch": 1.107935627081021, "grad_norm": 0.39135950803756714, "learning_rate": 7.949108417026941e-06, "loss": 0.4445, "step": 1331 }, { "epoch": 1.1087680355160932, "grad_norm": 0.4039277136325836, "learning_rate": 7.945194415231133e-06, "loss": 0.4099, "step": 1332 }, { "epoch": 1.1096004439511653, "grad_norm": 0.3570069670677185, "learning_rate": 7.9412776478462e-06, "loss": 0.4073, "step": 1333 }, { "epoch": 1.1104328523862375, "grad_norm": 0.4042835235595703, "learning_rate": 7.937358118550058e-06, "loss": 0.4037, "step": 1334 }, { "epoch": 1.1112652608213096, "grad_norm": 0.4284457266330719, "learning_rate": 7.933435831023211e-06, "loss": 0.4517, "step": 1335 }, { "epoch": 1.1120976692563818, "grad_norm": 0.3967071771621704, "learning_rate": 7.929510788948755e-06, "loss": 0.4205, "step": 1336 }, { "epoch": 1.112930077691454, "grad_norm": 0.3720296323299408, "learning_rate": 7.925582996012375e-06, "loss": 0.4472, "step": 1337 }, { "epoch": 1.113762486126526, "grad_norm": 0.38390690088272095, "learning_rate": 7.921652455902337e-06, "loss": 0.3934, "step": 1338 }, { "epoch": 1.1145948945615982, "grad_norm": 0.48500263690948486, "learning_rate": 7.917719172309487e-06, "loss": 0.4521, "step": 1339 }, { "epoch": 1.1154273029966704, "grad_norm": 0.38441312313079834, "learning_rate": 7.913783148927246e-06, "loss": 0.4474, "step": 1340 }, { "epoch": 1.1162597114317425, "grad_norm": 0.462807297706604, "learning_rate": 7.909844389451611e-06, "loss": 0.4397, "step": 1341 }, { "epoch": 1.1170921198668147, "grad_norm": 0.38858792185783386, "learning_rate": 7.905902897581145e-06, "loss": 0.3972, "step": 1342 }, { "epoch": 1.1179245283018868, "grad_norm": 0.3973788022994995, "learning_rate": 7.901958677016977e-06, "loss": 0.4265, "step": 1343 }, { "epoch": 1.118756936736959, "grad_norm": 0.3690992593765259, "learning_rate": 7.898011731462801e-06, "loss": 0.4439, "step": 1344 }, { "epoch": 1.1195893451720311, "grad_norm": 0.5391969680786133, "learning_rate": 7.894062064624865e-06, "loss": 0.4472, "step": 1345 }, { "epoch": 1.1204217536071033, "grad_norm": 0.3407292664051056, "learning_rate": 7.890109680211979e-06, "loss": 0.3753, "step": 1346 }, { "epoch": 1.1212541620421754, "grad_norm": 0.3949512541294098, "learning_rate": 7.886154581935499e-06, "loss": 0.459, "step": 1347 }, { "epoch": 1.1220865704772476, "grad_norm": 0.3942648470401764, "learning_rate": 7.88219677350933e-06, "loss": 0.3809, "step": 1348 }, { "epoch": 1.1229189789123197, "grad_norm": 0.4611280560493469, "learning_rate": 7.878236258649927e-06, "loss": 0.4762, "step": 1349 }, { "epoch": 1.1237513873473919, "grad_norm": 0.37650808691978455, "learning_rate": 7.874273041076283e-06, "loss": 0.4166, "step": 1350 }, { "epoch": 1.1245837957824638, "grad_norm": 0.4160518944263458, "learning_rate": 7.870307124509926e-06, "loss": 0.3948, "step": 1351 }, { "epoch": 1.1254162042175362, "grad_norm": 0.39588451385498047, "learning_rate": 7.86633851267492e-06, "loss": 0.4322, "step": 1352 }, { "epoch": 1.1262486126526081, "grad_norm": 0.3730158507823944, "learning_rate": 7.862367209297864e-06, "loss": 0.4327, "step": 1353 }, { "epoch": 1.1270810210876803, "grad_norm": 0.43521034717559814, "learning_rate": 7.85839321810788e-06, "loss": 0.392, "step": 1354 }, { "epoch": 1.1279134295227524, "grad_norm": 0.4675239622592926, "learning_rate": 7.854416542836617e-06, "loss": 0.4395, "step": 1355 }, { "epoch": 1.1287458379578246, "grad_norm": 0.38083502650260925, "learning_rate": 7.85043718721824e-06, "loss": 0.417, "step": 1356 }, { "epoch": 1.1295782463928967, "grad_norm": 0.46890681982040405, "learning_rate": 7.846455154989437e-06, "loss": 0.4463, "step": 1357 }, { "epoch": 1.1304106548279689, "grad_norm": 0.4093262851238251, "learning_rate": 7.842470449889403e-06, "loss": 0.394, "step": 1358 }, { "epoch": 1.131243063263041, "grad_norm": 0.35766276717185974, "learning_rate": 7.838483075659846e-06, "loss": 0.4444, "step": 1359 }, { "epoch": 1.1320754716981132, "grad_norm": 0.4069897532463074, "learning_rate": 7.83449303604498e-06, "loss": 0.4139, "step": 1360 }, { "epoch": 1.1329078801331853, "grad_norm": 0.4673629403114319, "learning_rate": 7.830500334791525e-06, "loss": 0.4268, "step": 1361 }, { "epoch": 1.1337402885682575, "grad_norm": 0.4079434275627136, "learning_rate": 7.826504975648696e-06, "loss": 0.4431, "step": 1362 }, { "epoch": 1.1345726970033296, "grad_norm": 0.4580193758010864, "learning_rate": 7.822506962368204e-06, "loss": 0.424, "step": 1363 }, { "epoch": 1.1354051054384018, "grad_norm": 0.4982101023197174, "learning_rate": 7.818506298704254e-06, "loss": 0.4285, "step": 1364 }, { "epoch": 1.136237513873474, "grad_norm": 0.3587871491909027, "learning_rate": 7.814502988413539e-06, "loss": 0.4058, "step": 1365 }, { "epoch": 1.137069922308546, "grad_norm": 0.48973003029823303, "learning_rate": 7.810497035255239e-06, "loss": 0.4631, "step": 1366 }, { "epoch": 1.1379023307436182, "grad_norm": 0.4257657825946808, "learning_rate": 7.80648844299101e-06, "loss": 0.4013, "step": 1367 }, { "epoch": 1.1387347391786904, "grad_norm": 0.5077852010726929, "learning_rate": 7.802477215384997e-06, "loss": 0.4421, "step": 1368 }, { "epoch": 1.1395671476137625, "grad_norm": 0.41671276092529297, "learning_rate": 7.79846335620381e-06, "loss": 0.4297, "step": 1369 }, { "epoch": 1.1403995560488347, "grad_norm": 0.392742395401001, "learning_rate": 7.794446869216527e-06, "loss": 0.3575, "step": 1370 }, { "epoch": 1.1412319644839068, "grad_norm": 0.504817545413971, "learning_rate": 7.79042775819471e-06, "loss": 0.4213, "step": 1371 }, { "epoch": 1.142064372918979, "grad_norm": 0.4020238518714905, "learning_rate": 7.786406026912368e-06, "loss": 0.4363, "step": 1372 }, { "epoch": 1.1428967813540512, "grad_norm": 0.49842900037765503, "learning_rate": 7.782381679145979e-06, "loss": 0.4386, "step": 1373 }, { "epoch": 1.143729189789123, "grad_norm": 0.48727041482925415, "learning_rate": 7.778354718674475e-06, "loss": 0.463, "step": 1374 }, { "epoch": 1.1445615982241955, "grad_norm": 0.39957672357559204, "learning_rate": 7.774325149279243e-06, "loss": 0.402, "step": 1375 }, { "epoch": 1.1453940066592674, "grad_norm": 0.5904676914215088, "learning_rate": 7.770292974744119e-06, "loss": 0.4446, "step": 1376 }, { "epoch": 1.1462264150943395, "grad_norm": 0.34512779116630554, "learning_rate": 7.766258198855386e-06, "loss": 0.4331, "step": 1377 }, { "epoch": 1.1470588235294117, "grad_norm": 0.44855794310569763, "learning_rate": 7.76222082540177e-06, "loss": 0.4014, "step": 1378 }, { "epoch": 1.1478912319644838, "grad_norm": 0.5417425632476807, "learning_rate": 7.758180858174434e-06, "loss": 0.4453, "step": 1379 }, { "epoch": 1.148723640399556, "grad_norm": 0.3855729401111603, "learning_rate": 7.754138300966978e-06, "loss": 0.4654, "step": 1380 }, { "epoch": 1.1495560488346281, "grad_norm": 0.4581325650215149, "learning_rate": 7.750093157575433e-06, "loss": 0.4398, "step": 1381 }, { "epoch": 1.1503884572697003, "grad_norm": 0.4510229825973511, "learning_rate": 7.746045431798264e-06, "loss": 0.4312, "step": 1382 }, { "epoch": 1.1512208657047724, "grad_norm": 0.3615362346172333, "learning_rate": 7.74199512743635e-06, "loss": 0.4297, "step": 1383 }, { "epoch": 1.1520532741398446, "grad_norm": 0.4092762768268585, "learning_rate": 7.737942248293001e-06, "loss": 0.4472, "step": 1384 }, { "epoch": 1.1528856825749167, "grad_norm": 0.46709978580474854, "learning_rate": 7.733886798173945e-06, "loss": 0.456, "step": 1385 }, { "epoch": 1.153718091009989, "grad_norm": 0.3712662160396576, "learning_rate": 7.729828780887313e-06, "loss": 0.4129, "step": 1386 }, { "epoch": 1.154550499445061, "grad_norm": 0.35037344694137573, "learning_rate": 7.72576820024366e-06, "loss": 0.4172, "step": 1387 }, { "epoch": 1.1553829078801332, "grad_norm": 0.4096495807170868, "learning_rate": 7.72170506005594e-06, "loss": 0.4243, "step": 1388 }, { "epoch": 1.1562153163152054, "grad_norm": 0.4149743914604187, "learning_rate": 7.717639364139514e-06, "loss": 0.4728, "step": 1389 }, { "epoch": 1.1570477247502775, "grad_norm": 0.3861239552497864, "learning_rate": 7.713571116312143e-06, "loss": 0.4409, "step": 1390 }, { "epoch": 1.1578801331853497, "grad_norm": 0.3818301558494568, "learning_rate": 7.709500320393976e-06, "loss": 0.415, "step": 1391 }, { "epoch": 1.1587125416204218, "grad_norm": 0.3710886240005493, "learning_rate": 7.70542698020757e-06, "loss": 0.4251, "step": 1392 }, { "epoch": 1.159544950055494, "grad_norm": 0.3728558421134949, "learning_rate": 7.70135109957786e-06, "loss": 0.4196, "step": 1393 }, { "epoch": 1.1603773584905661, "grad_norm": 0.42186808586120605, "learning_rate": 7.697272682332168e-06, "loss": 0.4473, "step": 1394 }, { "epoch": 1.1612097669256383, "grad_norm": 0.37886783480644226, "learning_rate": 7.6931917323002e-06, "loss": 0.4295, "step": 1395 }, { "epoch": 1.1620421753607104, "grad_norm": 0.36047130823135376, "learning_rate": 7.689108253314038e-06, "loss": 0.4145, "step": 1396 }, { "epoch": 1.1628745837957823, "grad_norm": 0.41782742738723755, "learning_rate": 7.685022249208142e-06, "loss": 0.4459, "step": 1397 }, { "epoch": 1.1637069922308547, "grad_norm": 0.3524583578109741, "learning_rate": 7.680933723819343e-06, "loss": 0.4361, "step": 1398 }, { "epoch": 1.1645394006659266, "grad_norm": 0.34524109959602356, "learning_rate": 7.676842680986836e-06, "loss": 0.405, "step": 1399 }, { "epoch": 1.1653718091009988, "grad_norm": 0.40391451120376587, "learning_rate": 7.67274912455218e-06, "loss": 0.445, "step": 1400 }, { "epoch": 1.166204217536071, "grad_norm": 0.3440330922603607, "learning_rate": 7.6686530583593e-06, "loss": 0.4126, "step": 1401 }, { "epoch": 1.167036625971143, "grad_norm": 0.4131713807582855, "learning_rate": 7.664554486254468e-06, "loss": 0.4831, "step": 1402 }, { "epoch": 1.1678690344062153, "grad_norm": 0.3954821527004242, "learning_rate": 7.660453412086323e-06, "loss": 0.4501, "step": 1403 }, { "epoch": 1.1687014428412874, "grad_norm": 0.3479725420475006, "learning_rate": 7.656349839705838e-06, "loss": 0.4125, "step": 1404 }, { "epoch": 1.1695338512763596, "grad_norm": 0.4041290581226349, "learning_rate": 7.652243772966345e-06, "loss": 0.3941, "step": 1405 }, { "epoch": 1.1703662597114317, "grad_norm": 0.3935270607471466, "learning_rate": 7.648135215723511e-06, "loss": 0.4381, "step": 1406 }, { "epoch": 1.1711986681465039, "grad_norm": 0.3363988399505615, "learning_rate": 7.64402417183534e-06, "loss": 0.3909, "step": 1407 }, { "epoch": 1.172031076581576, "grad_norm": 0.358073353767395, "learning_rate": 7.639910645162179e-06, "loss": 0.4092, "step": 1408 }, { "epoch": 1.1728634850166482, "grad_norm": 0.38340896368026733, "learning_rate": 7.635794639566697e-06, "loss": 0.4579, "step": 1409 }, { "epoch": 1.1736958934517203, "grad_norm": 0.3633081912994385, "learning_rate": 7.631676158913899e-06, "loss": 0.4123, "step": 1410 }, { "epoch": 1.1745283018867925, "grad_norm": 0.4083158075809479, "learning_rate": 7.627555207071108e-06, "loss": 0.451, "step": 1411 }, { "epoch": 1.1753607103218646, "grad_norm": 0.3736763894557953, "learning_rate": 7.623431787907971e-06, "loss": 0.4061, "step": 1412 }, { "epoch": 1.1761931187569368, "grad_norm": 0.3408348858356476, "learning_rate": 7.61930590529645e-06, "loss": 0.4175, "step": 1413 }, { "epoch": 1.177025527192009, "grad_norm": 0.38261574506759644, "learning_rate": 7.6151775631108245e-06, "loss": 0.4458, "step": 1414 }, { "epoch": 1.177857935627081, "grad_norm": 0.3570031225681305, "learning_rate": 7.611046765227675e-06, "loss": 0.3869, "step": 1415 }, { "epoch": 1.1786903440621532, "grad_norm": 0.3958790600299835, "learning_rate": 7.606913515525896e-06, "loss": 0.4224, "step": 1416 }, { "epoch": 1.1795227524972254, "grad_norm": 0.43983784317970276, "learning_rate": 7.602777817886678e-06, "loss": 0.456, "step": 1417 }, { "epoch": 1.1803551609322975, "grad_norm": 0.3442942202091217, "learning_rate": 7.59863967619352e-06, "loss": 0.4149, "step": 1418 }, { "epoch": 1.1811875693673697, "grad_norm": 0.3674313426017761, "learning_rate": 7.594499094332204e-06, "loss": 0.4029, "step": 1419 }, { "epoch": 1.1820199778024416, "grad_norm": 0.4117499589920044, "learning_rate": 7.59035607619081e-06, "loss": 0.434, "step": 1420 }, { "epoch": 1.182852386237514, "grad_norm": 0.3603156507015228, "learning_rate": 7.586210625659707e-06, "loss": 0.4318, "step": 1421 }, { "epoch": 1.183684794672586, "grad_norm": 0.38503581285476685, "learning_rate": 7.582062746631542e-06, "loss": 0.4139, "step": 1422 }, { "epoch": 1.184517203107658, "grad_norm": 0.43392035365104675, "learning_rate": 7.577912443001247e-06, "loss": 0.4058, "step": 1423 }, { "epoch": 1.1853496115427302, "grad_norm": 0.3828240633010864, "learning_rate": 7.573759718666031e-06, "loss": 0.4342, "step": 1424 }, { "epoch": 1.1861820199778024, "grad_norm": 0.39743468165397644, "learning_rate": 7.569604577525376e-06, "loss": 0.4351, "step": 1425 }, { "epoch": 1.1870144284128745, "grad_norm": 0.417901873588562, "learning_rate": 7.56544702348103e-06, "loss": 0.4358, "step": 1426 }, { "epoch": 1.1878468368479467, "grad_norm": 0.4021197259426117, "learning_rate": 7.5612870604370106e-06, "loss": 0.4402, "step": 1427 }, { "epoch": 1.1886792452830188, "grad_norm": 0.32264193892478943, "learning_rate": 7.557124692299593e-06, "loss": 0.3862, "step": 1428 }, { "epoch": 1.189511653718091, "grad_norm": 0.41591644287109375, "learning_rate": 7.552959922977317e-06, "loss": 0.4142, "step": 1429 }, { "epoch": 1.1903440621531631, "grad_norm": 0.38096314668655396, "learning_rate": 7.548792756380972e-06, "loss": 0.4072, "step": 1430 }, { "epoch": 1.1911764705882353, "grad_norm": 0.3876970410346985, "learning_rate": 7.5446231964236025e-06, "loss": 0.4199, "step": 1431 }, { "epoch": 1.1920088790233074, "grad_norm": 0.36842256784439087, "learning_rate": 7.540451247020495e-06, "loss": 0.4071, "step": 1432 }, { "epoch": 1.1928412874583796, "grad_norm": 0.3903951644897461, "learning_rate": 7.536276912089187e-06, "loss": 0.4546, "step": 1433 }, { "epoch": 1.1936736958934517, "grad_norm": 0.38267070055007935, "learning_rate": 7.53210019554945e-06, "loss": 0.4082, "step": 1434 }, { "epoch": 1.1945061043285239, "grad_norm": 0.3489694893360138, "learning_rate": 7.527921101323292e-06, "loss": 0.4278, "step": 1435 }, { "epoch": 1.195338512763596, "grad_norm": 0.36168771982192993, "learning_rate": 7.523739633334959e-06, "loss": 0.4372, "step": 1436 }, { "epoch": 1.1961709211986682, "grad_norm": 0.37979066371917725, "learning_rate": 7.5195557955109225e-06, "loss": 0.4514, "step": 1437 }, { "epoch": 1.1970033296337403, "grad_norm": 0.3345673382282257, "learning_rate": 7.515369591779876e-06, "loss": 0.4163, "step": 1438 }, { "epoch": 1.1978357380688125, "grad_norm": 0.3836076855659485, "learning_rate": 7.511181026072741e-06, "loss": 0.4598, "step": 1439 }, { "epoch": 1.1986681465038846, "grad_norm": 0.37280598282814026, "learning_rate": 7.5069901023226545e-06, "loss": 0.4072, "step": 1440 }, { "epoch": 1.1995005549389568, "grad_norm": 0.35891035199165344, "learning_rate": 7.502796824464966e-06, "loss": 0.4475, "step": 1441 }, { "epoch": 1.200332963374029, "grad_norm": 0.3753512501716614, "learning_rate": 7.498601196437238e-06, "loss": 0.4583, "step": 1442 }, { "epoch": 1.2011653718091009, "grad_norm": 0.33655253052711487, "learning_rate": 7.494403222179235e-06, "loss": 0.399, "step": 1443 }, { "epoch": 1.2019977802441733, "grad_norm": 0.34640976786613464, "learning_rate": 7.490202905632933e-06, "loss": 0.401, "step": 1444 }, { "epoch": 1.2028301886792452, "grad_norm": 0.34061723947525024, "learning_rate": 7.4860002507425004e-06, "loss": 0.408, "step": 1445 }, { "epoch": 1.2036625971143173, "grad_norm": 0.36065739393234253, "learning_rate": 7.481795261454304e-06, "loss": 0.4472, "step": 1446 }, { "epoch": 1.2044950055493895, "grad_norm": 0.33626216650009155, "learning_rate": 7.477587941716904e-06, "loss": 0.4088, "step": 1447 }, { "epoch": 1.2053274139844616, "grad_norm": 0.3538858890533447, "learning_rate": 7.4733782954810444e-06, "loss": 0.4513, "step": 1448 }, { "epoch": 1.2061598224195338, "grad_norm": 0.3653189539909363, "learning_rate": 7.469166326699658e-06, "loss": 0.4268, "step": 1449 }, { "epoch": 1.206992230854606, "grad_norm": 0.3585223853588104, "learning_rate": 7.4649520393278575e-06, "loss": 0.3983, "step": 1450 }, { "epoch": 1.207824639289678, "grad_norm": 0.38904839754104614, "learning_rate": 7.460735437322933e-06, "loss": 0.4459, "step": 1451 }, { "epoch": 1.2086570477247502, "grad_norm": 0.34138140082359314, "learning_rate": 7.456516524644347e-06, "loss": 0.4143, "step": 1452 }, { "epoch": 1.2094894561598224, "grad_norm": 0.3598451614379883, "learning_rate": 7.452295305253731e-06, "loss": 0.4108, "step": 1453 }, { "epoch": 1.2103218645948945, "grad_norm": 0.35666725039482117, "learning_rate": 7.448071783114887e-06, "loss": 0.4157, "step": 1454 }, { "epoch": 1.2111542730299667, "grad_norm": 0.42830565571784973, "learning_rate": 7.443845962193775e-06, "loss": 0.4488, "step": 1455 }, { "epoch": 1.2119866814650389, "grad_norm": 0.3601066470146179, "learning_rate": 7.439617846458513e-06, "loss": 0.4221, "step": 1456 }, { "epoch": 1.212819089900111, "grad_norm": 0.36113888025283813, "learning_rate": 7.435387439879378e-06, "loss": 0.368, "step": 1457 }, { "epoch": 1.2136514983351832, "grad_norm": 0.4079122245311737, "learning_rate": 7.431154746428794e-06, "loss": 0.4567, "step": 1458 }, { "epoch": 1.2144839067702553, "grad_norm": 0.3279145658016205, "learning_rate": 7.4269197700813375e-06, "loss": 0.401, "step": 1459 }, { "epoch": 1.2153163152053275, "grad_norm": 0.3636506497859955, "learning_rate": 7.4226825148137225e-06, "loss": 0.4394, "step": 1460 }, { "epoch": 1.2161487236403996, "grad_norm": 0.3573419153690338, "learning_rate": 7.418442984604805e-06, "loss": 0.3929, "step": 1461 }, { "epoch": 1.2169811320754718, "grad_norm": 0.3526058793067932, "learning_rate": 7.414201183435581e-06, "loss": 0.4305, "step": 1462 }, { "epoch": 1.217813540510544, "grad_norm": 0.34084025025367737, "learning_rate": 7.409957115289175e-06, "loss": 0.4018, "step": 1463 }, { "epoch": 1.218645948945616, "grad_norm": 0.3763788342475891, "learning_rate": 7.40571078415084e-06, "loss": 0.4106, "step": 1464 }, { "epoch": 1.2194783573806882, "grad_norm": 0.37996524572372437, "learning_rate": 7.401462194007957e-06, "loss": 0.4664, "step": 1465 }, { "epoch": 1.2203107658157601, "grad_norm": 0.3957565724849701, "learning_rate": 7.397211348850025e-06, "loss": 0.4947, "step": 1466 }, { "epoch": 1.2211431742508325, "grad_norm": 0.33967241644859314, "learning_rate": 7.392958252668663e-06, "loss": 0.3549, "step": 1467 }, { "epoch": 1.2219755826859044, "grad_norm": 0.37266120314598083, "learning_rate": 7.388702909457603e-06, "loss": 0.4341, "step": 1468 }, { "epoch": 1.2228079911209766, "grad_norm": 0.37723714113235474, "learning_rate": 7.384445323212687e-06, "loss": 0.4435, "step": 1469 }, { "epoch": 1.2236403995560488, "grad_norm": 0.3732614815235138, "learning_rate": 7.380185497931862e-06, "loss": 0.4508, "step": 1470 }, { "epoch": 1.224472807991121, "grad_norm": 0.35898712277412415, "learning_rate": 7.375923437615179e-06, "loss": 0.4299, "step": 1471 }, { "epoch": 1.225305216426193, "grad_norm": 0.37605753540992737, "learning_rate": 7.371659146264787e-06, "loss": 0.4495, "step": 1472 }, { "epoch": 1.2261376248612652, "grad_norm": 0.35970860719680786, "learning_rate": 7.367392627884931e-06, "loss": 0.3909, "step": 1473 }, { "epoch": 1.2269700332963374, "grad_norm": 0.39131295680999756, "learning_rate": 7.363123886481947e-06, "loss": 0.4308, "step": 1474 }, { "epoch": 1.2278024417314095, "grad_norm": 0.38385316729545593, "learning_rate": 7.3588529260642564e-06, "loss": 0.4483, "step": 1475 }, { "epoch": 1.2286348501664817, "grad_norm": 0.35251525044441223, "learning_rate": 7.3545797506423655e-06, "loss": 0.4427, "step": 1476 }, { "epoch": 1.2294672586015538, "grad_norm": 0.3673636317253113, "learning_rate": 7.3503043642288614e-06, "loss": 0.4065, "step": 1477 }, { "epoch": 1.230299667036626, "grad_norm": 0.3953494727611542, "learning_rate": 7.3460267708384084e-06, "loss": 0.4341, "step": 1478 }, { "epoch": 1.2311320754716981, "grad_norm": 0.34204229712486267, "learning_rate": 7.3417469744877375e-06, "loss": 0.4064, "step": 1479 }, { "epoch": 1.2319644839067703, "grad_norm": 0.35544899106025696, "learning_rate": 7.337464979195658e-06, "loss": 0.4086, "step": 1480 }, { "epoch": 1.2327968923418424, "grad_norm": 0.37943002581596375, "learning_rate": 7.333180788983034e-06, "loss": 0.4459, "step": 1481 }, { "epoch": 1.2336293007769146, "grad_norm": 0.33336302638053894, "learning_rate": 7.328894407872797e-06, "loss": 0.409, "step": 1482 }, { "epoch": 1.2344617092119867, "grad_norm": 0.3566553294658661, "learning_rate": 7.324605839889936e-06, "loss": 0.4377, "step": 1483 }, { "epoch": 1.2352941176470589, "grad_norm": 0.39889898896217346, "learning_rate": 7.320315089061486e-06, "loss": 0.4603, "step": 1484 }, { "epoch": 1.236126526082131, "grad_norm": 0.4692656695842743, "learning_rate": 7.3160221594165415e-06, "loss": 0.4108, "step": 1485 }, { "epoch": 1.2369589345172032, "grad_norm": 0.39253804087638855, "learning_rate": 7.3117270549862385e-06, "loss": 0.4393, "step": 1486 }, { "epoch": 1.2377913429522753, "grad_norm": 0.4060220718383789, "learning_rate": 7.3074297798037515e-06, "loss": 0.423, "step": 1487 }, { "epoch": 1.2386237513873475, "grad_norm": 0.32819664478302, "learning_rate": 7.303130337904303e-06, "loss": 0.3855, "step": 1488 }, { "epoch": 1.2394561598224194, "grad_norm": 0.36876818537712097, "learning_rate": 7.298828733325138e-06, "loss": 0.4056, "step": 1489 }, { "epoch": 1.2402885682574918, "grad_norm": 0.37898144125938416, "learning_rate": 7.294524970105543e-06, "loss": 0.3985, "step": 1490 }, { "epoch": 1.2411209766925637, "grad_norm": 0.3846915662288666, "learning_rate": 7.290219052286826e-06, "loss": 0.4752, "step": 1491 }, { "epoch": 1.2419533851276359, "grad_norm": 0.4225359857082367, "learning_rate": 7.285910983912317e-06, "loss": 0.4152, "step": 1492 }, { "epoch": 1.242785793562708, "grad_norm": 0.3616348206996918, "learning_rate": 7.281600769027371e-06, "loss": 0.4244, "step": 1493 }, { "epoch": 1.2436182019977802, "grad_norm": 0.35701385140419006, "learning_rate": 7.277288411679352e-06, "loss": 0.4284, "step": 1494 }, { "epoch": 1.2444506104328523, "grad_norm": 0.37644630670547485, "learning_rate": 7.272973915917642e-06, "loss": 0.4185, "step": 1495 }, { "epoch": 1.2452830188679245, "grad_norm": 0.3375144898891449, "learning_rate": 7.268657285793625e-06, "loss": 0.3967, "step": 1496 }, { "epoch": 1.2461154273029966, "grad_norm": 0.36483046412467957, "learning_rate": 7.264338525360695e-06, "loss": 0.4346, "step": 1497 }, { "epoch": 1.2469478357380688, "grad_norm": 0.364069402217865, "learning_rate": 7.260017638674244e-06, "loss": 0.4308, "step": 1498 }, { "epoch": 1.247780244173141, "grad_norm": 0.3628464639186859, "learning_rate": 7.255694629791659e-06, "loss": 0.4345, "step": 1499 }, { "epoch": 1.248612652608213, "grad_norm": 0.33808740973472595, "learning_rate": 7.251369502772318e-06, "loss": 0.405, "step": 1500 }, { "epoch": 1.2494450610432852, "grad_norm": 0.4051111042499542, "learning_rate": 7.247042261677597e-06, "loss": 0.4147, "step": 1501 }, { "epoch": 1.2502774694783574, "grad_norm": 0.34634676575660706, "learning_rate": 7.242712910570846e-06, "loss": 0.4635, "step": 1502 }, { "epoch": 1.2511098779134295, "grad_norm": 0.42578983306884766, "learning_rate": 7.238381453517405e-06, "loss": 0.4309, "step": 1503 }, { "epoch": 1.2519422863485017, "grad_norm": 0.38502374291419983, "learning_rate": 7.234047894584586e-06, "loss": 0.4128, "step": 1504 }, { "epoch": 1.2527746947835738, "grad_norm": 0.37332308292388916, "learning_rate": 7.229712237841679e-06, "loss": 0.4229, "step": 1505 }, { "epoch": 1.253607103218646, "grad_norm": 0.3640212118625641, "learning_rate": 7.225374487359937e-06, "loss": 0.4239, "step": 1506 }, { "epoch": 1.2544395116537181, "grad_norm": 0.3998686671257019, "learning_rate": 7.221034647212588e-06, "loss": 0.3917, "step": 1507 }, { "epoch": 1.2552719200887903, "grad_norm": 0.3780185282230377, "learning_rate": 7.216692721474816e-06, "loss": 0.4168, "step": 1508 }, { "epoch": 1.2561043285238624, "grad_norm": 0.3652763068675995, "learning_rate": 7.212348714223767e-06, "loss": 0.4179, "step": 1509 }, { "epoch": 1.2569367369589346, "grad_norm": 0.3468812108039856, "learning_rate": 7.208002629538537e-06, "loss": 0.3956, "step": 1510 }, { "epoch": 1.2577691453940067, "grad_norm": 0.46192142367362976, "learning_rate": 7.203654471500179e-06, "loss": 0.4355, "step": 1511 }, { "epoch": 1.2586015538290787, "grad_norm": 0.336422324180603, "learning_rate": 7.199304244191687e-06, "loss": 0.4085, "step": 1512 }, { "epoch": 1.259433962264151, "grad_norm": 0.3533170521259308, "learning_rate": 7.1949519516980005e-06, "loss": 0.4076, "step": 1513 }, { "epoch": 1.260266370699223, "grad_norm": 0.34538713097572327, "learning_rate": 7.190597598106001e-06, "loss": 0.3822, "step": 1514 }, { "epoch": 1.2610987791342954, "grad_norm": 0.35250788927078247, "learning_rate": 7.186241187504499e-06, "loss": 0.4416, "step": 1515 }, { "epoch": 1.2619311875693673, "grad_norm": 0.38682812452316284, "learning_rate": 7.1818827239842446e-06, "loss": 0.4295, "step": 1516 }, { "epoch": 1.2627635960044394, "grad_norm": 0.3738081455230713, "learning_rate": 7.177522211637906e-06, "loss": 0.4203, "step": 1517 }, { "epoch": 1.2635960044395116, "grad_norm": 0.39243027567863464, "learning_rate": 7.173159654560087e-06, "loss": 0.4994, "step": 1518 }, { "epoch": 1.2644284128745837, "grad_norm": 0.337700754404068, "learning_rate": 7.168795056847301e-06, "loss": 0.407, "step": 1519 }, { "epoch": 1.265260821309656, "grad_norm": 0.4204353094100952, "learning_rate": 7.164428422597982e-06, "loss": 0.4189, "step": 1520 }, { "epoch": 1.266093229744728, "grad_norm": 0.38597655296325684, "learning_rate": 7.1600597559124765e-06, "loss": 0.4476, "step": 1521 }, { "epoch": 1.2669256381798002, "grad_norm": 0.32296887040138245, "learning_rate": 7.155689060893038e-06, "loss": 0.3669, "step": 1522 }, { "epoch": 1.2677580466148723, "grad_norm": 0.35379377007484436, "learning_rate": 7.151316341643828e-06, "loss": 0.4346, "step": 1523 }, { "epoch": 1.2685904550499445, "grad_norm": 0.5642543435096741, "learning_rate": 7.146941602270905e-06, "loss": 0.4237, "step": 1524 }, { "epoch": 1.2694228634850167, "grad_norm": 0.4288339614868164, "learning_rate": 7.142564846882227e-06, "loss": 0.3797, "step": 1525 }, { "epoch": 1.2702552719200888, "grad_norm": 0.36099711060523987, "learning_rate": 7.1381860795876415e-06, "loss": 0.4519, "step": 1526 }, { "epoch": 1.271087680355161, "grad_norm": 0.4087908864021301, "learning_rate": 7.13380530449889e-06, "loss": 0.4499, "step": 1527 }, { "epoch": 1.271920088790233, "grad_norm": 0.37486109137535095, "learning_rate": 7.129422525729594e-06, "loss": 0.4141, "step": 1528 }, { "epoch": 1.2727524972253053, "grad_norm": 0.3277265429496765, "learning_rate": 7.125037747395264e-06, "loss": 0.3747, "step": 1529 }, { "epoch": 1.2735849056603774, "grad_norm": 0.36414340138435364, "learning_rate": 7.120650973613279e-06, "loss": 0.4181, "step": 1530 }, { "epoch": 1.2744173140954496, "grad_norm": 0.3803533911705017, "learning_rate": 7.116262208502901e-06, "loss": 0.429, "step": 1531 }, { "epoch": 1.2752497225305217, "grad_norm": 0.3974792957305908, "learning_rate": 7.111871456185253e-06, "loss": 0.4555, "step": 1532 }, { "epoch": 1.2760821309655939, "grad_norm": 0.35720840096473694, "learning_rate": 7.107478720783332e-06, "loss": 0.4415, "step": 1533 }, { "epoch": 1.276914539400666, "grad_norm": 0.4027242362499237, "learning_rate": 7.1030840064219906e-06, "loss": 0.4108, "step": 1534 }, { "epoch": 1.277746947835738, "grad_norm": 0.38121697306632996, "learning_rate": 7.098687317227943e-06, "loss": 0.4387, "step": 1535 }, { "epoch": 1.2785793562708103, "grad_norm": 0.37340235710144043, "learning_rate": 7.09428865732976e-06, "loss": 0.4174, "step": 1536 }, { "epoch": 1.2794117647058822, "grad_norm": 0.4037769138813019, "learning_rate": 7.089888030857857e-06, "loss": 0.4613, "step": 1537 }, { "epoch": 1.2802441731409546, "grad_norm": 0.33645448088645935, "learning_rate": 7.0854854419445e-06, "loss": 0.3608, "step": 1538 }, { "epoch": 1.2810765815760266, "grad_norm": 0.4041195511817932, "learning_rate": 7.0810808947237975e-06, "loss": 0.4305, "step": 1539 }, { "epoch": 1.2819089900110987, "grad_norm": 0.34097573161125183, "learning_rate": 7.076674393331697e-06, "loss": 0.4156, "step": 1540 }, { "epoch": 1.2827413984461709, "grad_norm": 0.40564876794815063, "learning_rate": 7.0722659419059806e-06, "loss": 0.4365, "step": 1541 }, { "epoch": 1.283573806881243, "grad_norm": 0.39337435364723206, "learning_rate": 7.0678555445862605e-06, "loss": 0.4482, "step": 1542 }, { "epoch": 1.2844062153163152, "grad_norm": 0.3427252173423767, "learning_rate": 7.063443205513975e-06, "loss": 0.3806, "step": 1543 }, { "epoch": 1.2852386237513873, "grad_norm": 0.4486899673938751, "learning_rate": 7.059028928832394e-06, "loss": 0.4238, "step": 1544 }, { "epoch": 1.2860710321864595, "grad_norm": 0.37656182050704956, "learning_rate": 7.054612718686593e-06, "loss": 0.4165, "step": 1545 }, { "epoch": 1.2869034406215316, "grad_norm": 0.37508389353752136, "learning_rate": 7.0501945792234776e-06, "loss": 0.4368, "step": 1546 }, { "epoch": 1.2877358490566038, "grad_norm": 0.385955274105072, "learning_rate": 7.045774514591753e-06, "loss": 0.4263, "step": 1547 }, { "epoch": 1.288568257491676, "grad_norm": 0.3990999162197113, "learning_rate": 7.041352528941939e-06, "loss": 0.4219, "step": 1548 }, { "epoch": 1.289400665926748, "grad_norm": 0.43571391701698303, "learning_rate": 7.036928626426358e-06, "loss": 0.4525, "step": 1549 }, { "epoch": 1.2902330743618202, "grad_norm": 0.40208232402801514, "learning_rate": 7.0325028111991325e-06, "loss": 0.4166, "step": 1550 }, { "epoch": 1.2910654827968924, "grad_norm": 0.5420854687690735, "learning_rate": 7.02807508741618e-06, "loss": 0.4908, "step": 1551 }, { "epoch": 1.2918978912319645, "grad_norm": 0.3404446840286255, "learning_rate": 7.0236454592352065e-06, "loss": 0.3513, "step": 1552 }, { "epoch": 1.2927302996670367, "grad_norm": 0.37210413813591003, "learning_rate": 7.019213930815718e-06, "loss": 0.4292, "step": 1553 }, { "epoch": 1.2935627081021088, "grad_norm": 0.4521826207637787, "learning_rate": 7.01478050631899e-06, "loss": 0.4314, "step": 1554 }, { "epoch": 1.294395116537181, "grad_norm": 0.3963296413421631, "learning_rate": 7.010345189908092e-06, "loss": 0.4345, "step": 1555 }, { "epoch": 1.2952275249722531, "grad_norm": 0.3517422378063202, "learning_rate": 7.0059079857478596e-06, "loss": 0.4088, "step": 1556 }, { "epoch": 1.2960599334073253, "grad_norm": 0.4621788263320923, "learning_rate": 7.001468898004907e-06, "loss": 0.4385, "step": 1557 }, { "epoch": 1.2968923418423972, "grad_norm": 0.41421958804130554, "learning_rate": 6.997027930847614e-06, "loss": 0.4428, "step": 1558 }, { "epoch": 1.2977247502774696, "grad_norm": 0.32892701029777527, "learning_rate": 6.992585088446129e-06, "loss": 0.4213, "step": 1559 }, { "epoch": 1.2985571587125415, "grad_norm": 0.37421393394470215, "learning_rate": 6.988140374972357e-06, "loss": 0.3801, "step": 1560 }, { "epoch": 1.2993895671476139, "grad_norm": 0.4647720456123352, "learning_rate": 6.983693794599959e-06, "loss": 0.4472, "step": 1561 }, { "epoch": 1.3002219755826858, "grad_norm": 0.3581669330596924, "learning_rate": 6.979245351504358e-06, "loss": 0.4224, "step": 1562 }, { "epoch": 1.301054384017758, "grad_norm": 0.37526553869247437, "learning_rate": 6.974795049862715e-06, "loss": 0.4071, "step": 1563 }, { "epoch": 1.3018867924528301, "grad_norm": 0.3938126266002655, "learning_rate": 6.970342893853943e-06, "loss": 0.4474, "step": 1564 }, { "epoch": 1.3027192008879023, "grad_norm": 0.33351486921310425, "learning_rate": 6.965888887658695e-06, "loss": 0.3736, "step": 1565 }, { "epoch": 1.3035516093229744, "grad_norm": 0.4295842945575714, "learning_rate": 6.961433035459361e-06, "loss": 0.4689, "step": 1566 }, { "epoch": 1.3043840177580466, "grad_norm": 0.3872586786746979, "learning_rate": 6.956975341440061e-06, "loss": 0.4328, "step": 1567 }, { "epoch": 1.3052164261931187, "grad_norm": 0.3561748266220093, "learning_rate": 6.952515809786652e-06, "loss": 0.4492, "step": 1568 }, { "epoch": 1.3060488346281909, "grad_norm": 0.41043657064437866, "learning_rate": 6.948054444686709e-06, "loss": 0.4037, "step": 1569 }, { "epoch": 1.306881243063263, "grad_norm": 0.4166451394557953, "learning_rate": 6.943591250329534e-06, "loss": 0.4192, "step": 1570 }, { "epoch": 1.3077136514983352, "grad_norm": 0.37019088864326477, "learning_rate": 6.939126230906144e-06, "loss": 0.4187, "step": 1571 }, { "epoch": 1.3085460599334073, "grad_norm": 0.3775978684425354, "learning_rate": 6.934659390609271e-06, "loss": 0.4589, "step": 1572 }, { "epoch": 1.3093784683684795, "grad_norm": 0.4076300859451294, "learning_rate": 6.930190733633355e-06, "loss": 0.437, "step": 1573 }, { "epoch": 1.3102108768035516, "grad_norm": 0.36930978298187256, "learning_rate": 6.925720264174543e-06, "loss": 0.3679, "step": 1574 }, { "epoch": 1.3110432852386238, "grad_norm": 0.40303748846054077, "learning_rate": 6.921247986430686e-06, "loss": 0.4646, "step": 1575 }, { "epoch": 1.311875693673696, "grad_norm": 0.3812013566493988, "learning_rate": 6.9167739046013305e-06, "loss": 0.4285, "step": 1576 }, { "epoch": 1.312708102108768, "grad_norm": 0.4709984064102173, "learning_rate": 6.912298022887716e-06, "loss": 0.4492, "step": 1577 }, { "epoch": 1.3135405105438402, "grad_norm": 0.33343759179115295, "learning_rate": 6.907820345492775e-06, "loss": 0.3764, "step": 1578 }, { "epoch": 1.3143729189789124, "grad_norm": 0.4059264361858368, "learning_rate": 6.903340876621125e-06, "loss": 0.4234, "step": 1579 }, { "epoch": 1.3152053274139845, "grad_norm": 0.39759019017219543, "learning_rate": 6.8988596204790655e-06, "loss": 0.4593, "step": 1580 }, { "epoch": 1.3160377358490565, "grad_norm": 0.3702462315559387, "learning_rate": 6.894376581274578e-06, "loss": 0.4284, "step": 1581 }, { "epoch": 1.3168701442841289, "grad_norm": 0.3988160490989685, "learning_rate": 6.889891763217307e-06, "loss": 0.4043, "step": 1582 }, { "epoch": 1.3177025527192008, "grad_norm": 0.3909897208213806, "learning_rate": 6.8854051705185825e-06, "loss": 0.4282, "step": 1583 }, { "epoch": 1.3185349611542732, "grad_norm": 0.37735849618911743, "learning_rate": 6.880916807391388e-06, "loss": 0.4185, "step": 1584 }, { "epoch": 1.319367369589345, "grad_norm": 0.4028033912181854, "learning_rate": 6.876426678050379e-06, "loss": 0.427, "step": 1585 }, { "epoch": 1.3201997780244172, "grad_norm": 0.3585635721683502, "learning_rate": 6.871934786711866e-06, "loss": 0.401, "step": 1586 }, { "epoch": 1.3210321864594894, "grad_norm": 0.36207467317581177, "learning_rate": 6.86744113759381e-06, "loss": 0.4119, "step": 1587 }, { "epoch": 1.3218645948945615, "grad_norm": 0.3779536485671997, "learning_rate": 6.862945734915829e-06, "loss": 0.4531, "step": 1588 }, { "epoch": 1.3226970033296337, "grad_norm": 0.38031184673309326, "learning_rate": 6.858448582899183e-06, "loss": 0.4332, "step": 1589 }, { "epoch": 1.3235294117647058, "grad_norm": 0.3429410755634308, "learning_rate": 6.8539496857667785e-06, "loss": 0.3944, "step": 1590 }, { "epoch": 1.324361820199778, "grad_norm": 0.3546883463859558, "learning_rate": 6.849449047743158e-06, "loss": 0.4342, "step": 1591 }, { "epoch": 1.3251942286348501, "grad_norm": 0.39290136098861694, "learning_rate": 6.844946673054498e-06, "loss": 0.431, "step": 1592 }, { "epoch": 1.3260266370699223, "grad_norm": 0.3893829584121704, "learning_rate": 6.840442565928609e-06, "loss": 0.4844, "step": 1593 }, { "epoch": 1.3268590455049944, "grad_norm": 0.344115287065506, "learning_rate": 6.8359367305949256e-06, "loss": 0.4036, "step": 1594 }, { "epoch": 1.3276914539400666, "grad_norm": 0.3980282247066498, "learning_rate": 6.831429171284506e-06, "loss": 0.4888, "step": 1595 }, { "epoch": 1.3285238623751388, "grad_norm": 0.36405783891677856, "learning_rate": 6.8269198922300274e-06, "loss": 0.4067, "step": 1596 }, { "epoch": 1.329356270810211, "grad_norm": 0.3426407277584076, "learning_rate": 6.822408897665782e-06, "loss": 0.3926, "step": 1597 }, { "epoch": 1.330188679245283, "grad_norm": 0.35977932810783386, "learning_rate": 6.817896191827673e-06, "loss": 0.4423, "step": 1598 }, { "epoch": 1.3310210876803552, "grad_norm": 0.34656643867492676, "learning_rate": 6.81338177895321e-06, "loss": 0.4234, "step": 1599 }, { "epoch": 1.3318534961154274, "grad_norm": 0.4227568507194519, "learning_rate": 6.808865663281504e-06, "loss": 0.4866, "step": 1600 }, { "epoch": 1.3326859045504995, "grad_norm": 0.3399163782596588, "learning_rate": 6.8043478490532695e-06, "loss": 0.4099, "step": 1601 }, { "epoch": 1.3335183129855717, "grad_norm": 0.37561044096946716, "learning_rate": 6.799828340510811e-06, "loss": 0.4149, "step": 1602 }, { "epoch": 1.3343507214206438, "grad_norm": 0.40371450781822205, "learning_rate": 6.795307141898027e-06, "loss": 0.3866, "step": 1603 }, { "epoch": 1.3351831298557157, "grad_norm": 0.4308793842792511, "learning_rate": 6.790784257460403e-06, "loss": 0.4635, "step": 1604 }, { "epoch": 1.3360155382907881, "grad_norm": 0.33165696263313293, "learning_rate": 6.786259691445005e-06, "loss": 0.3694, "step": 1605 }, { "epoch": 1.33684794672586, "grad_norm": 0.37500032782554626, "learning_rate": 6.781733448100482e-06, "loss": 0.4279, "step": 1606 }, { "epoch": 1.3376803551609324, "grad_norm": 0.3972361981868744, "learning_rate": 6.777205531677052e-06, "loss": 0.4096, "step": 1607 }, { "epoch": 1.3385127635960044, "grad_norm": 0.41537413001060486, "learning_rate": 6.772675946426511e-06, "loss": 0.4562, "step": 1608 }, { "epoch": 1.3393451720310765, "grad_norm": 0.3932049870491028, "learning_rate": 6.768144696602219e-06, "loss": 0.4244, "step": 1609 }, { "epoch": 1.3401775804661487, "grad_norm": 0.3874600827693939, "learning_rate": 6.763611786459097e-06, "loss": 0.4485, "step": 1610 }, { "epoch": 1.3410099889012208, "grad_norm": 0.3644678294658661, "learning_rate": 6.759077220253628e-06, "loss": 0.3804, "step": 1611 }, { "epoch": 1.341842397336293, "grad_norm": 0.36026453971862793, "learning_rate": 6.7545410022438495e-06, "loss": 0.4498, "step": 1612 }, { "epoch": 1.342674805771365, "grad_norm": 0.3394271433353424, "learning_rate": 6.750003136689349e-06, "loss": 0.4059, "step": 1613 }, { "epoch": 1.3435072142064373, "grad_norm": 0.3929471969604492, "learning_rate": 6.745463627851261e-06, "loss": 0.4259, "step": 1614 }, { "epoch": 1.3443396226415094, "grad_norm": 0.34686771035194397, "learning_rate": 6.740922479992264e-06, "loss": 0.4158, "step": 1615 }, { "epoch": 1.3451720310765816, "grad_norm": 0.37497010827064514, "learning_rate": 6.736379697376578e-06, "loss": 0.3876, "step": 1616 }, { "epoch": 1.3460044395116537, "grad_norm": 0.4034978449344635, "learning_rate": 6.731835284269952e-06, "loss": 0.4623, "step": 1617 }, { "epoch": 1.3468368479467259, "grad_norm": 0.37284499406814575, "learning_rate": 6.727289244939671e-06, "loss": 0.3982, "step": 1618 }, { "epoch": 1.347669256381798, "grad_norm": 0.37045595049858093, "learning_rate": 6.722741583654545e-06, "loss": 0.4012, "step": 1619 }, { "epoch": 1.3485016648168702, "grad_norm": 0.39600858092308044, "learning_rate": 6.718192304684909e-06, "loss": 0.4241, "step": 1620 }, { "epoch": 1.3493340732519423, "grad_norm": 0.35542434453964233, "learning_rate": 6.713641412302614e-06, "loss": 0.4276, "step": 1621 }, { "epoch": 1.3501664816870145, "grad_norm": 0.40306901931762695, "learning_rate": 6.7090889107810275e-06, "loss": 0.4232, "step": 1622 }, { "epoch": 1.3509988901220866, "grad_norm": 0.4040415287017822, "learning_rate": 6.704534804395029e-06, "loss": 0.428, "step": 1623 }, { "epoch": 1.3518312985571588, "grad_norm": 0.41643235087394714, "learning_rate": 6.699979097421004e-06, "loss": 0.4166, "step": 1624 }, { "epoch": 1.352663706992231, "grad_norm": 0.38821861147880554, "learning_rate": 6.695421794136843e-06, "loss": 0.4237, "step": 1625 }, { "epoch": 1.353496115427303, "grad_norm": 0.42019572854042053, "learning_rate": 6.690862898821928e-06, "loss": 0.4018, "step": 1626 }, { "epoch": 1.354328523862375, "grad_norm": 0.41116052865982056, "learning_rate": 6.686302415757149e-06, "loss": 0.4343, "step": 1627 }, { "epoch": 1.3551609322974474, "grad_norm": 0.346463143825531, "learning_rate": 6.681740349224873e-06, "loss": 0.4361, "step": 1628 }, { "epoch": 1.3559933407325193, "grad_norm": 0.4325784742832184, "learning_rate": 6.677176703508963e-06, "loss": 0.4472, "step": 1629 }, { "epoch": 1.3568257491675917, "grad_norm": 0.3932659924030304, "learning_rate": 6.672611482894763e-06, "loss": 0.4421, "step": 1630 }, { "epoch": 1.3576581576026636, "grad_norm": 0.3726898729801178, "learning_rate": 6.668044691669094e-06, "loss": 0.4374, "step": 1631 }, { "epoch": 1.3584905660377358, "grad_norm": 0.3914749324321747, "learning_rate": 6.663476334120254e-06, "loss": 0.4275, "step": 1632 }, { "epoch": 1.359322974472808, "grad_norm": 0.3771442174911499, "learning_rate": 6.658906414538009e-06, "loss": 0.4356, "step": 1633 }, { "epoch": 1.36015538290788, "grad_norm": 0.41853395104408264, "learning_rate": 6.6543349372135946e-06, "loss": 0.4489, "step": 1634 }, { "epoch": 1.3609877913429522, "grad_norm": 0.3918192982673645, "learning_rate": 6.649761906439708e-06, "loss": 0.4469, "step": 1635 }, { "epoch": 1.3618201997780244, "grad_norm": 0.3729172348976135, "learning_rate": 6.6451873265105045e-06, "loss": 0.4394, "step": 1636 }, { "epoch": 1.3626526082130965, "grad_norm": 0.45446979999542236, "learning_rate": 6.6406112017215966e-06, "loss": 0.4286, "step": 1637 }, { "epoch": 1.3634850166481687, "grad_norm": 0.4050742983818054, "learning_rate": 6.6360335363700435e-06, "loss": 0.4214, "step": 1638 }, { "epoch": 1.3643174250832408, "grad_norm": 0.3428737223148346, "learning_rate": 6.631454334754353e-06, "loss": 0.4248, "step": 1639 }, { "epoch": 1.365149833518313, "grad_norm": 0.44034427404403687, "learning_rate": 6.626873601174478e-06, "loss": 0.4406, "step": 1640 }, { "epoch": 1.3659822419533851, "grad_norm": 0.38823384046554565, "learning_rate": 6.622291339931806e-06, "loss": 0.4313, "step": 1641 }, { "epoch": 1.3668146503884573, "grad_norm": 0.36457860469818115, "learning_rate": 6.61770755532916e-06, "loss": 0.4313, "step": 1642 }, { "epoch": 1.3676470588235294, "grad_norm": 0.3769315481185913, "learning_rate": 6.613122251670795e-06, "loss": 0.4295, "step": 1643 }, { "epoch": 1.3684794672586016, "grad_norm": 0.35854166746139526, "learning_rate": 6.608535433262391e-06, "loss": 0.4386, "step": 1644 }, { "epoch": 1.3693118756936737, "grad_norm": 0.33545204997062683, "learning_rate": 6.60394710441105e-06, "loss": 0.3737, "step": 1645 }, { "epoch": 1.370144284128746, "grad_norm": 0.37274548411369324, "learning_rate": 6.599357269425294e-06, "loss": 0.4362, "step": 1646 }, { "epoch": 1.370976692563818, "grad_norm": 0.3416080176830292, "learning_rate": 6.594765932615059e-06, "loss": 0.4346, "step": 1647 }, { "epoch": 1.3718091009988902, "grad_norm": 0.3786942660808563, "learning_rate": 6.59017309829169e-06, "loss": 0.4237, "step": 1648 }, { "epoch": 1.3726415094339623, "grad_norm": 0.36071640253067017, "learning_rate": 6.585578770767939e-06, "loss": 0.4231, "step": 1649 }, { "epoch": 1.3734739178690343, "grad_norm": 0.3784758746623993, "learning_rate": 6.5809829543579595e-06, "loss": 0.458, "step": 1650 }, { "epoch": 1.3743063263041067, "grad_norm": 0.31035900115966797, "learning_rate": 6.576385653377303e-06, "loss": 0.3688, "step": 1651 }, { "epoch": 1.3751387347391786, "grad_norm": 0.3701719641685486, "learning_rate": 6.5717868721429175e-06, "loss": 0.4114, "step": 1652 }, { "epoch": 1.375971143174251, "grad_norm": 0.42909175157546997, "learning_rate": 6.56718661497314e-06, "loss": 0.4538, "step": 1653 }, { "epoch": 1.3768035516093229, "grad_norm": 0.3627909719944, "learning_rate": 6.562584886187687e-06, "loss": 0.4097, "step": 1654 }, { "epoch": 1.377635960044395, "grad_norm": 0.36822018027305603, "learning_rate": 6.557981690107669e-06, "loss": 0.4246, "step": 1655 }, { "epoch": 1.3784683684794672, "grad_norm": 0.3948631286621094, "learning_rate": 6.553377031055564e-06, "loss": 0.4232, "step": 1656 }, { "epoch": 1.3793007769145393, "grad_norm": 0.4063318371772766, "learning_rate": 6.5487709133552275e-06, "loss": 0.4494, "step": 1657 }, { "epoch": 1.3801331853496115, "grad_norm": 0.36222216486930847, "learning_rate": 6.544163341331886e-06, "loss": 0.3822, "step": 1658 }, { "epoch": 1.3809655937846836, "grad_norm": 0.37411925196647644, "learning_rate": 6.539554319312129e-06, "loss": 0.4213, "step": 1659 }, { "epoch": 1.3817980022197558, "grad_norm": 0.39759203791618347, "learning_rate": 6.534943851623911e-06, "loss": 0.4124, "step": 1660 }, { "epoch": 1.382630410654828, "grad_norm": 0.3889487087726593, "learning_rate": 6.530331942596539e-06, "loss": 0.4449, "step": 1661 }, { "epoch": 1.3834628190899, "grad_norm": 0.3533934950828552, "learning_rate": 6.525718596560679e-06, "loss": 0.4068, "step": 1662 }, { "epoch": 1.3842952275249722, "grad_norm": 0.4298970699310303, "learning_rate": 6.521103817848342e-06, "loss": 0.4447, "step": 1663 }, { "epoch": 1.3851276359600444, "grad_norm": 0.3471847176551819, "learning_rate": 6.516487610792888e-06, "loss": 0.4082, "step": 1664 }, { "epoch": 1.3859600443951166, "grad_norm": 0.3483141362667084, "learning_rate": 6.511869979729013e-06, "loss": 0.4342, "step": 1665 }, { "epoch": 1.3867924528301887, "grad_norm": 0.34657421708106995, "learning_rate": 6.507250928992757e-06, "loss": 0.3625, "step": 1666 }, { "epoch": 1.3876248612652609, "grad_norm": 0.3735605478286743, "learning_rate": 6.5026304629214846e-06, "loss": 0.4006, "step": 1667 }, { "epoch": 1.388457269700333, "grad_norm": 0.35017845034599304, "learning_rate": 6.498008585853901e-06, "loss": 0.4373, "step": 1668 }, { "epoch": 1.3892896781354052, "grad_norm": 0.3881807029247284, "learning_rate": 6.493385302130023e-06, "loss": 0.3704, "step": 1669 }, { "epoch": 1.3901220865704773, "grad_norm": 0.3898671567440033, "learning_rate": 6.488760616091201e-06, "loss": 0.4166, "step": 1670 }, { "epoch": 1.3909544950055495, "grad_norm": 0.3844543695449829, "learning_rate": 6.484134532080091e-06, "loss": 0.444, "step": 1671 }, { "epoch": 1.3917869034406216, "grad_norm": 0.3733096718788147, "learning_rate": 6.479507054440671e-06, "loss": 0.4312, "step": 1672 }, { "epoch": 1.3926193118756935, "grad_norm": 0.3414270281791687, "learning_rate": 6.474878187518221e-06, "loss": 0.4285, "step": 1673 }, { "epoch": 1.393451720310766, "grad_norm": 0.3758377134799957, "learning_rate": 6.470247935659328e-06, "loss": 0.4341, "step": 1674 }, { "epoch": 1.3942841287458378, "grad_norm": 0.35513973236083984, "learning_rate": 6.465616303211881e-06, "loss": 0.4031, "step": 1675 }, { "epoch": 1.3951165371809102, "grad_norm": 0.36078712344169617, "learning_rate": 6.460983294525064e-06, "loss": 0.4193, "step": 1676 }, { "epoch": 1.3959489456159822, "grad_norm": 0.3634653091430664, "learning_rate": 6.456348913949352e-06, "loss": 0.4201, "step": 1677 }, { "epoch": 1.3967813540510543, "grad_norm": 0.36076661944389343, "learning_rate": 6.451713165836511e-06, "loss": 0.3709, "step": 1678 }, { "epoch": 1.3976137624861265, "grad_norm": 0.3891527056694031, "learning_rate": 6.447076054539588e-06, "loss": 0.4665, "step": 1679 }, { "epoch": 1.3984461709211986, "grad_norm": 0.32691043615341187, "learning_rate": 6.442437584412912e-06, "loss": 0.3864, "step": 1680 }, { "epoch": 1.3992785793562708, "grad_norm": 0.37780606746673584, "learning_rate": 6.43779775981209e-06, "loss": 0.4385, "step": 1681 }, { "epoch": 1.400110987791343, "grad_norm": 0.3391205966472626, "learning_rate": 6.433156585093994e-06, "loss": 0.4281, "step": 1682 }, { "epoch": 1.400943396226415, "grad_norm": 0.32850903272628784, "learning_rate": 6.4285140646167735e-06, "loss": 0.4434, "step": 1683 }, { "epoch": 1.4017758046614872, "grad_norm": 0.3246311843395233, "learning_rate": 6.423870202739831e-06, "loss": 0.3899, "step": 1684 }, { "epoch": 1.4026082130965594, "grad_norm": 0.36959534883499146, "learning_rate": 6.41922500382384e-06, "loss": 0.4113, "step": 1685 }, { "epoch": 1.4034406215316315, "grad_norm": 0.3844015598297119, "learning_rate": 6.414578472230719e-06, "loss": 0.4143, "step": 1686 }, { "epoch": 1.4042730299667037, "grad_norm": 0.3275579512119293, "learning_rate": 6.409930612323646e-06, "loss": 0.4199, "step": 1687 }, { "epoch": 1.4051054384017758, "grad_norm": 0.388668030500412, "learning_rate": 6.405281428467041e-06, "loss": 0.4404, "step": 1688 }, { "epoch": 1.405937846836848, "grad_norm": 0.375386506319046, "learning_rate": 6.400630925026568e-06, "loss": 0.4471, "step": 1689 }, { "epoch": 1.4067702552719201, "grad_norm": 0.3668792247772217, "learning_rate": 6.395979106369132e-06, "loss": 0.4422, "step": 1690 }, { "epoch": 1.4076026637069923, "grad_norm": 0.34209975600242615, "learning_rate": 6.391325976862872e-06, "loss": 0.3815, "step": 1691 }, { "epoch": 1.4084350721420644, "grad_norm": 0.33450645208358765, "learning_rate": 6.386671540877162e-06, "loss": 0.3906, "step": 1692 }, { "epoch": 1.4092674805771366, "grad_norm": 0.34266212582588196, "learning_rate": 6.382015802782592e-06, "loss": 0.3992, "step": 1693 }, { "epoch": 1.4100998890122087, "grad_norm": 0.38996613025665283, "learning_rate": 6.377358766950987e-06, "loss": 0.4352, "step": 1694 }, { "epoch": 1.4109322974472809, "grad_norm": 0.3618418872356415, "learning_rate": 6.372700437755381e-06, "loss": 0.4297, "step": 1695 }, { "epoch": 1.4117647058823528, "grad_norm": 0.36019423604011536, "learning_rate": 6.368040819570032e-06, "loss": 0.4046, "step": 1696 }, { "epoch": 1.4125971143174252, "grad_norm": 0.3539135158061981, "learning_rate": 6.3633799167703954e-06, "loss": 0.4039, "step": 1697 }, { "epoch": 1.4134295227524971, "grad_norm": 0.37196677923202515, "learning_rate": 6.35871773373315e-06, "loss": 0.4241, "step": 1698 }, { "epoch": 1.4142619311875695, "grad_norm": 0.33772680163383484, "learning_rate": 6.3540542748361585e-06, "loss": 0.4144, "step": 1699 }, { "epoch": 1.4150943396226414, "grad_norm": 0.36471793055534363, "learning_rate": 6.349389544458497e-06, "loss": 0.4414, "step": 1700 }, { "epoch": 1.4159267480577136, "grad_norm": 0.39760175347328186, "learning_rate": 6.3447235469804255e-06, "loss": 0.4318, "step": 1701 }, { "epoch": 1.4167591564927857, "grad_norm": 0.34777477383613586, "learning_rate": 6.3400562867833984e-06, "loss": 0.406, "step": 1702 }, { "epoch": 1.4175915649278579, "grad_norm": 0.37017935514450073, "learning_rate": 6.335387768250054e-06, "loss": 0.4616, "step": 1703 }, { "epoch": 1.41842397336293, "grad_norm": 0.36705178022384644, "learning_rate": 6.330717995764215e-06, "loss": 0.4275, "step": 1704 }, { "epoch": 1.4192563817980022, "grad_norm": 0.34226226806640625, "learning_rate": 6.326046973710878e-06, "loss": 0.428, "step": 1705 }, { "epoch": 1.4200887902330743, "grad_norm": 0.3641026020050049, "learning_rate": 6.321374706476212e-06, "loss": 0.3847, "step": 1706 }, { "epoch": 1.4209211986681465, "grad_norm": 0.39205509424209595, "learning_rate": 6.316701198447562e-06, "loss": 0.4306, "step": 1707 }, { "epoch": 1.4217536071032186, "grad_norm": 0.32389676570892334, "learning_rate": 6.312026454013431e-06, "loss": 0.4076, "step": 1708 }, { "epoch": 1.4225860155382908, "grad_norm": 0.35079190135002136, "learning_rate": 6.3073504775634884e-06, "loss": 0.4088, "step": 1709 }, { "epoch": 1.423418423973363, "grad_norm": 0.3681551218032837, "learning_rate": 6.302673273488556e-06, "loss": 0.4128, "step": 1710 }, { "epoch": 1.424250832408435, "grad_norm": 0.3662969470024109, "learning_rate": 6.297994846180611e-06, "loss": 0.4279, "step": 1711 }, { "epoch": 1.4250832408435072, "grad_norm": 0.5505783557891846, "learning_rate": 6.293315200032777e-06, "loss": 0.4447, "step": 1712 }, { "epoch": 1.4259156492785794, "grad_norm": 0.35179439187049866, "learning_rate": 6.288634339439328e-06, "loss": 0.4202, "step": 1713 }, { "epoch": 1.4267480577136515, "grad_norm": 0.3551366329193115, "learning_rate": 6.283952268795669e-06, "loss": 0.435, "step": 1714 }, { "epoch": 1.4275804661487237, "grad_norm": 0.4332524240016937, "learning_rate": 6.279268992498349e-06, "loss": 0.4654, "step": 1715 }, { "epoch": 1.4284128745837958, "grad_norm": 0.35650166869163513, "learning_rate": 6.274584514945046e-06, "loss": 0.4416, "step": 1716 }, { "epoch": 1.429245283018868, "grad_norm": 0.3940298855304718, "learning_rate": 6.269898840534566e-06, "loss": 0.4315, "step": 1717 }, { "epoch": 1.4300776914539401, "grad_norm": 0.3638506829738617, "learning_rate": 6.26521197366684e-06, "loss": 0.4081, "step": 1718 }, { "epoch": 1.430910099889012, "grad_norm": 0.4078425168991089, "learning_rate": 6.2605239187429175e-06, "loss": 0.472, "step": 1719 }, { "epoch": 1.4317425083240845, "grad_norm": 0.42004722356796265, "learning_rate": 6.255834680164966e-06, "loss": 0.3941, "step": 1720 }, { "epoch": 1.4325749167591564, "grad_norm": 0.38885217905044556, "learning_rate": 6.2511442623362585e-06, "loss": 0.4549, "step": 1721 }, { "epoch": 1.4334073251942288, "grad_norm": 0.3824394643306732, "learning_rate": 6.246452669661184e-06, "loss": 0.4207, "step": 1722 }, { "epoch": 1.4342397336293007, "grad_norm": 0.39993953704833984, "learning_rate": 6.241759906545226e-06, "loss": 0.4386, "step": 1723 }, { "epoch": 1.435072142064373, "grad_norm": 0.34289178252220154, "learning_rate": 6.237065977394976e-06, "loss": 0.4224, "step": 1724 }, { "epoch": 1.435904550499445, "grad_norm": 0.3350009024143219, "learning_rate": 6.23237088661811e-06, "loss": 0.4408, "step": 1725 }, { "epoch": 1.4367369589345171, "grad_norm": 0.337027907371521, "learning_rate": 6.227674638623406e-06, "loss": 0.3893, "step": 1726 }, { "epoch": 1.4375693673695893, "grad_norm": 0.37013328075408936, "learning_rate": 6.22297723782072e-06, "loss": 0.4376, "step": 1727 }, { "epoch": 1.4384017758046614, "grad_norm": 0.3607780337333679, "learning_rate": 6.218278688620994e-06, "loss": 0.4236, "step": 1728 }, { "epoch": 1.4392341842397336, "grad_norm": 0.3762837052345276, "learning_rate": 6.213578995436248e-06, "loss": 0.4209, "step": 1729 }, { "epoch": 1.4400665926748057, "grad_norm": 0.365888774394989, "learning_rate": 6.208878162679577e-06, "loss": 0.4778, "step": 1730 }, { "epoch": 1.440899001109878, "grad_norm": 0.37897226214408875, "learning_rate": 6.204176194765143e-06, "loss": 0.4331, "step": 1731 }, { "epoch": 1.44173140954495, "grad_norm": 0.3936547040939331, "learning_rate": 6.199473096108179e-06, "loss": 0.4007, "step": 1732 }, { "epoch": 1.4425638179800222, "grad_norm": 0.33423808217048645, "learning_rate": 6.194768871124976e-06, "loss": 0.3973, "step": 1733 }, { "epoch": 1.4433962264150944, "grad_norm": 0.395235538482666, "learning_rate": 6.190063524232883e-06, "loss": 0.4326, "step": 1734 }, { "epoch": 1.4442286348501665, "grad_norm": 0.35545146465301514, "learning_rate": 6.1853570598503045e-06, "loss": 0.3881, "step": 1735 }, { "epoch": 1.4450610432852387, "grad_norm": 0.36360153555870056, "learning_rate": 6.18064948239669e-06, "loss": 0.3702, "step": 1736 }, { "epoch": 1.4458934517203108, "grad_norm": 0.4225115478038788, "learning_rate": 6.175940796292541e-06, "loss": 0.4497, "step": 1737 }, { "epoch": 1.446725860155383, "grad_norm": 0.3660155236721039, "learning_rate": 6.171231005959393e-06, "loss": 0.4226, "step": 1738 }, { "epoch": 1.447558268590455, "grad_norm": 0.38432952761650085, "learning_rate": 6.166520115819825e-06, "loss": 0.4009, "step": 1739 }, { "epoch": 1.4483906770255273, "grad_norm": 0.3980556130409241, "learning_rate": 6.161808130297442e-06, "loss": 0.4006, "step": 1740 }, { "epoch": 1.4492230854605994, "grad_norm": 0.34962958097457886, "learning_rate": 6.157095053816882e-06, "loss": 0.4289, "step": 1741 }, { "epoch": 1.4500554938956713, "grad_norm": 0.3962990939617157, "learning_rate": 6.152380890803806e-06, "loss": 0.4005, "step": 1742 }, { "epoch": 1.4508879023307437, "grad_norm": 0.3911704123020172, "learning_rate": 6.147665645684897e-06, "loss": 0.4301, "step": 1743 }, { "epoch": 1.4517203107658156, "grad_norm": 0.35797592997550964, "learning_rate": 6.142949322887852e-06, "loss": 0.4265, "step": 1744 }, { "epoch": 1.452552719200888, "grad_norm": 0.39295870065689087, "learning_rate": 6.138231926841381e-06, "loss": 0.3893, "step": 1745 }, { "epoch": 1.45338512763596, "grad_norm": 0.387833833694458, "learning_rate": 6.1335134619751994e-06, "loss": 0.4233, "step": 1746 }, { "epoch": 1.4542175360710323, "grad_norm": 0.3874453902244568, "learning_rate": 6.128793932720031e-06, "loss": 0.4878, "step": 1747 }, { "epoch": 1.4550499445061043, "grad_norm": 0.4330272078514099, "learning_rate": 6.1240733435075946e-06, "loss": 0.4074, "step": 1748 }, { "epoch": 1.4558823529411764, "grad_norm": 0.3374917507171631, "learning_rate": 6.119351698770607e-06, "loss": 0.4121, "step": 1749 }, { "epoch": 1.4567147613762486, "grad_norm": 0.37870532274246216, "learning_rate": 6.1146290029427755e-06, "loss": 0.4317, "step": 1750 }, { "epoch": 1.4575471698113207, "grad_norm": 0.42150020599365234, "learning_rate": 6.1099052604587935e-06, "loss": 0.4376, "step": 1751 }, { "epoch": 1.4583795782463929, "grad_norm": 0.3680340051651001, "learning_rate": 6.105180475754341e-06, "loss": 0.4167, "step": 1752 }, { "epoch": 1.459211986681465, "grad_norm": 0.32261571288108826, "learning_rate": 6.100454653266068e-06, "loss": 0.3828, "step": 1753 }, { "epoch": 1.4600443951165372, "grad_norm": 0.3715158700942993, "learning_rate": 6.095727797431607e-06, "loss": 0.4435, "step": 1754 }, { "epoch": 1.4608768035516093, "grad_norm": 0.3581849932670593, "learning_rate": 6.0909999126895605e-06, "loss": 0.431, "step": 1755 }, { "epoch": 1.4617092119866815, "grad_norm": 0.3825138509273529, "learning_rate": 6.086271003479492e-06, "loss": 0.4651, "step": 1756 }, { "epoch": 1.4625416204217536, "grad_norm": 0.38319095969200134, "learning_rate": 6.081541074241932e-06, "loss": 0.4446, "step": 1757 }, { "epoch": 1.4633740288568258, "grad_norm": 0.34215807914733887, "learning_rate": 6.076810129418367e-06, "loss": 0.3892, "step": 1758 }, { "epoch": 1.464206437291898, "grad_norm": 0.3696172833442688, "learning_rate": 6.072078173451235e-06, "loss": 0.4422, "step": 1759 }, { "epoch": 1.46503884572697, "grad_norm": 0.42305469512939453, "learning_rate": 6.067345210783927e-06, "loss": 0.4343, "step": 1760 }, { "epoch": 1.4658712541620422, "grad_norm": 0.4272174537181854, "learning_rate": 6.062611245860778e-06, "loss": 0.416, "step": 1761 }, { "epoch": 1.4667036625971144, "grad_norm": 0.33553504943847656, "learning_rate": 6.057876283127062e-06, "loss": 0.3888, "step": 1762 }, { "epoch": 1.4675360710321865, "grad_norm": 0.4211355149745941, "learning_rate": 6.053140327028996e-06, "loss": 0.4565, "step": 1763 }, { "epoch": 1.4683684794672587, "grad_norm": 0.4059634208679199, "learning_rate": 6.048403382013721e-06, "loss": 0.4007, "step": 1764 }, { "epoch": 1.4692008879023306, "grad_norm": 0.3843512535095215, "learning_rate": 6.043665452529315e-06, "loss": 0.4307, "step": 1765 }, { "epoch": 1.470033296337403, "grad_norm": 0.44470739364624023, "learning_rate": 6.038926543024774e-06, "loss": 0.4408, "step": 1766 }, { "epoch": 1.470865704772475, "grad_norm": 0.3952719271183014, "learning_rate": 6.034186657950019e-06, "loss": 0.3943, "step": 1767 }, { "epoch": 1.4716981132075473, "grad_norm": 0.4131404161453247, "learning_rate": 6.029445801755884e-06, "loss": 0.4505, "step": 1768 }, { "epoch": 1.4725305216426192, "grad_norm": 0.4458027184009552, "learning_rate": 6.024703978894118e-06, "loss": 0.4549, "step": 1769 }, { "epoch": 1.4733629300776916, "grad_norm": 0.3414630889892578, "learning_rate": 6.019961193817371e-06, "loss": 0.3777, "step": 1770 }, { "epoch": 1.4741953385127635, "grad_norm": 0.3965328633785248, "learning_rate": 6.015217450979206e-06, "loss": 0.4192, "step": 1771 }, { "epoch": 1.4750277469478357, "grad_norm": 0.4391860365867615, "learning_rate": 6.010472754834078e-06, "loss": 0.471, "step": 1772 }, { "epoch": 1.4758601553829078, "grad_norm": 0.34298470616340637, "learning_rate": 6.00572710983734e-06, "loss": 0.4105, "step": 1773 }, { "epoch": 1.47669256381798, "grad_norm": 0.37611621618270874, "learning_rate": 6.000980520445237e-06, "loss": 0.3869, "step": 1774 }, { "epoch": 1.4775249722530521, "grad_norm": 0.3933986723423004, "learning_rate": 5.9962329911148985e-06, "loss": 0.4365, "step": 1775 }, { "epoch": 1.4783573806881243, "grad_norm": 0.36345475912094116, "learning_rate": 5.991484526304338e-06, "loss": 0.401, "step": 1776 }, { "epoch": 1.4791897891231964, "grad_norm": 0.4031011462211609, "learning_rate": 5.986735130472449e-06, "loss": 0.4051, "step": 1777 }, { "epoch": 1.4800221975582686, "grad_norm": 0.3900188207626343, "learning_rate": 5.981984808078993e-06, "loss": 0.4398, "step": 1778 }, { "epoch": 1.4808546059933407, "grad_norm": 0.3524153232574463, "learning_rate": 5.97723356358461e-06, "loss": 0.4145, "step": 1779 }, { "epoch": 1.4816870144284129, "grad_norm": 0.3924466669559479, "learning_rate": 5.972481401450798e-06, "loss": 0.4317, "step": 1780 }, { "epoch": 1.482519422863485, "grad_norm": 0.33220553398132324, "learning_rate": 5.967728326139926e-06, "loss": 0.4181, "step": 1781 }, { "epoch": 1.4833518312985572, "grad_norm": 0.39167076349258423, "learning_rate": 5.962974342115209e-06, "loss": 0.4228, "step": 1782 }, { "epoch": 1.4841842397336293, "grad_norm": 0.3629782497882843, "learning_rate": 5.9582194538407235e-06, "loss": 0.4085, "step": 1783 }, { "epoch": 1.4850166481687015, "grad_norm": 0.3289967179298401, "learning_rate": 5.9534636657813935e-06, "loss": 0.4189, "step": 1784 }, { "epoch": 1.4858490566037736, "grad_norm": 0.36576929688453674, "learning_rate": 5.948706982402987e-06, "loss": 0.452, "step": 1785 }, { "epoch": 1.4866814650388458, "grad_norm": 0.3725149631500244, "learning_rate": 5.9439494081721125e-06, "loss": 0.4378, "step": 1786 }, { "epoch": 1.487513873473918, "grad_norm": 0.3603670299053192, "learning_rate": 5.939190947556216e-06, "loss": 0.4281, "step": 1787 }, { "epoch": 1.4883462819089899, "grad_norm": 0.35488083958625793, "learning_rate": 5.934431605023575e-06, "loss": 0.4333, "step": 1788 }, { "epoch": 1.4891786903440623, "grad_norm": 0.33910173177719116, "learning_rate": 5.929671385043296e-06, "loss": 0.3941, "step": 1789 }, { "epoch": 1.4900110987791342, "grad_norm": 0.4183632433414459, "learning_rate": 5.924910292085308e-06, "loss": 0.4207, "step": 1790 }, { "epoch": 1.4908435072142066, "grad_norm": 0.3865908086299896, "learning_rate": 5.920148330620362e-06, "loss": 0.4192, "step": 1791 }, { "epoch": 1.4916759156492785, "grad_norm": 0.34254950284957886, "learning_rate": 5.915385505120024e-06, "loss": 0.4404, "step": 1792 }, { "epoch": 1.4925083240843509, "grad_norm": 0.399553120136261, "learning_rate": 5.9106218200566646e-06, "loss": 0.4519, "step": 1793 }, { "epoch": 1.4933407325194228, "grad_norm": 0.33408740162849426, "learning_rate": 5.905857279903475e-06, "loss": 0.3971, "step": 1794 }, { "epoch": 1.494173140954495, "grad_norm": 0.3779832422733307, "learning_rate": 5.9010918891344375e-06, "loss": 0.4478, "step": 1795 }, { "epoch": 1.495005549389567, "grad_norm": 0.3475707471370697, "learning_rate": 5.896325652224339e-06, "loss": 0.4329, "step": 1796 }, { "epoch": 1.4958379578246392, "grad_norm": 0.3472834527492523, "learning_rate": 5.891558573648759e-06, "loss": 0.4036, "step": 1797 }, { "epoch": 1.4966703662597114, "grad_norm": 0.36704128980636597, "learning_rate": 5.886790657884067e-06, "loss": 0.4479, "step": 1798 }, { "epoch": 1.4975027746947835, "grad_norm": 0.39827099442481995, "learning_rate": 5.8820219094074215e-06, "loss": 0.4401, "step": 1799 }, { "epoch": 1.4983351831298557, "grad_norm": 0.37675169110298157, "learning_rate": 5.877252332696759e-06, "loss": 0.439, "step": 1800 }, { "epoch": 1.4991675915649278, "grad_norm": 0.3367178738117218, "learning_rate": 5.8724819322307955e-06, "loss": 0.4434, "step": 1801 }, { "epoch": 1.5, "grad_norm": 0.3833068013191223, "learning_rate": 5.8677107124890206e-06, "loss": 0.4348, "step": 1802 }, { "epoch": 1.5008324084350722, "grad_norm": 0.333822101354599, "learning_rate": 5.862938677951695e-06, "loss": 0.4245, "step": 1803 }, { "epoch": 1.5016648168701443, "grad_norm": 0.33005255460739136, "learning_rate": 5.85816583309984e-06, "loss": 0.4149, "step": 1804 }, { "epoch": 1.5024972253052165, "grad_norm": 0.3507244288921356, "learning_rate": 5.853392182415244e-06, "loss": 0.4029, "step": 1805 }, { "epoch": 1.5033296337402886, "grad_norm": 0.34437042474746704, "learning_rate": 5.848617730380444e-06, "loss": 0.3865, "step": 1806 }, { "epoch": 1.5041620421753608, "grad_norm": 0.33490440249443054, "learning_rate": 5.843842481478739e-06, "loss": 0.4468, "step": 1807 }, { "epoch": 1.504994450610433, "grad_norm": 0.34904393553733826, "learning_rate": 5.839066440194165e-06, "loss": 0.4286, "step": 1808 }, { "epoch": 1.5058268590455048, "grad_norm": 0.3498096168041229, "learning_rate": 5.834289611011515e-06, "loss": 0.4244, "step": 1809 }, { "epoch": 1.5066592674805772, "grad_norm": 0.3397802710533142, "learning_rate": 5.82951199841631e-06, "loss": 0.4203, "step": 1810 }, { "epoch": 1.5074916759156491, "grad_norm": 0.34732288122177124, "learning_rate": 5.824733606894818e-06, "loss": 0.4276, "step": 1811 }, { "epoch": 1.5083240843507215, "grad_norm": 0.3793588876724243, "learning_rate": 5.819954440934026e-06, "loss": 0.4587, "step": 1812 }, { "epoch": 1.5091564927857934, "grad_norm": 0.339600145816803, "learning_rate": 5.815174505021659e-06, "loss": 0.3988, "step": 1813 }, { "epoch": 1.5099889012208658, "grad_norm": 0.37940147519111633, "learning_rate": 5.810393803646157e-06, "loss": 0.4525, "step": 1814 }, { "epoch": 1.5108213096559377, "grad_norm": 0.36495909094810486, "learning_rate": 5.805612341296685e-06, "loss": 0.3835, "step": 1815 }, { "epoch": 1.5116537180910101, "grad_norm": 0.38348937034606934, "learning_rate": 5.800830122463117e-06, "loss": 0.4352, "step": 1816 }, { "epoch": 1.512486126526082, "grad_norm": 0.35472238063812256, "learning_rate": 5.7960471516360435e-06, "loss": 0.4223, "step": 1817 }, { "epoch": 1.5133185349611544, "grad_norm": 0.4020749032497406, "learning_rate": 5.791263433306758e-06, "loss": 0.4726, "step": 1818 }, { "epoch": 1.5141509433962264, "grad_norm": 0.3420332372188568, "learning_rate": 5.786478971967249e-06, "loss": 0.3763, "step": 1819 }, { "epoch": 1.5149833518312985, "grad_norm": 0.35677722096443176, "learning_rate": 5.781693772110219e-06, "loss": 0.4016, "step": 1820 }, { "epoch": 1.5158157602663707, "grad_norm": 0.3244923949241638, "learning_rate": 5.776907838229049e-06, "loss": 0.4264, "step": 1821 }, { "epoch": 1.5166481687014428, "grad_norm": 0.3919678032398224, "learning_rate": 5.772121174817816e-06, "loss": 0.451, "step": 1822 }, { "epoch": 1.517480577136515, "grad_norm": 0.3128311336040497, "learning_rate": 5.767333786371279e-06, "loss": 0.3684, "step": 1823 }, { "epoch": 1.5183129855715871, "grad_norm": 0.3211652338504791, "learning_rate": 5.762545677384884e-06, "loss": 0.4176, "step": 1824 }, { "epoch": 1.5191453940066593, "grad_norm": 0.3597281277179718, "learning_rate": 5.757756852354743e-06, "loss": 0.4235, "step": 1825 }, { "epoch": 1.5199778024417314, "grad_norm": 0.3358439803123474, "learning_rate": 5.752967315777653e-06, "loss": 0.3894, "step": 1826 }, { "epoch": 1.5208102108768036, "grad_norm": 0.3780466616153717, "learning_rate": 5.748177072151068e-06, "loss": 0.4414, "step": 1827 }, { "epoch": 1.5216426193118757, "grad_norm": 0.341908723115921, "learning_rate": 5.743386125973112e-06, "loss": 0.4439, "step": 1828 }, { "epoch": 1.5224750277469479, "grad_norm": 0.3324446976184845, "learning_rate": 5.738594481742568e-06, "loss": 0.4143, "step": 1829 }, { "epoch": 1.52330743618202, "grad_norm": 0.37258070707321167, "learning_rate": 5.733802143958872e-06, "loss": 0.4369, "step": 1830 }, { "epoch": 1.5241398446170922, "grad_norm": 0.34372997283935547, "learning_rate": 5.729009117122117e-06, "loss": 0.4175, "step": 1831 }, { "epoch": 1.524972253052164, "grad_norm": 0.3553544580936432, "learning_rate": 5.724215405733033e-06, "loss": 0.4233, "step": 1832 }, { "epoch": 1.5258046614872365, "grad_norm": 0.34760257601737976, "learning_rate": 5.7194210142930065e-06, "loss": 0.4174, "step": 1833 }, { "epoch": 1.5266370699223084, "grad_norm": 0.37886732816696167, "learning_rate": 5.714625947304048e-06, "loss": 0.4328, "step": 1834 }, { "epoch": 1.5274694783573808, "grad_norm": 0.35653242468833923, "learning_rate": 5.709830209268814e-06, "loss": 0.4064, "step": 1835 }, { "epoch": 1.5283018867924527, "grad_norm": 0.4173387885093689, "learning_rate": 5.705033804690583e-06, "loss": 0.4177, "step": 1836 }, { "epoch": 1.529134295227525, "grad_norm": 0.33839157223701477, "learning_rate": 5.7002367380732685e-06, "loss": 0.3887, "step": 1837 }, { "epoch": 1.529966703662597, "grad_norm": 0.35368192195892334, "learning_rate": 5.695439013921391e-06, "loss": 0.3985, "step": 1838 }, { "epoch": 1.5307991120976694, "grad_norm": 0.3887537121772766, "learning_rate": 5.6906406367401075e-06, "loss": 0.4437, "step": 1839 }, { "epoch": 1.5316315205327413, "grad_norm": 0.3811011016368866, "learning_rate": 5.6858416110351715e-06, "loss": 0.4423, "step": 1840 }, { "epoch": 1.5324639289678137, "grad_norm": 0.33723312616348267, "learning_rate": 5.681041941312954e-06, "loss": 0.411, "step": 1841 }, { "epoch": 1.5332963374028856, "grad_norm": 0.4133073389530182, "learning_rate": 5.676241632080429e-06, "loss": 0.4312, "step": 1842 }, { "epoch": 1.5341287458379578, "grad_norm": 0.3302142918109894, "learning_rate": 5.6714406878451715e-06, "loss": 0.3994, "step": 1843 }, { "epoch": 1.53496115427303, "grad_norm": 0.36849483847618103, "learning_rate": 5.666639113115351e-06, "loss": 0.4616, "step": 1844 }, { "epoch": 1.535793562708102, "grad_norm": 0.38487333059310913, "learning_rate": 5.661836912399731e-06, "loss": 0.4145, "step": 1845 }, { "epoch": 1.5366259711431742, "grad_norm": 0.37898313999176025, "learning_rate": 5.657034090207663e-06, "loss": 0.4573, "step": 1846 }, { "epoch": 1.5374583795782464, "grad_norm": 0.35560891032218933, "learning_rate": 5.652230651049077e-06, "loss": 0.4162, "step": 1847 }, { "epoch": 1.5382907880133185, "grad_norm": 0.3300834894180298, "learning_rate": 5.647426599434493e-06, "loss": 0.4577, "step": 1848 }, { "epoch": 1.5391231964483907, "grad_norm": 0.32240939140319824, "learning_rate": 5.642621939874995e-06, "loss": 0.3774, "step": 1849 }, { "epoch": 1.5399556048834628, "grad_norm": 0.41031232476234436, "learning_rate": 5.637816676882244e-06, "loss": 0.4475, "step": 1850 }, { "epoch": 1.540788013318535, "grad_norm": 0.32196757197380066, "learning_rate": 5.633010814968465e-06, "loss": 0.4096, "step": 1851 }, { "epoch": 1.5416204217536071, "grad_norm": 0.38092145323753357, "learning_rate": 5.628204358646448e-06, "loss": 0.4481, "step": 1852 }, { "epoch": 1.5424528301886793, "grad_norm": 0.34755241870880127, "learning_rate": 5.623397312429537e-06, "loss": 0.431, "step": 1853 }, { "epoch": 1.5432852386237514, "grad_norm": 0.33400028944015503, "learning_rate": 5.618589680831636e-06, "loss": 0.3853, "step": 1854 }, { "epoch": 1.5441176470588234, "grad_norm": 0.35710418224334717, "learning_rate": 5.6137814683671935e-06, "loss": 0.4172, "step": 1855 }, { "epoch": 1.5449500554938957, "grad_norm": 0.37034469842910767, "learning_rate": 5.608972679551205e-06, "loss": 0.3994, "step": 1856 }, { "epoch": 1.5457824639289677, "grad_norm": 0.33173617720603943, "learning_rate": 5.604163318899207e-06, "loss": 0.4299, "step": 1857 }, { "epoch": 1.54661487236404, "grad_norm": 0.3156852424144745, "learning_rate": 5.599353390927275e-06, "loss": 0.4213, "step": 1858 }, { "epoch": 1.547447280799112, "grad_norm": 0.3290148377418518, "learning_rate": 5.594542900152015e-06, "loss": 0.4192, "step": 1859 }, { "epoch": 1.5482796892341844, "grad_norm": 0.3706801235675812, "learning_rate": 5.589731851090559e-06, "loss": 0.4677, "step": 1860 }, { "epoch": 1.5491120976692563, "grad_norm": 0.33532100915908813, "learning_rate": 5.584920248260572e-06, "loss": 0.4131, "step": 1861 }, { "epoch": 1.5499445061043287, "grad_norm": 0.3689619302749634, "learning_rate": 5.580108096180229e-06, "loss": 0.4205, "step": 1862 }, { "epoch": 1.5507769145394006, "grad_norm": 0.32412606477737427, "learning_rate": 5.575295399368228e-06, "loss": 0.4127, "step": 1863 }, { "epoch": 1.551609322974473, "grad_norm": 0.363091379404068, "learning_rate": 5.570482162343772e-06, "loss": 0.4362, "step": 1864 }, { "epoch": 1.552441731409545, "grad_norm": 0.37017905712127686, "learning_rate": 5.5656683896265786e-06, "loss": 0.4014, "step": 1865 }, { "epoch": 1.553274139844617, "grad_norm": 0.3684568405151367, "learning_rate": 5.560854085736861e-06, "loss": 0.4703, "step": 1866 }, { "epoch": 1.5541065482796892, "grad_norm": 0.32582852244377136, "learning_rate": 5.556039255195338e-06, "loss": 0.4202, "step": 1867 }, { "epoch": 1.5549389567147613, "grad_norm": 0.37376418709754944, "learning_rate": 5.551223902523218e-06, "loss": 0.4237, "step": 1868 }, { "epoch": 1.5557713651498335, "grad_norm": 0.3530517518520355, "learning_rate": 5.546408032242202e-06, "loss": 0.4311, "step": 1869 }, { "epoch": 1.5566037735849056, "grad_norm": 0.34218358993530273, "learning_rate": 5.541591648874476e-06, "loss": 0.4404, "step": 1870 }, { "epoch": 1.5574361820199778, "grad_norm": 0.31669047474861145, "learning_rate": 5.53677475694271e-06, "loss": 0.4103, "step": 1871 }, { "epoch": 1.55826859045505, "grad_norm": 0.3490392863750458, "learning_rate": 5.531957360970048e-06, "loss": 0.3915, "step": 1872 }, { "epoch": 1.559100998890122, "grad_norm": 0.3820657730102539, "learning_rate": 5.527139465480109e-06, "loss": 0.4277, "step": 1873 }, { "epoch": 1.5599334073251943, "grad_norm": 0.35189470648765564, "learning_rate": 5.5223210749969845e-06, "loss": 0.4014, "step": 1874 }, { "epoch": 1.5607658157602664, "grad_norm": 0.36156129837036133, "learning_rate": 5.5175021940452225e-06, "loss": 0.413, "step": 1875 }, { "epoch": 1.5615982241953386, "grad_norm": 0.32655417919158936, "learning_rate": 5.512682827149841e-06, "loss": 0.4177, "step": 1876 }, { "epoch": 1.5624306326304107, "grad_norm": 0.3407149314880371, "learning_rate": 5.507862978836306e-06, "loss": 0.4112, "step": 1877 }, { "epoch": 1.5632630410654826, "grad_norm": 0.41151124238967896, "learning_rate": 5.503042653630543e-06, "loss": 0.4631, "step": 1878 }, { "epoch": 1.564095449500555, "grad_norm": 0.3386502265930176, "learning_rate": 5.49822185605892e-06, "loss": 0.3822, "step": 1879 }, { "epoch": 1.564927857935627, "grad_norm": 0.3886801302433014, "learning_rate": 5.4934005906482525e-06, "loss": 0.4602, "step": 1880 }, { "epoch": 1.5657602663706993, "grad_norm": 0.3777741491794586, "learning_rate": 5.488578861925788e-06, "loss": 0.4215, "step": 1881 }, { "epoch": 1.5665926748057712, "grad_norm": 0.3248192369937897, "learning_rate": 5.4837566744192196e-06, "loss": 0.3973, "step": 1882 }, { "epoch": 1.5674250832408436, "grad_norm": 0.3611052930355072, "learning_rate": 5.478934032656663e-06, "loss": 0.4408, "step": 1883 }, { "epoch": 1.5682574916759155, "grad_norm": 0.35995787382125854, "learning_rate": 5.4741109411666635e-06, "loss": 0.401, "step": 1884 }, { "epoch": 1.569089900110988, "grad_norm": 0.39710691571235657, "learning_rate": 5.46928740447819e-06, "loss": 0.4711, "step": 1885 }, { "epoch": 1.5699223085460599, "grad_norm": 0.34556522965431213, "learning_rate": 5.464463427120626e-06, "loss": 0.4347, "step": 1886 }, { "epoch": 1.5707547169811322, "grad_norm": 0.3444367051124573, "learning_rate": 5.459639013623772e-06, "loss": 0.3928, "step": 1887 }, { "epoch": 1.5715871254162042, "grad_norm": 0.3429131805896759, "learning_rate": 5.454814168517836e-06, "loss": 0.4362, "step": 1888 }, { "epoch": 1.5724195338512763, "grad_norm": 0.3870256841182709, "learning_rate": 5.449988896333431e-06, "loss": 0.4537, "step": 1889 }, { "epoch": 1.5732519422863485, "grad_norm": 0.3829275071620941, "learning_rate": 5.445163201601575e-06, "loss": 0.4134, "step": 1890 }, { "epoch": 1.5740843507214206, "grad_norm": 0.37084445357322693, "learning_rate": 5.440337088853679e-06, "loss": 0.3903, "step": 1891 }, { "epoch": 1.5749167591564928, "grad_norm": 0.36003977060317993, "learning_rate": 5.435510562621544e-06, "loss": 0.4641, "step": 1892 }, { "epoch": 1.575749167591565, "grad_norm": 0.4320586025714874, "learning_rate": 5.4306836274373675e-06, "loss": 0.4261, "step": 1893 }, { "epoch": 1.576581576026637, "grad_norm": 0.35853341221809387, "learning_rate": 5.425856287833723e-06, "loss": 0.4363, "step": 1894 }, { "epoch": 1.5774139844617092, "grad_norm": 0.3714522123336792, "learning_rate": 5.421028548343568e-06, "loss": 0.4157, "step": 1895 }, { "epoch": 1.5782463928967814, "grad_norm": 0.37098610401153564, "learning_rate": 5.4162004135002336e-06, "loss": 0.4233, "step": 1896 }, { "epoch": 1.5790788013318535, "grad_norm": 0.35609301924705505, "learning_rate": 5.411371887837423e-06, "loss": 0.4317, "step": 1897 }, { "epoch": 1.5799112097669257, "grad_norm": 0.31282979249954224, "learning_rate": 5.406542975889209e-06, "loss": 0.3921, "step": 1898 }, { "epoch": 1.5807436182019978, "grad_norm": 0.34338200092315674, "learning_rate": 5.40171368219002e-06, "loss": 0.4239, "step": 1899 }, { "epoch": 1.58157602663707, "grad_norm": 0.357689768075943, "learning_rate": 5.396884011274651e-06, "loss": 0.4402, "step": 1900 }, { "epoch": 1.582408435072142, "grad_norm": 0.3389754891395569, "learning_rate": 5.3920539676782455e-06, "loss": 0.3947, "step": 1901 }, { "epoch": 1.5832408435072143, "grad_norm": 0.348406046628952, "learning_rate": 5.387223555936301e-06, "loss": 0.4213, "step": 1902 }, { "epoch": 1.5840732519422862, "grad_norm": 0.3109308183193207, "learning_rate": 5.382392780584655e-06, "loss": 0.3912, "step": 1903 }, { "epoch": 1.5849056603773586, "grad_norm": 0.3686921298503876, "learning_rate": 5.377561646159495e-06, "loss": 0.4632, "step": 1904 }, { "epoch": 1.5857380688124305, "grad_norm": 0.33451026678085327, "learning_rate": 5.372730157197338e-06, "loss": 0.3807, "step": 1905 }, { "epoch": 1.5865704772475029, "grad_norm": 0.3415146768093109, "learning_rate": 5.367898318235037e-06, "loss": 0.4282, "step": 1906 }, { "epoch": 1.5874028856825748, "grad_norm": 0.3252672553062439, "learning_rate": 5.363066133809773e-06, "loss": 0.4125, "step": 1907 }, { "epoch": 1.5882352941176472, "grad_norm": 0.35067319869995117, "learning_rate": 5.3582336084590535e-06, "loss": 0.399, "step": 1908 }, { "epoch": 1.5890677025527191, "grad_norm": 0.33104994893074036, "learning_rate": 5.3534007467207024e-06, "loss": 0.4203, "step": 1909 }, { "epoch": 1.5899001109877915, "grad_norm": 0.33715182542800903, "learning_rate": 5.348567553132862e-06, "loss": 0.4237, "step": 1910 }, { "epoch": 1.5907325194228634, "grad_norm": 0.3445633053779602, "learning_rate": 5.343734032233986e-06, "loss": 0.4466, "step": 1911 }, { "epoch": 1.5915649278579356, "grad_norm": 0.3408707082271576, "learning_rate": 5.338900188562836e-06, "loss": 0.3845, "step": 1912 }, { "epoch": 1.5923973362930077, "grad_norm": 0.3361123204231262, "learning_rate": 5.334066026658475e-06, "loss": 0.4134, "step": 1913 }, { "epoch": 1.5932297447280799, "grad_norm": 0.3556373119354248, "learning_rate": 5.329231551060264e-06, "loss": 0.416, "step": 1914 }, { "epoch": 1.594062153163152, "grad_norm": 0.34920355677604675, "learning_rate": 5.324396766307863e-06, "loss": 0.4377, "step": 1915 }, { "epoch": 1.5948945615982242, "grad_norm": 0.3253536820411682, "learning_rate": 5.31956167694122e-06, "loss": 0.4011, "step": 1916 }, { "epoch": 1.5957269700332963, "grad_norm": 0.35277408361434937, "learning_rate": 5.314726287500565e-06, "loss": 0.4428, "step": 1917 }, { "epoch": 1.5965593784683685, "grad_norm": 0.36568132042884827, "learning_rate": 5.309890602526416e-06, "loss": 0.438, "step": 1918 }, { "epoch": 1.5973917869034406, "grad_norm": 0.31992262601852417, "learning_rate": 5.305054626559565e-06, "loss": 0.3983, "step": 1919 }, { "epoch": 1.5982241953385128, "grad_norm": 0.3432292640209198, "learning_rate": 5.30021836414108e-06, "loss": 0.446, "step": 1920 }, { "epoch": 1.599056603773585, "grad_norm": 0.3381594121456146, "learning_rate": 5.295381819812293e-06, "loss": 0.43, "step": 1921 }, { "epoch": 1.599889012208657, "grad_norm": 0.32569068670272827, "learning_rate": 5.290544998114805e-06, "loss": 0.4224, "step": 1922 }, { "epoch": 1.6007214206437292, "grad_norm": 0.32701462507247925, "learning_rate": 5.2857079035904764e-06, "loss": 0.4351, "step": 1923 }, { "epoch": 1.6015538290788012, "grad_norm": 0.32411783933639526, "learning_rate": 5.280870540781425e-06, "loss": 0.4015, "step": 1924 }, { "epoch": 1.6023862375138735, "grad_norm": 0.35999277234077454, "learning_rate": 5.2760329142300174e-06, "loss": 0.4268, "step": 1925 }, { "epoch": 1.6032186459489455, "grad_norm": 0.3214718997478485, "learning_rate": 5.271195028478871e-06, "loss": 0.3986, "step": 1926 }, { "epoch": 1.6040510543840178, "grad_norm": 0.3498198986053467, "learning_rate": 5.266356888070843e-06, "loss": 0.4462, "step": 1927 }, { "epoch": 1.6048834628190898, "grad_norm": 0.3669768273830414, "learning_rate": 5.261518497549033e-06, "loss": 0.4052, "step": 1928 }, { "epoch": 1.6057158712541622, "grad_norm": 0.3158254623413086, "learning_rate": 5.256679861456776e-06, "loss": 0.3936, "step": 1929 }, { "epoch": 1.606548279689234, "grad_norm": 0.33665063977241516, "learning_rate": 5.251840984337634e-06, "loss": 0.4338, "step": 1930 }, { "epoch": 1.6073806881243065, "grad_norm": 0.37514346837997437, "learning_rate": 5.247001870735398e-06, "loss": 0.464, "step": 1931 }, { "epoch": 1.6082130965593784, "grad_norm": 0.33054375648498535, "learning_rate": 5.242162525194082e-06, "loss": 0.3726, "step": 1932 }, { "epoch": 1.6090455049944508, "grad_norm": 0.3735671937465668, "learning_rate": 5.237322952257915e-06, "loss": 0.4846, "step": 1933 }, { "epoch": 1.6098779134295227, "grad_norm": 0.30594101548194885, "learning_rate": 5.232483156471339e-06, "loss": 0.3673, "step": 1934 }, { "epoch": 1.6107103218645948, "grad_norm": 0.3897792100906372, "learning_rate": 5.227643142379009e-06, "loss": 0.4665, "step": 1935 }, { "epoch": 1.611542730299667, "grad_norm": 0.3239308297634125, "learning_rate": 5.222802914525782e-06, "loss": 0.4004, "step": 1936 }, { "epoch": 1.6123751387347391, "grad_norm": 0.33833977580070496, "learning_rate": 5.217962477456718e-06, "loss": 0.4278, "step": 1937 }, { "epoch": 1.6132075471698113, "grad_norm": 0.3144145905971527, "learning_rate": 5.21312183571707e-06, "loss": 0.3737, "step": 1938 }, { "epoch": 1.6140399556048834, "grad_norm": 0.34893402457237244, "learning_rate": 5.208280993852287e-06, "loss": 0.4249, "step": 1939 }, { "epoch": 1.6148723640399556, "grad_norm": 0.32780539989471436, "learning_rate": 5.203439956408005e-06, "loss": 0.4224, "step": 1940 }, { "epoch": 1.6157047724750278, "grad_norm": 0.3463350534439087, "learning_rate": 5.198598727930041e-06, "loss": 0.395, "step": 1941 }, { "epoch": 1.6165371809101, "grad_norm": 0.3848700225353241, "learning_rate": 5.193757312964394e-06, "loss": 0.3988, "step": 1942 }, { "epoch": 1.617369589345172, "grad_norm": 0.3813340961933136, "learning_rate": 5.188915716057238e-06, "loss": 0.4296, "step": 1943 }, { "epoch": 1.6182019977802442, "grad_norm": 0.396963894367218, "learning_rate": 5.184073941754916e-06, "loss": 0.473, "step": 1944 }, { "epoch": 1.6190344062153164, "grad_norm": 0.34140685200691223, "learning_rate": 5.1792319946039405e-06, "loss": 0.3853, "step": 1945 }, { "epoch": 1.6198668146503885, "grad_norm": 0.3750185966491699, "learning_rate": 5.174389879150985e-06, "loss": 0.4811, "step": 1946 }, { "epoch": 1.6206992230854604, "grad_norm": 0.28287845849990845, "learning_rate": 5.169547599942877e-06, "loss": 0.3697, "step": 1947 }, { "epoch": 1.6215316315205328, "grad_norm": 0.3522551357746124, "learning_rate": 5.164705161526605e-06, "loss": 0.482, "step": 1948 }, { "epoch": 1.6223640399556047, "grad_norm": 0.35294002294540405, "learning_rate": 5.159862568449302e-06, "loss": 0.4473, "step": 1949 }, { "epoch": 1.6231964483906771, "grad_norm": 0.34549230337142944, "learning_rate": 5.155019825258251e-06, "loss": 0.3954, "step": 1950 }, { "epoch": 1.624028856825749, "grad_norm": 0.3237864673137665, "learning_rate": 5.1501769365008654e-06, "loss": 0.4288, "step": 1951 }, { "epoch": 1.6248612652608214, "grad_norm": 0.3312610685825348, "learning_rate": 5.14533390672471e-06, "loss": 0.4135, "step": 1952 }, { "epoch": 1.6256936736958933, "grad_norm": 0.39398959279060364, "learning_rate": 5.140490740477471e-06, "loss": 0.4262, "step": 1953 }, { "epoch": 1.6265260821309657, "grad_norm": 0.30289697647094727, "learning_rate": 5.135647442306966e-06, "loss": 0.3799, "step": 1954 }, { "epoch": 1.6273584905660377, "grad_norm": 0.3318493366241455, "learning_rate": 5.130804016761138e-06, "loss": 0.436, "step": 1955 }, { "epoch": 1.62819089900111, "grad_norm": 0.3526514172554016, "learning_rate": 5.1259604683880485e-06, "loss": 0.4636, "step": 1956 }, { "epoch": 1.629023307436182, "grad_norm": 0.3857317864894867, "learning_rate": 5.121116801735873e-06, "loss": 0.443, "step": 1957 }, { "epoch": 1.629855715871254, "grad_norm": 0.3597477078437805, "learning_rate": 5.1162730213529e-06, "loss": 0.4177, "step": 1958 }, { "epoch": 1.6306881243063263, "grad_norm": 0.34153300523757935, "learning_rate": 5.1114291317875244e-06, "loss": 0.4612, "step": 1959 }, { "epoch": 1.6315205327413984, "grad_norm": 0.3484551012516022, "learning_rate": 5.1065851375882425e-06, "loss": 0.4088, "step": 1960 }, { "epoch": 1.6323529411764706, "grad_norm": 0.36641940474510193, "learning_rate": 5.101741043303651e-06, "loss": 0.4182, "step": 1961 }, { "epoch": 1.6331853496115427, "grad_norm": 0.3681751787662506, "learning_rate": 5.096896853482437e-06, "loss": 0.4161, "step": 1962 }, { "epoch": 1.6340177580466149, "grad_norm": 0.3790402114391327, "learning_rate": 5.092052572673383e-06, "loss": 0.4297, "step": 1963 }, { "epoch": 1.634850166481687, "grad_norm": 0.3269292116165161, "learning_rate": 5.087208205425349e-06, "loss": 0.3941, "step": 1964 }, { "epoch": 1.6356825749167592, "grad_norm": 0.3894573748111725, "learning_rate": 5.082363756287285e-06, "loss": 0.4084, "step": 1965 }, { "epoch": 1.6365149833518313, "grad_norm": 0.39526158571243286, "learning_rate": 5.077519229808211e-06, "loss": 0.4229, "step": 1966 }, { "epoch": 1.6373473917869035, "grad_norm": 0.3372214436531067, "learning_rate": 5.072674630537223e-06, "loss": 0.4109, "step": 1967 }, { "epoch": 1.6381798002219756, "grad_norm": 0.4343195855617523, "learning_rate": 5.067829963023485e-06, "loss": 0.4377, "step": 1968 }, { "epoch": 1.6390122086570478, "grad_norm": 0.3801560699939728, "learning_rate": 5.062985231816225e-06, "loss": 0.4452, "step": 1969 }, { "epoch": 1.6398446170921197, "grad_norm": 0.3717063367366791, "learning_rate": 5.0581404414647276e-06, "loss": 0.4361, "step": 1970 }, { "epoch": 1.640677025527192, "grad_norm": 0.3857610523700714, "learning_rate": 5.053295596518337e-06, "loss": 0.4063, "step": 1971 }, { "epoch": 1.641509433962264, "grad_norm": 0.3806234300136566, "learning_rate": 5.04845070152645e-06, "loss": 0.4423, "step": 1972 }, { "epoch": 1.6423418423973364, "grad_norm": 0.39662259817123413, "learning_rate": 5.043605761038505e-06, "loss": 0.4482, "step": 1973 }, { "epoch": 1.6431742508324083, "grad_norm": 0.3193778395652771, "learning_rate": 5.038760779603989e-06, "loss": 0.3858, "step": 1974 }, { "epoch": 1.6440066592674807, "grad_norm": 0.35678938031196594, "learning_rate": 5.033915761772419e-06, "loss": 0.4691, "step": 1975 }, { "epoch": 1.6448390677025526, "grad_norm": 0.3585291802883148, "learning_rate": 5.029070712093357e-06, "loss": 0.3971, "step": 1976 }, { "epoch": 1.645671476137625, "grad_norm": 0.3424661457538605, "learning_rate": 5.024225635116386e-06, "loss": 0.4139, "step": 1977 }, { "epoch": 1.646503884572697, "grad_norm": 0.35479363799095154, "learning_rate": 5.01938053539112e-06, "loss": 0.4225, "step": 1978 }, { "epoch": 1.6473362930077693, "grad_norm": 0.3378525972366333, "learning_rate": 5.014535417467191e-06, "loss": 0.4122, "step": 1979 }, { "epoch": 1.6481687014428412, "grad_norm": 0.36042535305023193, "learning_rate": 5.009690285894252e-06, "loss": 0.4601, "step": 1980 }, { "epoch": 1.6490011098779136, "grad_norm": 0.3910295069217682, "learning_rate": 5.004845145221965e-06, "loss": 0.4279, "step": 1981 }, { "epoch": 1.6498335183129855, "grad_norm": 0.3552689552307129, "learning_rate": 5e-06, "loss": 0.4187, "step": 1982 }, { "epoch": 1.6506659267480577, "grad_norm": 0.33062514662742615, "learning_rate": 4.995154854778036e-06, "loss": 0.4125, "step": 1983 }, { "epoch": 1.6514983351831298, "grad_norm": 0.3373256325721741, "learning_rate": 4.99030971410575e-06, "loss": 0.444, "step": 1984 }, { "epoch": 1.652330743618202, "grad_norm": 0.3146938383579254, "learning_rate": 4.9854645825328096e-06, "loss": 0.3551, "step": 1985 }, { "epoch": 1.6531631520532741, "grad_norm": 0.34619879722595215, "learning_rate": 4.980619464608881e-06, "loss": 0.4253, "step": 1986 }, { "epoch": 1.6539955604883463, "grad_norm": 0.3549228012561798, "learning_rate": 4.975774364883617e-06, "loss": 0.434, "step": 1987 }, { "epoch": 1.6548279689234184, "grad_norm": 0.3266083896160126, "learning_rate": 4.9709292879066464e-06, "loss": 0.4061, "step": 1988 }, { "epoch": 1.6556603773584906, "grad_norm": 0.30864620208740234, "learning_rate": 4.966084238227582e-06, "loss": 0.4607, "step": 1989 }, { "epoch": 1.6564927857935627, "grad_norm": 0.357719361782074, "learning_rate": 4.961239220396014e-06, "loss": 0.4443, "step": 1990 }, { "epoch": 1.657325194228635, "grad_norm": 0.36556276679039, "learning_rate": 4.956394238961497e-06, "loss": 0.3715, "step": 1991 }, { "epoch": 1.658157602663707, "grad_norm": 0.3319959342479706, "learning_rate": 4.951549298473552e-06, "loss": 0.4252, "step": 1992 }, { "epoch": 1.658990011098779, "grad_norm": 0.35500237345695496, "learning_rate": 4.946704403481663e-06, "loss": 0.4447, "step": 1993 }, { "epoch": 1.6598224195338513, "grad_norm": 0.38562414050102234, "learning_rate": 4.941859558535275e-06, "loss": 0.4528, "step": 1994 }, { "epoch": 1.6606548279689233, "grad_norm": 0.3319378197193146, "learning_rate": 4.937014768183778e-06, "loss": 0.3608, "step": 1995 }, { "epoch": 1.6614872364039956, "grad_norm": 0.34083935618400574, "learning_rate": 4.9321700369765165e-06, "loss": 0.4559, "step": 1996 }, { "epoch": 1.6623196448390676, "grad_norm": 0.3207685947418213, "learning_rate": 4.927325369462777e-06, "loss": 0.375, "step": 1997 }, { "epoch": 1.66315205327414, "grad_norm": 0.3647385835647583, "learning_rate": 4.92248077019179e-06, "loss": 0.4085, "step": 1998 }, { "epoch": 1.6639844617092119, "grad_norm": 0.3710618317127228, "learning_rate": 4.917636243712716e-06, "loss": 0.4605, "step": 1999 }, { "epoch": 1.6648168701442843, "grad_norm": 0.35661581158638, "learning_rate": 4.912791794574653e-06, "loss": 0.407, "step": 2000 }, { "epoch": 1.6656492785793562, "grad_norm": 0.39443153142929077, "learning_rate": 4.9079474273266195e-06, "loss": 0.4512, "step": 2001 }, { "epoch": 1.6664816870144286, "grad_norm": 0.36028945446014404, "learning_rate": 4.903103146517564e-06, "loss": 0.4333, "step": 2002 }, { "epoch": 1.6673140954495005, "grad_norm": 0.3345014154911041, "learning_rate": 4.898258956696351e-06, "loss": 0.3976, "step": 2003 }, { "epoch": 1.6681465038845729, "grad_norm": 0.3746415376663208, "learning_rate": 4.893414862411759e-06, "loss": 0.4256, "step": 2004 }, { "epoch": 1.6689789123196448, "grad_norm": 0.35627481341362, "learning_rate": 4.888570868212478e-06, "loss": 0.4167, "step": 2005 }, { "epoch": 1.669811320754717, "grad_norm": 0.30849483609199524, "learning_rate": 4.883726978647101e-06, "loss": 0.3915, "step": 2006 }, { "epoch": 1.670643729189789, "grad_norm": 0.3932332396507263, "learning_rate": 4.878883198264129e-06, "loss": 0.461, "step": 2007 }, { "epoch": 1.6714761376248612, "grad_norm": 0.32576408982276917, "learning_rate": 4.874039531611954e-06, "loss": 0.4585, "step": 2008 }, { "epoch": 1.6723085460599334, "grad_norm": 0.3283080756664276, "learning_rate": 4.8691959832388635e-06, "loss": 0.3801, "step": 2009 }, { "epoch": 1.6731409544950056, "grad_norm": 0.40403053164482117, "learning_rate": 4.864352557693035e-06, "loss": 0.4624, "step": 2010 }, { "epoch": 1.6739733629300777, "grad_norm": 0.3406950831413269, "learning_rate": 4.859509259522531e-06, "loss": 0.4116, "step": 2011 }, { "epoch": 1.6748057713651499, "grad_norm": 0.3636447787284851, "learning_rate": 4.854666093275291e-06, "loss": 0.4214, "step": 2012 }, { "epoch": 1.675638179800222, "grad_norm": 0.34086188673973083, "learning_rate": 4.849823063499136e-06, "loss": 0.4077, "step": 2013 }, { "epoch": 1.6764705882352942, "grad_norm": 0.345948189496994, "learning_rate": 4.844980174741752e-06, "loss": 0.4133, "step": 2014 }, { "epoch": 1.6773029966703663, "grad_norm": 0.339345782995224, "learning_rate": 4.840137431550698e-06, "loss": 0.431, "step": 2015 }, { "epoch": 1.6781354051054382, "grad_norm": 0.3828071057796478, "learning_rate": 4.835294838473396e-06, "loss": 0.4574, "step": 2016 }, { "epoch": 1.6789678135405106, "grad_norm": 0.3178552985191345, "learning_rate": 4.8304524000571255e-06, "loss": 0.4198, "step": 2017 }, { "epoch": 1.6798002219755825, "grad_norm": 0.35455217957496643, "learning_rate": 4.825610120849018e-06, "loss": 0.4129, "step": 2018 }, { "epoch": 1.680632630410655, "grad_norm": 0.35618969798088074, "learning_rate": 4.8207680053960594e-06, "loss": 0.4373, "step": 2019 }, { "epoch": 1.6814650388457268, "grad_norm": 0.33656421303749084, "learning_rate": 4.815926058245085e-06, "loss": 0.4209, "step": 2020 }, { "epoch": 1.6822974472807992, "grad_norm": 0.34791067242622375, "learning_rate": 4.811084283942764e-06, "loss": 0.4398, "step": 2021 }, { "epoch": 1.6831298557158711, "grad_norm": 0.32604023814201355, "learning_rate": 4.806242687035608e-06, "loss": 0.4346, "step": 2022 }, { "epoch": 1.6839622641509435, "grad_norm": 0.3848741352558136, "learning_rate": 4.80140127206996e-06, "loss": 0.4152, "step": 2023 }, { "epoch": 1.6847946725860155, "grad_norm": 0.34885889291763306, "learning_rate": 4.796560043591996e-06, "loss": 0.3928, "step": 2024 }, { "epoch": 1.6856270810210878, "grad_norm": 0.3154467046260834, "learning_rate": 4.791719006147714e-06, "loss": 0.4268, "step": 2025 }, { "epoch": 1.6864594894561598, "grad_norm": 0.34713122248649597, "learning_rate": 4.7868781642829326e-06, "loss": 0.4561, "step": 2026 }, { "epoch": 1.6872918978912321, "grad_norm": 0.2982744872570038, "learning_rate": 4.782037522543283e-06, "loss": 0.3945, "step": 2027 }, { "epoch": 1.688124306326304, "grad_norm": 0.33279117941856384, "learning_rate": 4.777197085474219e-06, "loss": 0.4445, "step": 2028 }, { "epoch": 1.6889567147613762, "grad_norm": 0.3618778586387634, "learning_rate": 4.772356857620992e-06, "loss": 0.4938, "step": 2029 }, { "epoch": 1.6897891231964484, "grad_norm": 0.311985582113266, "learning_rate": 4.767516843528664e-06, "loss": 0.4027, "step": 2030 }, { "epoch": 1.6906215316315205, "grad_norm": 0.30986541509628296, "learning_rate": 4.762677047742088e-06, "loss": 0.3541, "step": 2031 }, { "epoch": 1.6914539400665927, "grad_norm": 0.3394172191619873, "learning_rate": 4.757837474805918e-06, "loss": 0.4316, "step": 2032 }, { "epoch": 1.6922863485016648, "grad_norm": 0.331486314535141, "learning_rate": 4.7529981292646025e-06, "loss": 0.4357, "step": 2033 }, { "epoch": 1.693118756936737, "grad_norm": 0.3562561869621277, "learning_rate": 4.748159015662367e-06, "loss": 0.4528, "step": 2034 }, { "epoch": 1.6939511653718091, "grad_norm": 0.3469091057777405, "learning_rate": 4.743320138543225e-06, "loss": 0.4298, "step": 2035 }, { "epoch": 1.6947835738068813, "grad_norm": 0.3128523528575897, "learning_rate": 4.738481502450967e-06, "loss": 0.43, "step": 2036 }, { "epoch": 1.6956159822419534, "grad_norm": 0.36427655816078186, "learning_rate": 4.733643111929159e-06, "loss": 0.4474, "step": 2037 }, { "epoch": 1.6964483906770256, "grad_norm": 0.32562559843063354, "learning_rate": 4.728804971521132e-06, "loss": 0.4002, "step": 2038 }, { "epoch": 1.6972807991120975, "grad_norm": 0.3482353091239929, "learning_rate": 4.723967085769985e-06, "loss": 0.3841, "step": 2039 }, { "epoch": 1.6981132075471699, "grad_norm": 0.3613423705101013, "learning_rate": 4.719129459218575e-06, "loss": 0.4263, "step": 2040 }, { "epoch": 1.6989456159822418, "grad_norm": 0.3004484474658966, "learning_rate": 4.714292096409524e-06, "loss": 0.389, "step": 2041 }, { "epoch": 1.6997780244173142, "grad_norm": 0.3427788317203522, "learning_rate": 4.709455001885196e-06, "loss": 0.4658, "step": 2042 }, { "epoch": 1.7006104328523861, "grad_norm": 0.3731909394264221, "learning_rate": 4.704618180187709e-06, "loss": 0.4452, "step": 2043 }, { "epoch": 1.7014428412874585, "grad_norm": 0.34222519397735596, "learning_rate": 4.699781635858923e-06, "loss": 0.4187, "step": 2044 }, { "epoch": 1.7022752497225304, "grad_norm": 0.32763800024986267, "learning_rate": 4.694945373440435e-06, "loss": 0.4081, "step": 2045 }, { "epoch": 1.7031076581576028, "grad_norm": 0.3098640739917755, "learning_rate": 4.690109397473586e-06, "loss": 0.3837, "step": 2046 }, { "epoch": 1.7039400665926747, "grad_norm": 0.3192623555660248, "learning_rate": 4.685273712499436e-06, "loss": 0.429, "step": 2047 }, { "epoch": 1.704772475027747, "grad_norm": 0.34456512331962585, "learning_rate": 4.680438323058783e-06, "loss": 0.4263, "step": 2048 }, { "epoch": 1.705604883462819, "grad_norm": 0.3312320411205292, "learning_rate": 4.675603233692137e-06, "loss": 0.4054, "step": 2049 }, { "epoch": 1.7064372918978914, "grad_norm": 0.33208805322647095, "learning_rate": 4.670768448939737e-06, "loss": 0.3949, "step": 2050 }, { "epoch": 1.7072697003329633, "grad_norm": 0.3596723973751068, "learning_rate": 4.665933973341527e-06, "loss": 0.4395, "step": 2051 }, { "epoch": 1.7081021087680355, "grad_norm": 0.3322398066520691, "learning_rate": 4.661099811437166e-06, "loss": 0.399, "step": 2052 }, { "epoch": 1.7089345172031076, "grad_norm": 0.3125096261501312, "learning_rate": 4.656265967766014e-06, "loss": 0.4142, "step": 2053 }, { "epoch": 1.7097669256381798, "grad_norm": 0.3498634696006775, "learning_rate": 4.651432446867139e-06, "loss": 0.4149, "step": 2054 }, { "epoch": 1.710599334073252, "grad_norm": 0.34815841913223267, "learning_rate": 4.646599253279299e-06, "loss": 0.4006, "step": 2055 }, { "epoch": 1.711431742508324, "grad_norm": 0.3256421983242035, "learning_rate": 4.641766391540949e-06, "loss": 0.4225, "step": 2056 }, { "epoch": 1.7122641509433962, "grad_norm": 0.3041287362575531, "learning_rate": 4.636933866190228e-06, "loss": 0.4198, "step": 2057 }, { "epoch": 1.7130965593784684, "grad_norm": 0.3252321481704712, "learning_rate": 4.632101681764964e-06, "loss": 0.4309, "step": 2058 }, { "epoch": 1.7139289678135405, "grad_norm": 0.3307684361934662, "learning_rate": 4.627269842802664e-06, "loss": 0.4331, "step": 2059 }, { "epoch": 1.7147613762486127, "grad_norm": 0.34094369411468506, "learning_rate": 4.622438353840506e-06, "loss": 0.4036, "step": 2060 }, { "epoch": 1.7155937846836848, "grad_norm": 0.31734779477119446, "learning_rate": 4.617607219415346e-06, "loss": 0.4096, "step": 2061 }, { "epoch": 1.7164261931187568, "grad_norm": 0.3557928502559662, "learning_rate": 4.6127764440637e-06, "loss": 0.4343, "step": 2062 }, { "epoch": 1.7172586015538291, "grad_norm": 0.35101959109306335, "learning_rate": 4.607946032321755e-06, "loss": 0.418, "step": 2063 }, { "epoch": 1.718091009988901, "grad_norm": 0.35029613971710205, "learning_rate": 4.603115988725351e-06, "loss": 0.4141, "step": 2064 }, { "epoch": 1.7189234184239734, "grad_norm": 0.333061546087265, "learning_rate": 4.598286317809983e-06, "loss": 0.4064, "step": 2065 }, { "epoch": 1.7197558268590454, "grad_norm": 0.32927843928337097, "learning_rate": 4.593457024110792e-06, "loss": 0.4149, "step": 2066 }, { "epoch": 1.7205882352941178, "grad_norm": 0.33346185088157654, "learning_rate": 4.588628112162578e-06, "loss": 0.4092, "step": 2067 }, { "epoch": 1.7214206437291897, "grad_norm": 0.32560572028160095, "learning_rate": 4.583799586499768e-06, "loss": 0.4182, "step": 2068 }, { "epoch": 1.722253052164262, "grad_norm": 0.3583771586418152, "learning_rate": 4.578971451656435e-06, "loss": 0.4472, "step": 2069 }, { "epoch": 1.723085460599334, "grad_norm": 0.3493313491344452, "learning_rate": 4.574143712166279e-06, "loss": 0.4249, "step": 2070 }, { "epoch": 1.7239178690344064, "grad_norm": 0.3568575978279114, "learning_rate": 4.569316372562634e-06, "loss": 0.4193, "step": 2071 }, { "epoch": 1.7247502774694783, "grad_norm": 0.33443883061408997, "learning_rate": 4.564489437378457e-06, "loss": 0.4556, "step": 2072 }, { "epoch": 1.7255826859045507, "grad_norm": 0.31167763471603394, "learning_rate": 4.559662911146324e-06, "loss": 0.3944, "step": 2073 }, { "epoch": 1.7264150943396226, "grad_norm": 0.32042884826660156, "learning_rate": 4.554836798398425e-06, "loss": 0.4126, "step": 2074 }, { "epoch": 1.7272475027746947, "grad_norm": 0.33394762873649597, "learning_rate": 4.550011103666568e-06, "loss": 0.4307, "step": 2075 }, { "epoch": 1.728079911209767, "grad_norm": 0.3465491831302643, "learning_rate": 4.545185831482166e-06, "loss": 0.4582, "step": 2076 }, { "epoch": 1.728912319644839, "grad_norm": 0.30535024404525757, "learning_rate": 4.5403609863762295e-06, "loss": 0.3633, "step": 2077 }, { "epoch": 1.7297447280799112, "grad_norm": 0.3142050802707672, "learning_rate": 4.535536572879376e-06, "loss": 0.4242, "step": 2078 }, { "epoch": 1.7305771365149833, "grad_norm": 0.3482121229171753, "learning_rate": 4.53071259552181e-06, "loss": 0.447, "step": 2079 }, { "epoch": 1.7314095449500555, "grad_norm": 0.3179130256175995, "learning_rate": 4.525889058833337e-06, "loss": 0.4052, "step": 2080 }, { "epoch": 1.7322419533851277, "grad_norm": 0.2993527352809906, "learning_rate": 4.5210659673433386e-06, "loss": 0.3627, "step": 2081 }, { "epoch": 1.7330743618201998, "grad_norm": 0.3593730032444, "learning_rate": 4.516243325580782e-06, "loss": 0.4697, "step": 2082 }, { "epoch": 1.733906770255272, "grad_norm": 0.3211425542831421, "learning_rate": 4.511421138074213e-06, "loss": 0.401, "step": 2083 }, { "epoch": 1.734739178690344, "grad_norm": 0.33504122495651245, "learning_rate": 4.50659940935175e-06, "loss": 0.4107, "step": 2084 }, { "epoch": 1.7355715871254163, "grad_norm": 0.36016252636909485, "learning_rate": 4.5017781439410806e-06, "loss": 0.4275, "step": 2085 }, { "epoch": 1.7364039955604884, "grad_norm": 0.3467753827571869, "learning_rate": 4.496957346369458e-06, "loss": 0.4255, "step": 2086 }, { "epoch": 1.7372364039955603, "grad_norm": 0.37390729784965515, "learning_rate": 4.492137021163694e-06, "loss": 0.4342, "step": 2087 }, { "epoch": 1.7380688124306327, "grad_norm": 0.38485151529312134, "learning_rate": 4.4873171728501604e-06, "loss": 0.3993, "step": 2088 }, { "epoch": 1.7389012208657046, "grad_norm": 0.37274548411369324, "learning_rate": 4.482497805954779e-06, "loss": 0.4139, "step": 2089 }, { "epoch": 1.739733629300777, "grad_norm": 0.3811771273612976, "learning_rate": 4.477678925003018e-06, "loss": 0.4109, "step": 2090 }, { "epoch": 1.740566037735849, "grad_norm": 0.37200403213500977, "learning_rate": 4.472860534519893e-06, "loss": 0.4366, "step": 2091 }, { "epoch": 1.7413984461709213, "grad_norm": 0.3533194065093994, "learning_rate": 4.468042639029952e-06, "loss": 0.3646, "step": 2092 }, { "epoch": 1.7422308546059933, "grad_norm": 0.3777799904346466, "learning_rate": 4.463225243057292e-06, "loss": 0.4576, "step": 2093 }, { "epoch": 1.7430632630410656, "grad_norm": 0.34515514969825745, "learning_rate": 4.458408351125525e-06, "loss": 0.421, "step": 2094 }, { "epoch": 1.7438956714761376, "grad_norm": 0.340119868516922, "learning_rate": 4.453591967757801e-06, "loss": 0.41, "step": 2095 }, { "epoch": 1.74472807991121, "grad_norm": 0.3491295576095581, "learning_rate": 4.4487760974767835e-06, "loss": 0.4169, "step": 2096 }, { "epoch": 1.7455604883462819, "grad_norm": 0.35444924235343933, "learning_rate": 4.4439607448046636e-06, "loss": 0.4326, "step": 2097 }, { "epoch": 1.746392896781354, "grad_norm": 0.36455461382865906, "learning_rate": 4.43914591426314e-06, "loss": 0.3829, "step": 2098 }, { "epoch": 1.7472253052164262, "grad_norm": 0.3415604531764984, "learning_rate": 4.434331610373424e-06, "loss": 0.4204, "step": 2099 }, { "epoch": 1.7480577136514983, "grad_norm": 0.3621603846549988, "learning_rate": 4.4295178376562285e-06, "loss": 0.4484, "step": 2100 }, { "epoch": 1.7488901220865705, "grad_norm": 0.3243803381919861, "learning_rate": 4.424704600631774e-06, "loss": 0.3944, "step": 2101 }, { "epoch": 1.7497225305216426, "grad_norm": 0.32423099875450134, "learning_rate": 4.419891903819773e-06, "loss": 0.3849, "step": 2102 }, { "epoch": 1.7505549389567148, "grad_norm": 0.35436517000198364, "learning_rate": 4.4150797517394295e-06, "loss": 0.4817, "step": 2103 }, { "epoch": 1.751387347391787, "grad_norm": 0.3532561659812927, "learning_rate": 4.410268148909441e-06, "loss": 0.448, "step": 2104 }, { "epoch": 1.752219755826859, "grad_norm": 0.360914409160614, "learning_rate": 4.405457099847986e-06, "loss": 0.4412, "step": 2105 }, { "epoch": 1.7530521642619312, "grad_norm": 0.34743252396583557, "learning_rate": 4.400646609072727e-06, "loss": 0.4094, "step": 2106 }, { "epoch": 1.7538845726970034, "grad_norm": 0.3823837339878082, "learning_rate": 4.395836681100794e-06, "loss": 0.4252, "step": 2107 }, { "epoch": 1.7547169811320755, "grad_norm": 0.36400553584098816, "learning_rate": 4.391027320448798e-06, "loss": 0.4383, "step": 2108 }, { "epoch": 1.7555493895671477, "grad_norm": 0.3543497920036316, "learning_rate": 4.386218531632808e-06, "loss": 0.3985, "step": 2109 }, { "epoch": 1.7563817980022196, "grad_norm": 0.33355042338371277, "learning_rate": 4.3814103191683655e-06, "loss": 0.447, "step": 2110 }, { "epoch": 1.757214206437292, "grad_norm": 0.31347766518592834, "learning_rate": 4.376602687570464e-06, "loss": 0.4034, "step": 2111 }, { "epoch": 1.758046614872364, "grad_norm": 0.3570155203342438, "learning_rate": 4.371795641353555e-06, "loss": 0.3879, "step": 2112 }, { "epoch": 1.7588790233074363, "grad_norm": 0.38342198729515076, "learning_rate": 4.366989185031536e-06, "loss": 0.4377, "step": 2113 }, { "epoch": 1.7597114317425082, "grad_norm": 0.3486819565296173, "learning_rate": 4.362183323117757e-06, "loss": 0.4352, "step": 2114 }, { "epoch": 1.7605438401775806, "grad_norm": 0.3399284780025482, "learning_rate": 4.357378060125007e-06, "loss": 0.4193, "step": 2115 }, { "epoch": 1.7613762486126525, "grad_norm": 0.36731529235839844, "learning_rate": 4.3525734005655085e-06, "loss": 0.4648, "step": 2116 }, { "epoch": 1.762208657047725, "grad_norm": 0.3027511537075043, "learning_rate": 4.347769348950922e-06, "loss": 0.4082, "step": 2117 }, { "epoch": 1.7630410654827968, "grad_norm": 0.31376221776008606, "learning_rate": 4.342965909792338e-06, "loss": 0.4031, "step": 2118 }, { "epoch": 1.7638734739178692, "grad_norm": 0.31292369961738586, "learning_rate": 4.338163087600271e-06, "loss": 0.4102, "step": 2119 }, { "epoch": 1.7647058823529411, "grad_norm": 0.29499831795692444, "learning_rate": 4.33336088688465e-06, "loss": 0.3865, "step": 2120 }, { "epoch": 1.7655382907880133, "grad_norm": 0.34691059589385986, "learning_rate": 4.328559312154831e-06, "loss": 0.4631, "step": 2121 }, { "epoch": 1.7663706992230854, "grad_norm": 0.32868635654449463, "learning_rate": 4.323758367919572e-06, "loss": 0.4256, "step": 2122 }, { "epoch": 1.7672031076581576, "grad_norm": 0.33149829506874084, "learning_rate": 4.318958058687047e-06, "loss": 0.4242, "step": 2123 }, { "epoch": 1.7680355160932297, "grad_norm": 0.3130408823490143, "learning_rate": 4.31415838896483e-06, "loss": 0.4296, "step": 2124 }, { "epoch": 1.7688679245283019, "grad_norm": 0.3201256990432739, "learning_rate": 4.309359363259895e-06, "loss": 0.4092, "step": 2125 }, { "epoch": 1.769700332963374, "grad_norm": 0.3389187455177307, "learning_rate": 4.304560986078609e-06, "loss": 0.4557, "step": 2126 }, { "epoch": 1.7705327413984462, "grad_norm": 0.32760846614837646, "learning_rate": 4.299763261926734e-06, "loss": 0.4082, "step": 2127 }, { "epoch": 1.7713651498335183, "grad_norm": 0.3498232960700989, "learning_rate": 4.294966195309418e-06, "loss": 0.4329, "step": 2128 }, { "epoch": 1.7721975582685905, "grad_norm": 0.3230178952217102, "learning_rate": 4.2901697907311876e-06, "loss": 0.4315, "step": 2129 }, { "epoch": 1.7730299667036626, "grad_norm": 0.3200167715549469, "learning_rate": 4.285374052695953e-06, "loss": 0.3921, "step": 2130 }, { "epoch": 1.7738623751387348, "grad_norm": 0.3524058163166046, "learning_rate": 4.280578985706995e-06, "loss": 0.4731, "step": 2131 }, { "epoch": 1.774694783573807, "grad_norm": 0.2892106771469116, "learning_rate": 4.2757845942669674e-06, "loss": 0.3912, "step": 2132 }, { "epoch": 1.7755271920088789, "grad_norm": 0.3491550087928772, "learning_rate": 4.270990882877885e-06, "loss": 0.4164, "step": 2133 }, { "epoch": 1.7763596004439512, "grad_norm": 0.3432190418243408, "learning_rate": 4.2661978560411274e-06, "loss": 0.4305, "step": 2134 }, { "epoch": 1.7771920088790232, "grad_norm": 0.2854191064834595, "learning_rate": 4.261405518257434e-06, "loss": 0.3578, "step": 2135 }, { "epoch": 1.7780244173140956, "grad_norm": 0.36918261647224426, "learning_rate": 4.25661387402689e-06, "loss": 0.4849, "step": 2136 }, { "epoch": 1.7788568257491675, "grad_norm": 0.3176991939544678, "learning_rate": 4.251822927848934e-06, "loss": 0.3844, "step": 2137 }, { "epoch": 1.7796892341842399, "grad_norm": 0.3359120488166809, "learning_rate": 4.24703268422235e-06, "loss": 0.4387, "step": 2138 }, { "epoch": 1.7805216426193118, "grad_norm": 0.3274317681789398, "learning_rate": 4.242243147645257e-06, "loss": 0.4009, "step": 2139 }, { "epoch": 1.7813540510543842, "grad_norm": 0.3508370518684387, "learning_rate": 4.237454322615118e-06, "loss": 0.4086, "step": 2140 }, { "epoch": 1.782186459489456, "grad_norm": 0.3434849977493286, "learning_rate": 4.232666213628722e-06, "loss": 0.4478, "step": 2141 }, { "epoch": 1.7830188679245285, "grad_norm": 0.32151880860328674, "learning_rate": 4.227878825182186e-06, "loss": 0.3955, "step": 2142 }, { "epoch": 1.7838512763596004, "grad_norm": 0.40656089782714844, "learning_rate": 4.223092161770952e-06, "loss": 0.4138, "step": 2143 }, { "epoch": 1.7846836847946725, "grad_norm": 0.3283717930316925, "learning_rate": 4.218306227889782e-06, "loss": 0.4156, "step": 2144 }, { "epoch": 1.7855160932297447, "grad_norm": 0.32961905002593994, "learning_rate": 4.213521028032751e-06, "loss": 0.3817, "step": 2145 }, { "epoch": 1.7863485016648168, "grad_norm": 0.35188883543014526, "learning_rate": 4.2087365666932456e-06, "loss": 0.4086, "step": 2146 }, { "epoch": 1.787180910099889, "grad_norm": 0.36771219968795776, "learning_rate": 4.203952848363957e-06, "loss": 0.4582, "step": 2147 }, { "epoch": 1.7880133185349611, "grad_norm": 0.305984765291214, "learning_rate": 4.199169877536884e-06, "loss": 0.371, "step": 2148 }, { "epoch": 1.7888457269700333, "grad_norm": 0.3949022591114044, "learning_rate": 4.194387658703317e-06, "loss": 0.4562, "step": 2149 }, { "epoch": 1.7896781354051055, "grad_norm": 0.3683740496635437, "learning_rate": 4.189606196353844e-06, "loss": 0.4009, "step": 2150 }, { "epoch": 1.7905105438401776, "grad_norm": 0.32549092173576355, "learning_rate": 4.184825494978342e-06, "loss": 0.4445, "step": 2151 }, { "epoch": 1.7913429522752498, "grad_norm": 0.35574018955230713, "learning_rate": 4.180045559065974e-06, "loss": 0.4337, "step": 2152 }, { "epoch": 1.792175360710322, "grad_norm": 0.3322107195854187, "learning_rate": 4.175266393105183e-06, "loss": 0.3973, "step": 2153 }, { "epoch": 1.793007769145394, "grad_norm": 0.3085898458957672, "learning_rate": 4.1704880015836905e-06, "loss": 0.4016, "step": 2154 }, { "epoch": 1.7938401775804662, "grad_norm": 0.3286598026752472, "learning_rate": 4.165710388988487e-06, "loss": 0.4377, "step": 2155 }, { "epoch": 1.7946725860155381, "grad_norm": 0.32184141874313354, "learning_rate": 4.1609335598058355e-06, "loss": 0.3827, "step": 2156 }, { "epoch": 1.7955049944506105, "grad_norm": 0.3248361051082611, "learning_rate": 4.156157518521264e-06, "loss": 0.4447, "step": 2157 }, { "epoch": 1.7963374028856824, "grad_norm": 0.35561513900756836, "learning_rate": 4.151382269619558e-06, "loss": 0.4322, "step": 2158 }, { "epoch": 1.7971698113207548, "grad_norm": 0.33568158745765686, "learning_rate": 4.146607817584759e-06, "loss": 0.417, "step": 2159 }, { "epoch": 1.7980022197558267, "grad_norm": 0.325347363948822, "learning_rate": 4.14183416690016e-06, "loss": 0.4263, "step": 2160 }, { "epoch": 1.7988346281908991, "grad_norm": 0.31573471426963806, "learning_rate": 4.137061322048307e-06, "loss": 0.4142, "step": 2161 }, { "epoch": 1.799667036625971, "grad_norm": 0.33735018968582153, "learning_rate": 4.13228928751098e-06, "loss": 0.4247, "step": 2162 }, { "epoch": 1.8004994450610434, "grad_norm": 0.3238171339035034, "learning_rate": 4.127518067769206e-06, "loss": 0.3978, "step": 2163 }, { "epoch": 1.8013318534961154, "grad_norm": 0.33231449127197266, "learning_rate": 4.122747667303242e-06, "loss": 0.4361, "step": 2164 }, { "epoch": 1.8021642619311877, "grad_norm": 0.32602575421333313, "learning_rate": 4.11797809059258e-06, "loss": 0.3914, "step": 2165 }, { "epoch": 1.8029966703662597, "grad_norm": 0.29990747570991516, "learning_rate": 4.1132093421159335e-06, "loss": 0.3839, "step": 2166 }, { "epoch": 1.8038290788013318, "grad_norm": 0.3527059257030487, "learning_rate": 4.108441426351243e-06, "loss": 0.4338, "step": 2167 }, { "epoch": 1.804661487236404, "grad_norm": 0.3696962893009186, "learning_rate": 4.103674347775663e-06, "loss": 0.4063, "step": 2168 }, { "epoch": 1.8054938956714761, "grad_norm": 0.3006901144981384, "learning_rate": 4.098908110865563e-06, "loss": 0.3799, "step": 2169 }, { "epoch": 1.8063263041065483, "grad_norm": 0.350769579410553, "learning_rate": 4.094142720096526e-06, "loss": 0.4381, "step": 2170 }, { "epoch": 1.8071587125416204, "grad_norm": 0.3283845782279968, "learning_rate": 4.089378179943336e-06, "loss": 0.4216, "step": 2171 }, { "epoch": 1.8079911209766926, "grad_norm": 0.2863730490207672, "learning_rate": 4.084614494879979e-06, "loss": 0.4155, "step": 2172 }, { "epoch": 1.8088235294117647, "grad_norm": 0.3665149211883545, "learning_rate": 4.079851669379638e-06, "loss": 0.4351, "step": 2173 }, { "epoch": 1.8096559378468369, "grad_norm": 0.39417773485183716, "learning_rate": 4.0750897079146924e-06, "loss": 0.4559, "step": 2174 }, { "epoch": 1.810488346281909, "grad_norm": 0.3096025288105011, "learning_rate": 4.070328614956705e-06, "loss": 0.4012, "step": 2175 }, { "epoch": 1.8113207547169812, "grad_norm": 0.3567507863044739, "learning_rate": 4.065568394976426e-06, "loss": 0.416, "step": 2176 }, { "epoch": 1.8121531631520533, "grad_norm": 0.29146334528923035, "learning_rate": 4.060809052443784e-06, "loss": 0.3917, "step": 2177 }, { "epoch": 1.8129855715871255, "grad_norm": 0.3747054636478424, "learning_rate": 4.056050591827888e-06, "loss": 0.4189, "step": 2178 }, { "epoch": 1.8138179800221974, "grad_norm": 0.3492047190666199, "learning_rate": 4.051293017597014e-06, "loss": 0.4179, "step": 2179 }, { "epoch": 1.8146503884572698, "grad_norm": 0.3406948745250702, "learning_rate": 4.046536334218609e-06, "loss": 0.421, "step": 2180 }, { "epoch": 1.8154827968923417, "grad_norm": 0.34447532892227173, "learning_rate": 4.0417805461592764e-06, "loss": 0.4349, "step": 2181 }, { "epoch": 1.816315205327414, "grad_norm": 0.3566656708717346, "learning_rate": 4.037025657884793e-06, "loss": 0.4326, "step": 2182 }, { "epoch": 1.817147613762486, "grad_norm": 0.3476259112358093, "learning_rate": 4.032271673860077e-06, "loss": 0.4038, "step": 2183 }, { "epoch": 1.8179800221975584, "grad_norm": 0.3235064446926117, "learning_rate": 4.0275185985492025e-06, "loss": 0.412, "step": 2184 }, { "epoch": 1.8188124306326303, "grad_norm": 0.37167060375213623, "learning_rate": 4.022766436415392e-06, "loss": 0.4514, "step": 2185 }, { "epoch": 1.8196448390677027, "grad_norm": 0.31849905848503113, "learning_rate": 4.018015191921008e-06, "loss": 0.4254, "step": 2186 }, { "epoch": 1.8204772475027746, "grad_norm": 0.2981938123703003, "learning_rate": 4.013264869527553e-06, "loss": 0.379, "step": 2187 }, { "epoch": 1.821309655937847, "grad_norm": 0.36852484941482544, "learning_rate": 4.008515473695663e-06, "loss": 0.4213, "step": 2188 }, { "epoch": 1.822142064372919, "grad_norm": 0.37913084030151367, "learning_rate": 4.003767008885102e-06, "loss": 0.4507, "step": 2189 }, { "epoch": 1.822974472807991, "grad_norm": 0.30263346433639526, "learning_rate": 3.999019479554764e-06, "loss": 0.3763, "step": 2190 }, { "epoch": 1.8238068812430632, "grad_norm": 0.33267539739608765, "learning_rate": 3.9942728901626605e-06, "loss": 0.4024, "step": 2191 }, { "epoch": 1.8246392896781354, "grad_norm": 0.32982137799263, "learning_rate": 3.989527245165924e-06, "loss": 0.4079, "step": 2192 }, { "epoch": 1.8254716981132075, "grad_norm": 0.35067951679229736, "learning_rate": 3.984782549020797e-06, "loss": 0.4182, "step": 2193 }, { "epoch": 1.8263041065482797, "grad_norm": 0.328118234872818, "learning_rate": 3.980038806182629e-06, "loss": 0.4588, "step": 2194 }, { "epoch": 1.8271365149833518, "grad_norm": 0.37364038825035095, "learning_rate": 3.975296021105885e-06, "loss": 0.4546, "step": 2195 }, { "epoch": 1.827968923418424, "grad_norm": 0.3203776180744171, "learning_rate": 3.970554198244116e-06, "loss": 0.4091, "step": 2196 }, { "epoch": 1.8288013318534961, "grad_norm": 0.36616021394729614, "learning_rate": 3.965813342049983e-06, "loss": 0.4341, "step": 2197 }, { "epoch": 1.8296337402885683, "grad_norm": 0.3346719741821289, "learning_rate": 3.961073456975227e-06, "loss": 0.4067, "step": 2198 }, { "epoch": 1.8304661487236404, "grad_norm": 0.3459213674068451, "learning_rate": 3.956334547470686e-06, "loss": 0.3825, "step": 2199 }, { "epoch": 1.8312985571587126, "grad_norm": 0.336300253868103, "learning_rate": 3.95159661798628e-06, "loss": 0.4056, "step": 2200 }, { "epoch": 1.8321309655937847, "grad_norm": 0.3746432662010193, "learning_rate": 3.946859672971006e-06, "loss": 0.4649, "step": 2201 }, { "epoch": 1.8329633740288567, "grad_norm": 0.3622547388076782, "learning_rate": 3.9421237168729386e-06, "loss": 0.3967, "step": 2202 }, { "epoch": 1.833795782463929, "grad_norm": 0.3507779836654663, "learning_rate": 3.937388754139223e-06, "loss": 0.4041, "step": 2203 }, { "epoch": 1.834628190899001, "grad_norm": 0.37171730399131775, "learning_rate": 3.9326547892160746e-06, "loss": 0.4587, "step": 2204 }, { "epoch": 1.8354605993340734, "grad_norm": 0.3501371443271637, "learning_rate": 3.927921826548767e-06, "loss": 0.4174, "step": 2205 }, { "epoch": 1.8362930077691453, "grad_norm": 0.3328426778316498, "learning_rate": 3.923189870581636e-06, "loss": 0.4201, "step": 2206 }, { "epoch": 1.8371254162042177, "grad_norm": 0.3560123145580292, "learning_rate": 3.918458925758068e-06, "loss": 0.4023, "step": 2207 }, { "epoch": 1.8379578246392896, "grad_norm": 0.35025811195373535, "learning_rate": 3.9137289965205086e-06, "loss": 0.4353, "step": 2208 }, { "epoch": 1.838790233074362, "grad_norm": 0.29967784881591797, "learning_rate": 3.909000087310441e-06, "loss": 0.4074, "step": 2209 }, { "epoch": 1.8396226415094339, "grad_norm": 0.35869383811950684, "learning_rate": 3.9042722025683945e-06, "loss": 0.3998, "step": 2210 }, { "epoch": 1.8404550499445063, "grad_norm": 0.3381917178630829, "learning_rate": 3.899545346733933e-06, "loss": 0.4623, "step": 2211 }, { "epoch": 1.8412874583795782, "grad_norm": 0.30779141187667847, "learning_rate": 3.894819524245661e-06, "loss": 0.395, "step": 2212 }, { "epoch": 1.8421198668146503, "grad_norm": 0.35666483640670776, "learning_rate": 3.890094739541207e-06, "loss": 0.3997, "step": 2213 }, { "epoch": 1.8429522752497225, "grad_norm": 0.3682236969470978, "learning_rate": 3.885370997057225e-06, "loss": 0.4708, "step": 2214 }, { "epoch": 1.8437846836847946, "grad_norm": 0.3033082187175751, "learning_rate": 3.880648301229394e-06, "loss": 0.368, "step": 2215 }, { "epoch": 1.8446170921198668, "grad_norm": 0.335415244102478, "learning_rate": 3.875926656492406e-06, "loss": 0.4432, "step": 2216 }, { "epoch": 1.845449500554939, "grad_norm": 0.35151904821395874, "learning_rate": 3.871206067279971e-06, "loss": 0.4582, "step": 2217 }, { "epoch": 1.846281908990011, "grad_norm": 0.32559242844581604, "learning_rate": 3.866486538024802e-06, "loss": 0.4067, "step": 2218 }, { "epoch": 1.8471143174250833, "grad_norm": 0.3076154589653015, "learning_rate": 3.861768073158623e-06, "loss": 0.4061, "step": 2219 }, { "epoch": 1.8479467258601554, "grad_norm": 0.29312387108802795, "learning_rate": 3.8570506771121484e-06, "loss": 0.3832, "step": 2220 }, { "epoch": 1.8487791342952276, "grad_norm": 0.3544323742389679, "learning_rate": 3.852334354315104e-06, "loss": 0.4602, "step": 2221 }, { "epoch": 1.8496115427302997, "grad_norm": 0.3405257761478424, "learning_rate": 3.847619109196195e-06, "loss": 0.4202, "step": 2222 }, { "epoch": 1.8504439511653719, "grad_norm": 0.35631346702575684, "learning_rate": 3.842904946183121e-06, "loss": 0.4349, "step": 2223 }, { "epoch": 1.851276359600444, "grad_norm": 0.35186129808425903, "learning_rate": 3.83819186970256e-06, "loss": 0.4067, "step": 2224 }, { "epoch": 1.852108768035516, "grad_norm": 0.3434910476207733, "learning_rate": 3.833479884180177e-06, "loss": 0.4053, "step": 2225 }, { "epoch": 1.8529411764705883, "grad_norm": 0.3569307029247284, "learning_rate": 3.828768994040608e-06, "loss": 0.4356, "step": 2226 }, { "epoch": 1.8537735849056602, "grad_norm": 0.33176884055137634, "learning_rate": 3.824059203707461e-06, "loss": 0.4422, "step": 2227 }, { "epoch": 1.8546059933407326, "grad_norm": 0.2953312397003174, "learning_rate": 3.81935051760331e-06, "loss": 0.3875, "step": 2228 }, { "epoch": 1.8554384017758045, "grad_norm": 0.336987167596817, "learning_rate": 3.8146429401496963e-06, "loss": 0.4396, "step": 2229 }, { "epoch": 1.856270810210877, "grad_norm": 0.3004775643348694, "learning_rate": 3.8099364757671188e-06, "loss": 0.4045, "step": 2230 }, { "epoch": 1.8571032186459488, "grad_norm": 0.3074621856212616, "learning_rate": 3.8052311288750255e-06, "loss": 0.3944, "step": 2231 }, { "epoch": 1.8579356270810212, "grad_norm": 0.31900620460510254, "learning_rate": 3.800526903891823e-06, "loss": 0.4074, "step": 2232 }, { "epoch": 1.8587680355160932, "grad_norm": 0.3192499876022339, "learning_rate": 3.795823805234857e-06, "loss": 0.4026, "step": 2233 }, { "epoch": 1.8596004439511655, "grad_norm": 0.3406783640384674, "learning_rate": 3.791121837320425e-06, "loss": 0.4335, "step": 2234 }, { "epoch": 1.8604328523862375, "grad_norm": 0.33335891366004944, "learning_rate": 3.786421004563753e-06, "loss": 0.4305, "step": 2235 }, { "epoch": 1.8612652608213096, "grad_norm": 0.30208635330200195, "learning_rate": 3.7817213113790088e-06, "loss": 0.4136, "step": 2236 }, { "epoch": 1.8620976692563818, "grad_norm": 0.3383508026599884, "learning_rate": 3.7770227621792815e-06, "loss": 0.4167, "step": 2237 }, { "epoch": 1.862930077691454, "grad_norm": 0.32437726855278015, "learning_rate": 3.7723253613765954e-06, "loss": 0.4547, "step": 2238 }, { "epoch": 1.863762486126526, "grad_norm": 0.3173195719718933, "learning_rate": 3.767629113381891e-06, "loss": 0.4056, "step": 2239 }, { "epoch": 1.8645948945615982, "grad_norm": 0.3287826180458069, "learning_rate": 3.762934022605027e-06, "loss": 0.4125, "step": 2240 }, { "epoch": 1.8654273029966704, "grad_norm": 0.33371874690055847, "learning_rate": 3.758240093454775e-06, "loss": 0.4262, "step": 2241 }, { "epoch": 1.8662597114317425, "grad_norm": 0.3588714599609375, "learning_rate": 3.7535473303388175e-06, "loss": 0.3972, "step": 2242 }, { "epoch": 1.8670921198668147, "grad_norm": 0.32598569989204407, "learning_rate": 3.7488557376637436e-06, "loss": 0.3772, "step": 2243 }, { "epoch": 1.8679245283018868, "grad_norm": 0.3502293825149536, "learning_rate": 3.744165319835037e-06, "loss": 0.4538, "step": 2244 }, { "epoch": 1.868756936736959, "grad_norm": 0.31929466128349304, "learning_rate": 3.739476081257085e-06, "loss": 0.4287, "step": 2245 }, { "epoch": 1.8695893451720311, "grad_norm": 0.366178959608078, "learning_rate": 3.7347880263331603e-06, "loss": 0.442, "step": 2246 }, { "epoch": 1.8704217536071033, "grad_norm": 0.40710756182670593, "learning_rate": 3.730101159465435e-06, "loss": 0.441, "step": 2247 }, { "epoch": 1.8712541620421752, "grad_norm": 0.3121664226055145, "learning_rate": 3.725415485054955e-06, "loss": 0.3975, "step": 2248 }, { "epoch": 1.8720865704772476, "grad_norm": 0.3380068242549896, "learning_rate": 3.7207310075016533e-06, "loss": 0.4533, "step": 2249 }, { "epoch": 1.8729189789123195, "grad_norm": 0.35721099376678467, "learning_rate": 3.716047731204332e-06, "loss": 0.4422, "step": 2250 }, { "epoch": 1.8737513873473919, "grad_norm": 0.35127249360084534, "learning_rate": 3.711365660560674e-06, "loss": 0.4263, "step": 2251 }, { "epoch": 1.8745837957824638, "grad_norm": 0.36538636684417725, "learning_rate": 3.706684799967224e-06, "loss": 0.4003, "step": 2252 }, { "epoch": 1.8754162042175362, "grad_norm": 0.3314541280269623, "learning_rate": 3.702005153819391e-06, "loss": 0.369, "step": 2253 }, { "epoch": 1.8762486126526081, "grad_norm": 0.35317155718803406, "learning_rate": 3.6973267265114456e-06, "loss": 0.4318, "step": 2254 }, { "epoch": 1.8770810210876805, "grad_norm": 0.397045373916626, "learning_rate": 3.6926495224365124e-06, "loss": 0.4304, "step": 2255 }, { "epoch": 1.8779134295227524, "grad_norm": 0.34217485785484314, "learning_rate": 3.6879735459865708e-06, "loss": 0.3961, "step": 2256 }, { "epoch": 1.8787458379578248, "grad_norm": 0.3433960974216461, "learning_rate": 3.68329880155244e-06, "loss": 0.4528, "step": 2257 }, { "epoch": 1.8795782463928967, "grad_norm": 0.36350932717323303, "learning_rate": 3.6786252935237886e-06, "loss": 0.4229, "step": 2258 }, { "epoch": 1.8804106548279689, "grad_norm": 0.3405548334121704, "learning_rate": 3.6739530262891245e-06, "loss": 0.4055, "step": 2259 }, { "epoch": 1.881243063263041, "grad_norm": 0.3336378037929535, "learning_rate": 3.669282004235787e-06, "loss": 0.3886, "step": 2260 }, { "epoch": 1.8820754716981132, "grad_norm": 0.3233022093772888, "learning_rate": 3.6646122317499465e-06, "loss": 0.4265, "step": 2261 }, { "epoch": 1.8829078801331853, "grad_norm": 0.3442718982696533, "learning_rate": 3.6599437132166036e-06, "loss": 0.4322, "step": 2262 }, { "epoch": 1.8837402885682575, "grad_norm": 0.3712242841720581, "learning_rate": 3.655276453019575e-06, "loss": 0.4382, "step": 2263 }, { "epoch": 1.8845726970033296, "grad_norm": 0.3307363986968994, "learning_rate": 3.650610455541504e-06, "loss": 0.387, "step": 2264 }, { "epoch": 1.8854051054384018, "grad_norm": 0.3208552896976471, "learning_rate": 3.6459457251638423e-06, "loss": 0.4071, "step": 2265 }, { "epoch": 1.886237513873474, "grad_norm": 0.345205157995224, "learning_rate": 3.641282266266853e-06, "loss": 0.4206, "step": 2266 }, { "epoch": 1.887069922308546, "grad_norm": 0.4148430824279785, "learning_rate": 3.636620083229604e-06, "loss": 0.445, "step": 2267 }, { "epoch": 1.8879023307436182, "grad_norm": 0.33523353934288025, "learning_rate": 3.6319591804299703e-06, "loss": 0.389, "step": 2268 }, { "epoch": 1.8887347391786904, "grad_norm": 0.3195513188838959, "learning_rate": 3.6272995622446204e-06, "loss": 0.4133, "step": 2269 }, { "epoch": 1.8895671476137625, "grad_norm": 0.36299553513526917, "learning_rate": 3.622641233049016e-06, "loss": 0.4297, "step": 2270 }, { "epoch": 1.8903995560488345, "grad_norm": 0.3698355555534363, "learning_rate": 3.617984197217409e-06, "loss": 0.4338, "step": 2271 }, { "epoch": 1.8912319644839068, "grad_norm": 0.332303524017334, "learning_rate": 3.6133284591228403e-06, "loss": 0.3874, "step": 2272 }, { "epoch": 1.8920643729189788, "grad_norm": 0.34952232241630554, "learning_rate": 3.608674023137129e-06, "loss": 0.425, "step": 2273 }, { "epoch": 1.8928967813540512, "grad_norm": 0.3578547239303589, "learning_rate": 3.6040208936308697e-06, "loss": 0.4576, "step": 2274 }, { "epoch": 1.893729189789123, "grad_norm": 0.3056720793247223, "learning_rate": 3.599369074973433e-06, "loss": 0.4044, "step": 2275 }, { "epoch": 1.8945615982241955, "grad_norm": 0.3444676101207733, "learning_rate": 3.5947185715329614e-06, "loss": 0.3809, "step": 2276 }, { "epoch": 1.8953940066592674, "grad_norm": 0.3490372896194458, "learning_rate": 3.5900693876763556e-06, "loss": 0.4221, "step": 2277 }, { "epoch": 1.8962264150943398, "grad_norm": 0.3269912600517273, "learning_rate": 3.585421527769283e-06, "loss": 0.4139, "step": 2278 }, { "epoch": 1.8970588235294117, "grad_norm": 0.3209238350391388, "learning_rate": 3.580774996176162e-06, "loss": 0.3649, "step": 2279 }, { "epoch": 1.897891231964484, "grad_norm": 0.3572242259979248, "learning_rate": 3.5761297972601695e-06, "loss": 0.4546, "step": 2280 }, { "epoch": 1.898723640399556, "grad_norm": 0.31195253133773804, "learning_rate": 3.5714859353832286e-06, "loss": 0.4141, "step": 2281 }, { "epoch": 1.8995560488346281, "grad_norm": 0.3187641501426697, "learning_rate": 3.5668434149060076e-06, "loss": 0.4276, "step": 2282 }, { "epoch": 1.9003884572697003, "grad_norm": 0.34391510486602783, "learning_rate": 3.562202240187913e-06, "loss": 0.4086, "step": 2283 }, { "epoch": 1.9012208657047724, "grad_norm": 0.35638102889060974, "learning_rate": 3.5575624155870885e-06, "loss": 0.4511, "step": 2284 }, { "epoch": 1.9020532741398446, "grad_norm": 0.30889418721199036, "learning_rate": 3.552923945460413e-06, "loss": 0.417, "step": 2285 }, { "epoch": 1.9028856825749167, "grad_norm": 0.3231523036956787, "learning_rate": 3.548286834163491e-06, "loss": 0.3931, "step": 2286 }, { "epoch": 1.903718091009989, "grad_norm": 0.3281930983066559, "learning_rate": 3.543651086050649e-06, "loss": 0.3971, "step": 2287 }, { "epoch": 1.904550499445061, "grad_norm": 0.3393603563308716, "learning_rate": 3.5390167054749363e-06, "loss": 0.3878, "step": 2288 }, { "epoch": 1.9053829078801332, "grad_norm": 0.33854469656944275, "learning_rate": 3.5343836967881194e-06, "loss": 0.4232, "step": 2289 }, { "epoch": 1.9062153163152054, "grad_norm": 0.3317667245864868, "learning_rate": 3.529752064340673e-06, "loss": 0.3732, "step": 2290 }, { "epoch": 1.9070477247502775, "grad_norm": 0.31549349427223206, "learning_rate": 3.5251218124817803e-06, "loss": 0.4203, "step": 2291 }, { "epoch": 1.9078801331853497, "grad_norm": 0.3274786174297333, "learning_rate": 3.5204929455593316e-06, "loss": 0.4142, "step": 2292 }, { "epoch": 1.9087125416204218, "grad_norm": 0.3583471477031708, "learning_rate": 3.51586546791991e-06, "loss": 0.4494, "step": 2293 }, { "epoch": 1.9095449500554937, "grad_norm": 0.34534788131713867, "learning_rate": 3.511239383908801e-06, "loss": 0.4386, "step": 2294 }, { "epoch": 1.9103773584905661, "grad_norm": 0.34133318066596985, "learning_rate": 3.5066146978699785e-06, "loss": 0.4227, "step": 2295 }, { "epoch": 1.911209766925638, "grad_norm": 0.3159574568271637, "learning_rate": 3.501991414146102e-06, "loss": 0.42, "step": 2296 }, { "epoch": 1.9120421753607104, "grad_norm": 0.321329265832901, "learning_rate": 3.4973695370785154e-06, "loss": 0.391, "step": 2297 }, { "epoch": 1.9128745837957823, "grad_norm": 0.3425721526145935, "learning_rate": 3.4927490710072454e-06, "loss": 0.427, "step": 2298 }, { "epoch": 1.9137069922308547, "grad_norm": 0.34851303696632385, "learning_rate": 3.488130020270989e-06, "loss": 0.4335, "step": 2299 }, { "epoch": 1.9145394006659266, "grad_norm": 0.26636067032814026, "learning_rate": 3.4835123892071145e-06, "loss": 0.3935, "step": 2300 }, { "epoch": 1.915371809100999, "grad_norm": 0.30461832880973816, "learning_rate": 3.4788961821516576e-06, "loss": 0.3859, "step": 2301 }, { "epoch": 1.916204217536071, "grad_norm": 0.3343258500099182, "learning_rate": 3.4742814034393224e-06, "loss": 0.4512, "step": 2302 }, { "epoch": 1.9170366259711433, "grad_norm": 0.3263416588306427, "learning_rate": 3.4696680574034613e-06, "loss": 0.4181, "step": 2303 }, { "epoch": 1.9178690344062153, "grad_norm": 0.30046147108078003, "learning_rate": 3.46505614837609e-06, "loss": 0.3963, "step": 2304 }, { "epoch": 1.9187014428412874, "grad_norm": 0.3026406764984131, "learning_rate": 3.4604456806878704e-06, "loss": 0.4206, "step": 2305 }, { "epoch": 1.9195338512763596, "grad_norm": 0.30180978775024414, "learning_rate": 3.4558366586681152e-06, "loss": 0.4089, "step": 2306 }, { "epoch": 1.9203662597114317, "grad_norm": 0.2909837067127228, "learning_rate": 3.451229086644774e-06, "loss": 0.3858, "step": 2307 }, { "epoch": 1.9211986681465039, "grad_norm": 0.30994731187820435, "learning_rate": 3.4466229689444384e-06, "loss": 0.3947, "step": 2308 }, { "epoch": 1.922031076581576, "grad_norm": 0.3551662862300873, "learning_rate": 3.442018309892333e-06, "loss": 0.4318, "step": 2309 }, { "epoch": 1.9228634850166482, "grad_norm": 0.3257605731487274, "learning_rate": 3.4374151138123135e-06, "loss": 0.4569, "step": 2310 }, { "epoch": 1.9236958934517203, "grad_norm": 0.28167983889579773, "learning_rate": 3.432813385026862e-06, "loss": 0.3689, "step": 2311 }, { "epoch": 1.9245283018867925, "grad_norm": 0.36538660526275635, "learning_rate": 3.4282131278570833e-06, "loss": 0.4454, "step": 2312 }, { "epoch": 1.9253607103218646, "grad_norm": 0.33179739117622375, "learning_rate": 3.423614346622698e-06, "loss": 0.3962, "step": 2313 }, { "epoch": 1.9261931187569368, "grad_norm": 0.31999471783638, "learning_rate": 3.4190170456420413e-06, "loss": 0.3943, "step": 2314 }, { "epoch": 1.927025527192009, "grad_norm": 0.3487468659877777, "learning_rate": 3.4144212292320634e-06, "loss": 0.4468, "step": 2315 }, { "epoch": 1.927857935627081, "grad_norm": 0.3570094406604767, "learning_rate": 3.409826901708312e-06, "loss": 0.4241, "step": 2316 }, { "epoch": 1.928690344062153, "grad_norm": 0.354623407125473, "learning_rate": 3.4052340673849426e-06, "loss": 0.4656, "step": 2317 }, { "epoch": 1.9295227524972254, "grad_norm": 0.29796338081359863, "learning_rate": 3.400642730574706e-06, "loss": 0.422, "step": 2318 }, { "epoch": 1.9303551609322973, "grad_norm": 0.32531046867370605, "learning_rate": 3.3960528955889516e-06, "loss": 0.417, "step": 2319 }, { "epoch": 1.9311875693673697, "grad_norm": 0.3640810251235962, "learning_rate": 3.391464566737611e-06, "loss": 0.4625, "step": 2320 }, { "epoch": 1.9320199778024416, "grad_norm": 0.3448679745197296, "learning_rate": 3.386877748329208e-06, "loss": 0.4093, "step": 2321 }, { "epoch": 1.932852386237514, "grad_norm": 0.3299039900302887, "learning_rate": 3.382292444670843e-06, "loss": 0.4068, "step": 2322 }, { "epoch": 1.933684794672586, "grad_norm": 0.3216043710708618, "learning_rate": 3.3777086600681954e-06, "loss": 0.4023, "step": 2323 }, { "epoch": 1.9345172031076583, "grad_norm": 0.3004588484764099, "learning_rate": 3.3731263988255223e-06, "loss": 0.3881, "step": 2324 }, { "epoch": 1.9353496115427302, "grad_norm": 0.3696260452270508, "learning_rate": 3.3685456652456484e-06, "loss": 0.4373, "step": 2325 }, { "epoch": 1.9361820199778026, "grad_norm": 0.35806453227996826, "learning_rate": 3.3639664636299586e-06, "loss": 0.4515, "step": 2326 }, { "epoch": 1.9370144284128745, "grad_norm": 0.28217509388923645, "learning_rate": 3.3593887982784047e-06, "loss": 0.3911, "step": 2327 }, { "epoch": 1.9378468368479467, "grad_norm": 0.3391430974006653, "learning_rate": 3.354812673489497e-06, "loss": 0.4178, "step": 2328 }, { "epoch": 1.9386792452830188, "grad_norm": 0.3683750629425049, "learning_rate": 3.3502380935602942e-06, "loss": 0.4342, "step": 2329 }, { "epoch": 1.939511653718091, "grad_norm": 0.34491217136383057, "learning_rate": 3.3456650627864075e-06, "loss": 0.389, "step": 2330 }, { "epoch": 1.9403440621531631, "grad_norm": 0.3198521137237549, "learning_rate": 3.341093585461992e-06, "loss": 0.3983, "step": 2331 }, { "epoch": 1.9411764705882353, "grad_norm": 0.34554436802864075, "learning_rate": 3.336523665879748e-06, "loss": 0.4338, "step": 2332 }, { "epoch": 1.9420088790233074, "grad_norm": 0.32669246196746826, "learning_rate": 3.331955308330907e-06, "loss": 0.3832, "step": 2333 }, { "epoch": 1.9428412874583796, "grad_norm": 0.3620488941669464, "learning_rate": 3.327388517105239e-06, "loss": 0.493, "step": 2334 }, { "epoch": 1.9436736958934517, "grad_norm": 0.2980594038963318, "learning_rate": 3.3228232964910377e-06, "loss": 0.4098, "step": 2335 }, { "epoch": 1.9445061043285239, "grad_norm": 0.32015350461006165, "learning_rate": 3.3182596507751288e-06, "loss": 0.3994, "step": 2336 }, { "epoch": 1.945338512763596, "grad_norm": 0.32040566205978394, "learning_rate": 3.313697584242853e-06, "loss": 0.4373, "step": 2337 }, { "epoch": 1.9461709211986682, "grad_norm": 0.3319530785083771, "learning_rate": 3.309137101178073e-06, "loss": 0.4209, "step": 2338 }, { "epoch": 1.9470033296337403, "grad_norm": 0.3502316474914551, "learning_rate": 3.3045782058631597e-06, "loss": 0.4313, "step": 2339 }, { "epoch": 1.9478357380688123, "grad_norm": 0.3295128047466278, "learning_rate": 3.3000209025789965e-06, "loss": 0.429, "step": 2340 }, { "epoch": 1.9486681465038846, "grad_norm": 0.30817556381225586, "learning_rate": 3.295465195604972e-06, "loss": 0.3908, "step": 2341 }, { "epoch": 1.9495005549389566, "grad_norm": 0.34200671315193176, "learning_rate": 3.2909110892189745e-06, "loss": 0.4588, "step": 2342 }, { "epoch": 1.950332963374029, "grad_norm": 0.3436935842037201, "learning_rate": 3.286358587697388e-06, "loss": 0.4262, "step": 2343 }, { "epoch": 1.9511653718091009, "grad_norm": 0.29214903712272644, "learning_rate": 3.2818076953150917e-06, "loss": 0.4161, "step": 2344 }, { "epoch": 1.9519977802441733, "grad_norm": 0.2840372920036316, "learning_rate": 3.277258416345456e-06, "loss": 0.3877, "step": 2345 }, { "epoch": 1.9528301886792452, "grad_norm": 0.34386762976646423, "learning_rate": 3.2727107550603305e-06, "loss": 0.4524, "step": 2346 }, { "epoch": 1.9536625971143176, "grad_norm": 0.3042285740375519, "learning_rate": 3.26816471573005e-06, "loss": 0.4206, "step": 2347 }, { "epoch": 1.9544950055493895, "grad_norm": 0.28492388129234314, "learning_rate": 3.2636203026234236e-06, "loss": 0.3565, "step": 2348 }, { "epoch": 1.9553274139844619, "grad_norm": 0.33080917596817017, "learning_rate": 3.2590775200077364e-06, "loss": 0.4431, "step": 2349 }, { "epoch": 1.9561598224195338, "grad_norm": 0.3305113613605499, "learning_rate": 3.25453637214874e-06, "loss": 0.4194, "step": 2350 }, { "epoch": 1.956992230854606, "grad_norm": 0.3372589647769928, "learning_rate": 3.249996863310654e-06, "loss": 0.4155, "step": 2351 }, { "epoch": 1.957824639289678, "grad_norm": 0.33676356077194214, "learning_rate": 3.2454589977561513e-06, "loss": 0.3825, "step": 2352 }, { "epoch": 1.9586570477247502, "grad_norm": 0.37880995869636536, "learning_rate": 3.2409227797463727e-06, "loss": 0.4373, "step": 2353 }, { "epoch": 1.9594894561598224, "grad_norm": 0.3162456750869751, "learning_rate": 3.236388213540904e-06, "loss": 0.3984, "step": 2354 }, { "epoch": 1.9603218645948945, "grad_norm": 0.33563530445098877, "learning_rate": 3.231855303397783e-06, "loss": 0.4313, "step": 2355 }, { "epoch": 1.9611542730299667, "grad_norm": 0.31390097737312317, "learning_rate": 3.2273240535734895e-06, "loss": 0.4177, "step": 2356 }, { "epoch": 1.9619866814650389, "grad_norm": 0.3237724006175995, "learning_rate": 3.2227944683229484e-06, "loss": 0.4041, "step": 2357 }, { "epoch": 1.962819089900111, "grad_norm": 0.32047805190086365, "learning_rate": 3.2182665518995203e-06, "loss": 0.4324, "step": 2358 }, { "epoch": 1.9636514983351832, "grad_norm": 0.2876075506210327, "learning_rate": 3.2137403085549962e-06, "loss": 0.3837, "step": 2359 }, { "epoch": 1.9644839067702553, "grad_norm": 0.2909439504146576, "learning_rate": 3.2092157425395996e-06, "loss": 0.4206, "step": 2360 }, { "epoch": 1.9653163152053275, "grad_norm": 0.29555991291999817, "learning_rate": 3.2046928581019744e-06, "loss": 0.3688, "step": 2361 }, { "epoch": 1.9661487236403996, "grad_norm": 0.35118407011032104, "learning_rate": 3.20017165948919e-06, "loss": 0.4504, "step": 2362 }, { "epoch": 1.9669811320754715, "grad_norm": 0.309773325920105, "learning_rate": 3.195652150946732e-06, "loss": 0.3804, "step": 2363 }, { "epoch": 1.967813540510544, "grad_norm": 0.32229289412498474, "learning_rate": 3.1911343367184977e-06, "loss": 0.4497, "step": 2364 }, { "epoch": 1.9686459489456158, "grad_norm": 0.30695047974586487, "learning_rate": 3.1866182210467923e-06, "loss": 0.4304, "step": 2365 }, { "epoch": 1.9694783573806882, "grad_norm": 0.33170729875564575, "learning_rate": 3.1821038081723283e-06, "loss": 0.4071, "step": 2366 }, { "epoch": 1.9703107658157601, "grad_norm": 0.30925998091697693, "learning_rate": 3.1775911023342197e-06, "loss": 0.3772, "step": 2367 }, { "epoch": 1.9711431742508325, "grad_norm": 0.3299291729927063, "learning_rate": 3.1730801077699747e-06, "loss": 0.4137, "step": 2368 }, { "epoch": 1.9719755826859044, "grad_norm": 0.3230315148830414, "learning_rate": 3.168570828715496e-06, "loss": 0.4289, "step": 2369 }, { "epoch": 1.9728079911209768, "grad_norm": 0.33906981348991394, "learning_rate": 3.1640632694050753e-06, "loss": 0.4349, "step": 2370 }, { "epoch": 1.9736403995560488, "grad_norm": 0.31247228384017944, "learning_rate": 3.159557434071393e-06, "loss": 0.4172, "step": 2371 }, { "epoch": 1.9744728079911211, "grad_norm": 0.31131434440612793, "learning_rate": 3.155053326945503e-06, "loss": 0.4167, "step": 2372 }, { "epoch": 1.975305216426193, "grad_norm": 0.3348797857761383, "learning_rate": 3.1505509522568444e-06, "loss": 0.4398, "step": 2373 }, { "epoch": 1.9761376248612652, "grad_norm": 0.33107826113700867, "learning_rate": 3.1460503142332227e-06, "loss": 0.394, "step": 2374 }, { "epoch": 1.9769700332963374, "grad_norm": 0.30621546506881714, "learning_rate": 3.1415514171008176e-06, "loss": 0.4005, "step": 2375 }, { "epoch": 1.9778024417314095, "grad_norm": 0.3266296088695526, "learning_rate": 3.137054265084173e-06, "loss": 0.4249, "step": 2376 }, { "epoch": 1.9786348501664817, "grad_norm": 0.3151954412460327, "learning_rate": 3.1325588624061925e-06, "loss": 0.3982, "step": 2377 }, { "epoch": 1.9794672586015538, "grad_norm": 0.32240432500839233, "learning_rate": 3.128065213288136e-06, "loss": 0.3918, "step": 2378 }, { "epoch": 1.980299667036626, "grad_norm": 0.3490481376647949, "learning_rate": 3.123573321949621e-06, "loss": 0.4313, "step": 2379 }, { "epoch": 1.9811320754716981, "grad_norm": 0.3069087862968445, "learning_rate": 3.119083192608614e-06, "loss": 0.3813, "step": 2380 }, { "epoch": 1.9819644839067703, "grad_norm": 0.3291850984096527, "learning_rate": 3.114594829481421e-06, "loss": 0.4728, "step": 2381 }, { "epoch": 1.9827968923418424, "grad_norm": 0.33388224244117737, "learning_rate": 3.110108236782694e-06, "loss": 0.4257, "step": 2382 }, { "epoch": 1.9836293007769146, "grad_norm": 0.31437182426452637, "learning_rate": 3.105623418725424e-06, "loss": 0.3869, "step": 2383 }, { "epoch": 1.9844617092119867, "grad_norm": 0.31633225083351135, "learning_rate": 3.101140379520935e-06, "loss": 0.3808, "step": 2384 }, { "epoch": 1.9852941176470589, "grad_norm": 0.3501662015914917, "learning_rate": 3.0966591233788757e-06, "loss": 0.4576, "step": 2385 }, { "epoch": 1.9861265260821308, "grad_norm": 0.3176516592502594, "learning_rate": 3.092179654507227e-06, "loss": 0.4181, "step": 2386 }, { "epoch": 1.9869589345172032, "grad_norm": 0.354187935590744, "learning_rate": 3.0877019771122848e-06, "loss": 0.4359, "step": 2387 }, { "epoch": 1.987791342952275, "grad_norm": 0.32240673899650574, "learning_rate": 3.0832260953986716e-06, "loss": 0.3794, "step": 2388 }, { "epoch": 1.9886237513873475, "grad_norm": 0.3612145781517029, "learning_rate": 3.078752013569315e-06, "loss": 0.4573, "step": 2389 }, { "epoch": 1.9894561598224194, "grad_norm": 0.29033204913139343, "learning_rate": 3.0742797358254584e-06, "loss": 0.4075, "step": 2390 }, { "epoch": 1.9902885682574918, "grad_norm": 0.3590858280658722, "learning_rate": 3.069809266366647e-06, "loss": 0.4718, "step": 2391 }, { "epoch": 1.9911209766925637, "grad_norm": 0.3385516405105591, "learning_rate": 3.06534060939073e-06, "loss": 0.4209, "step": 2392 }, { "epoch": 1.991953385127636, "grad_norm": 0.32453739643096924, "learning_rate": 3.060873769093858e-06, "loss": 0.3889, "step": 2393 }, { "epoch": 1.992785793562708, "grad_norm": 0.3326762616634369, "learning_rate": 3.0564087496704676e-06, "loss": 0.4476, "step": 2394 }, { "epoch": 1.9936182019977804, "grad_norm": 0.30887943506240845, "learning_rate": 3.0519455553132914e-06, "loss": 0.422, "step": 2395 }, { "epoch": 1.9944506104328523, "grad_norm": 0.32724782824516296, "learning_rate": 3.047484190213349e-06, "loss": 0.4304, "step": 2396 }, { "epoch": 1.9952830188679245, "grad_norm": 0.3449188768863678, "learning_rate": 3.0430246585599402e-06, "loss": 0.4361, "step": 2397 }, { "epoch": 1.9961154273029966, "grad_norm": 0.3099921941757202, "learning_rate": 3.0385669645406413e-06, "loss": 0.378, "step": 2398 }, { "epoch": 1.9969478357380688, "grad_norm": 0.3242183327674866, "learning_rate": 3.034111112341307e-06, "loss": 0.4265, "step": 2399 }, { "epoch": 1.997780244173141, "grad_norm": 0.3651365637779236, "learning_rate": 3.029657106146057e-06, "loss": 0.4341, "step": 2400 }, { "epoch": 1.998612652608213, "grad_norm": 0.32321080565452576, "learning_rate": 3.025204950137286e-06, "loss": 0.4153, "step": 2401 }, { "epoch": 1.9994450610432852, "grad_norm": 0.3187124729156494, "learning_rate": 3.020754648495644e-06, "loss": 0.3873, "step": 2402 }, { "epoch": 2.000277469478357, "grad_norm": 0.69718337059021, "learning_rate": 3.0163062054000424e-06, "loss": 0.683, "step": 2403 }, { "epoch": 2.0011098779134295, "grad_norm": 0.33480337262153625, "learning_rate": 3.0118596250276453e-06, "loss": 0.4026, "step": 2404 }, { "epoch": 2.0019422863485015, "grad_norm": 0.3858184814453125, "learning_rate": 3.0074149115538725e-06, "loss": 0.4126, "step": 2405 }, { "epoch": 2.002774694783574, "grad_norm": 0.3335935175418854, "learning_rate": 3.0029720691523873e-06, "loss": 0.372, "step": 2406 }, { "epoch": 2.0036071032186458, "grad_norm": 0.3692198693752289, "learning_rate": 2.9985311019950945e-06, "loss": 0.4148, "step": 2407 }, { "epoch": 2.004439511653718, "grad_norm": 0.333741158246994, "learning_rate": 2.9940920142521413e-06, "loss": 0.3529, "step": 2408 }, { "epoch": 2.00527192008879, "grad_norm": 0.3616425395011902, "learning_rate": 2.9896548100919087e-06, "loss": 0.3824, "step": 2409 }, { "epoch": 2.0061043285238624, "grad_norm": 0.34554699063301086, "learning_rate": 2.985219493681011e-06, "loss": 0.3984, "step": 2410 }, { "epoch": 2.0069367369589344, "grad_norm": 0.35826876759529114, "learning_rate": 2.980786069184285e-06, "loss": 0.4494, "step": 2411 }, { "epoch": 2.0077691453940067, "grad_norm": 0.31827691197395325, "learning_rate": 2.976354540764793e-06, "loss": 0.3831, "step": 2412 }, { "epoch": 2.0086015538290787, "grad_norm": 0.3482878506183624, "learning_rate": 2.971924912583822e-06, "loss": 0.3803, "step": 2413 }, { "epoch": 2.009433962264151, "grad_norm": 0.3719305098056793, "learning_rate": 2.9674971888008696e-06, "loss": 0.4542, "step": 2414 }, { "epoch": 2.010266370699223, "grad_norm": 0.30256187915802, "learning_rate": 2.9630713735736428e-06, "loss": 0.3519, "step": 2415 }, { "epoch": 2.0110987791342954, "grad_norm": 0.3034214973449707, "learning_rate": 2.9586474710580627e-06, "loss": 0.3865, "step": 2416 }, { "epoch": 2.0119311875693673, "grad_norm": 0.3587876260280609, "learning_rate": 2.954225485408248e-06, "loss": 0.4371, "step": 2417 }, { "epoch": 2.0127635960044397, "grad_norm": 0.35651448369026184, "learning_rate": 2.9498054207765237e-06, "loss": 0.3977, "step": 2418 }, { "epoch": 2.0135960044395116, "grad_norm": 0.2968187630176544, "learning_rate": 2.945387281313408e-06, "loss": 0.3755, "step": 2419 }, { "epoch": 2.014428412874584, "grad_norm": 0.34130021929740906, "learning_rate": 2.940971071167608e-06, "loss": 0.4274, "step": 2420 }, { "epoch": 2.015260821309656, "grad_norm": 0.3059341609477997, "learning_rate": 2.936556794486024e-06, "loss": 0.3419, "step": 2421 }, { "epoch": 2.0160932297447283, "grad_norm": 0.3298993408679962, "learning_rate": 2.932144455413741e-06, "loss": 0.4018, "step": 2422 }, { "epoch": 2.0169256381798, "grad_norm": 0.3346230387687683, "learning_rate": 2.9277340580940215e-06, "loss": 0.4095, "step": 2423 }, { "epoch": 2.0177580466148726, "grad_norm": 0.35374805331230164, "learning_rate": 2.9233256066683047e-06, "loss": 0.3878, "step": 2424 }, { "epoch": 2.0185904550499445, "grad_norm": 0.2975861430168152, "learning_rate": 2.9189191052762038e-06, "loss": 0.3523, "step": 2425 }, { "epoch": 2.0194228634850164, "grad_norm": 0.3865160048007965, "learning_rate": 2.914514558055502e-06, "loss": 0.4404, "step": 2426 }, { "epoch": 2.020255271920089, "grad_norm": 0.2803787887096405, "learning_rate": 2.9101119691421453e-06, "loss": 0.3509, "step": 2427 }, { "epoch": 2.0210876803551607, "grad_norm": 0.35009345412254333, "learning_rate": 2.905711342670242e-06, "loss": 0.3977, "step": 2428 }, { "epoch": 2.021920088790233, "grad_norm": 0.3857073485851288, "learning_rate": 2.901312682772058e-06, "loss": 0.4167, "step": 2429 }, { "epoch": 2.022752497225305, "grad_norm": 0.3246917426586151, "learning_rate": 2.896915993578011e-06, "loss": 0.3457, "step": 2430 }, { "epoch": 2.0235849056603774, "grad_norm": 0.32466015219688416, "learning_rate": 2.8925212792166694e-06, "loss": 0.3807, "step": 2431 }, { "epoch": 2.0244173140954493, "grad_norm": 0.3205777704715729, "learning_rate": 2.8881285438147477e-06, "loss": 0.3801, "step": 2432 }, { "epoch": 2.0252497225305217, "grad_norm": 0.31383705139160156, "learning_rate": 2.8837377914971003e-06, "loss": 0.3692, "step": 2433 }, { "epoch": 2.0260821309655936, "grad_norm": 0.36327627301216125, "learning_rate": 2.8793490263867212e-06, "loss": 0.395, "step": 2434 }, { "epoch": 2.026914539400666, "grad_norm": 0.33775612711906433, "learning_rate": 2.8749622526047373e-06, "loss": 0.3764, "step": 2435 }, { "epoch": 2.027746947835738, "grad_norm": 0.31958720088005066, "learning_rate": 2.8705774742704063e-06, "loss": 0.4473, "step": 2436 }, { "epoch": 2.0285793562708103, "grad_norm": 0.3061386048793793, "learning_rate": 2.8661946955011145e-06, "loss": 0.4101, "step": 2437 }, { "epoch": 2.0294117647058822, "grad_norm": 0.27900204062461853, "learning_rate": 2.8618139204123597e-06, "loss": 0.3726, "step": 2438 }, { "epoch": 2.0302441731409546, "grad_norm": 0.32336297631263733, "learning_rate": 2.8574351531177747e-06, "loss": 0.403, "step": 2439 }, { "epoch": 2.0310765815760266, "grad_norm": 0.3574887216091156, "learning_rate": 2.853058397729095e-06, "loss": 0.354, "step": 2440 }, { "epoch": 2.031908990011099, "grad_norm": 0.3319253623485565, "learning_rate": 2.8486836583561737e-06, "loss": 0.4185, "step": 2441 }, { "epoch": 2.032741398446171, "grad_norm": 0.31709638237953186, "learning_rate": 2.8443109391069616e-06, "loss": 0.3966, "step": 2442 }, { "epoch": 2.0335738068812432, "grad_norm": 0.28898531198501587, "learning_rate": 2.8399402440875248e-06, "loss": 0.3537, "step": 2443 }, { "epoch": 2.034406215316315, "grad_norm": 0.32134294509887695, "learning_rate": 2.835571577402021e-06, "loss": 0.3991, "step": 2444 }, { "epoch": 2.0352386237513875, "grad_norm": 0.3410552144050598, "learning_rate": 2.831204943152701e-06, "loss": 0.4234, "step": 2445 }, { "epoch": 2.0360710321864595, "grad_norm": 0.31080541014671326, "learning_rate": 2.8268403454399154e-06, "loss": 0.3578, "step": 2446 }, { "epoch": 2.036903440621532, "grad_norm": 0.3171520531177521, "learning_rate": 2.8224777883620926e-06, "loss": 0.3696, "step": 2447 }, { "epoch": 2.0377358490566038, "grad_norm": 0.3251802921295166, "learning_rate": 2.8181172760157575e-06, "loss": 0.4128, "step": 2448 }, { "epoch": 2.0385682574916757, "grad_norm": 0.3506574034690857, "learning_rate": 2.8137588124955017e-06, "loss": 0.3945, "step": 2449 }, { "epoch": 2.039400665926748, "grad_norm": 0.32733920216560364, "learning_rate": 2.8094024018940012e-06, "loss": 0.4038, "step": 2450 }, { "epoch": 2.04023307436182, "grad_norm": 0.3144792914390564, "learning_rate": 2.8050480483020003e-06, "loss": 0.4005, "step": 2451 }, { "epoch": 2.0410654827968924, "grad_norm": 0.27931925654411316, "learning_rate": 2.8006957558083147e-06, "loss": 0.3664, "step": 2452 }, { "epoch": 2.0418978912319643, "grad_norm": 0.3300316333770752, "learning_rate": 2.7963455284998225e-06, "loss": 0.438, "step": 2453 }, { "epoch": 2.0427302996670367, "grad_norm": 0.31316685676574707, "learning_rate": 2.7919973704614632e-06, "loss": 0.3973, "step": 2454 }, { "epoch": 2.0435627081021086, "grad_norm": 0.3359510600566864, "learning_rate": 2.7876512857762343e-06, "loss": 0.3725, "step": 2455 }, { "epoch": 2.044395116537181, "grad_norm": 0.3232194483280182, "learning_rate": 2.7833072785251846e-06, "loss": 0.4057, "step": 2456 }, { "epoch": 2.045227524972253, "grad_norm": 0.31120437383651733, "learning_rate": 2.778965352787413e-06, "loss": 0.3684, "step": 2457 }, { "epoch": 2.0460599334073253, "grad_norm": 0.32518357038497925, "learning_rate": 2.774625512640064e-06, "loss": 0.4007, "step": 2458 }, { "epoch": 2.046892341842397, "grad_norm": 0.3096177875995636, "learning_rate": 2.7702877621583234e-06, "loss": 0.3879, "step": 2459 }, { "epoch": 2.0477247502774696, "grad_norm": 0.3457062840461731, "learning_rate": 2.7659521054154147e-06, "loss": 0.4057, "step": 2460 }, { "epoch": 2.0485571587125415, "grad_norm": 0.33439356088638306, "learning_rate": 2.7616185464825963e-06, "loss": 0.4092, "step": 2461 }, { "epoch": 2.049389567147614, "grad_norm": 0.33163490891456604, "learning_rate": 2.7572870894291542e-06, "loss": 0.4207, "step": 2462 }, { "epoch": 2.050221975582686, "grad_norm": 0.3037160336971283, "learning_rate": 2.752957738322406e-06, "loss": 0.3703, "step": 2463 }, { "epoch": 2.051054384017758, "grad_norm": 0.3286396563053131, "learning_rate": 2.748630497227682e-06, "loss": 0.4138, "step": 2464 }, { "epoch": 2.05188679245283, "grad_norm": 0.3129012882709503, "learning_rate": 2.744305370208342e-06, "loss": 0.3747, "step": 2465 }, { "epoch": 2.0527192008879025, "grad_norm": 0.3447505831718445, "learning_rate": 2.7399823613257565e-06, "loss": 0.3802, "step": 2466 }, { "epoch": 2.0535516093229744, "grad_norm": 0.32165050506591797, "learning_rate": 2.7356614746393063e-06, "loss": 0.4049, "step": 2467 }, { "epoch": 2.054384017758047, "grad_norm": 0.31354621052742004, "learning_rate": 2.7313427142063742e-06, "loss": 0.3912, "step": 2468 }, { "epoch": 2.0552164261931187, "grad_norm": 0.3174445331096649, "learning_rate": 2.7270260840823588e-06, "loss": 0.3831, "step": 2469 }, { "epoch": 2.056048834628191, "grad_norm": 0.3428478538990021, "learning_rate": 2.72271158832065e-06, "loss": 0.3866, "step": 2470 }, { "epoch": 2.056881243063263, "grad_norm": 0.35326361656188965, "learning_rate": 2.718399230972632e-06, "loss": 0.4147, "step": 2471 }, { "epoch": 2.057713651498335, "grad_norm": 0.31062963604927063, "learning_rate": 2.714089016087683e-06, "loss": 0.3806, "step": 2472 }, { "epoch": 2.0585460599334073, "grad_norm": 0.3279687166213989, "learning_rate": 2.7097809477131754e-06, "loss": 0.4389, "step": 2473 }, { "epoch": 2.0593784683684793, "grad_norm": 0.33881494402885437, "learning_rate": 2.705475029894459e-06, "loss": 0.4214, "step": 2474 }, { "epoch": 2.0602108768035516, "grad_norm": 0.3012351989746094, "learning_rate": 2.7011712666748636e-06, "loss": 0.3647, "step": 2475 }, { "epoch": 2.0610432852386236, "grad_norm": 0.36581099033355713, "learning_rate": 2.696869662095698e-06, "loss": 0.4215, "step": 2476 }, { "epoch": 2.061875693673696, "grad_norm": 0.3162044286727905, "learning_rate": 2.6925702201962493e-06, "loss": 0.4061, "step": 2477 }, { "epoch": 2.062708102108768, "grad_norm": 0.30521532893180847, "learning_rate": 2.6882729450137636e-06, "loss": 0.368, "step": 2478 }, { "epoch": 2.0635405105438402, "grad_norm": 0.36388954520225525, "learning_rate": 2.6839778405834593e-06, "loss": 0.3992, "step": 2479 }, { "epoch": 2.064372918978912, "grad_norm": 0.3033269941806793, "learning_rate": 2.6796849109385147e-06, "loss": 0.3603, "step": 2480 }, { "epoch": 2.0652053274139845, "grad_norm": 0.32970061898231506, "learning_rate": 2.6753941601100662e-06, "loss": 0.4103, "step": 2481 }, { "epoch": 2.0660377358490565, "grad_norm": 0.3365088105201721, "learning_rate": 2.6711055921272033e-06, "loss": 0.43, "step": 2482 }, { "epoch": 2.066870144284129, "grad_norm": 0.3210076689720154, "learning_rate": 2.6668192110169664e-06, "loss": 0.3518, "step": 2483 }, { "epoch": 2.067702552719201, "grad_norm": 0.37223634123802185, "learning_rate": 2.6625350208043432e-06, "loss": 0.3965, "step": 2484 }, { "epoch": 2.068534961154273, "grad_norm": 0.3405874967575073, "learning_rate": 2.658253025512263e-06, "loss": 0.401, "step": 2485 }, { "epoch": 2.069367369589345, "grad_norm": 0.32502299547195435, "learning_rate": 2.6539732291615937e-06, "loss": 0.365, "step": 2486 }, { "epoch": 2.0701997780244175, "grad_norm": 0.36859750747680664, "learning_rate": 2.6496956357711402e-06, "loss": 0.4403, "step": 2487 }, { "epoch": 2.0710321864594894, "grad_norm": 0.3416479527950287, "learning_rate": 2.6454202493576366e-06, "loss": 0.3544, "step": 2488 }, { "epoch": 2.0718645948945618, "grad_norm": 0.3147336542606354, "learning_rate": 2.641147073935746e-06, "loss": 0.3804, "step": 2489 }, { "epoch": 2.0726970033296337, "grad_norm": 0.3625737428665161, "learning_rate": 2.6368761135180544e-06, "loss": 0.4079, "step": 2490 }, { "epoch": 2.073529411764706, "grad_norm": 0.3262125849723816, "learning_rate": 2.632607372115069e-06, "loss": 0.4015, "step": 2491 }, { "epoch": 2.074361820199778, "grad_norm": 0.3019062280654907, "learning_rate": 2.628340853735213e-06, "loss": 0.3562, "step": 2492 }, { "epoch": 2.0751942286348504, "grad_norm": 0.35128647089004517, "learning_rate": 2.624076562384823e-06, "loss": 0.416, "step": 2493 }, { "epoch": 2.0760266370699223, "grad_norm": 0.3422330915927887, "learning_rate": 2.619814502068139e-06, "loss": 0.3876, "step": 2494 }, { "epoch": 2.0768590455049942, "grad_norm": 0.3407348692417145, "learning_rate": 2.6155546767873136e-06, "loss": 0.4153, "step": 2495 }, { "epoch": 2.0776914539400666, "grad_norm": 0.3248727023601532, "learning_rate": 2.611297090542399e-06, "loss": 0.3867, "step": 2496 }, { "epoch": 2.0785238623751385, "grad_norm": 0.2963961958885193, "learning_rate": 2.607041747331339e-06, "loss": 0.372, "step": 2497 }, { "epoch": 2.079356270810211, "grad_norm": 0.3318076431751251, "learning_rate": 2.6027886511499756e-06, "loss": 0.3884, "step": 2498 }, { "epoch": 2.080188679245283, "grad_norm": 0.32221075892448425, "learning_rate": 2.598537805992044e-06, "loss": 0.3999, "step": 2499 }, { "epoch": 2.081021087680355, "grad_norm": 0.3247624635696411, "learning_rate": 2.5942892158491626e-06, "loss": 0.4089, "step": 2500 }, { "epoch": 2.081853496115427, "grad_norm": 0.33710792660713196, "learning_rate": 2.590042884710828e-06, "loss": 0.4056, "step": 2501 }, { "epoch": 2.0826859045504995, "grad_norm": 0.3118056058883667, "learning_rate": 2.585798816564419e-06, "loss": 0.4116, "step": 2502 }, { "epoch": 2.0835183129855714, "grad_norm": 0.2979108691215515, "learning_rate": 2.5815570153951942e-06, "loss": 0.3746, "step": 2503 }, { "epoch": 2.084350721420644, "grad_norm": 0.33021363615989685, "learning_rate": 2.5773174851862796e-06, "loss": 0.394, "step": 2504 }, { "epoch": 2.0851831298557157, "grad_norm": 0.32653099298477173, "learning_rate": 2.573080229918664e-06, "loss": 0.4189, "step": 2505 }, { "epoch": 2.086015538290788, "grad_norm": 0.3247196674346924, "learning_rate": 2.568845253571204e-06, "loss": 0.3851, "step": 2506 }, { "epoch": 2.08684794672586, "grad_norm": 0.3410961329936981, "learning_rate": 2.564612560120623e-06, "loss": 0.4209, "step": 2507 }, { "epoch": 2.0876803551609324, "grad_norm": 0.3214920163154602, "learning_rate": 2.5603821535414874e-06, "loss": 0.3618, "step": 2508 }, { "epoch": 2.0885127635960044, "grad_norm": 0.34245097637176514, "learning_rate": 2.556154037806226e-06, "loss": 0.4046, "step": 2509 }, { "epoch": 2.0893451720310767, "grad_norm": 0.3048022389411926, "learning_rate": 2.5519282168851134e-06, "loss": 0.3755, "step": 2510 }, { "epoch": 2.0901775804661487, "grad_norm": 0.3066607415676117, "learning_rate": 2.547704694746269e-06, "loss": 0.3606, "step": 2511 }, { "epoch": 2.091009988901221, "grad_norm": 0.30674123764038086, "learning_rate": 2.543483475355654e-06, "loss": 0.3706, "step": 2512 }, { "epoch": 2.091842397336293, "grad_norm": 0.32365816831588745, "learning_rate": 2.5392645626770686e-06, "loss": 0.3973, "step": 2513 }, { "epoch": 2.0926748057713653, "grad_norm": 0.3154730498790741, "learning_rate": 2.5350479606721433e-06, "loss": 0.4019, "step": 2514 }, { "epoch": 2.0935072142064373, "grad_norm": 0.28480201959609985, "learning_rate": 2.5308336733003435e-06, "loss": 0.362, "step": 2515 }, { "epoch": 2.0943396226415096, "grad_norm": 0.3131362497806549, "learning_rate": 2.5266217045189572e-06, "loss": 0.4336, "step": 2516 }, { "epoch": 2.0951720310765816, "grad_norm": 0.3055063784122467, "learning_rate": 2.522412058283098e-06, "loss": 0.3658, "step": 2517 }, { "epoch": 2.0960044395116535, "grad_norm": 0.3128129839897156, "learning_rate": 2.5182047385456967e-06, "loss": 0.3963, "step": 2518 }, { "epoch": 2.096836847946726, "grad_norm": 0.3318641483783722, "learning_rate": 2.513999749257501e-06, "loss": 0.4023, "step": 2519 }, { "epoch": 2.097669256381798, "grad_norm": 0.30515703558921814, "learning_rate": 2.509797094367068e-06, "loss": 0.3823, "step": 2520 }, { "epoch": 2.09850166481687, "grad_norm": 0.3311365842819214, "learning_rate": 2.505596777820766e-06, "loss": 0.3941, "step": 2521 }, { "epoch": 2.099334073251942, "grad_norm": 0.3067466914653778, "learning_rate": 2.5013988035627656e-06, "loss": 0.4081, "step": 2522 }, { "epoch": 2.1001664816870145, "grad_norm": 0.344066858291626, "learning_rate": 2.4972031755350366e-06, "loss": 0.3707, "step": 2523 }, { "epoch": 2.1009988901220864, "grad_norm": 0.3539952039718628, "learning_rate": 2.493009897677346e-06, "loss": 0.4235, "step": 2524 }, { "epoch": 2.101831298557159, "grad_norm": 0.34413817524909973, "learning_rate": 2.4888189739272587e-06, "loss": 0.382, "step": 2525 }, { "epoch": 2.1026637069922307, "grad_norm": 0.30042779445648193, "learning_rate": 2.484630408220126e-06, "loss": 0.3718, "step": 2526 }, { "epoch": 2.103496115427303, "grad_norm": 0.3354708254337311, "learning_rate": 2.480444204489081e-06, "loss": 0.4273, "step": 2527 }, { "epoch": 2.104328523862375, "grad_norm": 0.31707751750946045, "learning_rate": 2.476260366665041e-06, "loss": 0.4271, "step": 2528 }, { "epoch": 2.1051609322974474, "grad_norm": 0.28172051906585693, "learning_rate": 2.472078898676708e-06, "loss": 0.3713, "step": 2529 }, { "epoch": 2.1059933407325193, "grad_norm": 0.31706756353378296, "learning_rate": 2.467899804450553e-06, "loss": 0.4136, "step": 2530 }, { "epoch": 2.1068257491675917, "grad_norm": 0.31650689244270325, "learning_rate": 2.463723087910815e-06, "loss": 0.3989, "step": 2531 }, { "epoch": 2.1076581576026636, "grad_norm": 0.3059511184692383, "learning_rate": 2.4595487529795044e-06, "loss": 0.3801, "step": 2532 }, { "epoch": 2.108490566037736, "grad_norm": 0.30133718252182007, "learning_rate": 2.4553768035763996e-06, "loss": 0.3809, "step": 2533 }, { "epoch": 2.109322974472808, "grad_norm": 0.3258849084377289, "learning_rate": 2.451207243619029e-06, "loss": 0.4455, "step": 2534 }, { "epoch": 2.1101553829078803, "grad_norm": 0.29617834091186523, "learning_rate": 2.447040077022685e-06, "loss": 0.3728, "step": 2535 }, { "epoch": 2.1109877913429522, "grad_norm": 0.3328491747379303, "learning_rate": 2.4428753077004067e-06, "loss": 0.3945, "step": 2536 }, { "epoch": 2.1118201997780246, "grad_norm": 0.3035510778427124, "learning_rate": 2.438712939562992e-06, "loss": 0.3755, "step": 2537 }, { "epoch": 2.1126526082130965, "grad_norm": 0.31253474950790405, "learning_rate": 2.434552976518971e-06, "loss": 0.3796, "step": 2538 }, { "epoch": 2.113485016648169, "grad_norm": 0.35178011655807495, "learning_rate": 2.430395422474625e-06, "loss": 0.4061, "step": 2539 }, { "epoch": 2.114317425083241, "grad_norm": 0.3211081326007843, "learning_rate": 2.426240281333969e-06, "loss": 0.3668, "step": 2540 }, { "epoch": 2.1151498335183128, "grad_norm": 0.35287511348724365, "learning_rate": 2.422087556998754e-06, "loss": 0.4332, "step": 2541 }, { "epoch": 2.115982241953385, "grad_norm": 0.31688159704208374, "learning_rate": 2.41793725336846e-06, "loss": 0.392, "step": 2542 }, { "epoch": 2.116814650388457, "grad_norm": 0.3495129644870758, "learning_rate": 2.4137893743402954e-06, "loss": 0.3871, "step": 2543 }, { "epoch": 2.1176470588235294, "grad_norm": 0.29781919717788696, "learning_rate": 2.409643923809191e-06, "loss": 0.366, "step": 2544 }, { "epoch": 2.1184794672586014, "grad_norm": 0.3296952545642853, "learning_rate": 2.4055009056677977e-06, "loss": 0.4213, "step": 2545 }, { "epoch": 2.1193118756936737, "grad_norm": 0.3212566077709198, "learning_rate": 2.4013603238064814e-06, "loss": 0.3989, "step": 2546 }, { "epoch": 2.1201442841287457, "grad_norm": 0.31362348794937134, "learning_rate": 2.397222182113322e-06, "loss": 0.364, "step": 2547 }, { "epoch": 2.120976692563818, "grad_norm": 0.3248312473297119, "learning_rate": 2.393086484474108e-06, "loss": 0.3903, "step": 2548 }, { "epoch": 2.12180910099889, "grad_norm": 0.3590147793292999, "learning_rate": 2.3889532347723266e-06, "loss": 0.418, "step": 2549 }, { "epoch": 2.1226415094339623, "grad_norm": 0.3201029300689697, "learning_rate": 2.384822436889177e-06, "loss": 0.4164, "step": 2550 }, { "epoch": 2.1234739178690343, "grad_norm": 0.2909408211708069, "learning_rate": 2.3806940947035497e-06, "loss": 0.35, "step": 2551 }, { "epoch": 2.1243063263041067, "grad_norm": 0.29795750975608826, "learning_rate": 2.3765682120920315e-06, "loss": 0.3996, "step": 2552 }, { "epoch": 2.1251387347391786, "grad_norm": 0.31154143810272217, "learning_rate": 2.3724447929288925e-06, "loss": 0.3876, "step": 2553 }, { "epoch": 2.125971143174251, "grad_norm": 0.33874204754829407, "learning_rate": 2.368323841086102e-06, "loss": 0.3969, "step": 2554 }, { "epoch": 2.126803551609323, "grad_norm": 0.3370445966720581, "learning_rate": 2.3642053604333032e-06, "loss": 0.4318, "step": 2555 }, { "epoch": 2.1276359600443953, "grad_norm": 0.30151233077049255, "learning_rate": 2.3600893548378238e-06, "loss": 0.3805, "step": 2556 }, { "epoch": 2.128468368479467, "grad_norm": 0.3622559607028961, "learning_rate": 2.3559758281646615e-06, "loss": 0.418, "step": 2557 }, { "epoch": 2.1293007769145396, "grad_norm": 0.307041734457016, "learning_rate": 2.35186478427649e-06, "loss": 0.3494, "step": 2558 }, { "epoch": 2.1301331853496115, "grad_norm": 0.2808810770511627, "learning_rate": 2.3477562270336564e-06, "loss": 0.4012, "step": 2559 }, { "epoch": 2.130965593784684, "grad_norm": 0.2891574800014496, "learning_rate": 2.343650160294163e-06, "loss": 0.358, "step": 2560 }, { "epoch": 2.131798002219756, "grad_norm": 0.32515814900398254, "learning_rate": 2.3395465879136795e-06, "loss": 0.4128, "step": 2561 }, { "epoch": 2.132630410654828, "grad_norm": 0.30551859736442566, "learning_rate": 2.3354455137455312e-06, "loss": 0.3673, "step": 2562 }, { "epoch": 2.1334628190899, "grad_norm": 0.3107419013977051, "learning_rate": 2.3313469416407037e-06, "loss": 0.3966, "step": 2563 }, { "epoch": 2.134295227524972, "grad_norm": 0.3009784519672394, "learning_rate": 2.3272508754478224e-06, "loss": 0.3692, "step": 2564 }, { "epoch": 2.1351276359600444, "grad_norm": 0.3275810778141022, "learning_rate": 2.3231573190131666e-06, "loss": 0.4089, "step": 2565 }, { "epoch": 2.1359600443951163, "grad_norm": 0.32291144132614136, "learning_rate": 2.3190662761806586e-06, "loss": 0.363, "step": 2566 }, { "epoch": 2.1367924528301887, "grad_norm": 0.32055914402008057, "learning_rate": 2.3149777507918587e-06, "loss": 0.4111, "step": 2567 }, { "epoch": 2.1376248612652606, "grad_norm": 0.31367000937461853, "learning_rate": 2.310891746685963e-06, "loss": 0.3907, "step": 2568 }, { "epoch": 2.138457269700333, "grad_norm": 0.31795892119407654, "learning_rate": 2.3068082676998022e-06, "loss": 0.4011, "step": 2569 }, { "epoch": 2.139289678135405, "grad_norm": 0.30362680554389954, "learning_rate": 2.3027273176678337e-06, "loss": 0.3924, "step": 2570 }, { "epoch": 2.1401220865704773, "grad_norm": 0.3192872405052185, "learning_rate": 2.298648900422141e-06, "loss": 0.382, "step": 2571 }, { "epoch": 2.1409544950055492, "grad_norm": 0.33818933367729187, "learning_rate": 2.2945730197924303e-06, "loss": 0.4072, "step": 2572 }, { "epoch": 2.1417869034406216, "grad_norm": 0.3056696951389313, "learning_rate": 2.2904996796060243e-06, "loss": 0.3397, "step": 2573 }, { "epoch": 2.1426193118756935, "grad_norm": 0.3521409034729004, "learning_rate": 2.2864288836878616e-06, "loss": 0.4124, "step": 2574 }, { "epoch": 2.143451720310766, "grad_norm": 0.3235545754432678, "learning_rate": 2.2823606358604868e-06, "loss": 0.3854, "step": 2575 }, { "epoch": 2.144284128745838, "grad_norm": 0.33744242787361145, "learning_rate": 2.278294939944061e-06, "loss": 0.3948, "step": 2576 }, { "epoch": 2.14511653718091, "grad_norm": 0.30026137828826904, "learning_rate": 2.2742317997563407e-06, "loss": 0.3687, "step": 2577 }, { "epoch": 2.145948945615982, "grad_norm": 0.3329930007457733, "learning_rate": 2.2701712191126895e-06, "loss": 0.4255, "step": 2578 }, { "epoch": 2.1467813540510545, "grad_norm": 0.3148747682571411, "learning_rate": 2.266113201826057e-06, "loss": 0.3728, "step": 2579 }, { "epoch": 2.1476137624861265, "grad_norm": 0.300738126039505, "learning_rate": 2.2620577517069986e-06, "loss": 0.3788, "step": 2580 }, { "epoch": 2.148446170921199, "grad_norm": 0.31569600105285645, "learning_rate": 2.2580048725636506e-06, "loss": 0.4349, "step": 2581 }, { "epoch": 2.1492785793562708, "grad_norm": 0.3264215290546417, "learning_rate": 2.2539545682017394e-06, "loss": 0.3995, "step": 2582 }, { "epoch": 2.150110987791343, "grad_norm": 0.3219297230243683, "learning_rate": 2.2499068424245667e-06, "loss": 0.3659, "step": 2583 }, { "epoch": 2.150943396226415, "grad_norm": 0.32074615359306335, "learning_rate": 2.245861699033023e-06, "loss": 0.3935, "step": 2584 }, { "epoch": 2.1517758046614874, "grad_norm": 0.31240931153297424, "learning_rate": 2.2418191418255684e-06, "loss": 0.4093, "step": 2585 }, { "epoch": 2.1526082130965594, "grad_norm": 0.30113837122917175, "learning_rate": 2.2377791745982323e-06, "loss": 0.367, "step": 2586 }, { "epoch": 2.1534406215316313, "grad_norm": 0.3172069489955902, "learning_rate": 2.2337418011446154e-06, "loss": 0.3933, "step": 2587 }, { "epoch": 2.1542730299667037, "grad_norm": 0.3283001780509949, "learning_rate": 2.229707025255881e-06, "loss": 0.4171, "step": 2588 }, { "epoch": 2.1551054384017756, "grad_norm": 0.30055010318756104, "learning_rate": 2.225674850720759e-06, "loss": 0.3517, "step": 2589 }, { "epoch": 2.155937846836848, "grad_norm": 0.33835846185684204, "learning_rate": 2.2216452813255273e-06, "loss": 0.3955, "step": 2590 }, { "epoch": 2.15677025527192, "grad_norm": 0.3244904577732086, "learning_rate": 2.2176183208540236e-06, "loss": 0.4064, "step": 2591 }, { "epoch": 2.1576026637069923, "grad_norm": 0.32495352625846863, "learning_rate": 2.2135939730876344e-06, "loss": 0.4063, "step": 2592 }, { "epoch": 2.158435072142064, "grad_norm": 0.32691752910614014, "learning_rate": 2.2095722418052916e-06, "loss": 0.3824, "step": 2593 }, { "epoch": 2.1592674805771366, "grad_norm": 0.33834582567214966, "learning_rate": 2.2055531307834734e-06, "loss": 0.3959, "step": 2594 }, { "epoch": 2.1600998890122085, "grad_norm": 0.3010924756526947, "learning_rate": 2.2015366437961932e-06, "loss": 0.3838, "step": 2595 }, { "epoch": 2.160932297447281, "grad_norm": 0.3164057731628418, "learning_rate": 2.197522784615004e-06, "loss": 0.3656, "step": 2596 }, { "epoch": 2.161764705882353, "grad_norm": 0.3379717171192169, "learning_rate": 2.1935115570089897e-06, "loss": 0.4121, "step": 2597 }, { "epoch": 2.162597114317425, "grad_norm": 0.2782793939113617, "learning_rate": 2.189502964744763e-06, "loss": 0.3191, "step": 2598 }, { "epoch": 2.163429522752497, "grad_norm": 0.2974027991294861, "learning_rate": 2.1854970115864623e-06, "loss": 0.4204, "step": 2599 }, { "epoch": 2.1642619311875695, "grad_norm": 0.2943418622016907, "learning_rate": 2.1814937012957476e-06, "loss": 0.4057, "step": 2600 }, { "epoch": 2.1650943396226414, "grad_norm": 0.30796945095062256, "learning_rate": 2.1774930376317976e-06, "loss": 0.3434, "step": 2601 }, { "epoch": 2.165926748057714, "grad_norm": 0.35572606325149536, "learning_rate": 2.1734950243513054e-06, "loss": 0.4329, "step": 2602 }, { "epoch": 2.1667591564927857, "grad_norm": 0.30681103467941284, "learning_rate": 2.1694996652084752e-06, "loss": 0.3559, "step": 2603 }, { "epoch": 2.167591564927858, "grad_norm": 0.30400606989860535, "learning_rate": 2.165506963955022e-06, "loss": 0.3995, "step": 2604 }, { "epoch": 2.16842397336293, "grad_norm": 0.30469441413879395, "learning_rate": 2.1615169243401557e-06, "loss": 0.3993, "step": 2605 }, { "epoch": 2.1692563817980024, "grad_norm": 0.3207082748413086, "learning_rate": 2.1575295501105987e-06, "loss": 0.4184, "step": 2606 }, { "epoch": 2.1700887902330743, "grad_norm": 0.30796340107917786, "learning_rate": 2.1535448450105644e-06, "loss": 0.3651, "step": 2607 }, { "epoch": 2.1709211986681467, "grad_norm": 0.3026013672351837, "learning_rate": 2.1495628127817618e-06, "loss": 0.3816, "step": 2608 }, { "epoch": 2.1717536071032186, "grad_norm": 0.33026769757270813, "learning_rate": 2.1455834571633836e-06, "loss": 0.4104, "step": 2609 }, { "epoch": 2.1725860155382906, "grad_norm": 0.32119134068489075, "learning_rate": 2.14160678189212e-06, "loss": 0.4247, "step": 2610 }, { "epoch": 2.173418423973363, "grad_norm": 0.30723845958709717, "learning_rate": 2.1376327907021385e-06, "loss": 0.3703, "step": 2611 }, { "epoch": 2.1742508324084353, "grad_norm": 0.29472365975379944, "learning_rate": 2.133661487325082e-06, "loss": 0.3878, "step": 2612 }, { "epoch": 2.1750832408435072, "grad_norm": 0.3001120090484619, "learning_rate": 2.1296928754900753e-06, "loss": 0.3795, "step": 2613 }, { "epoch": 2.175915649278579, "grad_norm": 0.31931760907173157, "learning_rate": 2.125726958923718e-06, "loss": 0.3901, "step": 2614 }, { "epoch": 2.1767480577136515, "grad_norm": 0.3011782765388489, "learning_rate": 2.1217637413500735e-06, "loss": 0.3744, "step": 2615 }, { "epoch": 2.1775804661487235, "grad_norm": 0.3152015507221222, "learning_rate": 2.1178032264906704e-06, "loss": 0.382, "step": 2616 }, { "epoch": 2.178412874583796, "grad_norm": 0.3104303479194641, "learning_rate": 2.1138454180645035e-06, "loss": 0.3745, "step": 2617 }, { "epoch": 2.1792452830188678, "grad_norm": 0.32811030745506287, "learning_rate": 2.109890319788023e-06, "loss": 0.3892, "step": 2618 }, { "epoch": 2.18007769145394, "grad_norm": 0.32053300738334656, "learning_rate": 2.105937935375136e-06, "loss": 0.372, "step": 2619 }, { "epoch": 2.180910099889012, "grad_norm": 0.3680615723133087, "learning_rate": 2.1019882685372016e-06, "loss": 0.4414, "step": 2620 }, { "epoch": 2.1817425083240845, "grad_norm": 0.29054370522499084, "learning_rate": 2.0980413229830248e-06, "loss": 0.3299, "step": 2621 }, { "epoch": 2.1825749167591564, "grad_norm": 0.31951627135276794, "learning_rate": 2.094097102418857e-06, "loss": 0.3982, "step": 2622 }, { "epoch": 2.1834073251942288, "grad_norm": 0.3075707256793976, "learning_rate": 2.09015561054839e-06, "loss": 0.3976, "step": 2623 }, { "epoch": 2.1842397336293007, "grad_norm": 0.28511252999305725, "learning_rate": 2.0862168510727545e-06, "loss": 0.3907, "step": 2624 }, { "epoch": 2.185072142064373, "grad_norm": 0.2891198694705963, "learning_rate": 2.0822808276905144e-06, "loss": 0.3904, "step": 2625 }, { "epoch": 2.185904550499445, "grad_norm": 0.3299737870693207, "learning_rate": 2.0783475440976635e-06, "loss": 0.3974, "step": 2626 }, { "epoch": 2.1867369589345174, "grad_norm": 0.3298386335372925, "learning_rate": 2.0744170039876255e-06, "loss": 0.3923, "step": 2627 }, { "epoch": 2.1875693673695893, "grad_norm": 0.295673131942749, "learning_rate": 2.0704892110512458e-06, "loss": 0.3811, "step": 2628 }, { "epoch": 2.1884017758046617, "grad_norm": 0.3085336983203888, "learning_rate": 2.0665641689767902e-06, "loss": 0.392, "step": 2629 }, { "epoch": 2.1892341842397336, "grad_norm": 0.3050670921802521, "learning_rate": 2.0626418814499428e-06, "loss": 0.3935, "step": 2630 }, { "epoch": 2.190066592674806, "grad_norm": 0.32354554533958435, "learning_rate": 2.0587223521537996e-06, "loss": 0.4429, "step": 2631 }, { "epoch": 2.190899001109878, "grad_norm": 0.3790428638458252, "learning_rate": 2.0548055847688676e-06, "loss": 0.3709, "step": 2632 }, { "epoch": 2.19173140954495, "grad_norm": 0.2934742271900177, "learning_rate": 2.0508915829730595e-06, "loss": 0.3677, "step": 2633 }, { "epoch": 2.192563817980022, "grad_norm": 0.30265501141548157, "learning_rate": 2.046980350441694e-06, "loss": 0.4166, "step": 2634 }, { "epoch": 2.1933962264150946, "grad_norm": 0.3148394525051117, "learning_rate": 2.0430718908474813e-06, "loss": 0.3848, "step": 2635 }, { "epoch": 2.1942286348501665, "grad_norm": 0.35043779015541077, "learning_rate": 2.0391662078605383e-06, "loss": 0.4055, "step": 2636 }, { "epoch": 2.1950610432852384, "grad_norm": 0.27816712856292725, "learning_rate": 2.0352633051483705e-06, "loss": 0.3301, "step": 2637 }, { "epoch": 2.195893451720311, "grad_norm": 0.3395947813987732, "learning_rate": 2.0313631863758677e-06, "loss": 0.4272, "step": 2638 }, { "epoch": 2.1967258601553827, "grad_norm": 0.3466813564300537, "learning_rate": 2.02746585520531e-06, "loss": 0.3971, "step": 2639 }, { "epoch": 2.197558268590455, "grad_norm": 0.32729557156562805, "learning_rate": 2.0235713152963627e-06, "loss": 0.3828, "step": 2640 }, { "epoch": 2.198390677025527, "grad_norm": 0.35937994718551636, "learning_rate": 2.019679570306068e-06, "loss": 0.3956, "step": 2641 }, { "epoch": 2.1992230854605994, "grad_norm": 0.2958609163761139, "learning_rate": 2.0157906238888376e-06, "loss": 0.3495, "step": 2642 }, { "epoch": 2.2000554938956713, "grad_norm": 0.31037241220474243, "learning_rate": 2.0119044796964614e-06, "loss": 0.3748, "step": 2643 }, { "epoch": 2.2008879023307437, "grad_norm": 0.3517056107521057, "learning_rate": 2.008021141378102e-06, "loss": 0.4095, "step": 2644 }, { "epoch": 2.2017203107658156, "grad_norm": 0.3355526030063629, "learning_rate": 2.0041406125802764e-06, "loss": 0.3765, "step": 2645 }, { "epoch": 2.202552719200888, "grad_norm": 0.32547831535339355, "learning_rate": 2.0002628969468713e-06, "loss": 0.4366, "step": 2646 }, { "epoch": 2.20338512763596, "grad_norm": 0.2848454415798187, "learning_rate": 1.9963879981191288e-06, "loss": 0.3593, "step": 2647 }, { "epoch": 2.2042175360710323, "grad_norm": 0.3068901598453522, "learning_rate": 1.9925159197356475e-06, "loss": 0.38, "step": 2648 }, { "epoch": 2.2050499445061043, "grad_norm": 0.33143675327301025, "learning_rate": 1.9886466654323765e-06, "loss": 0.4175, "step": 2649 }, { "epoch": 2.2058823529411766, "grad_norm": 0.323045551776886, "learning_rate": 1.9847802388426137e-06, "loss": 0.4157, "step": 2650 }, { "epoch": 2.2067147613762486, "grad_norm": 0.27590107917785645, "learning_rate": 1.9809166435970006e-06, "loss": 0.3653, "step": 2651 }, { "epoch": 2.207547169811321, "grad_norm": 0.3093126714229584, "learning_rate": 1.9770558833235215e-06, "loss": 0.4101, "step": 2652 }, { "epoch": 2.208379578246393, "grad_norm": 0.32659175992012024, "learning_rate": 1.973197961647498e-06, "loss": 0.4096, "step": 2653 }, { "epoch": 2.2092119866814652, "grad_norm": 0.29490897059440613, "learning_rate": 1.969342882191585e-06, "loss": 0.3598, "step": 2654 }, { "epoch": 2.210044395116537, "grad_norm": 0.2893614172935486, "learning_rate": 1.9654906485757707e-06, "loss": 0.3784, "step": 2655 }, { "epoch": 2.210876803551609, "grad_norm": 0.2968060374259949, "learning_rate": 1.9616412644173697e-06, "loss": 0.3807, "step": 2656 }, { "epoch": 2.2117092119866815, "grad_norm": 0.2902612090110779, "learning_rate": 1.957794733331021e-06, "loss": 0.3769, "step": 2657 }, { "epoch": 2.212541620421754, "grad_norm": 0.30103838443756104, "learning_rate": 1.9539510589286848e-06, "loss": 0.3738, "step": 2658 }, { "epoch": 2.2133740288568258, "grad_norm": 0.3171117901802063, "learning_rate": 1.950110244819638e-06, "loss": 0.3899, "step": 2659 }, { "epoch": 2.2142064372918977, "grad_norm": 0.32826194167137146, "learning_rate": 1.9462722946104727e-06, "loss": 0.3894, "step": 2660 }, { "epoch": 2.21503884572697, "grad_norm": 0.32006773352622986, "learning_rate": 1.942437211905092e-06, "loss": 0.3982, "step": 2661 }, { "epoch": 2.215871254162042, "grad_norm": 0.33344799280166626, "learning_rate": 1.9386050003047047e-06, "loss": 0.3706, "step": 2662 }, { "epoch": 2.2167036625971144, "grad_norm": 0.30667856335639954, "learning_rate": 1.9347756634078273e-06, "loss": 0.4086, "step": 2663 }, { "epoch": 2.2175360710321863, "grad_norm": 0.31673869490623474, "learning_rate": 1.93094920481027e-06, "loss": 0.4003, "step": 2664 }, { "epoch": 2.2183684794672587, "grad_norm": 0.3328154981136322, "learning_rate": 1.9271256281051443e-06, "loss": 0.4012, "step": 2665 }, { "epoch": 2.2192008879023306, "grad_norm": 0.32742953300476074, "learning_rate": 1.92330493688286e-06, "loss": 0.4188, "step": 2666 }, { "epoch": 2.220033296337403, "grad_norm": 0.27765265107154846, "learning_rate": 1.9194871347311115e-06, "loss": 0.39, "step": 2667 }, { "epoch": 2.220865704772475, "grad_norm": 0.28916940093040466, "learning_rate": 1.91567222523488e-06, "loss": 0.389, "step": 2668 }, { "epoch": 2.2216981132075473, "grad_norm": 0.2962568402290344, "learning_rate": 1.9118602119764325e-06, "loss": 0.3644, "step": 2669 }, { "epoch": 2.222530521642619, "grad_norm": 0.33371245861053467, "learning_rate": 1.90805109853532e-06, "loss": 0.4107, "step": 2670 }, { "epoch": 2.2233629300776916, "grad_norm": 0.3214453160762787, "learning_rate": 1.9042448884883618e-06, "loss": 0.4116, "step": 2671 }, { "epoch": 2.2241953385127635, "grad_norm": 0.2842710614204407, "learning_rate": 1.9004415854096586e-06, "loss": 0.3673, "step": 2672 }, { "epoch": 2.225027746947836, "grad_norm": 0.3369787633419037, "learning_rate": 1.8966411928705757e-06, "loss": 0.3995, "step": 2673 }, { "epoch": 2.225860155382908, "grad_norm": 0.3225659132003784, "learning_rate": 1.8928437144397538e-06, "loss": 0.403, "step": 2674 }, { "epoch": 2.22669256381798, "grad_norm": 0.32059574127197266, "learning_rate": 1.8890491536830863e-06, "loss": 0.356, "step": 2675 }, { "epoch": 2.227524972253052, "grad_norm": 0.3181704580783844, "learning_rate": 1.8852575141637347e-06, "loss": 0.4074, "step": 2676 }, { "epoch": 2.2283573806881245, "grad_norm": 0.317874938249588, "learning_rate": 1.8814687994421138e-06, "loss": 0.3938, "step": 2677 }, { "epoch": 2.2291897891231964, "grad_norm": 0.3188163638114929, "learning_rate": 1.8776830130758939e-06, "loss": 0.3644, "step": 2678 }, { "epoch": 2.2300221975582684, "grad_norm": 0.3100307285785675, "learning_rate": 1.873900158619994e-06, "loss": 0.4062, "step": 2679 }, { "epoch": 2.2308546059933407, "grad_norm": 0.3106209635734558, "learning_rate": 1.8701202396265815e-06, "loss": 0.3857, "step": 2680 }, { "epoch": 2.231687014428413, "grad_norm": 0.35905230045318604, "learning_rate": 1.866343259645066e-06, "loss": 0.4164, "step": 2681 }, { "epoch": 2.232519422863485, "grad_norm": 0.28830498456954956, "learning_rate": 1.8625692222220977e-06, "loss": 0.3477, "step": 2682 }, { "epoch": 2.233351831298557, "grad_norm": 0.30920305848121643, "learning_rate": 1.8587981309015635e-06, "loss": 0.4061, "step": 2683 }, { "epoch": 2.2341842397336293, "grad_norm": 0.3114646375179291, "learning_rate": 1.8550299892245854e-06, "loss": 0.3915, "step": 2684 }, { "epoch": 2.2350166481687013, "grad_norm": 0.3108902871608734, "learning_rate": 1.851264800729513e-06, "loss": 0.3879, "step": 2685 }, { "epoch": 2.2358490566037736, "grad_norm": 0.2933717370033264, "learning_rate": 1.8475025689519256e-06, "loss": 0.3962, "step": 2686 }, { "epoch": 2.2366814650388456, "grad_norm": 0.29493871331214905, "learning_rate": 1.8437432974246238e-06, "loss": 0.3765, "step": 2687 }, { "epoch": 2.237513873473918, "grad_norm": 0.31428149342536926, "learning_rate": 1.8399869896776296e-06, "loss": 0.4303, "step": 2688 }, { "epoch": 2.23834628190899, "grad_norm": 0.30307847261428833, "learning_rate": 1.8362336492381832e-06, "loss": 0.3772, "step": 2689 }, { "epoch": 2.2391786903440623, "grad_norm": 0.31176915764808655, "learning_rate": 1.8324832796307323e-06, "loss": 0.3836, "step": 2690 }, { "epoch": 2.240011098779134, "grad_norm": 0.2853330075740814, "learning_rate": 1.8287358843769448e-06, "loss": 0.3783, "step": 2691 }, { "epoch": 2.2408435072142066, "grad_norm": 0.28568682074546814, "learning_rate": 1.8249914669956886e-06, "loss": 0.392, "step": 2692 }, { "epoch": 2.2416759156492785, "grad_norm": 0.2875567674636841, "learning_rate": 1.8212500310030385e-06, "loss": 0.4076, "step": 2693 }, { "epoch": 2.242508324084351, "grad_norm": 0.28917601704597473, "learning_rate": 1.8175115799122656e-06, "loss": 0.3889, "step": 2694 }, { "epoch": 2.243340732519423, "grad_norm": 0.29513514041900635, "learning_rate": 1.8137761172338404e-06, "loss": 0.4111, "step": 2695 }, { "epoch": 2.244173140954495, "grad_norm": 0.2941649854183197, "learning_rate": 1.810043646475431e-06, "loss": 0.3975, "step": 2696 }, { "epoch": 2.245005549389567, "grad_norm": 0.28224435448646545, "learning_rate": 1.8063141711418941e-06, "loss": 0.3946, "step": 2697 }, { "epoch": 2.2458379578246395, "grad_norm": 0.286742627620697, "learning_rate": 1.8025876947352677e-06, "loss": 0.3954, "step": 2698 }, { "epoch": 2.2466703662597114, "grad_norm": 0.3206160366535187, "learning_rate": 1.7988642207547784e-06, "loss": 0.4243, "step": 2699 }, { "epoch": 2.2475027746947838, "grad_norm": 0.3006853759288788, "learning_rate": 1.795143752696839e-06, "loss": 0.3759, "step": 2700 }, { "epoch": 2.2483351831298557, "grad_norm": 0.29394397139549255, "learning_rate": 1.7914262940550292e-06, "loss": 0.389, "step": 2701 }, { "epoch": 2.2491675915649276, "grad_norm": 0.3121004104614258, "learning_rate": 1.7877118483201095e-06, "loss": 0.3977, "step": 2702 }, { "epoch": 2.25, "grad_norm": 0.3045893609523773, "learning_rate": 1.784000418980007e-06, "loss": 0.4071, "step": 2703 }, { "epoch": 2.2508324084350724, "grad_norm": 0.30945533514022827, "learning_rate": 1.7802920095198246e-06, "loss": 0.3923, "step": 2704 }, { "epoch": 2.2516648168701443, "grad_norm": 0.28781136870384216, "learning_rate": 1.7765866234218187e-06, "loss": 0.3648, "step": 2705 }, { "epoch": 2.2524972253052162, "grad_norm": 0.32264959812164307, "learning_rate": 1.7728842641654125e-06, "loss": 0.4221, "step": 2706 }, { "epoch": 2.2533296337402886, "grad_norm": 0.30960628390312195, "learning_rate": 1.7691849352271872e-06, "loss": 0.3859, "step": 2707 }, { "epoch": 2.2541620421753605, "grad_norm": 0.3129541873931885, "learning_rate": 1.7654886400808774e-06, "loss": 0.3869, "step": 2708 }, { "epoch": 2.254994450610433, "grad_norm": 0.2916148900985718, "learning_rate": 1.7617953821973682e-06, "loss": 0.363, "step": 2709 }, { "epoch": 2.255826859045505, "grad_norm": 0.3174515664577484, "learning_rate": 1.758105165044694e-06, "loss": 0.4119, "step": 2710 }, { "epoch": 2.256659267480577, "grad_norm": 0.28080281615257263, "learning_rate": 1.7544179920880333e-06, "loss": 0.3623, "step": 2711 }, { "epoch": 2.257491675915649, "grad_norm": 0.31006425619125366, "learning_rate": 1.7507338667897062e-06, "loss": 0.4584, "step": 2712 }, { "epoch": 2.2583240843507215, "grad_norm": 0.3159677982330322, "learning_rate": 1.7470527926091702e-06, "loss": 0.3642, "step": 2713 }, { "epoch": 2.2591564927857934, "grad_norm": 0.34636712074279785, "learning_rate": 1.7433747730030188e-06, "loss": 0.3917, "step": 2714 }, { "epoch": 2.259988901220866, "grad_norm": 0.28924471139907837, "learning_rate": 1.7396998114249786e-06, "loss": 0.3665, "step": 2715 }, { "epoch": 2.2608213096559377, "grad_norm": 0.3206512928009033, "learning_rate": 1.7360279113258977e-06, "loss": 0.3851, "step": 2716 }, { "epoch": 2.26165371809101, "grad_norm": 0.30951130390167236, "learning_rate": 1.7323590761537595e-06, "loss": 0.4265, "step": 2717 }, { "epoch": 2.262486126526082, "grad_norm": 0.318645179271698, "learning_rate": 1.7286933093536634e-06, "loss": 0.3968, "step": 2718 }, { "epoch": 2.2633185349611544, "grad_norm": 0.31277957558631897, "learning_rate": 1.7250306143678292e-06, "loss": 0.3947, "step": 2719 }, { "epoch": 2.2641509433962264, "grad_norm": 0.30135253071784973, "learning_rate": 1.7213709946355879e-06, "loss": 0.3715, "step": 2720 }, { "epoch": 2.2649833518312987, "grad_norm": 0.3124295771121979, "learning_rate": 1.7177144535933903e-06, "loss": 0.4376, "step": 2721 }, { "epoch": 2.2658157602663707, "grad_norm": 0.30148327350616455, "learning_rate": 1.7140609946747915e-06, "loss": 0.382, "step": 2722 }, { "epoch": 2.266648168701443, "grad_norm": 0.32914572954177856, "learning_rate": 1.7104106213104554e-06, "loss": 0.4407, "step": 2723 }, { "epoch": 2.267480577136515, "grad_norm": 0.3174681067466736, "learning_rate": 1.7067633369281422e-06, "loss": 0.3585, "step": 2724 }, { "epoch": 2.268312985571587, "grad_norm": 0.29542532563209534, "learning_rate": 1.7031191449527162e-06, "loss": 0.3803, "step": 2725 }, { "epoch": 2.2691453940066593, "grad_norm": 0.2985629439353943, "learning_rate": 1.699478048806143e-06, "loss": 0.3619, "step": 2726 }, { "epoch": 2.2699778024417316, "grad_norm": 0.3362717628479004, "learning_rate": 1.6958400519074696e-06, "loss": 0.3884, "step": 2727 }, { "epoch": 2.2708102108768036, "grad_norm": 0.33345919847488403, "learning_rate": 1.6922051576728415e-06, "loss": 0.4431, "step": 2728 }, { "epoch": 2.2716426193118755, "grad_norm": 0.2820887267589569, "learning_rate": 1.6885733695154855e-06, "loss": 0.319, "step": 2729 }, { "epoch": 2.272475027746948, "grad_norm": 0.2951180636882782, "learning_rate": 1.6849446908457201e-06, "loss": 0.3953, "step": 2730 }, { "epoch": 2.27330743618202, "grad_norm": 0.2896723747253418, "learning_rate": 1.6813191250709326e-06, "loss": 0.4086, "step": 2731 }, { "epoch": 2.274139844617092, "grad_norm": 0.29083332419395447, "learning_rate": 1.6776966755955941e-06, "loss": 0.3748, "step": 2732 }, { "epoch": 2.274972253052164, "grad_norm": 0.3114146888256073, "learning_rate": 1.674077345821249e-06, "loss": 0.4199, "step": 2733 }, { "epoch": 2.2758046614872365, "grad_norm": 0.311087965965271, "learning_rate": 1.6704611391465103e-06, "loss": 0.4003, "step": 2734 }, { "epoch": 2.2766370699223084, "grad_norm": 0.3135244846343994, "learning_rate": 1.6668480589670604e-06, "loss": 0.3827, "step": 2735 }, { "epoch": 2.277469478357381, "grad_norm": 0.3102346956729889, "learning_rate": 1.6632381086756439e-06, "loss": 0.3987, "step": 2736 }, { "epoch": 2.2783018867924527, "grad_norm": 0.2921448349952698, "learning_rate": 1.6596312916620677e-06, "loss": 0.3694, "step": 2737 }, { "epoch": 2.279134295227525, "grad_norm": 0.31638967990875244, "learning_rate": 1.6560276113131968e-06, "loss": 0.4139, "step": 2738 }, { "epoch": 2.279966703662597, "grad_norm": 0.29766252636909485, "learning_rate": 1.6524270710129491e-06, "loss": 0.3582, "step": 2739 }, { "epoch": 2.2807991120976694, "grad_norm": 0.308046817779541, "learning_rate": 1.6488296741422955e-06, "loss": 0.3882, "step": 2740 }, { "epoch": 2.2816315205327413, "grad_norm": 0.30890682339668274, "learning_rate": 1.6452354240792561e-06, "loss": 0.4078, "step": 2741 }, { "epoch": 2.2824639289678137, "grad_norm": 0.31466445326805115, "learning_rate": 1.64164432419889e-06, "loss": 0.3866, "step": 2742 }, { "epoch": 2.2832963374028856, "grad_norm": 0.3218589425086975, "learning_rate": 1.6380563778733078e-06, "loss": 0.3788, "step": 2743 }, { "epoch": 2.284128745837958, "grad_norm": 0.3170068860054016, "learning_rate": 1.6344715884716517e-06, "loss": 0.3912, "step": 2744 }, { "epoch": 2.28496115427303, "grad_norm": 0.28382614254951477, "learning_rate": 1.630889959360104e-06, "loss": 0.3923, "step": 2745 }, { "epoch": 2.2857935627081023, "grad_norm": 0.3070010840892792, "learning_rate": 1.627311493901872e-06, "loss": 0.4414, "step": 2746 }, { "epoch": 2.2866259711431742, "grad_norm": 0.2942955493927002, "learning_rate": 1.6237361954572023e-06, "loss": 0.3476, "step": 2747 }, { "epoch": 2.287458379578246, "grad_norm": 0.2962301969528198, "learning_rate": 1.6201640673833613e-06, "loss": 0.3807, "step": 2748 }, { "epoch": 2.2882907880133185, "grad_norm": 0.3063211441040039, "learning_rate": 1.6165951130346408e-06, "loss": 0.4005, "step": 2749 }, { "epoch": 2.289123196448391, "grad_norm": 0.30910301208496094, "learning_rate": 1.6130293357623473e-06, "loss": 0.4213, "step": 2750 }, { "epoch": 2.289955604883463, "grad_norm": 0.3249872922897339, "learning_rate": 1.6094667389148128e-06, "loss": 0.3776, "step": 2751 }, { "epoch": 2.2907880133185348, "grad_norm": 0.3110400140285492, "learning_rate": 1.605907325837378e-06, "loss": 0.4065, "step": 2752 }, { "epoch": 2.291620421753607, "grad_norm": 0.28765764832496643, "learning_rate": 1.6023510998723906e-06, "loss": 0.3952, "step": 2753 }, { "epoch": 2.292452830188679, "grad_norm": 0.3131961524486542, "learning_rate": 1.598798064359211e-06, "loss": 0.4005, "step": 2754 }, { "epoch": 2.2932852386237514, "grad_norm": 0.30940961837768555, "learning_rate": 1.5952482226342003e-06, "loss": 0.3693, "step": 2755 }, { "epoch": 2.2941176470588234, "grad_norm": 0.32620513439178467, "learning_rate": 1.5917015780307265e-06, "loss": 0.418, "step": 2756 }, { "epoch": 2.2949500554938957, "grad_norm": 0.2872724235057831, "learning_rate": 1.5881581338791462e-06, "loss": 0.343, "step": 2757 }, { "epoch": 2.2957824639289677, "grad_norm": 0.28437381982803345, "learning_rate": 1.5846178935068173e-06, "loss": 0.3797, "step": 2758 }, { "epoch": 2.29661487236404, "grad_norm": 0.29751530289649963, "learning_rate": 1.5810808602380872e-06, "loss": 0.4177, "step": 2759 }, { "epoch": 2.297447280799112, "grad_norm": 0.29856476187705994, "learning_rate": 1.5775470373942926e-06, "loss": 0.3655, "step": 2760 }, { "epoch": 2.2982796892341844, "grad_norm": 0.3321475684642792, "learning_rate": 1.5740164282937548e-06, "loss": 0.4332, "step": 2761 }, { "epoch": 2.2991120976692563, "grad_norm": 0.2865569591522217, "learning_rate": 1.5704890362517772e-06, "loss": 0.3488, "step": 2762 }, { "epoch": 2.2999445061043287, "grad_norm": 0.30639970302581787, "learning_rate": 1.5669648645806428e-06, "loss": 0.3751, "step": 2763 }, { "epoch": 2.3007769145394006, "grad_norm": 0.3253311812877655, "learning_rate": 1.5634439165896103e-06, "loss": 0.3768, "step": 2764 }, { "epoch": 2.301609322974473, "grad_norm": 0.33678269386291504, "learning_rate": 1.5599261955849126e-06, "loss": 0.3722, "step": 2765 }, { "epoch": 2.302441731409545, "grad_norm": 0.32921579480171204, "learning_rate": 1.5564117048697503e-06, "loss": 0.425, "step": 2766 }, { "epoch": 2.3032741398446173, "grad_norm": 0.29400959610939026, "learning_rate": 1.5529004477442921e-06, "loss": 0.3497, "step": 2767 }, { "epoch": 2.304106548279689, "grad_norm": 0.30234500765800476, "learning_rate": 1.5493924275056699e-06, "loss": 0.3947, "step": 2768 }, { "epoch": 2.3049389567147616, "grad_norm": 0.29784688353538513, "learning_rate": 1.5458876474479757e-06, "loss": 0.3688, "step": 2769 }, { "epoch": 2.3057713651498335, "grad_norm": 0.3415769636631012, "learning_rate": 1.5423861108622601e-06, "loss": 0.4432, "step": 2770 }, { "epoch": 2.3066037735849054, "grad_norm": 0.2958844304084778, "learning_rate": 1.5388878210365283e-06, "loss": 0.3377, "step": 2771 }, { "epoch": 2.307436182019978, "grad_norm": 0.3629142940044403, "learning_rate": 1.5353927812557306e-06, "loss": 0.4083, "step": 2772 }, { "epoch": 2.30826859045505, "grad_norm": 0.3380175530910492, "learning_rate": 1.5319009948017765e-06, "loss": 0.3929, "step": 2773 }, { "epoch": 2.309100998890122, "grad_norm": 0.2930530607700348, "learning_rate": 1.528412464953512e-06, "loss": 0.389, "step": 2774 }, { "epoch": 2.309933407325194, "grad_norm": 0.3223347067832947, "learning_rate": 1.5249271949867294e-06, "loss": 0.4326, "step": 2775 }, { "epoch": 2.3107658157602664, "grad_norm": 0.2680168151855469, "learning_rate": 1.5214451881741544e-06, "loss": 0.3597, "step": 2776 }, { "epoch": 2.3115982241953383, "grad_norm": 0.29601550102233887, "learning_rate": 1.5179664477854556e-06, "loss": 0.3948, "step": 2777 }, { "epoch": 2.3124306326304107, "grad_norm": 0.31549885869026184, "learning_rate": 1.5144909770872324e-06, "loss": 0.4028, "step": 2778 }, { "epoch": 2.3132630410654826, "grad_norm": 0.3089504837989807, "learning_rate": 1.5110187793430086e-06, "loss": 0.3954, "step": 2779 }, { "epoch": 2.314095449500555, "grad_norm": 0.3053048551082611, "learning_rate": 1.5075498578132398e-06, "loss": 0.4161, "step": 2780 }, { "epoch": 2.314927857935627, "grad_norm": 0.3325631320476532, "learning_rate": 1.504084215755306e-06, "loss": 0.4186, "step": 2781 }, { "epoch": 2.3157602663706993, "grad_norm": 0.2945425510406494, "learning_rate": 1.5006218564235058e-06, "loss": 0.3561, "step": 2782 }, { "epoch": 2.3165926748057712, "grad_norm": 0.35173118114471436, "learning_rate": 1.4971627830690533e-06, "loss": 0.4315, "step": 2783 }, { "epoch": 2.3174250832408436, "grad_norm": 0.326571524143219, "learning_rate": 1.4937069989400782e-06, "loss": 0.3789, "step": 2784 }, { "epoch": 2.3182574916759155, "grad_norm": 0.32909318804740906, "learning_rate": 1.4902545072816266e-06, "loss": 0.4114, "step": 2785 }, { "epoch": 2.319089900110988, "grad_norm": 0.3058091998100281, "learning_rate": 1.4868053113356446e-06, "loss": 0.3674, "step": 2786 }, { "epoch": 2.31992230854606, "grad_norm": 0.31880733370780945, "learning_rate": 1.483359414340989e-06, "loss": 0.4133, "step": 2787 }, { "epoch": 2.3207547169811322, "grad_norm": 0.28259992599487305, "learning_rate": 1.4799168195334174e-06, "loss": 0.3619, "step": 2788 }, { "epoch": 2.321587125416204, "grad_norm": 0.33174195885658264, "learning_rate": 1.4764775301455859e-06, "loss": 0.41, "step": 2789 }, { "epoch": 2.3224195338512765, "grad_norm": 0.3177911043167114, "learning_rate": 1.4730415494070482e-06, "loss": 0.3861, "step": 2790 }, { "epoch": 2.3232519422863485, "grad_norm": 0.2958433926105499, "learning_rate": 1.4696088805442505e-06, "loss": 0.3628, "step": 2791 }, { "epoch": 2.324084350721421, "grad_norm": 0.2952271103858948, "learning_rate": 1.466179526780529e-06, "loss": 0.3991, "step": 2792 }, { "epoch": 2.3249167591564928, "grad_norm": 0.3214179575443268, "learning_rate": 1.4627534913361064e-06, "loss": 0.409, "step": 2793 }, { "epoch": 2.3257491675915647, "grad_norm": 0.31064051389694214, "learning_rate": 1.4593307774280895e-06, "loss": 0.4192, "step": 2794 }, { "epoch": 2.326581576026637, "grad_norm": 0.30104267597198486, "learning_rate": 1.4559113882704683e-06, "loss": 0.3648, "step": 2795 }, { "epoch": 2.3274139844617094, "grad_norm": 0.3028804659843445, "learning_rate": 1.4524953270741077e-06, "loss": 0.3694, "step": 2796 }, { "epoch": 2.3282463928967814, "grad_norm": 0.309151828289032, "learning_rate": 1.4490825970467493e-06, "loss": 0.4093, "step": 2797 }, { "epoch": 2.3290788013318533, "grad_norm": 0.31857770681381226, "learning_rate": 1.4456732013930064e-06, "loss": 0.3984, "step": 2798 }, { "epoch": 2.3299112097669257, "grad_norm": 0.3101446032524109, "learning_rate": 1.442267143314361e-06, "loss": 0.3722, "step": 2799 }, { "epoch": 2.3307436182019976, "grad_norm": 0.29778382182121277, "learning_rate": 1.4388644260091617e-06, "loss": 0.3865, "step": 2800 }, { "epoch": 2.33157602663707, "grad_norm": 0.309423565864563, "learning_rate": 1.435465052672621e-06, "loss": 0.3819, "step": 2801 }, { "epoch": 2.332408435072142, "grad_norm": 0.301540732383728, "learning_rate": 1.432069026496805e-06, "loss": 0.3653, "step": 2802 }, { "epoch": 2.3332408435072143, "grad_norm": 0.28494688868522644, "learning_rate": 1.4286763506706474e-06, "loss": 0.3809, "step": 2803 }, { "epoch": 2.334073251942286, "grad_norm": 0.2828214764595032, "learning_rate": 1.425287028379929e-06, "loss": 0.4076, "step": 2804 }, { "epoch": 2.3349056603773586, "grad_norm": 0.3035745620727539, "learning_rate": 1.4219010628072806e-06, "loss": 0.4237, "step": 2805 }, { "epoch": 2.3357380688124305, "grad_norm": 0.30221620202064514, "learning_rate": 1.418518457132182e-06, "loss": 0.4185, "step": 2806 }, { "epoch": 2.336570477247503, "grad_norm": 0.29032158851623535, "learning_rate": 1.4151392145309634e-06, "loss": 0.4039, "step": 2807 }, { "epoch": 2.337402885682575, "grad_norm": 0.3030306398868561, "learning_rate": 1.4117633381767925e-06, "loss": 0.4108, "step": 2808 }, { "epoch": 2.338235294117647, "grad_norm": 0.3300309181213379, "learning_rate": 1.4083908312396727e-06, "loss": 0.4123, "step": 2809 }, { "epoch": 2.339067702552719, "grad_norm": 0.3179803490638733, "learning_rate": 1.4050216968864477e-06, "loss": 0.3796, "step": 2810 }, { "epoch": 2.3399001109877915, "grad_norm": 0.28045690059661865, "learning_rate": 1.401655938280798e-06, "loss": 0.368, "step": 2811 }, { "epoch": 2.3407325194228634, "grad_norm": 0.32688331604003906, "learning_rate": 1.3982935585832253e-06, "loss": 0.3847, "step": 2812 }, { "epoch": 2.341564927857936, "grad_norm": 0.29030436277389526, "learning_rate": 1.3949345609510645e-06, "loss": 0.3711, "step": 2813 }, { "epoch": 2.3423973362930077, "grad_norm": 0.33373093605041504, "learning_rate": 1.3915789485384718e-06, "loss": 0.4009, "step": 2814 }, { "epoch": 2.34322974472808, "grad_norm": 0.3265412151813507, "learning_rate": 1.3882267244964304e-06, "loss": 0.4302, "step": 2815 }, { "epoch": 2.344062153163152, "grad_norm": 0.30916792154312134, "learning_rate": 1.3848778919727324e-06, "loss": 0.392, "step": 2816 }, { "epoch": 2.344894561598224, "grad_norm": 0.30912598967552185, "learning_rate": 1.3815324541119924e-06, "loss": 0.3968, "step": 2817 }, { "epoch": 2.3457269700332963, "grad_norm": 0.31527265906333923, "learning_rate": 1.3781904140556352e-06, "loss": 0.3742, "step": 2818 }, { "epoch": 2.3465593784683687, "grad_norm": 0.2920280694961548, "learning_rate": 1.3748517749418944e-06, "loss": 0.3646, "step": 2819 }, { "epoch": 2.3473917869034406, "grad_norm": 0.3410710394382477, "learning_rate": 1.3715165399058106e-06, "loss": 0.426, "step": 2820 }, { "epoch": 2.3482241953385126, "grad_norm": 0.3250875771045685, "learning_rate": 1.368184712079228e-06, "loss": 0.417, "step": 2821 }, { "epoch": 2.349056603773585, "grad_norm": 0.2889362573623657, "learning_rate": 1.3648562945907916e-06, "loss": 0.3865, "step": 2822 }, { "epoch": 2.349889012208657, "grad_norm": 0.3278926610946655, "learning_rate": 1.3615312905659434e-06, "loss": 0.4317, "step": 2823 }, { "epoch": 2.3507214206437292, "grad_norm": 0.3064905107021332, "learning_rate": 1.3582097031269208e-06, "loss": 0.3656, "step": 2824 }, { "epoch": 2.351553829078801, "grad_norm": 0.31251177191734314, "learning_rate": 1.3548915353927516e-06, "loss": 0.415, "step": 2825 }, { "epoch": 2.3523862375138735, "grad_norm": 0.3331948220729828, "learning_rate": 1.3515767904792548e-06, "loss": 0.3624, "step": 2826 }, { "epoch": 2.3532186459489455, "grad_norm": 0.31363222002983093, "learning_rate": 1.3482654714990323e-06, "loss": 0.3953, "step": 2827 }, { "epoch": 2.354051054384018, "grad_norm": 0.30317604541778564, "learning_rate": 1.3449575815614719e-06, "loss": 0.4045, "step": 2828 }, { "epoch": 2.35488346281909, "grad_norm": 0.30388328433036804, "learning_rate": 1.3416531237727398e-06, "loss": 0.4031, "step": 2829 }, { "epoch": 2.355715871254162, "grad_norm": 0.30564767122268677, "learning_rate": 1.338352101235781e-06, "loss": 0.3676, "step": 2830 }, { "epoch": 2.356548279689234, "grad_norm": 0.3132300078868866, "learning_rate": 1.3350545170503087e-06, "loss": 0.4245, "step": 2831 }, { "epoch": 2.3573806881243065, "grad_norm": 0.3112890124320984, "learning_rate": 1.3317603743128177e-06, "loss": 0.3984, "step": 2832 }, { "epoch": 2.3582130965593784, "grad_norm": 0.32551440596580505, "learning_rate": 1.3284696761165634e-06, "loss": 0.4194, "step": 2833 }, { "epoch": 2.3590455049944508, "grad_norm": 0.3218403160572052, "learning_rate": 1.3251824255515704e-06, "loss": 0.4249, "step": 2834 }, { "epoch": 2.3598779134295227, "grad_norm": 0.2741534113883972, "learning_rate": 1.3218986257046217e-06, "loss": 0.3283, "step": 2835 }, { "epoch": 2.360710321864595, "grad_norm": 0.33001041412353516, "learning_rate": 1.3186182796592634e-06, "loss": 0.4112, "step": 2836 }, { "epoch": 2.361542730299667, "grad_norm": 0.29912424087524414, "learning_rate": 1.3153413904958024e-06, "loss": 0.3664, "step": 2837 }, { "epoch": 2.3623751387347394, "grad_norm": 0.2997702360153198, "learning_rate": 1.3120679612912896e-06, "loss": 0.3769, "step": 2838 }, { "epoch": 2.3632075471698113, "grad_norm": 0.295955628156662, "learning_rate": 1.308797995119534e-06, "loss": 0.3931, "step": 2839 }, { "epoch": 2.3640399556048832, "grad_norm": 0.3053940236568451, "learning_rate": 1.30553149505109e-06, "loss": 0.3741, "step": 2840 }, { "epoch": 2.3648723640399556, "grad_norm": 0.3083863854408264, "learning_rate": 1.302268464153263e-06, "loss": 0.4192, "step": 2841 }, { "epoch": 2.365704772475028, "grad_norm": 0.3068256378173828, "learning_rate": 1.2990089054900918e-06, "loss": 0.4053, "step": 2842 }, { "epoch": 2.3665371809101, "grad_norm": 0.3323042392730713, "learning_rate": 1.2957528221223591e-06, "loss": 0.44, "step": 2843 }, { "epoch": 2.367369589345172, "grad_norm": 0.29590079188346863, "learning_rate": 1.2925002171075846e-06, "loss": 0.3591, "step": 2844 }, { "epoch": 2.368201997780244, "grad_norm": 0.3084840476512909, "learning_rate": 1.2892510935000252e-06, "loss": 0.4094, "step": 2845 }, { "epoch": 2.369034406215316, "grad_norm": 0.2761549651622772, "learning_rate": 1.2860054543506595e-06, "loss": 0.349, "step": 2846 }, { "epoch": 2.3698668146503885, "grad_norm": 0.30774176120758057, "learning_rate": 1.2827633027072017e-06, "loss": 0.4483, "step": 2847 }, { "epoch": 2.3706992230854604, "grad_norm": 0.2745443284511566, "learning_rate": 1.2795246416140895e-06, "loss": 0.3591, "step": 2848 }, { "epoch": 2.371531631520533, "grad_norm": 0.3255792558193207, "learning_rate": 1.2762894741124814e-06, "loss": 0.4007, "step": 2849 }, { "epoch": 2.3723640399556047, "grad_norm": 0.34070661664009094, "learning_rate": 1.273057803240257e-06, "loss": 0.4215, "step": 2850 }, { "epoch": 2.373196448390677, "grad_norm": 0.31039872765541077, "learning_rate": 1.2698296320320113e-06, "loss": 0.3986, "step": 2851 }, { "epoch": 2.374028856825749, "grad_norm": 0.2800928056240082, "learning_rate": 1.2666049635190535e-06, "loss": 0.3828, "step": 2852 }, { "epoch": 2.3748612652608214, "grad_norm": 0.28755927085876465, "learning_rate": 1.2633838007294048e-06, "loss": 0.3997, "step": 2853 }, { "epoch": 2.3756936736958933, "grad_norm": 0.28583160042762756, "learning_rate": 1.260166146687793e-06, "loss": 0.3762, "step": 2854 }, { "epoch": 2.3765260821309657, "grad_norm": 0.33797693252563477, "learning_rate": 1.2569520044156509e-06, "loss": 0.415, "step": 2855 }, { "epoch": 2.3773584905660377, "grad_norm": 0.31852585077285767, "learning_rate": 1.2537413769311163e-06, "loss": 0.4078, "step": 2856 }, { "epoch": 2.37819089900111, "grad_norm": 0.3200073838233948, "learning_rate": 1.25053426724902e-06, "loss": 0.3974, "step": 2857 }, { "epoch": 2.379023307436182, "grad_norm": 0.3143557608127594, "learning_rate": 1.247330678380899e-06, "loss": 0.3691, "step": 2858 }, { "epoch": 2.3798557158712543, "grad_norm": 0.30389317870140076, "learning_rate": 1.2441306133349785e-06, "loss": 0.4205, "step": 2859 }, { "epoch": 2.3806881243063263, "grad_norm": 0.2985035181045532, "learning_rate": 1.2409340751161753e-06, "loss": 0.3629, "step": 2860 }, { "epoch": 2.3815205327413986, "grad_norm": 0.3365674316883087, "learning_rate": 1.2377410667260914e-06, "loss": 0.4372, "step": 2861 }, { "epoch": 2.3823529411764706, "grad_norm": 0.31339481472969055, "learning_rate": 1.2345515911630223e-06, "loss": 0.4141, "step": 2862 }, { "epoch": 2.3831853496115425, "grad_norm": 0.3070381283760071, "learning_rate": 1.2313656514219408e-06, "loss": 0.3372, "step": 2863 }, { "epoch": 2.384017758046615, "grad_norm": 0.2941051423549652, "learning_rate": 1.2281832504944967e-06, "loss": 0.383, "step": 2864 }, { "epoch": 2.3848501664816872, "grad_norm": 0.2965616285800934, "learning_rate": 1.2250043913690235e-06, "loss": 0.4028, "step": 2865 }, { "epoch": 2.385682574916759, "grad_norm": 0.30054527521133423, "learning_rate": 1.2218290770305218e-06, "loss": 0.3639, "step": 2866 }, { "epoch": 2.386514983351831, "grad_norm": 0.3061928451061249, "learning_rate": 1.2186573104606735e-06, "loss": 0.4202, "step": 2867 }, { "epoch": 2.3873473917869035, "grad_norm": 0.29752233624458313, "learning_rate": 1.2154890946378178e-06, "loss": 0.3646, "step": 2868 }, { "epoch": 2.3881798002219754, "grad_norm": 0.3129211664199829, "learning_rate": 1.2123244325369665e-06, "loss": 0.4046, "step": 2869 }, { "epoch": 2.3890122086570478, "grad_norm": 0.3117455244064331, "learning_rate": 1.2091633271297916e-06, "loss": 0.4251, "step": 2870 }, { "epoch": 2.3898446170921197, "grad_norm": 0.2949092984199524, "learning_rate": 1.20600578138463e-06, "loss": 0.3705, "step": 2871 }, { "epoch": 2.390677025527192, "grad_norm": 0.31923535466194153, "learning_rate": 1.2028517982664683e-06, "loss": 0.4063, "step": 2872 }, { "epoch": 2.391509433962264, "grad_norm": 0.30522382259368896, "learning_rate": 1.1997013807369535e-06, "loss": 0.3575, "step": 2873 }, { "epoch": 2.3923418423973364, "grad_norm": 0.29187336564064026, "learning_rate": 1.196554531754383e-06, "loss": 0.3778, "step": 2874 }, { "epoch": 2.3931742508324083, "grad_norm": 0.3098593056201935, "learning_rate": 1.193411254273703e-06, "loss": 0.4229, "step": 2875 }, { "epoch": 2.3940066592674807, "grad_norm": 0.2713955342769623, "learning_rate": 1.1902715512465057e-06, "loss": 0.3677, "step": 2876 }, { "epoch": 2.3948390677025526, "grad_norm": 0.26880118250846863, "learning_rate": 1.1871354256210277e-06, "loss": 0.4, "step": 2877 }, { "epoch": 2.395671476137625, "grad_norm": 0.31260430812835693, "learning_rate": 1.1840028803421455e-06, "loss": 0.4367, "step": 2878 }, { "epoch": 2.396503884572697, "grad_norm": 0.28427496552467346, "learning_rate": 1.1808739183513745e-06, "loss": 0.3538, "step": 2879 }, { "epoch": 2.3973362930077693, "grad_norm": 0.32038092613220215, "learning_rate": 1.1777485425868639e-06, "loss": 0.368, "step": 2880 }, { "epoch": 2.398168701442841, "grad_norm": 0.32951870560646057, "learning_rate": 1.1746267559833973e-06, "loss": 0.3891, "step": 2881 }, { "epoch": 2.3990011098779136, "grad_norm": 0.2811376452445984, "learning_rate": 1.1715085614723881e-06, "loss": 0.375, "step": 2882 }, { "epoch": 2.3998335183129855, "grad_norm": 0.29977062344551086, "learning_rate": 1.1683939619818708e-06, "loss": 0.4002, "step": 2883 }, { "epoch": 2.400665926748058, "grad_norm": 0.303155779838562, "learning_rate": 1.1652829604365135e-06, "loss": 0.3454, "step": 2884 }, { "epoch": 2.40149833518313, "grad_norm": 0.3123762309551239, "learning_rate": 1.1621755597575996e-06, "loss": 0.4271, "step": 2885 }, { "epoch": 2.4023307436182018, "grad_norm": 0.28050366044044495, "learning_rate": 1.1590717628630337e-06, "loss": 0.3879, "step": 2886 }, { "epoch": 2.403163152053274, "grad_norm": 0.2736967206001282, "learning_rate": 1.155971572667332e-06, "loss": 0.3857, "step": 2887 }, { "epoch": 2.4039955604883465, "grad_norm": 0.3153398334980011, "learning_rate": 1.1528749920816319e-06, "loss": 0.391, "step": 2888 }, { "epoch": 2.4048279689234184, "grad_norm": 0.3044677972793579, "learning_rate": 1.1497820240136753e-06, "loss": 0.4164, "step": 2889 }, { "epoch": 2.4056603773584904, "grad_norm": 0.3012680411338806, "learning_rate": 1.1466926713678117e-06, "loss": 0.382, "step": 2890 }, { "epoch": 2.4064927857935627, "grad_norm": 0.32723140716552734, "learning_rate": 1.143606937044997e-06, "loss": 0.4242, "step": 2891 }, { "epoch": 2.4073251942286347, "grad_norm": 0.3037180006504059, "learning_rate": 1.140524823942793e-06, "loss": 0.4052, "step": 2892 }, { "epoch": 2.408157602663707, "grad_norm": 0.300297349691391, "learning_rate": 1.137446334955357e-06, "loss": 0.3693, "step": 2893 }, { "epoch": 2.408990011098779, "grad_norm": 0.30542272329330444, "learning_rate": 1.1343714729734424e-06, "loss": 0.3466, "step": 2894 }, { "epoch": 2.4098224195338513, "grad_norm": 0.32545021176338196, "learning_rate": 1.1313002408843986e-06, "loss": 0.3899, "step": 2895 }, { "epoch": 2.4106548279689233, "grad_norm": 0.30949637293815613, "learning_rate": 1.1282326415721657e-06, "loss": 0.4122, "step": 2896 }, { "epoch": 2.4114872364039956, "grad_norm": 0.27907073497772217, "learning_rate": 1.1251686779172772e-06, "loss": 0.3984, "step": 2897 }, { "epoch": 2.4123196448390676, "grad_norm": 0.3177613914012909, "learning_rate": 1.122108352796844e-06, "loss": 0.3756, "step": 2898 }, { "epoch": 2.41315205327414, "grad_norm": 0.31491708755493164, "learning_rate": 1.119051669084567e-06, "loss": 0.4117, "step": 2899 }, { "epoch": 2.413984461709212, "grad_norm": 0.2828831374645233, "learning_rate": 1.1159986296507259e-06, "loss": 0.3907, "step": 2900 }, { "epoch": 2.4148168701442843, "grad_norm": 0.281389057636261, "learning_rate": 1.112949237362177e-06, "loss": 0.3957, "step": 2901 }, { "epoch": 2.415649278579356, "grad_norm": 0.29541143774986267, "learning_rate": 1.1099034950823539e-06, "loss": 0.3692, "step": 2902 }, { "epoch": 2.4164816870144286, "grad_norm": 0.3205885887145996, "learning_rate": 1.1068614056712624e-06, "loss": 0.4481, "step": 2903 }, { "epoch": 2.4173140954495005, "grad_norm": 0.3070029020309448, "learning_rate": 1.103822971985477e-06, "loss": 0.3735, "step": 2904 }, { "epoch": 2.418146503884573, "grad_norm": 0.31543856859207153, "learning_rate": 1.1007881968781403e-06, "loss": 0.3967, "step": 2905 }, { "epoch": 2.418978912319645, "grad_norm": 0.29121309518814087, "learning_rate": 1.0977570831989593e-06, "loss": 0.3626, "step": 2906 }, { "epoch": 2.419811320754717, "grad_norm": 0.3155393600463867, "learning_rate": 1.0947296337942026e-06, "loss": 0.4054, "step": 2907 }, { "epoch": 2.420643729189789, "grad_norm": 0.3240245282649994, "learning_rate": 1.091705851506698e-06, "loss": 0.4109, "step": 2908 }, { "epoch": 2.421476137624861, "grad_norm": 0.27017274498939514, "learning_rate": 1.088685739175831e-06, "loss": 0.3569, "step": 2909 }, { "epoch": 2.4223085460599334, "grad_norm": 0.3277089595794678, "learning_rate": 1.085669299637539e-06, "loss": 0.4304, "step": 2910 }, { "epoch": 2.4231409544950058, "grad_norm": 0.30966663360595703, "learning_rate": 1.0826565357243125e-06, "loss": 0.3984, "step": 2911 }, { "epoch": 2.4239733629300777, "grad_norm": 0.28824105858802795, "learning_rate": 1.0796474502651893e-06, "loss": 0.3827, "step": 2912 }, { "epoch": 2.4248057713651496, "grad_norm": 0.29764944314956665, "learning_rate": 1.0766420460857507e-06, "loss": 0.3925, "step": 2913 }, { "epoch": 2.425638179800222, "grad_norm": 0.31288978457450867, "learning_rate": 1.0736403260081279e-06, "loss": 0.3844, "step": 2914 }, { "epoch": 2.426470588235294, "grad_norm": 0.2968754172325134, "learning_rate": 1.070642292850987e-06, "loss": 0.3646, "step": 2915 }, { "epoch": 2.4273029966703663, "grad_norm": 0.3165760636329651, "learning_rate": 1.067647949429534e-06, "loss": 0.4002, "step": 2916 }, { "epoch": 2.4281354051054382, "grad_norm": 0.2920931875705719, "learning_rate": 1.0646572985555071e-06, "loss": 0.3664, "step": 2917 }, { "epoch": 2.4289678135405106, "grad_norm": 0.2827328145503998, "learning_rate": 1.0616703430371833e-06, "loss": 0.4066, "step": 2918 }, { "epoch": 2.4298002219755825, "grad_norm": 0.29017403721809387, "learning_rate": 1.0586870856793657e-06, "loss": 0.3744, "step": 2919 }, { "epoch": 2.430632630410655, "grad_norm": 0.3129844665527344, "learning_rate": 1.0557075292833836e-06, "loss": 0.4184, "step": 2920 }, { "epoch": 2.431465038845727, "grad_norm": 0.3073255121707916, "learning_rate": 1.052731676647092e-06, "loss": 0.3929, "step": 2921 }, { "epoch": 2.432297447280799, "grad_norm": 0.2809605300426483, "learning_rate": 1.049759530564871e-06, "loss": 0.3918, "step": 2922 }, { "epoch": 2.433129855715871, "grad_norm": 0.29906994104385376, "learning_rate": 1.0467910938276182e-06, "loss": 0.4187, "step": 2923 }, { "epoch": 2.4339622641509435, "grad_norm": 0.2949870228767395, "learning_rate": 1.0438263692227452e-06, "loss": 0.3776, "step": 2924 }, { "epoch": 2.4347946725860155, "grad_norm": 0.33523961901664734, "learning_rate": 1.0408653595341812e-06, "loss": 0.4238, "step": 2925 }, { "epoch": 2.435627081021088, "grad_norm": 0.3024636507034302, "learning_rate": 1.0379080675423664e-06, "loss": 0.4127, "step": 2926 }, { "epoch": 2.4364594894561598, "grad_norm": 0.2905980944633484, "learning_rate": 1.0349544960242496e-06, "loss": 0.3608, "step": 2927 }, { "epoch": 2.437291897891232, "grad_norm": 0.3159032464027405, "learning_rate": 1.0320046477532864e-06, "loss": 0.4126, "step": 2928 }, { "epoch": 2.438124306326304, "grad_norm": 0.3154861629009247, "learning_rate": 1.0290585254994356e-06, "loss": 0.4027, "step": 2929 }, { "epoch": 2.4389567147613764, "grad_norm": 0.29416272044181824, "learning_rate": 1.0261161320291586e-06, "loss": 0.4003, "step": 2930 }, { "epoch": 2.4397891231964484, "grad_norm": 0.30298373103141785, "learning_rate": 1.0231774701054126e-06, "loss": 0.3521, "step": 2931 }, { "epoch": 2.4406215316315203, "grad_norm": 0.3236266076564789, "learning_rate": 1.020242542487654e-06, "loss": 0.4003, "step": 2932 }, { "epoch": 2.4414539400665927, "grad_norm": 0.2998720407485962, "learning_rate": 1.017311351931831e-06, "loss": 0.3764, "step": 2933 }, { "epoch": 2.442286348501665, "grad_norm": 0.36174139380455017, "learning_rate": 1.0143839011903822e-06, "loss": 0.3811, "step": 2934 }, { "epoch": 2.443118756936737, "grad_norm": 0.306686133146286, "learning_rate": 1.0114601930122363e-06, "loss": 0.3726, "step": 2935 }, { "epoch": 2.443951165371809, "grad_norm": 0.3181160092353821, "learning_rate": 1.0085402301428055e-06, "loss": 0.4274, "step": 2936 }, { "epoch": 2.4447835738068813, "grad_norm": 0.3226320743560791, "learning_rate": 1.005624015323986e-06, "loss": 0.4226, "step": 2937 }, { "epoch": 2.445615982241953, "grad_norm": 0.28478971123695374, "learning_rate": 1.0027115512941549e-06, "loss": 0.3913, "step": 2938 }, { "epoch": 2.4464483906770256, "grad_norm": 0.2990976572036743, "learning_rate": 9.998028407881672e-07, "loss": 0.3943, "step": 2939 }, { "epoch": 2.4472807991120975, "grad_norm": 0.326755166053772, "learning_rate": 9.96897886537353e-07, "loss": 0.4393, "step": 2940 }, { "epoch": 2.44811320754717, "grad_norm": 0.29832005500793457, "learning_rate": 9.939966912695143e-07, "loss": 0.3817, "step": 2941 }, { "epoch": 2.448945615982242, "grad_norm": 0.3191179037094116, "learning_rate": 9.910992577089269e-07, "loss": 0.3915, "step": 2942 }, { "epoch": 2.449778024417314, "grad_norm": 0.29789450764656067, "learning_rate": 9.882055885763264e-07, "loss": 0.4053, "step": 2943 }, { "epoch": 2.450610432852386, "grad_norm": 0.3055591285228729, "learning_rate": 9.853156865889234e-07, "loss": 0.4068, "step": 2944 }, { "epoch": 2.4514428412874585, "grad_norm": 0.2884024381637573, "learning_rate": 9.824295544603863e-07, "loss": 0.3908, "step": 2945 }, { "epoch": 2.4522752497225304, "grad_norm": 0.29266828298568726, "learning_rate": 9.795471949008411e-07, "loss": 0.3763, "step": 2946 }, { "epoch": 2.453107658157603, "grad_norm": 0.28186294436454773, "learning_rate": 9.766686106168744e-07, "loss": 0.3785, "step": 2947 }, { "epoch": 2.4539400665926747, "grad_norm": 0.2927301228046417, "learning_rate": 9.73793804311529e-07, "loss": 0.4081, "step": 2948 }, { "epoch": 2.454772475027747, "grad_norm": 0.30842170119285583, "learning_rate": 9.70922778684299e-07, "loss": 0.4011, "step": 2949 }, { "epoch": 2.455604883462819, "grad_norm": 0.30753591656684875, "learning_rate": 9.680555364311251e-07, "loss": 0.391, "step": 2950 }, { "epoch": 2.4564372918978914, "grad_norm": 0.3024488091468811, "learning_rate": 9.651920802443971e-07, "loss": 0.3796, "step": 2951 }, { "epoch": 2.4572697003329633, "grad_norm": 0.29086968302726746, "learning_rate": 9.623324128129557e-07, "loss": 0.3864, "step": 2952 }, { "epoch": 2.4581021087680357, "grad_norm": 0.28945299983024597, "learning_rate": 9.594765368220737e-07, "loss": 0.3656, "step": 2953 }, { "epoch": 2.4589345172031076, "grad_norm": 0.30900830030441284, "learning_rate": 9.56624454953471e-07, "loss": 0.4122, "step": 2954 }, { "epoch": 2.4597669256381796, "grad_norm": 0.3098362982273102, "learning_rate": 9.537761698853016e-07, "loss": 0.3975, "step": 2955 }, { "epoch": 2.460599334073252, "grad_norm": 0.2963133454322815, "learning_rate": 9.509316842921551e-07, "loss": 0.3775, "step": 2956 }, { "epoch": 2.4614317425083243, "grad_norm": 0.28471919894218445, "learning_rate": 9.480910008450534e-07, "loss": 0.3706, "step": 2957 }, { "epoch": 2.4622641509433962, "grad_norm": 0.32631343603134155, "learning_rate": 9.452541222114481e-07, "loss": 0.4478, "step": 2958 }, { "epoch": 2.463096559378468, "grad_norm": 0.2862439751625061, "learning_rate": 9.424210510552179e-07, "loss": 0.35, "step": 2959 }, { "epoch": 2.4639289678135405, "grad_norm": 0.2939037084579468, "learning_rate": 9.395917900366663e-07, "loss": 0.4093, "step": 2960 }, { "epoch": 2.4647613762486125, "grad_norm": 0.27758315205574036, "learning_rate": 9.36766341812519e-07, "loss": 0.3851, "step": 2961 }, { "epoch": 2.465593784683685, "grad_norm": 0.3188072443008423, "learning_rate": 9.33944709035921e-07, "loss": 0.4501, "step": 2962 }, { "epoch": 2.4664261931187568, "grad_norm": 0.29086801409721375, "learning_rate": 9.31126894356435e-07, "loss": 0.386, "step": 2963 }, { "epoch": 2.467258601553829, "grad_norm": 0.30861911177635193, "learning_rate": 9.283129004200381e-07, "loss": 0.4013, "step": 2964 }, { "epoch": 2.468091009988901, "grad_norm": 0.31104356050491333, "learning_rate": 9.255027298691205e-07, "loss": 0.3772, "step": 2965 }, { "epoch": 2.4689234184239734, "grad_norm": 0.3085804879665375, "learning_rate": 9.226963853424815e-07, "loss": 0.3887, "step": 2966 }, { "epoch": 2.4697558268590454, "grad_norm": 0.3194611072540283, "learning_rate": 9.198938694753268e-07, "loss": 0.4214, "step": 2967 }, { "epoch": 2.4705882352941178, "grad_norm": 0.30297356843948364, "learning_rate": 9.170951848992693e-07, "loss": 0.4159, "step": 2968 }, { "epoch": 2.4714206437291897, "grad_norm": 0.3000932037830353, "learning_rate": 9.143003342423212e-07, "loss": 0.4052, "step": 2969 }, { "epoch": 2.472253052164262, "grad_norm": 0.300144225358963, "learning_rate": 9.115093201288977e-07, "loss": 0.4005, "step": 2970 }, { "epoch": 2.473085460599334, "grad_norm": 0.304669588804245, "learning_rate": 9.0872214517981e-07, "loss": 0.3844, "step": 2971 }, { "epoch": 2.4739178690344064, "grad_norm": 0.2950827181339264, "learning_rate": 9.059388120122626e-07, "loss": 0.3914, "step": 2972 }, { "epoch": 2.4747502774694783, "grad_norm": 0.2657037079334259, "learning_rate": 9.031593232398539e-07, "loss": 0.3532, "step": 2973 }, { "epoch": 2.4755826859045507, "grad_norm": 0.3045920133590698, "learning_rate": 9.003836814725742e-07, "loss": 0.4007, "step": 2974 }, { "epoch": 2.4764150943396226, "grad_norm": 0.295762300491333, "learning_rate": 8.976118893168006e-07, "loss": 0.3683, "step": 2975 }, { "epoch": 2.477247502774695, "grad_norm": 0.29629120230674744, "learning_rate": 8.94843949375292e-07, "loss": 0.4194, "step": 2976 }, { "epoch": 2.478079911209767, "grad_norm": 0.28778138756752014, "learning_rate": 8.920798642471918e-07, "loss": 0.3705, "step": 2977 }, { "epoch": 2.478912319644839, "grad_norm": 0.32166433334350586, "learning_rate": 8.893196365280282e-07, "loss": 0.4143, "step": 2978 }, { "epoch": 2.479744728079911, "grad_norm": 0.301952987909317, "learning_rate": 8.865632688097004e-07, "loss": 0.381, "step": 2979 }, { "epoch": 2.4805771365149836, "grad_norm": 0.2990775406360626, "learning_rate": 8.83810763680486e-07, "loss": 0.3671, "step": 2980 }, { "epoch": 2.4814095449500555, "grad_norm": 0.31099602580070496, "learning_rate": 8.810621237250355e-07, "loss": 0.4344, "step": 2981 }, { "epoch": 2.4822419533851274, "grad_norm": 0.2816608250141144, "learning_rate": 8.783173515243725e-07, "loss": 0.3796, "step": 2982 }, { "epoch": 2.4830743618202, "grad_norm": 0.27356284856796265, "learning_rate": 8.755764496558838e-07, "loss": 0.3578, "step": 2983 }, { "epoch": 2.4839067702552717, "grad_norm": 0.32336634397506714, "learning_rate": 8.728394206933239e-07, "loss": 0.4262, "step": 2984 }, { "epoch": 2.484739178690344, "grad_norm": 0.3039683699607849, "learning_rate": 8.701062672068122e-07, "loss": 0.3967, "step": 2985 }, { "epoch": 2.485571587125416, "grad_norm": 0.2810039520263672, "learning_rate": 8.673769917628272e-07, "loss": 0.3698, "step": 2986 }, { "epoch": 2.4864039955604884, "grad_norm": 0.2974560260772705, "learning_rate": 8.646515969242065e-07, "loss": 0.3925, "step": 2987 }, { "epoch": 2.4872364039955603, "grad_norm": 0.3146367371082306, "learning_rate": 8.619300852501427e-07, "loss": 0.425, "step": 2988 }, { "epoch": 2.4880688124306327, "grad_norm": 0.29236578941345215, "learning_rate": 8.592124592961843e-07, "loss": 0.3563, "step": 2989 }, { "epoch": 2.4889012208657046, "grad_norm": 0.29718172550201416, "learning_rate": 8.56498721614229e-07, "loss": 0.3998, "step": 2990 }, { "epoch": 2.489733629300777, "grad_norm": 0.313275009393692, "learning_rate": 8.537888747525236e-07, "loss": 0.4364, "step": 2991 }, { "epoch": 2.490566037735849, "grad_norm": 0.30545952916145325, "learning_rate": 8.51082921255662e-07, "loss": 0.4099, "step": 2992 }, { "epoch": 2.4913984461709213, "grad_norm": 0.33663320541381836, "learning_rate": 8.483808636645824e-07, "loss": 0.4288, "step": 2993 }, { "epoch": 2.4922308546059933, "grad_norm": 0.3012736141681671, "learning_rate": 8.456827045165638e-07, "loss": 0.3524, "step": 2994 }, { "epoch": 2.4930632630410656, "grad_norm": 0.31417447328567505, "learning_rate": 8.429884463452248e-07, "loss": 0.3816, "step": 2995 }, { "epoch": 2.4938956714761376, "grad_norm": 0.31972286105155945, "learning_rate": 8.402980916805215e-07, "loss": 0.4074, "step": 2996 }, { "epoch": 2.49472807991121, "grad_norm": 0.33134639263153076, "learning_rate": 8.376116430487441e-07, "loss": 0.4231, "step": 2997 }, { "epoch": 2.495560488346282, "grad_norm": 0.28740981221199036, "learning_rate": 8.349291029725126e-07, "loss": 0.369, "step": 2998 }, { "epoch": 2.4963928967813542, "grad_norm": 0.29424378275871277, "learning_rate": 8.322504739707821e-07, "loss": 0.3857, "step": 2999 }, { "epoch": 2.497225305216426, "grad_norm": 0.275423526763916, "learning_rate": 8.295757585588304e-07, "loss": 0.3734, "step": 3000 }, { "epoch": 2.498057713651498, "grad_norm": 0.3008045256137848, "learning_rate": 8.269049592482648e-07, "loss": 0.3864, "step": 3001 }, { "epoch": 2.4988901220865705, "grad_norm": 0.2958872318267822, "learning_rate": 8.242380785470088e-07, "loss": 0.3839, "step": 3002 }, { "epoch": 2.499722530521643, "grad_norm": 0.31030040979385376, "learning_rate": 8.215751189593107e-07, "loss": 0.4039, "step": 3003 }, { "epoch": 2.5005549389567148, "grad_norm": 0.3011883497238159, "learning_rate": 8.189160829857396e-07, "loss": 0.3923, "step": 3004 }, { "epoch": 2.5013873473917867, "grad_norm": 0.314978688955307, "learning_rate": 8.16260973123173e-07, "loss": 0.3885, "step": 3005 }, { "epoch": 2.502219755826859, "grad_norm": 0.31006646156311035, "learning_rate": 8.136097918648073e-07, "loss": 0.3841, "step": 3006 }, { "epoch": 2.5030521642619314, "grad_norm": 0.3044472336769104, "learning_rate": 8.109625417001465e-07, "loss": 0.4087, "step": 3007 }, { "epoch": 2.5038845726970034, "grad_norm": 0.3212951421737671, "learning_rate": 8.08319225115009e-07, "loss": 0.3969, "step": 3008 }, { "epoch": 2.5047169811320753, "grad_norm": 0.3018619120121002, "learning_rate": 8.056798445915115e-07, "loss": 0.3801, "step": 3009 }, { "epoch": 2.5055493895671477, "grad_norm": 0.3130761384963989, "learning_rate": 8.030444026080791e-07, "loss": 0.3594, "step": 3010 }, { "epoch": 2.5063817980022196, "grad_norm": 0.30970242619514465, "learning_rate": 8.004129016394374e-07, "loss": 0.4117, "step": 3011 }, { "epoch": 2.507214206437292, "grad_norm": 0.2875807583332062, "learning_rate": 7.977853441566152e-07, "loss": 0.3824, "step": 3012 }, { "epoch": 2.508046614872364, "grad_norm": 0.3063216805458069, "learning_rate": 7.951617326269318e-07, "loss": 0.3899, "step": 3013 }, { "epoch": 2.5088790233074363, "grad_norm": 0.2871801257133484, "learning_rate": 7.925420695140052e-07, "loss": 0.3954, "step": 3014 }, { "epoch": 2.509711431742508, "grad_norm": 0.30276262760162354, "learning_rate": 7.899263572777454e-07, "loss": 0.4196, "step": 3015 }, { "epoch": 2.5105438401775806, "grad_norm": 0.3048497140407562, "learning_rate": 7.873145983743513e-07, "loss": 0.3925, "step": 3016 }, { "epoch": 2.5113762486126525, "grad_norm": 0.3085763454437256, "learning_rate": 7.847067952563103e-07, "loss": 0.3928, "step": 3017 }, { "epoch": 2.512208657047725, "grad_norm": 0.3082400858402252, "learning_rate": 7.821029503723959e-07, "loss": 0.3913, "step": 3018 }, { "epoch": 2.513041065482797, "grad_norm": 0.2877948582172394, "learning_rate": 7.795030661676633e-07, "loss": 0.3754, "step": 3019 }, { "epoch": 2.513873473917869, "grad_norm": 0.29405730962753296, "learning_rate": 7.769071450834498e-07, "loss": 0.3884, "step": 3020 }, { "epoch": 2.514705882352941, "grad_norm": 0.31952765583992004, "learning_rate": 7.743151895573703e-07, "loss": 0.4085, "step": 3021 }, { "epoch": 2.5155382907880135, "grad_norm": 0.3090550899505615, "learning_rate": 7.717272020233169e-07, "loss": 0.4127, "step": 3022 }, { "epoch": 2.5163706992230854, "grad_norm": 0.27301451563835144, "learning_rate": 7.691431849114561e-07, "loss": 0.3753, "step": 3023 }, { "epoch": 2.5172031076581574, "grad_norm": 0.31073275208473206, "learning_rate": 7.665631406482216e-07, "loss": 0.4172, "step": 3024 }, { "epoch": 2.5180355160932297, "grad_norm": 0.32717081904411316, "learning_rate": 7.639870716563236e-07, "loss": 0.4001, "step": 3025 }, { "epoch": 2.518867924528302, "grad_norm": 0.27241167426109314, "learning_rate": 7.614149803547354e-07, "loss": 0.3609, "step": 3026 }, { "epoch": 2.519700332963374, "grad_norm": 0.31278035044670105, "learning_rate": 7.588468691586964e-07, "loss": 0.3819, "step": 3027 }, { "epoch": 2.520532741398446, "grad_norm": 0.3114195764064789, "learning_rate": 7.562827404797046e-07, "loss": 0.3831, "step": 3028 }, { "epoch": 2.5213651498335183, "grad_norm": 0.29425179958343506, "learning_rate": 7.537225967255252e-07, "loss": 0.3664, "step": 3029 }, { "epoch": 2.5221975582685907, "grad_norm": 0.2959001958370209, "learning_rate": 7.511664403001778e-07, "loss": 0.3783, "step": 3030 }, { "epoch": 2.5230299667036626, "grad_norm": 0.3024890720844269, "learning_rate": 7.486142736039364e-07, "loss": 0.3908, "step": 3031 }, { "epoch": 2.5238623751387346, "grad_norm": 0.3172726333141327, "learning_rate": 7.460660990333307e-07, "loss": 0.422, "step": 3032 }, { "epoch": 2.524694783573807, "grad_norm": 0.26689743995666504, "learning_rate": 7.435219189811404e-07, "loss": 0.381, "step": 3033 }, { "epoch": 2.525527192008879, "grad_norm": 0.29678985476493835, "learning_rate": 7.409817358363986e-07, "loss": 0.3954, "step": 3034 }, { "epoch": 2.5263596004439512, "grad_norm": 0.2859072983264923, "learning_rate": 7.38445551984378e-07, "loss": 0.3582, "step": 3035 }, { "epoch": 2.527192008879023, "grad_norm": 0.29791417717933655, "learning_rate": 7.359133698066012e-07, "loss": 0.4127, "step": 3036 }, { "epoch": 2.5280244173140956, "grad_norm": 0.27158403396606445, "learning_rate": 7.333851916808298e-07, "loss": 0.3757, "step": 3037 }, { "epoch": 2.5288568257491675, "grad_norm": 0.2889906167984009, "learning_rate": 7.308610199810717e-07, "loss": 0.367, "step": 3038 }, { "epoch": 2.52968923418424, "grad_norm": 0.31212955713272095, "learning_rate": 7.28340857077564e-07, "loss": 0.4116, "step": 3039 }, { "epoch": 2.530521642619312, "grad_norm": 0.30906566977500916, "learning_rate": 7.258247053367856e-07, "loss": 0.4001, "step": 3040 }, { "epoch": 2.531354051054384, "grad_norm": 0.27870288491249084, "learning_rate": 7.233125671214469e-07, "loss": 0.3846, "step": 3041 }, { "epoch": 2.532186459489456, "grad_norm": 0.2938634157180786, "learning_rate": 7.208044447904893e-07, "loss": 0.4034, "step": 3042 }, { "epoch": 2.5330188679245285, "grad_norm": 0.2890358865261078, "learning_rate": 7.183003406990841e-07, "loss": 0.3691, "step": 3043 }, { "epoch": 2.5338512763596004, "grad_norm": 0.28360605239868164, "learning_rate": 7.158002571986283e-07, "loss": 0.3825, "step": 3044 }, { "epoch": 2.5346836847946728, "grad_norm": 0.277585506439209, "learning_rate": 7.133041966367443e-07, "loss": 0.3718, "step": 3045 }, { "epoch": 2.5355160932297447, "grad_norm": 0.2901941239833832, "learning_rate": 7.108121613572771e-07, "loss": 0.3666, "step": 3046 }, { "epoch": 2.5363485016648166, "grad_norm": 0.2988186776638031, "learning_rate": 7.083241537002905e-07, "loss": 0.3939, "step": 3047 }, { "epoch": 2.537180910099889, "grad_norm": 0.31158897280693054, "learning_rate": 7.058401760020689e-07, "loss": 0.4082, "step": 3048 }, { "epoch": 2.5380133185349614, "grad_norm": 0.29564598202705383, "learning_rate": 7.033602305951104e-07, "loss": 0.3702, "step": 3049 }, { "epoch": 2.5388457269700333, "grad_norm": 0.29291555285453796, "learning_rate": 7.008843198081239e-07, "loss": 0.4032, "step": 3050 }, { "epoch": 2.5396781354051052, "grad_norm": 0.29521968960762024, "learning_rate": 6.984124459660374e-07, "loss": 0.3727, "step": 3051 }, { "epoch": 2.5405105438401776, "grad_norm": 0.29539427161216736, "learning_rate": 6.95944611389982e-07, "loss": 0.4056, "step": 3052 }, { "epoch": 2.54134295227525, "grad_norm": 0.3013669550418854, "learning_rate": 6.934808183972986e-07, "loss": 0.377, "step": 3053 }, { "epoch": 2.542175360710322, "grad_norm": 0.3182709217071533, "learning_rate": 6.910210693015285e-07, "loss": 0.4025, "step": 3054 }, { "epoch": 2.543007769145394, "grad_norm": 0.3235478401184082, "learning_rate": 6.885653664124226e-07, "loss": 0.4023, "step": 3055 }, { "epoch": 2.543840177580466, "grad_norm": 0.28089457750320435, "learning_rate": 6.861137120359296e-07, "loss": 0.3754, "step": 3056 }, { "epoch": 2.544672586015538, "grad_norm": 0.3240266740322113, "learning_rate": 6.836661084741924e-07, "loss": 0.4258, "step": 3057 }, { "epoch": 2.5455049944506105, "grad_norm": 0.30531975626945496, "learning_rate": 6.812225580255549e-07, "loss": 0.4008, "step": 3058 }, { "epoch": 2.5463374028856824, "grad_norm": 0.2742519676685333, "learning_rate": 6.787830629845549e-07, "loss": 0.3597, "step": 3059 }, { "epoch": 2.547169811320755, "grad_norm": 0.3020766079425812, "learning_rate": 6.763476256419215e-07, "loss": 0.3942, "step": 3060 }, { "epoch": 2.5480022197558267, "grad_norm": 0.314591646194458, "learning_rate": 6.739162482845707e-07, "loss": 0.3769, "step": 3061 }, { "epoch": 2.548834628190899, "grad_norm": 0.308002233505249, "learning_rate": 6.714889331956087e-07, "loss": 0.3942, "step": 3062 }, { "epoch": 2.549667036625971, "grad_norm": 0.300611287355423, "learning_rate": 6.690656826543285e-07, "loss": 0.4121, "step": 3063 }, { "epoch": 2.5504994450610434, "grad_norm": 0.27304670214653015, "learning_rate": 6.666464989362054e-07, "loss": 0.3678, "step": 3064 }, { "epoch": 2.5513318534961154, "grad_norm": 0.2949070632457733, "learning_rate": 6.642313843128922e-07, "loss": 0.4077, "step": 3065 }, { "epoch": 2.5521642619311877, "grad_norm": 0.2929401695728302, "learning_rate": 6.618203410522262e-07, "loss": 0.376, "step": 3066 }, { "epoch": 2.5529966703662597, "grad_norm": 0.30417951941490173, "learning_rate": 6.594133714182178e-07, "loss": 0.4095, "step": 3067 }, { "epoch": 2.553829078801332, "grad_norm": 0.32251882553100586, "learning_rate": 6.570104776710551e-07, "loss": 0.4429, "step": 3068 }, { "epoch": 2.554661487236404, "grad_norm": 0.2788321077823639, "learning_rate": 6.546116620670961e-07, "loss": 0.3727, "step": 3069 }, { "epoch": 2.555493895671476, "grad_norm": 0.3155818581581116, "learning_rate": 6.522169268588713e-07, "loss": 0.3847, "step": 3070 }, { "epoch": 2.5563263041065483, "grad_norm": 0.3051152527332306, "learning_rate": 6.49826274295079e-07, "loss": 0.3391, "step": 3071 }, { "epoch": 2.5571587125416206, "grad_norm": 0.2933105230331421, "learning_rate": 6.474397066205834e-07, "loss": 0.3977, "step": 3072 }, { "epoch": 2.5579911209766926, "grad_norm": 0.2970825135707855, "learning_rate": 6.450572260764137e-07, "loss": 0.4147, "step": 3073 }, { "epoch": 2.5588235294117645, "grad_norm": 0.31705281138420105, "learning_rate": 6.42678834899761e-07, "loss": 0.4048, "step": 3074 }, { "epoch": 2.559655937846837, "grad_norm": 0.28795450925827026, "learning_rate": 6.403045353239757e-07, "loss": 0.4011, "step": 3075 }, { "epoch": 2.5604883462819092, "grad_norm": 0.3047225773334503, "learning_rate": 6.379343295785673e-07, "loss": 0.3888, "step": 3076 }, { "epoch": 2.561320754716981, "grad_norm": 0.29493415355682373, "learning_rate": 6.355682198892005e-07, "loss": 0.3831, "step": 3077 }, { "epoch": 2.562153163152053, "grad_norm": 0.29540005326271057, "learning_rate": 6.33206208477693e-07, "loss": 0.4203, "step": 3078 }, { "epoch": 2.5629855715871255, "grad_norm": 0.266559898853302, "learning_rate": 6.308482975620161e-07, "loss": 0.3354, "step": 3079 }, { "epoch": 2.5638179800221974, "grad_norm": 0.29652097821235657, "learning_rate": 6.284944893562872e-07, "loss": 0.4215, "step": 3080 }, { "epoch": 2.56465038845727, "grad_norm": 0.3205231726169586, "learning_rate": 6.261447860707753e-07, "loss": 0.4226, "step": 3081 }, { "epoch": 2.5654827968923417, "grad_norm": 0.3019580841064453, "learning_rate": 6.23799189911894e-07, "loss": 0.3837, "step": 3082 }, { "epoch": 2.566315205327414, "grad_norm": 0.26572704315185547, "learning_rate": 6.214577030821967e-07, "loss": 0.3495, "step": 3083 }, { "epoch": 2.567147613762486, "grad_norm": 0.2902132272720337, "learning_rate": 6.191203277803798e-07, "loss": 0.4022, "step": 3084 }, { "epoch": 2.5679800221975584, "grad_norm": 0.2867758572101593, "learning_rate": 6.167870662012831e-07, "loss": 0.3833, "step": 3085 }, { "epoch": 2.5688124306326303, "grad_norm": 0.27622854709625244, "learning_rate": 6.144579205358786e-07, "loss": 0.3983, "step": 3086 }, { "epoch": 2.5696448390677027, "grad_norm": 0.2973408102989197, "learning_rate": 6.121328929712739e-07, "loss": 0.3968, "step": 3087 }, { "epoch": 2.5704772475027746, "grad_norm": 0.3048017919063568, "learning_rate": 6.098119856907103e-07, "loss": 0.431, "step": 3088 }, { "epoch": 2.571309655937847, "grad_norm": 0.29575997591018677, "learning_rate": 6.074952008735624e-07, "loss": 0.3685, "step": 3089 }, { "epoch": 2.572142064372919, "grad_norm": 0.3209153413772583, "learning_rate": 6.051825406953316e-07, "loss": 0.4429, "step": 3090 }, { "epoch": 2.5729744728079913, "grad_norm": 0.30033037066459656, "learning_rate": 6.02874007327644e-07, "loss": 0.364, "step": 3091 }, { "epoch": 2.5738068812430632, "grad_norm": 0.2988393306732178, "learning_rate": 6.005696029382535e-07, "loss": 0.3802, "step": 3092 }, { "epoch": 2.574639289678135, "grad_norm": 0.3061862289905548, "learning_rate": 5.982693296910386e-07, "loss": 0.3916, "step": 3093 }, { "epoch": 2.5754716981132075, "grad_norm": 0.31209561228752136, "learning_rate": 5.959731897459936e-07, "loss": 0.4099, "step": 3094 }, { "epoch": 2.57630410654828, "grad_norm": 0.3136289417743683, "learning_rate": 5.93681185259235e-07, "loss": 0.3519, "step": 3095 }, { "epoch": 2.577136514983352, "grad_norm": 0.29192236065864563, "learning_rate": 5.91393318382995e-07, "loss": 0.3679, "step": 3096 }, { "epoch": 2.5779689234184238, "grad_norm": 0.3013075590133667, "learning_rate": 5.891095912656208e-07, "loss": 0.471, "step": 3097 }, { "epoch": 2.578801331853496, "grad_norm": 0.2735782265663147, "learning_rate": 5.86830006051572e-07, "loss": 0.3486, "step": 3098 }, { "epoch": 2.5796337402885685, "grad_norm": 0.28365886211395264, "learning_rate": 5.845545648814188e-07, "loss": 0.3943, "step": 3099 }, { "epoch": 2.5804661487236404, "grad_norm": 0.262623131275177, "learning_rate": 5.822832698918413e-07, "loss": 0.3549, "step": 3100 }, { "epoch": 2.5812985571587124, "grad_norm": 0.3032148778438568, "learning_rate": 5.800161232156238e-07, "loss": 0.3379, "step": 3101 }, { "epoch": 2.5821309655937847, "grad_norm": 0.3096363842487335, "learning_rate": 5.777531269816577e-07, "loss": 0.4437, "step": 3102 }, { "epoch": 2.5829633740288567, "grad_norm": 0.2988356649875641, "learning_rate": 5.754942833149363e-07, "loss": 0.3767, "step": 3103 }, { "epoch": 2.583795782463929, "grad_norm": 0.29221031069755554, "learning_rate": 5.732395943365526e-07, "loss": 0.3716, "step": 3104 }, { "epoch": 2.584628190899001, "grad_norm": 0.2928762137889862, "learning_rate": 5.709890621636993e-07, "loss": 0.3776, "step": 3105 }, { "epoch": 2.5854605993340734, "grad_norm": 0.29212868213653564, "learning_rate": 5.687426889096659e-07, "loss": 0.3987, "step": 3106 }, { "epoch": 2.5862930077691453, "grad_norm": 0.34992143511772156, "learning_rate": 5.665004766838356e-07, "loss": 0.4072, "step": 3107 }, { "epoch": 2.5871254162042177, "grad_norm": 0.3027849495410919, "learning_rate": 5.642624275916852e-07, "loss": 0.3612, "step": 3108 }, { "epoch": 2.5879578246392896, "grad_norm": 0.29607823491096497, "learning_rate": 5.620285437347834e-07, "loss": 0.4058, "step": 3109 }, { "epoch": 2.588790233074362, "grad_norm": 0.28273841738700867, "learning_rate": 5.597988272107824e-07, "loss": 0.3771, "step": 3110 }, { "epoch": 2.589622641509434, "grad_norm": 0.293728232383728, "learning_rate": 5.575732801134287e-07, "loss": 0.3977, "step": 3111 }, { "epoch": 2.5904550499445063, "grad_norm": 0.28861936926841736, "learning_rate": 5.553519045325501e-07, "loss": 0.3839, "step": 3112 }, { "epoch": 2.591287458379578, "grad_norm": 0.2736756503582001, "learning_rate": 5.531347025540546e-07, "loss": 0.374, "step": 3113 }, { "epoch": 2.5921198668146506, "grad_norm": 0.2778788208961487, "learning_rate": 5.509216762599339e-07, "loss": 0.3654, "step": 3114 }, { "epoch": 2.5929522752497225, "grad_norm": 0.29448696970939636, "learning_rate": 5.487128277282605e-07, "loss": 0.3952, "step": 3115 }, { "epoch": 2.5937846836847944, "grad_norm": 0.31573858857154846, "learning_rate": 5.465081590331817e-07, "loss": 0.4203, "step": 3116 }, { "epoch": 2.594617092119867, "grad_norm": 0.2849540114402771, "learning_rate": 5.443076722449186e-07, "loss": 0.3638, "step": 3117 }, { "epoch": 2.595449500554939, "grad_norm": 0.2975523769855499, "learning_rate": 5.421113694297664e-07, "loss": 0.3668, "step": 3118 }, { "epoch": 2.596281908990011, "grad_norm": 0.3016306459903717, "learning_rate": 5.399192526500946e-07, "loss": 0.4132, "step": 3119 }, { "epoch": 2.597114317425083, "grad_norm": 0.2861490845680237, "learning_rate": 5.377313239643367e-07, "loss": 0.3821, "step": 3120 }, { "epoch": 2.5979467258601554, "grad_norm": 0.2761737108230591, "learning_rate": 5.355475854269964e-07, "loss": 0.3899, "step": 3121 }, { "epoch": 2.5987791342952278, "grad_norm": 0.2985495328903198, "learning_rate": 5.333680390886426e-07, "loss": 0.4112, "step": 3122 }, { "epoch": 2.5996115427302997, "grad_norm": 0.2759629786014557, "learning_rate": 5.311926869959094e-07, "loss": 0.3573, "step": 3123 }, { "epoch": 2.6004439511653716, "grad_norm": 0.343628466129303, "learning_rate": 5.290215311914881e-07, "loss": 0.4135, "step": 3124 }, { "epoch": 2.601276359600444, "grad_norm": 0.2960171103477478, "learning_rate": 5.268545737141323e-07, "loss": 0.3888, "step": 3125 }, { "epoch": 2.602108768035516, "grad_norm": 0.277442991733551, "learning_rate": 5.246918165986537e-07, "loss": 0.3336, "step": 3126 }, { "epoch": 2.6029411764705883, "grad_norm": 0.31210342049598694, "learning_rate": 5.225332618759193e-07, "loss": 0.4181, "step": 3127 }, { "epoch": 2.6037735849056602, "grad_norm": 0.27689164876937866, "learning_rate": 5.203789115728486e-07, "loss": 0.3818, "step": 3128 }, { "epoch": 2.6046059933407326, "grad_norm": 0.28218722343444824, "learning_rate": 5.182287677124159e-07, "loss": 0.3801, "step": 3129 }, { "epoch": 2.6054384017758045, "grad_norm": 0.29998162388801575, "learning_rate": 5.160828323136424e-07, "loss": 0.4012, "step": 3130 }, { "epoch": 2.606270810210877, "grad_norm": 0.2883782684803009, "learning_rate": 5.139411073916001e-07, "loss": 0.3856, "step": 3131 }, { "epoch": 2.607103218645949, "grad_norm": 0.3021693825721741, "learning_rate": 5.118035949574057e-07, "loss": 0.4013, "step": 3132 }, { "epoch": 2.6079356270810212, "grad_norm": 0.26575303077697754, "learning_rate": 5.096702970182204e-07, "loss": 0.3247, "step": 3133 }, { "epoch": 2.608768035516093, "grad_norm": 0.3065766394138336, "learning_rate": 5.075412155772492e-07, "loss": 0.382, "step": 3134 }, { "epoch": 2.6096004439511655, "grad_norm": 0.30673807859420776, "learning_rate": 5.054163526337364e-07, "loss": 0.4343, "step": 3135 }, { "epoch": 2.6104328523862375, "grad_norm": 0.3258250951766968, "learning_rate": 5.032957101829661e-07, "loss": 0.3875, "step": 3136 }, { "epoch": 2.61126526082131, "grad_norm": 0.2834281623363495, "learning_rate": 5.011792902162572e-07, "loss": 0.379, "step": 3137 }, { "epoch": 2.6120976692563818, "grad_norm": 0.3125614523887634, "learning_rate": 4.990670947209675e-07, "loss": 0.3715, "step": 3138 }, { "epoch": 2.6129300776914537, "grad_norm": 0.29795554280281067, "learning_rate": 4.969591256804824e-07, "loss": 0.402, "step": 3139 }, { "epoch": 2.613762486126526, "grad_norm": 0.30222252011299133, "learning_rate": 4.948553850742238e-07, "loss": 0.392, "step": 3140 }, { "epoch": 2.6145948945615984, "grad_norm": 0.30855488777160645, "learning_rate": 4.927558748776412e-07, "loss": 0.4307, "step": 3141 }, { "epoch": 2.6154273029966704, "grad_norm": 0.27991849184036255, "learning_rate": 4.906605970622114e-07, "loss": 0.3593, "step": 3142 }, { "epoch": 2.6162597114317423, "grad_norm": 0.269772469997406, "learning_rate": 4.885695535954361e-07, "loss": 0.3599, "step": 3143 }, { "epoch": 2.6170921198668147, "grad_norm": 0.2982839345932007, "learning_rate": 4.8648274644084e-07, "loss": 0.4084, "step": 3144 }, { "epoch": 2.617924528301887, "grad_norm": 0.30452045798301697, "learning_rate": 4.844001775579766e-07, "loss": 0.4204, "step": 3145 }, { "epoch": 2.618756936736959, "grad_norm": 0.28936830163002014, "learning_rate": 4.8232184890241e-07, "loss": 0.377, "step": 3146 }, { "epoch": 2.619589345172031, "grad_norm": 0.29566147923469543, "learning_rate": 4.802477624257285e-07, "loss": 0.399, "step": 3147 }, { "epoch": 2.6204217536071033, "grad_norm": 0.29598045349121094, "learning_rate": 4.781779200755354e-07, "loss": 0.3818, "step": 3148 }, { "epoch": 2.621254162042175, "grad_norm": 0.2791507840156555, "learning_rate": 4.7611232379545124e-07, "loss": 0.3618, "step": 3149 }, { "epoch": 2.6220865704772476, "grad_norm": 0.28765299916267395, "learning_rate": 4.740509755251038e-07, "loss": 0.3683, "step": 3150 }, { "epoch": 2.6229189789123195, "grad_norm": 0.2902831435203552, "learning_rate": 4.71993877200137e-07, "loss": 0.3989, "step": 3151 }, { "epoch": 2.623751387347392, "grad_norm": 0.29683712124824524, "learning_rate": 4.6994103075220175e-07, "loss": 0.4098, "step": 3152 }, { "epoch": 2.624583795782464, "grad_norm": 0.28899383544921875, "learning_rate": 4.678924381089567e-07, "loss": 0.3437, "step": 3153 }, { "epoch": 2.625416204217536, "grad_norm": 0.29837122559547424, "learning_rate": 4.658481011940663e-07, "loss": 0.4139, "step": 3154 }, { "epoch": 2.626248612652608, "grad_norm": 0.31046080589294434, "learning_rate": 4.63808021927199e-07, "loss": 0.3861, "step": 3155 }, { "epoch": 2.6270810210876805, "grad_norm": 0.3166535198688507, "learning_rate": 4.617722022240245e-07, "loss": 0.4264, "step": 3156 }, { "epoch": 2.6279134295227524, "grad_norm": 0.3086240589618683, "learning_rate": 4.597406439962138e-07, "loss": 0.4188, "step": 3157 }, { "epoch": 2.628745837957825, "grad_norm": 0.29972097277641296, "learning_rate": 4.5771334915143516e-07, "loss": 0.3938, "step": 3158 }, { "epoch": 2.6295782463928967, "grad_norm": 0.3148014545440674, "learning_rate": 4.5569031959335374e-07, "loss": 0.402, "step": 3159 }, { "epoch": 2.630410654827969, "grad_norm": 0.31971946358680725, "learning_rate": 4.536715572216299e-07, "loss": 0.3992, "step": 3160 }, { "epoch": 2.631243063263041, "grad_norm": 0.3092584013938904, "learning_rate": 4.5165706393191676e-07, "loss": 0.4067, "step": 3161 }, { "epoch": 2.632075471698113, "grad_norm": 0.306518018245697, "learning_rate": 4.496468416158595e-07, "loss": 0.3765, "step": 3162 }, { "epoch": 2.6329078801331853, "grad_norm": 0.306640625, "learning_rate": 4.4764089216109144e-07, "loss": 0.4048, "step": 3163 }, { "epoch": 2.6337402885682577, "grad_norm": 0.2979956269264221, "learning_rate": 4.456392174512347e-07, "loss": 0.3945, "step": 3164 }, { "epoch": 2.6345726970033296, "grad_norm": 0.29669734835624695, "learning_rate": 4.4364181936589536e-07, "loss": 0.417, "step": 3165 }, { "epoch": 2.6354051054384016, "grad_norm": 0.28595003485679626, "learning_rate": 4.4164869978066684e-07, "loss": 0.3818, "step": 3166 }, { "epoch": 2.636237513873474, "grad_norm": 0.2954268455505371, "learning_rate": 4.3965986056712316e-07, "loss": 0.3785, "step": 3167 }, { "epoch": 2.6370699223085463, "grad_norm": 0.3013925850391388, "learning_rate": 4.376753035928194e-07, "loss": 0.4055, "step": 3168 }, { "epoch": 2.6379023307436182, "grad_norm": 0.2962302565574646, "learning_rate": 4.3569503072128703e-07, "loss": 0.4024, "step": 3169 }, { "epoch": 2.63873473917869, "grad_norm": 0.2931113839149475, "learning_rate": 4.3371904381203976e-07, "loss": 0.3846, "step": 3170 }, { "epoch": 2.6395671476137625, "grad_norm": 0.290157675743103, "learning_rate": 4.3174734472056334e-07, "loss": 0.3722, "step": 3171 }, { "epoch": 2.6403995560488345, "grad_norm": 0.3102055490016937, "learning_rate": 4.2977993529831675e-07, "loss": 0.3862, "step": 3172 }, { "epoch": 2.641231964483907, "grad_norm": 0.30981799960136414, "learning_rate": 4.278168173927322e-07, "loss": 0.3891, "step": 3173 }, { "epoch": 2.6420643729189788, "grad_norm": 0.28204596042633057, "learning_rate": 4.258579928472106e-07, "loss": 0.36, "step": 3174 }, { "epoch": 2.642896781354051, "grad_norm": 0.30792534351348877, "learning_rate": 4.2390346350112634e-07, "loss": 0.4047, "step": 3175 }, { "epoch": 2.643729189789123, "grad_norm": 0.2940234839916229, "learning_rate": 4.219532311898128e-07, "loss": 0.4073, "step": 3176 }, { "epoch": 2.6445615982241955, "grad_norm": 0.27562248706817627, "learning_rate": 4.200072977445735e-07, "loss": 0.4024, "step": 3177 }, { "epoch": 2.6453940066592674, "grad_norm": 0.2981874942779541, "learning_rate": 4.180656649926745e-07, "loss": 0.413, "step": 3178 }, { "epoch": 2.6462264150943398, "grad_norm": 0.31285783648490906, "learning_rate": 4.161283347573425e-07, "loss": 0.4159, "step": 3179 }, { "epoch": 2.6470588235294117, "grad_norm": 0.2978651523590088, "learning_rate": 4.141953088577644e-07, "loss": 0.3846, "step": 3180 }, { "epoch": 2.647891231964484, "grad_norm": 0.28265270590782166, "learning_rate": 4.12266589109086e-07, "loss": 0.3768, "step": 3181 }, { "epoch": 2.648723640399556, "grad_norm": 0.2694186270236969, "learning_rate": 4.103421773224081e-07, "loss": 0.3781, "step": 3182 }, { "epoch": 2.6495560488346284, "grad_norm": 0.2963635325431824, "learning_rate": 4.0842207530478793e-07, "loss": 0.4111, "step": 3183 }, { "epoch": 2.6503884572697003, "grad_norm": 0.29516974091529846, "learning_rate": 4.0650628485923385e-07, "loss": 0.3699, "step": 3184 }, { "epoch": 2.6512208657047722, "grad_norm": 0.3196733295917511, "learning_rate": 4.0459480778470786e-07, "loss": 0.4343, "step": 3185 }, { "epoch": 2.6520532741398446, "grad_norm": 0.28815674781799316, "learning_rate": 4.026876458761192e-07, "loss": 0.3925, "step": 3186 }, { "epoch": 2.652885682574917, "grad_norm": 0.30133241415023804, "learning_rate": 4.0078480092432705e-07, "loss": 0.3972, "step": 3187 }, { "epoch": 2.653718091009989, "grad_norm": 0.3248240351676941, "learning_rate": 3.9888627471613595e-07, "loss": 0.405, "step": 3188 }, { "epoch": 2.654550499445061, "grad_norm": 0.3046268820762634, "learning_rate": 3.969920690342954e-07, "loss": 0.3948, "step": 3189 }, { "epoch": 2.655382907880133, "grad_norm": 0.29888278245925903, "learning_rate": 3.9510218565749823e-07, "loss": 0.3884, "step": 3190 }, { "epoch": 2.6562153163152056, "grad_norm": 0.2877199351787567, "learning_rate": 3.9321662636037537e-07, "loss": 0.3705, "step": 3191 }, { "epoch": 2.6570477247502775, "grad_norm": 0.2959778606891632, "learning_rate": 3.913353929135033e-07, "loss": 0.3945, "step": 3192 }, { "epoch": 2.6578801331853494, "grad_norm": 0.27556687593460083, "learning_rate": 3.8945848708339173e-07, "loss": 0.369, "step": 3193 }, { "epoch": 2.658712541620422, "grad_norm": 0.27974632382392883, "learning_rate": 3.8758591063248864e-07, "loss": 0.3902, "step": 3194 }, { "epoch": 2.6595449500554937, "grad_norm": 0.2980170249938965, "learning_rate": 3.8571766531917466e-07, "loss": 0.4005, "step": 3195 }, { "epoch": 2.660377358490566, "grad_norm": 0.26741084456443787, "learning_rate": 3.838537528977659e-07, "loss": 0.3446, "step": 3196 }, { "epoch": 2.661209766925638, "grad_norm": 0.2738521993160248, "learning_rate": 3.8199417511851023e-07, "loss": 0.3572, "step": 3197 }, { "epoch": 2.6620421753607104, "grad_norm": 0.30660688877105713, "learning_rate": 3.8013893372758125e-07, "loss": 0.398, "step": 3198 }, { "epoch": 2.6628745837957823, "grad_norm": 0.31190434098243713, "learning_rate": 3.782880304670833e-07, "loss": 0.3662, "step": 3199 }, { "epoch": 2.6637069922308547, "grad_norm": 0.27871236205101013, "learning_rate": 3.7644146707504826e-07, "loss": 0.3878, "step": 3200 }, { "epoch": 2.6645394006659266, "grad_norm": 0.28115060925483704, "learning_rate": 3.7459924528543247e-07, "loss": 0.3932, "step": 3201 }, { "epoch": 2.665371809100999, "grad_norm": 0.29334890842437744, "learning_rate": 3.727613668281116e-07, "loss": 0.4139, "step": 3202 }, { "epoch": 2.666204217536071, "grad_norm": 0.30630093812942505, "learning_rate": 3.709278334288874e-07, "loss": 0.4072, "step": 3203 }, { "epoch": 2.6670366259711433, "grad_norm": 0.2759256958961487, "learning_rate": 3.6909864680947815e-07, "loss": 0.396, "step": 3204 }, { "epoch": 2.6678690344062153, "grad_norm": 0.289121150970459, "learning_rate": 3.672738086875255e-07, "loss": 0.393, "step": 3205 }, { "epoch": 2.6687014428412876, "grad_norm": 0.3110848069190979, "learning_rate": 3.6545332077658146e-07, "loss": 0.4241, "step": 3206 }, { "epoch": 2.6695338512763596, "grad_norm": 0.2876022160053253, "learning_rate": 3.63637184786117e-07, "loss": 0.3783, "step": 3207 }, { "epoch": 2.6703662597114315, "grad_norm": 0.28069189190864563, "learning_rate": 3.618254024215156e-07, "loss": 0.4027, "step": 3208 }, { "epoch": 2.671198668146504, "grad_norm": 0.2795030474662781, "learning_rate": 3.6001797538407214e-07, "loss": 0.3889, "step": 3209 }, { "epoch": 2.6720310765815762, "grad_norm": 0.28693515062332153, "learning_rate": 3.582149053709932e-07, "loss": 0.3846, "step": 3210 }, { "epoch": 2.672863485016648, "grad_norm": 0.30839934945106506, "learning_rate": 3.564161940753924e-07, "loss": 0.4299, "step": 3211 }, { "epoch": 2.67369589345172, "grad_norm": 0.2880555987358093, "learning_rate": 3.5462184318629134e-07, "loss": 0.3687, "step": 3212 }, { "epoch": 2.6745283018867925, "grad_norm": 0.3026648759841919, "learning_rate": 3.528318543886172e-07, "loss": 0.3653, "step": 3213 }, { "epoch": 2.675360710321865, "grad_norm": 0.2983366847038269, "learning_rate": 3.510462293632e-07, "loss": 0.3878, "step": 3214 }, { "epoch": 2.6761931187569368, "grad_norm": 0.2791326344013214, "learning_rate": 3.4926496978677393e-07, "loss": 0.3921, "step": 3215 }, { "epoch": 2.6770255271920087, "grad_norm": 0.29134318232536316, "learning_rate": 3.4748807733197223e-07, "loss": 0.3876, "step": 3216 }, { "epoch": 2.677857935627081, "grad_norm": 0.3067108392715454, "learning_rate": 3.457155536673279e-07, "loss": 0.3933, "step": 3217 }, { "epoch": 2.678690344062153, "grad_norm": 0.31518760323524475, "learning_rate": 3.439474004572724e-07, "loss": 0.4012, "step": 3218 }, { "epoch": 2.6795227524972254, "grad_norm": 0.27954304218292236, "learning_rate": 3.4218361936213195e-07, "loss": 0.3536, "step": 3219 }, { "epoch": 2.6803551609322973, "grad_norm": 0.30983665585517883, "learning_rate": 3.4042421203812904e-07, "loss": 0.4255, "step": 3220 }, { "epoch": 2.6811875693673697, "grad_norm": 0.28379014134407043, "learning_rate": 3.386691801373754e-07, "loss": 0.3885, "step": 3221 }, { "epoch": 2.6820199778024416, "grad_norm": 0.2883853316307068, "learning_rate": 3.369185253078794e-07, "loss": 0.3971, "step": 3222 }, { "epoch": 2.682852386237514, "grad_norm": 0.28160157799720764, "learning_rate": 3.3517224919353555e-07, "loss": 0.3796, "step": 3223 }, { "epoch": 2.683684794672586, "grad_norm": 0.2900620102882385, "learning_rate": 3.334303534341277e-07, "loss": 0.4238, "step": 3224 }, { "epoch": 2.6845172031076583, "grad_norm": 0.2966312766075134, "learning_rate": 3.3169283966532517e-07, "loss": 0.3779, "step": 3225 }, { "epoch": 2.68534961154273, "grad_norm": 0.27945834398269653, "learning_rate": 3.2995970951868574e-07, "loss": 0.3731, "step": 3226 }, { "epoch": 2.6861820199778026, "grad_norm": 0.31320464611053467, "learning_rate": 3.2823096462164915e-07, "loss": 0.3988, "step": 3227 }, { "epoch": 2.6870144284128745, "grad_norm": 0.28876248002052307, "learning_rate": 3.265066065975353e-07, "loss": 0.3668, "step": 3228 }, { "epoch": 2.687846836847947, "grad_norm": 0.2871847152709961, "learning_rate": 3.2478663706554724e-07, "loss": 0.4239, "step": 3229 }, { "epoch": 2.688679245283019, "grad_norm": 0.31527018547058105, "learning_rate": 3.2307105764076694e-07, "loss": 0.3833, "step": 3230 }, { "epoch": 2.6895116537180908, "grad_norm": 0.31052833795547485, "learning_rate": 3.213598699341547e-07, "loss": 0.3889, "step": 3231 }, { "epoch": 2.690344062153163, "grad_norm": 0.28673428297042847, "learning_rate": 3.1965307555254343e-07, "loss": 0.4006, "step": 3232 }, { "epoch": 2.6911764705882355, "grad_norm": 0.28148752450942993, "learning_rate": 3.1795067609864395e-07, "loss": 0.3643, "step": 3233 }, { "epoch": 2.6920088790233074, "grad_norm": 0.28988078236579895, "learning_rate": 3.162526731710386e-07, "loss": 0.4032, "step": 3234 }, { "epoch": 2.6928412874583794, "grad_norm": 0.2785470485687256, "learning_rate": 3.14559068364183e-07, "loss": 0.3915, "step": 3235 }, { "epoch": 2.6936736958934517, "grad_norm": 0.27616074681282043, "learning_rate": 3.1286986326840076e-07, "loss": 0.3812, "step": 3236 }, { "epoch": 2.694506104328524, "grad_norm": 0.28551867604255676, "learning_rate": 3.1118505946988506e-07, "loss": 0.3644, "step": 3237 }, { "epoch": 2.695338512763596, "grad_norm": 0.31371888518333435, "learning_rate": 3.095046585506967e-07, "loss": 0.4134, "step": 3238 }, { "epoch": 2.696170921198668, "grad_norm": 0.299771785736084, "learning_rate": 3.0782866208876163e-07, "loss": 0.3827, "step": 3239 }, { "epoch": 2.6970033296337403, "grad_norm": 0.29559215903282166, "learning_rate": 3.0615707165786937e-07, "loss": 0.385, "step": 3240 }, { "epoch": 2.6978357380688123, "grad_norm": 0.27681657671928406, "learning_rate": 3.044898888276726e-07, "loss": 0.3581, "step": 3241 }, { "epoch": 2.6986681465038846, "grad_norm": 0.29248616099357605, "learning_rate": 3.0282711516368524e-07, "loss": 0.3735, "step": 3242 }, { "epoch": 2.6995005549389566, "grad_norm": 0.3053610026836395, "learning_rate": 3.011687522272816e-07, "loss": 0.4158, "step": 3243 }, { "epoch": 2.700332963374029, "grad_norm": 0.31310683488845825, "learning_rate": 2.995148015756927e-07, "loss": 0.3995, "step": 3244 }, { "epoch": 2.701165371809101, "grad_norm": 0.30295878648757935, "learning_rate": 2.978652647620073e-07, "loss": 0.4164, "step": 3245 }, { "epoch": 2.7019977802441733, "grad_norm": 0.2850441336631775, "learning_rate": 2.962201433351697e-07, "loss": 0.3935, "step": 3246 }, { "epoch": 2.702830188679245, "grad_norm": 0.2884097397327423, "learning_rate": 2.9457943883997696e-07, "loss": 0.3741, "step": 3247 }, { "epoch": 2.7036625971143176, "grad_norm": 0.2926424741744995, "learning_rate": 2.929431528170801e-07, "loss": 0.3759, "step": 3248 }, { "epoch": 2.7044950055493895, "grad_norm": 0.29279258847236633, "learning_rate": 2.91311286802981e-07, "loss": 0.3754, "step": 3249 }, { "epoch": 2.705327413984462, "grad_norm": 0.2717744708061218, "learning_rate": 2.8968384233002855e-07, "loss": 0.3732, "step": 3250 }, { "epoch": 2.706159822419534, "grad_norm": 0.29794079065322876, "learning_rate": 2.8806082092642186e-07, "loss": 0.3902, "step": 3251 }, { "epoch": 2.706992230854606, "grad_norm": 0.31436416506767273, "learning_rate": 2.8644222411620793e-07, "loss": 0.4225, "step": 3252 }, { "epoch": 2.707824639289678, "grad_norm": 0.2860269546508789, "learning_rate": 2.848280534192777e-07, "loss": 0.3869, "step": 3253 }, { "epoch": 2.70865704772475, "grad_norm": 0.2989305257797241, "learning_rate": 2.832183103513636e-07, "loss": 0.3976, "step": 3254 }, { "epoch": 2.7094894561598224, "grad_norm": 0.30432286858558655, "learning_rate": 2.816129964240433e-07, "loss": 0.4112, "step": 3255 }, { "epoch": 2.7103218645948948, "grad_norm": 0.3015022575855255, "learning_rate": 2.800121131447353e-07, "loss": 0.404, "step": 3256 }, { "epoch": 2.7111542730299667, "grad_norm": 0.2788195013999939, "learning_rate": 2.784156620166983e-07, "loss": 0.3442, "step": 3257 }, { "epoch": 2.7119866814650386, "grad_norm": 0.32228487730026245, "learning_rate": 2.7682364453902487e-07, "loss": 0.4665, "step": 3258 }, { "epoch": 2.712819089900111, "grad_norm": 0.2844350039958954, "learning_rate": 2.7523606220664854e-07, "loss": 0.3602, "step": 3259 }, { "epoch": 2.7136514983351834, "grad_norm": 0.270535409450531, "learning_rate": 2.736529165103385e-07, "loss": 0.3578, "step": 3260 }, { "epoch": 2.7144839067702553, "grad_norm": 0.31310030817985535, "learning_rate": 2.7207420893669455e-07, "loss": 0.3897, "step": 3261 }, { "epoch": 2.7153163152053272, "grad_norm": 0.3238847851753235, "learning_rate": 2.704999409681508e-07, "loss": 0.4193, "step": 3262 }, { "epoch": 2.7161487236403996, "grad_norm": 0.2968219220638275, "learning_rate": 2.6893011408297196e-07, "loss": 0.3856, "step": 3263 }, { "epoch": 2.7169811320754715, "grad_norm": 0.2701689600944519, "learning_rate": 2.6736472975525564e-07, "loss": 0.3865, "step": 3264 }, { "epoch": 2.717813540510544, "grad_norm": 0.30333271622657776, "learning_rate": 2.65803789454922e-07, "loss": 0.4149, "step": 3265 }, { "epoch": 2.718645948945616, "grad_norm": 0.2998022735118866, "learning_rate": 2.6424729464772316e-07, "loss": 0.3999, "step": 3266 }, { "epoch": 2.719478357380688, "grad_norm": 0.2897208034992218, "learning_rate": 2.626952467952343e-07, "loss": 0.388, "step": 3267 }, { "epoch": 2.72031076581576, "grad_norm": 0.27508842945098877, "learning_rate": 2.611476473548552e-07, "loss": 0.3711, "step": 3268 }, { "epoch": 2.7211431742508325, "grad_norm": 0.30373603105545044, "learning_rate": 2.596044977798101e-07, "loss": 0.4133, "step": 3269 }, { "epoch": 2.7219755826859044, "grad_norm": 0.30435818433761597, "learning_rate": 2.5806579951914214e-07, "loss": 0.411, "step": 3270 }, { "epoch": 2.722807991120977, "grad_norm": 0.28073209524154663, "learning_rate": 2.5653155401771655e-07, "loss": 0.3691, "step": 3271 }, { "epoch": 2.7236403995560488, "grad_norm": 0.330555260181427, "learning_rate": 2.550017627162166e-07, "loss": 0.4357, "step": 3272 }, { "epoch": 2.724472807991121, "grad_norm": 0.25710800290107727, "learning_rate": 2.534764270511431e-07, "loss": 0.3366, "step": 3273 }, { "epoch": 2.725305216426193, "grad_norm": 0.29701754450798035, "learning_rate": 2.5195554845481306e-07, "loss": 0.4171, "step": 3274 }, { "epoch": 2.7261376248612654, "grad_norm": 0.2765384316444397, "learning_rate": 2.5043912835535867e-07, "loss": 0.3471, "step": 3275 }, { "epoch": 2.7269700332963374, "grad_norm": 0.29196688532829285, "learning_rate": 2.4892716817672304e-07, "loss": 0.4051, "step": 3276 }, { "epoch": 2.7278024417314093, "grad_norm": 0.2899174690246582, "learning_rate": 2.474196693386649e-07, "loss": 0.3756, "step": 3277 }, { "epoch": 2.7286348501664817, "grad_norm": 0.2992348074913025, "learning_rate": 2.45916633256752e-07, "loss": 0.4266, "step": 3278 }, { "epoch": 2.729467258601554, "grad_norm": 0.29235145449638367, "learning_rate": 2.4441806134236137e-07, "loss": 0.4102, "step": 3279 }, { "epoch": 2.730299667036626, "grad_norm": 0.26745110750198364, "learning_rate": 2.4292395500267796e-07, "loss": 0.3314, "step": 3280 }, { "epoch": 2.731132075471698, "grad_norm": 0.3117216229438782, "learning_rate": 2.4143431564069344e-07, "loss": 0.4081, "step": 3281 }, { "epoch": 2.7319644839067703, "grad_norm": 0.2923487424850464, "learning_rate": 2.39949144655206e-07, "loss": 0.3971, "step": 3282 }, { "epoch": 2.7327968923418426, "grad_norm": 0.2777581810951233, "learning_rate": 2.38468443440818e-07, "loss": 0.3751, "step": 3283 }, { "epoch": 2.7336293007769146, "grad_norm": 0.30919012427330017, "learning_rate": 2.3699221338793155e-07, "loss": 0.4077, "step": 3284 }, { "epoch": 2.7344617092119865, "grad_norm": 0.2907434105873108, "learning_rate": 2.355204558827534e-07, "loss": 0.3853, "step": 3285 }, { "epoch": 2.735294117647059, "grad_norm": 0.2914058566093445, "learning_rate": 2.340531723072914e-07, "loss": 0.3711, "step": 3286 }, { "epoch": 2.736126526082131, "grad_norm": 0.2743302583694458, "learning_rate": 2.3259036403934843e-07, "loss": 0.4138, "step": 3287 }, { "epoch": 2.736958934517203, "grad_norm": 0.30281293392181396, "learning_rate": 2.3113203245252734e-07, "loss": 0.3994, "step": 3288 }, { "epoch": 2.737791342952275, "grad_norm": 0.303541898727417, "learning_rate": 2.2967817891622724e-07, "loss": 0.3588, "step": 3289 }, { "epoch": 2.7386237513873475, "grad_norm": 0.31531253457069397, "learning_rate": 2.2822880479564325e-07, "loss": 0.3762, "step": 3290 }, { "epoch": 2.7394561598224194, "grad_norm": 0.29251205921173096, "learning_rate": 2.2678391145176115e-07, "loss": 0.3847, "step": 3291 }, { "epoch": 2.740288568257492, "grad_norm": 0.2942062318325043, "learning_rate": 2.2534350024136232e-07, "loss": 0.3904, "step": 3292 }, { "epoch": 2.7411209766925637, "grad_norm": 0.30196094512939453, "learning_rate": 2.2390757251701756e-07, "loss": 0.3586, "step": 3293 }, { "epoch": 2.741953385127636, "grad_norm": 0.3154551386833191, "learning_rate": 2.224761296270883e-07, "loss": 0.4193, "step": 3294 }, { "epoch": 2.742785793562708, "grad_norm": 0.2840886116027832, "learning_rate": 2.2104917291572435e-07, "loss": 0.3629, "step": 3295 }, { "epoch": 2.7436182019977804, "grad_norm": 0.28531089425086975, "learning_rate": 2.196267037228633e-07, "loss": 0.389, "step": 3296 }, { "epoch": 2.7444506104328523, "grad_norm": 0.2992643713951111, "learning_rate": 2.1820872338422838e-07, "loss": 0.3948, "step": 3297 }, { "epoch": 2.7452830188679247, "grad_norm": 0.296223908662796, "learning_rate": 2.1679523323132835e-07, "loss": 0.3902, "step": 3298 }, { "epoch": 2.7461154273029966, "grad_norm": 0.270108699798584, "learning_rate": 2.153862345914548e-07, "loss": 0.3622, "step": 3299 }, { "epoch": 2.7469478357380686, "grad_norm": 0.2876308262348175, "learning_rate": 2.139817287876822e-07, "loss": 0.3826, "step": 3300 }, { "epoch": 2.747780244173141, "grad_norm": 0.28309187293052673, "learning_rate": 2.125817171388672e-07, "loss": 0.3575, "step": 3301 }, { "epoch": 2.7486126526082133, "grad_norm": 0.31576332449913025, "learning_rate": 2.111862009596427e-07, "loss": 0.425, "step": 3302 }, { "epoch": 2.7494450610432852, "grad_norm": 0.3037078082561493, "learning_rate": 2.097951815604249e-07, "loss": 0.3768, "step": 3303 }, { "epoch": 2.750277469478357, "grad_norm": 0.3079027831554413, "learning_rate": 2.0840866024740502e-07, "loss": 0.3602, "step": 3304 }, { "epoch": 2.7511098779134295, "grad_norm": 0.3146269917488098, "learning_rate": 2.070266383225511e-07, "loss": 0.4562, "step": 3305 }, { "epoch": 2.751942286348502, "grad_norm": 0.2855137288570404, "learning_rate": 2.0564911708360447e-07, "loss": 0.3737, "step": 3306 }, { "epoch": 2.752774694783574, "grad_norm": 0.30360037088394165, "learning_rate": 2.0427609782408265e-07, "loss": 0.4307, "step": 3307 }, { "epoch": 2.7536071032186458, "grad_norm": 0.25891706347465515, "learning_rate": 2.029075818332754e-07, "loss": 0.3776, "step": 3308 }, { "epoch": 2.754439511653718, "grad_norm": 0.2823725938796997, "learning_rate": 2.0154357039624317e-07, "loss": 0.3631, "step": 3309 }, { "epoch": 2.75527192008879, "grad_norm": 0.30981922149658203, "learning_rate": 2.0018406479381525e-07, "loss": 0.4291, "step": 3310 }, { "epoch": 2.7561043285238624, "grad_norm": 0.29691869020462036, "learning_rate": 1.9882906630259158e-07, "loss": 0.3768, "step": 3311 }, { "epoch": 2.7569367369589344, "grad_norm": 0.28396308422088623, "learning_rate": 1.9747857619494105e-07, "loss": 0.3652, "step": 3312 }, { "epoch": 2.7577691453940067, "grad_norm": 0.2920913100242615, "learning_rate": 1.961325957389959e-07, "loss": 0.3836, "step": 3313 }, { "epoch": 2.7586015538290787, "grad_norm": 0.2953694760799408, "learning_rate": 1.9479112619865513e-07, "loss": 0.3953, "step": 3314 }, { "epoch": 2.759433962264151, "grad_norm": 0.3114762604236603, "learning_rate": 1.934541688335828e-07, "loss": 0.4324, "step": 3315 }, { "epoch": 2.760266370699223, "grad_norm": 0.28087711334228516, "learning_rate": 1.9212172489920632e-07, "loss": 0.3553, "step": 3316 }, { "epoch": 2.7610987791342954, "grad_norm": 0.2810649871826172, "learning_rate": 1.9079379564671207e-07, "loss": 0.3584, "step": 3317 }, { "epoch": 2.7619311875693673, "grad_norm": 0.3019134998321533, "learning_rate": 1.8947038232304981e-07, "loss": 0.4394, "step": 3318 }, { "epoch": 2.7627635960044397, "grad_norm": 0.2816515266895294, "learning_rate": 1.8815148617092772e-07, "loss": 0.3543, "step": 3319 }, { "epoch": 2.7635960044395116, "grad_norm": 0.30382615327835083, "learning_rate": 1.8683710842881174e-07, "loss": 0.4034, "step": 3320 }, { "epoch": 2.764428412874584, "grad_norm": 0.29469168186187744, "learning_rate": 1.8552725033092635e-07, "loss": 0.3681, "step": 3321 }, { "epoch": 2.765260821309656, "grad_norm": 0.3065103590488434, "learning_rate": 1.8422191310725147e-07, "loss": 0.4048, "step": 3322 }, { "epoch": 2.766093229744728, "grad_norm": 0.29562950134277344, "learning_rate": 1.8292109798352054e-07, "loss": 0.3898, "step": 3323 }, { "epoch": 2.7669256381798, "grad_norm": 0.28922104835510254, "learning_rate": 1.816248061812226e-07, "loss": 0.3753, "step": 3324 }, { "epoch": 2.7677580466148726, "grad_norm": 0.2866446375846863, "learning_rate": 1.8033303891759835e-07, "loss": 0.3665, "step": 3325 }, { "epoch": 2.7685904550499445, "grad_norm": 0.3014658987522125, "learning_rate": 1.7904579740563921e-07, "loss": 0.3785, "step": 3326 }, { "epoch": 2.7694228634850164, "grad_norm": 0.28989580273628235, "learning_rate": 1.7776308285408826e-07, "loss": 0.4271, "step": 3327 }, { "epoch": 2.770255271920089, "grad_norm": 0.27226322889328003, "learning_rate": 1.7648489646743648e-07, "loss": 0.3668, "step": 3328 }, { "epoch": 2.771087680355161, "grad_norm": 0.286642462015152, "learning_rate": 1.752112394459232e-07, "loss": 0.4169, "step": 3329 }, { "epoch": 2.771920088790233, "grad_norm": 0.2984205186367035, "learning_rate": 1.7394211298553508e-07, "loss": 0.3907, "step": 3330 }, { "epoch": 2.772752497225305, "grad_norm": 0.26750481128692627, "learning_rate": 1.726775182780044e-07, "loss": 0.3586, "step": 3331 }, { "epoch": 2.7735849056603774, "grad_norm": 0.2971529960632324, "learning_rate": 1.7141745651080565e-07, "loss": 0.4357, "step": 3332 }, { "epoch": 2.7744173140954493, "grad_norm": 0.2746163308620453, "learning_rate": 1.7016192886716132e-07, "loss": 0.3554, "step": 3333 }, { "epoch": 2.7752497225305217, "grad_norm": 0.2701033651828766, "learning_rate": 1.689109365260333e-07, "loss": 0.392, "step": 3334 }, { "epoch": 2.7760821309655936, "grad_norm": 0.3108593225479126, "learning_rate": 1.676644806621247e-07, "loss": 0.38, "step": 3335 }, { "epoch": 2.776914539400666, "grad_norm": 0.28853678703308105, "learning_rate": 1.664225624458793e-07, "loss": 0.381, "step": 3336 }, { "epoch": 2.777746947835738, "grad_norm": 0.30430543422698975, "learning_rate": 1.651851830434803e-07, "loss": 0.4189, "step": 3337 }, { "epoch": 2.7785793562708103, "grad_norm": 0.2828444838523865, "learning_rate": 1.6395234361684943e-07, "loss": 0.3827, "step": 3338 }, { "epoch": 2.7794117647058822, "grad_norm": 0.2710029184818268, "learning_rate": 1.6272404532364337e-07, "loss": 0.4074, "step": 3339 }, { "epoch": 2.7802441731409546, "grad_norm": 0.2796766757965088, "learning_rate": 1.615002893172557e-07, "loss": 0.3915, "step": 3340 }, { "epoch": 2.7810765815760266, "grad_norm": 0.31362029910087585, "learning_rate": 1.6028107674681547e-07, "loss": 0.395, "step": 3341 }, { "epoch": 2.781908990011099, "grad_norm": 0.30837786197662354, "learning_rate": 1.5906640875718525e-07, "loss": 0.3694, "step": 3342 }, { "epoch": 2.782741398446171, "grad_norm": 0.290594220161438, "learning_rate": 1.5785628648895767e-07, "loss": 0.3946, "step": 3343 }, { "epoch": 2.7835738068812432, "grad_norm": 0.28468722105026245, "learning_rate": 1.5665071107845987e-07, "loss": 0.3844, "step": 3344 }, { "epoch": 2.784406215316315, "grad_norm": 0.2934969365596771, "learning_rate": 1.5544968365774792e-07, "loss": 0.3824, "step": 3345 }, { "epoch": 2.785238623751387, "grad_norm": 0.295932799577713, "learning_rate": 1.542532053546081e-07, "loss": 0.4102, "step": 3346 }, { "epoch": 2.7860710321864595, "grad_norm": 0.292585551738739, "learning_rate": 1.5306127729255382e-07, "loss": 0.3556, "step": 3347 }, { "epoch": 2.786903440621532, "grad_norm": 0.30441945791244507, "learning_rate": 1.5187390059082706e-07, "loss": 0.3754, "step": 3348 }, { "epoch": 2.7877358490566038, "grad_norm": 0.2834872305393219, "learning_rate": 1.5069107636439484e-07, "loss": 0.4026, "step": 3349 }, { "epoch": 2.7885682574916757, "grad_norm": 0.29688331484794617, "learning_rate": 1.4951280572394977e-07, "loss": 0.3684, "step": 3350 }, { "epoch": 2.789400665926748, "grad_norm": 0.28374260663986206, "learning_rate": 1.483390897759096e-07, "loss": 0.4047, "step": 3351 }, { "epoch": 2.7902330743618204, "grad_norm": 0.30708324909210205, "learning_rate": 1.4716992962241272e-07, "loss": 0.3917, "step": 3352 }, { "epoch": 2.7910654827968924, "grad_norm": 0.2824838161468506, "learning_rate": 1.4600532636132147e-07, "loss": 0.3869, "step": 3353 }, { "epoch": 2.7918978912319643, "grad_norm": 0.294177383184433, "learning_rate": 1.4484528108621942e-07, "loss": 0.4117, "step": 3354 }, { "epoch": 2.7927302996670367, "grad_norm": 0.3030126988887787, "learning_rate": 1.4368979488640855e-07, "loss": 0.3446, "step": 3355 }, { "epoch": 2.7935627081021086, "grad_norm": 0.2987135350704193, "learning_rate": 1.4253886884691148e-07, "loss": 0.4387, "step": 3356 }, { "epoch": 2.794395116537181, "grad_norm": 0.28978827595710754, "learning_rate": 1.4139250404846704e-07, "loss": 0.384, "step": 3357 }, { "epoch": 2.795227524972253, "grad_norm": 0.28679752349853516, "learning_rate": 1.4025070156753196e-07, "loss": 0.346, "step": 3358 }, { "epoch": 2.7960599334073253, "grad_norm": 0.341983437538147, "learning_rate": 1.391134624762791e-07, "loss": 0.4132, "step": 3359 }, { "epoch": 2.796892341842397, "grad_norm": 0.3002639412879944, "learning_rate": 1.3798078784259594e-07, "loss": 0.399, "step": 3360 }, { "epoch": 2.7977247502774696, "grad_norm": 0.3027034103870392, "learning_rate": 1.368526787300839e-07, "loss": 0.4074, "step": 3361 }, { "epoch": 2.7985571587125415, "grad_norm": 0.2931459844112396, "learning_rate": 1.3572913619805616e-07, "loss": 0.374, "step": 3362 }, { "epoch": 2.799389567147614, "grad_norm": 0.2804553806781769, "learning_rate": 1.3461016130153993e-07, "loss": 0.3715, "step": 3363 }, { "epoch": 2.800221975582686, "grad_norm": 0.2980172038078308, "learning_rate": 1.3349575509127244e-07, "loss": 0.371, "step": 3364 }, { "epoch": 2.801054384017758, "grad_norm": 0.28683868050575256, "learning_rate": 1.3238591861369943e-07, "loss": 0.3872, "step": 3365 }, { "epoch": 2.80188679245283, "grad_norm": 0.28891491889953613, "learning_rate": 1.3128065291097724e-07, "loss": 0.3925, "step": 3366 }, { "epoch": 2.8027192008879025, "grad_norm": 0.3039287030696869, "learning_rate": 1.3017995902097013e-07, "loss": 0.4261, "step": 3367 }, { "epoch": 2.8035516093229744, "grad_norm": 0.30227887630462646, "learning_rate": 1.290838379772491e-07, "loss": 0.3702, "step": 3368 }, { "epoch": 2.8043840177580464, "grad_norm": 0.3222786784172058, "learning_rate": 1.2799229080909026e-07, "loss": 0.3801, "step": 3369 }, { "epoch": 2.8052164261931187, "grad_norm": 0.2965812683105469, "learning_rate": 1.2690531854147537e-07, "loss": 0.3763, "step": 3370 }, { "epoch": 2.806048834628191, "grad_norm": 0.2899739146232605, "learning_rate": 1.2582292219509184e-07, "loss": 0.3972, "step": 3371 }, { "epoch": 2.806881243063263, "grad_norm": 0.3159641921520233, "learning_rate": 1.2474510278632779e-07, "loss": 0.4052, "step": 3372 }, { "epoch": 2.807713651498335, "grad_norm": 0.30823588371276855, "learning_rate": 1.2367186132727415e-07, "loss": 0.3822, "step": 3373 }, { "epoch": 2.8085460599334073, "grad_norm": 0.286836177110672, "learning_rate": 1.2260319882572425e-07, "loss": 0.3836, "step": 3374 }, { "epoch": 2.8093784683684797, "grad_norm": 0.2870038151741028, "learning_rate": 1.2153911628517036e-07, "loss": 0.3896, "step": 3375 }, { "epoch": 2.8102108768035516, "grad_norm": 0.3150666058063507, "learning_rate": 1.2047961470480485e-07, "loss": 0.4282, "step": 3376 }, { "epoch": 2.8110432852386236, "grad_norm": 0.28661397099494934, "learning_rate": 1.1942469507951803e-07, "loss": 0.3712, "step": 3377 }, { "epoch": 2.811875693673696, "grad_norm": 0.29480233788490295, "learning_rate": 1.1837435839989808e-07, "loss": 0.416, "step": 3378 }, { "epoch": 2.812708102108768, "grad_norm": 0.2871578335762024, "learning_rate": 1.1732860565222936e-07, "loss": 0.3905, "step": 3379 }, { "epoch": 2.8135405105438402, "grad_norm": 0.27761033177375793, "learning_rate": 1.162874378184925e-07, "loss": 0.3864, "step": 3380 }, { "epoch": 2.814372918978912, "grad_norm": 0.29687756299972534, "learning_rate": 1.1525085587636209e-07, "loss": 0.3494, "step": 3381 }, { "epoch": 2.8152053274139845, "grad_norm": 0.2945989668369293, "learning_rate": 1.1421886079920619e-07, "loss": 0.3968, "step": 3382 }, { "epoch": 2.8160377358490565, "grad_norm": 0.2789284884929657, "learning_rate": 1.1319145355608684e-07, "loss": 0.381, "step": 3383 }, { "epoch": 2.816870144284129, "grad_norm": 0.3032738268375397, "learning_rate": 1.1216863511175736e-07, "loss": 0.3858, "step": 3384 }, { "epoch": 2.817702552719201, "grad_norm": 0.30183014273643494, "learning_rate": 1.111504064266622e-07, "loss": 0.398, "step": 3385 }, { "epoch": 2.818534961154273, "grad_norm": 0.29048189520835876, "learning_rate": 1.1013676845693544e-07, "loss": 0.4148, "step": 3386 }, { "epoch": 2.819367369589345, "grad_norm": 0.2871493101119995, "learning_rate": 1.0912772215440182e-07, "loss": 0.3881, "step": 3387 }, { "epoch": 2.8201997780244175, "grad_norm": 0.30180609226226807, "learning_rate": 1.0812326846657228e-07, "loss": 0.3678, "step": 3388 }, { "epoch": 2.8210321864594894, "grad_norm": 0.2673914134502411, "learning_rate": 1.0712340833664737e-07, "loss": 0.3572, "step": 3389 }, { "epoch": 2.8218645948945618, "grad_norm": 0.28488829731941223, "learning_rate": 1.0612814270351324e-07, "loss": 0.4244, "step": 3390 }, { "epoch": 2.8226970033296337, "grad_norm": 0.2882390320301056, "learning_rate": 1.0513747250174123e-07, "loss": 0.3718, "step": 3391 }, { "epoch": 2.8235294117647056, "grad_norm": 0.30586186051368713, "learning_rate": 1.0415139866158774e-07, "loss": 0.3935, "step": 3392 }, { "epoch": 2.824361820199778, "grad_norm": 0.296487033367157, "learning_rate": 1.0316992210899435e-07, "loss": 0.38, "step": 3393 }, { "epoch": 2.8251942286348504, "grad_norm": 0.26386237144470215, "learning_rate": 1.0219304376558492e-07, "loss": 0.3801, "step": 3394 }, { "epoch": 2.8260266370699223, "grad_norm": 0.26951250433921814, "learning_rate": 1.0122076454866347e-07, "loss": 0.3868, "step": 3395 }, { "epoch": 2.8268590455049942, "grad_norm": 0.30076080560684204, "learning_rate": 1.0025308537121859e-07, "loss": 0.4074, "step": 3396 }, { "epoch": 2.8276914539400666, "grad_norm": 0.3036608397960663, "learning_rate": 9.929000714191838e-08, "loss": 0.4016, "step": 3397 }, { "epoch": 2.828523862375139, "grad_norm": 0.2856789231300354, "learning_rate": 9.833153076510893e-08, "loss": 0.3614, "step": 3398 }, { "epoch": 2.829356270810211, "grad_norm": 0.3074534237384796, "learning_rate": 9.737765714081748e-08, "loss": 0.3782, "step": 3399 }, { "epoch": 2.830188679245283, "grad_norm": 0.2953753173351288, "learning_rate": 9.642838716474645e-08, "loss": 0.4025, "step": 3400 }, { "epoch": 2.831021087680355, "grad_norm": 0.2885286808013916, "learning_rate": 9.548372172827946e-08, "loss": 0.3511, "step": 3401 }, { "epoch": 2.831853496115427, "grad_norm": 0.2874261736869812, "learning_rate": 9.454366171847196e-08, "loss": 0.3967, "step": 3402 }, { "epoch": 2.8326859045504995, "grad_norm": 0.29668954014778137, "learning_rate": 9.360820801805726e-08, "loss": 0.4091, "step": 3403 }, { "epoch": 2.8335183129855714, "grad_norm": 0.28590044379234314, "learning_rate": 9.267736150544271e-08, "loss": 0.3797, "step": 3404 }, { "epoch": 2.834350721420644, "grad_norm": 0.31228768825531006, "learning_rate": 9.175112305470913e-08, "loss": 0.4138, "step": 3405 }, { "epoch": 2.8351831298557157, "grad_norm": 0.29430899024009705, "learning_rate": 9.082949353561187e-08, "loss": 0.3866, "step": 3406 }, { "epoch": 2.836015538290788, "grad_norm": 0.30940887331962585, "learning_rate": 8.991247381357593e-08, "loss": 0.4006, "step": 3407 }, { "epoch": 2.83684794672586, "grad_norm": 0.29085415601730347, "learning_rate": 8.900006474969913e-08, "loss": 0.3581, "step": 3408 }, { "epoch": 2.8376803551609324, "grad_norm": 0.29941922426223755, "learning_rate": 8.809226720075059e-08, "loss": 0.4113, "step": 3409 }, { "epoch": 2.8385127635960044, "grad_norm": 0.2955285906791687, "learning_rate": 8.718908201916676e-08, "loss": 0.3789, "step": 3410 }, { "epoch": 2.8393451720310767, "grad_norm": 0.2934470772743225, "learning_rate": 8.629051005305478e-08, "loss": 0.3859, "step": 3411 }, { "epoch": 2.8401775804661487, "grad_norm": 0.28304851055145264, "learning_rate": 8.539655214618969e-08, "loss": 0.3857, "step": 3412 }, { "epoch": 2.841009988901221, "grad_norm": 0.2950180172920227, "learning_rate": 8.450720913801336e-08, "loss": 0.3564, "step": 3413 }, { "epoch": 2.841842397336293, "grad_norm": 0.2903018593788147, "learning_rate": 8.362248186363441e-08, "loss": 0.41, "step": 3414 }, { "epoch": 2.842674805771365, "grad_norm": 0.3056175708770752, "learning_rate": 8.274237115382777e-08, "loss": 0.381, "step": 3415 }, { "epoch": 2.8435072142064373, "grad_norm": 0.2846352756023407, "learning_rate": 8.186687783503289e-08, "loss": 0.376, "step": 3416 }, { "epoch": 2.8443396226415096, "grad_norm": 0.29959842562675476, "learning_rate": 8.09960027293516e-08, "loss": 0.395, "step": 3417 }, { "epoch": 2.8451720310765816, "grad_norm": 0.30339470505714417, "learning_rate": 8.012974665455308e-08, "loss": 0.3928, "step": 3418 }, { "epoch": 2.8460044395116535, "grad_norm": 0.3100747764110565, "learning_rate": 7.926811042406557e-08, "loss": 0.4121, "step": 3419 }, { "epoch": 2.846836847946726, "grad_norm": 0.28325793147087097, "learning_rate": 7.841109484698184e-08, "loss": 0.4077, "step": 3420 }, { "epoch": 2.8476692563817982, "grad_norm": 0.30448758602142334, "learning_rate": 7.755870072805316e-08, "loss": 0.3853, "step": 3421 }, { "epoch": 2.84850166481687, "grad_norm": 0.3250058591365814, "learning_rate": 7.67109288676926e-08, "loss": 0.4308, "step": 3422 }, { "epoch": 2.849334073251942, "grad_norm": 0.27683284878730774, "learning_rate": 7.586778006197337e-08, "loss": 0.3575, "step": 3423 }, { "epoch": 2.8501664816870145, "grad_norm": 0.30187785625457764, "learning_rate": 7.50292551026277e-08, "loss": 0.4145, "step": 3424 }, { "epoch": 2.8509988901220864, "grad_norm": 0.2923542261123657, "learning_rate": 7.419535477704354e-08, "loss": 0.3981, "step": 3425 }, { "epoch": 2.851831298557159, "grad_norm": 0.3036320209503174, "learning_rate": 7.336607986826839e-08, "loss": 0.3879, "step": 3426 }, { "epoch": 2.8526637069922307, "grad_norm": 0.29370933771133423, "learning_rate": 7.254143115500711e-08, "loss": 0.3971, "step": 3427 }, { "epoch": 2.853496115427303, "grad_norm": 0.307032972574234, "learning_rate": 7.17214094116181e-08, "loss": 0.398, "step": 3428 }, { "epoch": 2.854328523862375, "grad_norm": 0.3023212254047394, "learning_rate": 7.090601540811648e-08, "loss": 0.3883, "step": 3429 }, { "epoch": 2.8551609322974474, "grad_norm": 0.2827087938785553, "learning_rate": 7.009524991017091e-08, "loss": 0.3788, "step": 3430 }, { "epoch": 2.8559933407325193, "grad_norm": 0.28021904826164246, "learning_rate": 6.928911367910573e-08, "loss": 0.3843, "step": 3431 }, { "epoch": 2.8568257491675917, "grad_norm": 0.2957780063152313, "learning_rate": 6.848760747189598e-08, "loss": 0.3939, "step": 3432 }, { "epoch": 2.8576581576026636, "grad_norm": 0.307271808385849, "learning_rate": 6.769073204117016e-08, "loss": 0.4193, "step": 3433 }, { "epoch": 2.858490566037736, "grad_norm": 0.3014625012874603, "learning_rate": 6.689848813520805e-08, "loss": 0.3848, "step": 3434 }, { "epoch": 2.859322974472808, "grad_norm": 0.2933945059776306, "learning_rate": 6.611087649794124e-08, "loss": 0.3986, "step": 3435 }, { "epoch": 2.8601553829078803, "grad_norm": 0.29639217257499695, "learning_rate": 6.532789786895033e-08, "loss": 0.3729, "step": 3436 }, { "epoch": 2.8609877913429522, "grad_norm": 0.31975582242012024, "learning_rate": 6.454955298346555e-08, "loss": 0.4172, "step": 3437 }, { "epoch": 2.861820199778024, "grad_norm": 0.2899722754955292, "learning_rate": 6.377584257236724e-08, "loss": 0.3822, "step": 3438 }, { "epoch": 2.8626526082130965, "grad_norm": 0.27522486448287964, "learning_rate": 6.300676736218258e-08, "loss": 0.3855, "step": 3439 }, { "epoch": 2.863485016648169, "grad_norm": 0.2691723704338074, "learning_rate": 6.224232807508667e-08, "loss": 0.3849, "step": 3440 }, { "epoch": 2.864317425083241, "grad_norm": 0.2858082056045532, "learning_rate": 6.148252542890198e-08, "loss": 0.4041, "step": 3441 }, { "epoch": 2.8651498335183128, "grad_norm": 0.27565300464630127, "learning_rate": 6.072736013709557e-08, "loss": 0.3919, "step": 3442 }, { "epoch": 2.865982241953385, "grad_norm": 0.2915458679199219, "learning_rate": 5.997683290878131e-08, "loss": 0.4133, "step": 3443 }, { "epoch": 2.8668146503884575, "grad_norm": 0.27250799536705017, "learning_rate": 5.923094444871713e-08, "loss": 0.3583, "step": 3444 }, { "epoch": 2.8676470588235294, "grad_norm": 0.28215593099594116, "learning_rate": 5.848969545730554e-08, "loss": 0.4154, "step": 3445 }, { "epoch": 2.8684794672586014, "grad_norm": 0.2830628752708435, "learning_rate": 5.775308663059309e-08, "loss": 0.385, "step": 3446 }, { "epoch": 2.8693118756936737, "grad_norm": 0.2738587558269501, "learning_rate": 5.702111866026705e-08, "loss": 0.373, "step": 3447 }, { "epoch": 2.870144284128746, "grad_norm": 0.28816235065460205, "learning_rate": 5.629379223365872e-08, "loss": 0.3963, "step": 3448 }, { "epoch": 2.870976692563818, "grad_norm": 0.3096372187137604, "learning_rate": 5.557110803374066e-08, "loss": 0.4023, "step": 3449 }, { "epoch": 2.87180910099889, "grad_norm": 0.29053497314453125, "learning_rate": 5.485306673912616e-08, "loss": 0.401, "step": 3450 }, { "epoch": 2.8726415094339623, "grad_norm": 0.29939737915992737, "learning_rate": 5.413966902406753e-08, "loss": 0.3722, "step": 3451 }, { "epoch": 2.8734739178690343, "grad_norm": 0.285515159368515, "learning_rate": 5.343091555845781e-08, "loss": 0.3878, "step": 3452 }, { "epoch": 2.8743063263041067, "grad_norm": 0.28648898005485535, "learning_rate": 5.272680700783073e-08, "loss": 0.3773, "step": 3453 }, { "epoch": 2.8751387347391786, "grad_norm": 0.275084525346756, "learning_rate": 5.2027344033354077e-08, "loss": 0.3646, "step": 3454 }, { "epoch": 2.875971143174251, "grad_norm": 0.2844296395778656, "learning_rate": 5.1332527291837465e-08, "loss": 0.3902, "step": 3455 }, { "epoch": 2.876803551609323, "grad_norm": 0.32090917229652405, "learning_rate": 5.06423574357251e-08, "loss": 0.3932, "step": 3456 }, { "epoch": 2.8776359600443953, "grad_norm": 0.29726335406303406, "learning_rate": 4.9956835113099676e-08, "loss": 0.3991, "step": 3457 }, { "epoch": 2.878468368479467, "grad_norm": 0.30311334133148193, "learning_rate": 4.927596096767795e-08, "loss": 0.385, "step": 3458 }, { "epoch": 2.8793007769145396, "grad_norm": 0.3128688633441925, "learning_rate": 4.8599735638812373e-08, "loss": 0.4243, "step": 3459 }, { "epoch": 2.8801331853496115, "grad_norm": 0.2947447896003723, "learning_rate": 4.7928159761490566e-08, "loss": 0.4102, "step": 3460 }, { "epoch": 2.8809655937846834, "grad_norm": 0.2862451374530792, "learning_rate": 4.7261233966334196e-08, "loss": 0.3962, "step": 3461 }, { "epoch": 2.881798002219756, "grad_norm": 0.2894607186317444, "learning_rate": 4.659895887959787e-08, "loss": 0.3894, "step": 3462 }, { "epoch": 2.882630410654828, "grad_norm": 0.2788543701171875, "learning_rate": 4.594133512317023e-08, "loss": 0.3739, "step": 3463 }, { "epoch": 2.8834628190899, "grad_norm": 0.289505273103714, "learning_rate": 4.528836331457065e-08, "loss": 0.4062, "step": 3464 }, { "epoch": 2.884295227524972, "grad_norm": 0.26619216799736023, "learning_rate": 4.4640044066951994e-08, "loss": 0.3453, "step": 3465 }, { "epoch": 2.8851276359600444, "grad_norm": 0.29299643635749817, "learning_rate": 4.399637798909673e-08, "loss": 0.4205, "step": 3466 }, { "epoch": 2.8859600443951168, "grad_norm": 0.2827985882759094, "learning_rate": 4.335736568541915e-08, "loss": 0.4005, "step": 3467 }, { "epoch": 2.8867924528301887, "grad_norm": 0.2879890203475952, "learning_rate": 4.272300775596205e-08, "loss": 0.3817, "step": 3468 }, { "epoch": 2.8876248612652606, "grad_norm": 0.30056655406951904, "learning_rate": 4.2093304796399504e-08, "loss": 0.372, "step": 3469 }, { "epoch": 2.888457269700333, "grad_norm": 0.285110205411911, "learning_rate": 4.146825739803295e-08, "loss": 0.3608, "step": 3470 }, { "epoch": 2.8892896781354054, "grad_norm": 0.29550155997276306, "learning_rate": 4.084786614779346e-08, "loss": 0.4042, "step": 3471 }, { "epoch": 2.8901220865704773, "grad_norm": 0.3029278516769409, "learning_rate": 4.023213162823947e-08, "loss": 0.4232, "step": 3472 }, { "epoch": 2.8909544950055492, "grad_norm": 0.2764630913734436, "learning_rate": 3.962105441755515e-08, "loss": 0.3862, "step": 3473 }, { "epoch": 2.8917869034406216, "grad_norm": 0.2890421748161316, "learning_rate": 3.9014635089554274e-08, "loss": 0.3944, "step": 3474 }, { "epoch": 2.8926193118756935, "grad_norm": 0.3060329258441925, "learning_rate": 3.841287421367412e-08, "loss": 0.3911, "step": 3475 }, { "epoch": 2.893451720310766, "grad_norm": 0.29384008049964905, "learning_rate": 3.781577235497935e-08, "loss": 0.3783, "step": 3476 }, { "epoch": 2.894284128745838, "grad_norm": 0.2772958278656006, "learning_rate": 3.7223330074158126e-08, "loss": 0.3298, "step": 3477 }, { "epoch": 2.89511653718091, "grad_norm": 0.32114148139953613, "learning_rate": 3.663554792752544e-08, "loss": 0.4557, "step": 3478 }, { "epoch": 2.895948945615982, "grad_norm": 0.27607429027557373, "learning_rate": 3.605242646701812e-08, "loss": 0.3632, "step": 3479 }, { "epoch": 2.8967813540510545, "grad_norm": 0.3078801929950714, "learning_rate": 3.547396624019817e-08, "loss": 0.4103, "step": 3480 }, { "epoch": 2.8976137624861265, "grad_norm": 0.2985548675060272, "learning_rate": 3.490016779024885e-08, "loss": 0.3895, "step": 3481 }, { "epoch": 2.898446170921199, "grad_norm": 0.3042149245738983, "learning_rate": 3.4331031655976955e-08, "loss": 0.3638, "step": 3482 }, { "epoch": 2.8992785793562708, "grad_norm": 0.28391167521476746, "learning_rate": 3.3766558371812754e-08, "loss": 0.4081, "step": 3483 }, { "epoch": 2.9001109877913427, "grad_norm": 0.29962408542633057, "learning_rate": 3.320674846780503e-08, "loss": 0.3843, "step": 3484 }, { "epoch": 2.900943396226415, "grad_norm": 0.27982330322265625, "learning_rate": 3.265160246962607e-08, "loss": 0.3801, "step": 3485 }, { "epoch": 2.9017758046614874, "grad_norm": 0.30797380208969116, "learning_rate": 3.210112089856721e-08, "loss": 0.4107, "step": 3486 }, { "epoch": 2.9026082130965594, "grad_norm": 0.30911773443222046, "learning_rate": 3.155530427153997e-08, "loss": 0.4328, "step": 3487 }, { "epoch": 2.9034406215316313, "grad_norm": 0.2736690044403076, "learning_rate": 3.1014153101076026e-08, "loss": 0.3458, "step": 3488 }, { "epoch": 2.9042730299667037, "grad_norm": 0.28448286652565, "learning_rate": 3.0477667895326133e-08, "loss": 0.3888, "step": 3489 }, { "epoch": 2.905105438401776, "grad_norm": 0.29954880475997925, "learning_rate": 2.994584915805898e-08, "loss": 0.3806, "step": 3490 }, { "epoch": 2.905937846836848, "grad_norm": 0.27898576855659485, "learning_rate": 2.9418697388661766e-08, "loss": 0.3819, "step": 3491 }, { "epoch": 2.90677025527192, "grad_norm": 0.305177241563797, "learning_rate": 2.889621308213908e-08, "loss": 0.3835, "step": 3492 }, { "epoch": 2.9076026637069923, "grad_norm": 0.29762381315231323, "learning_rate": 2.8378396729113466e-08, "loss": 0.3718, "step": 3493 }, { "epoch": 2.9084350721420646, "grad_norm": 0.29369136691093445, "learning_rate": 2.7865248815822087e-08, "loss": 0.3917, "step": 3494 }, { "epoch": 2.9092674805771366, "grad_norm": 0.29049572348594666, "learning_rate": 2.7356769824121166e-08, "loss": 0.4114, "step": 3495 }, { "epoch": 2.9100998890122085, "grad_norm": 0.27511221170425415, "learning_rate": 2.6852960231480985e-08, "loss": 0.3677, "step": 3496 }, { "epoch": 2.910932297447281, "grad_norm": 0.28010135889053345, "learning_rate": 2.635382051098756e-08, "loss": 0.406, "step": 3497 }, { "epoch": 2.911764705882353, "grad_norm": 0.28749653697013855, "learning_rate": 2.585935113134208e-08, "loss": 0.3838, "step": 3498 }, { "epoch": 2.912597114317425, "grad_norm": 0.2730650305747986, "learning_rate": 2.5369552556859243e-08, "loss": 0.3455, "step": 3499 }, { "epoch": 2.913429522752497, "grad_norm": 0.2961804270744324, "learning_rate": 2.4884425247468924e-08, "loss": 0.3869, "step": 3500 }, { "epoch": 2.9142619311875695, "grad_norm": 0.29125046730041504, "learning_rate": 2.44039696587145e-08, "loss": 0.383, "step": 3501 }, { "epoch": 2.9150943396226414, "grad_norm": 0.3115370273590088, "learning_rate": 2.392818624175175e-08, "loss": 0.3976, "step": 3502 }, { "epoch": 2.915926748057714, "grad_norm": 0.3151339590549469, "learning_rate": 2.345707544334941e-08, "loss": 0.3907, "step": 3503 }, { "epoch": 2.9167591564927857, "grad_norm": 0.29095959663391113, "learning_rate": 2.2990637705889717e-08, "loss": 0.3903, "step": 3504 }, { "epoch": 2.917591564927858, "grad_norm": 0.30376264452934265, "learning_rate": 2.2528873467365098e-08, "loss": 0.4254, "step": 3505 }, { "epoch": 2.91842397336293, "grad_norm": 0.28083696961402893, "learning_rate": 2.2071783161379812e-08, "loss": 0.3617, "step": 3506 }, { "epoch": 2.919256381798002, "grad_norm": 0.3202035129070282, "learning_rate": 2.1619367217150522e-08, "loss": 0.4465, "step": 3507 }, { "epoch": 2.9200887902330743, "grad_norm": 0.2700662612915039, "learning_rate": 2.1171626059503514e-08, "loss": 0.3814, "step": 3508 }, { "epoch": 2.9209211986681467, "grad_norm": 0.2775675058364868, "learning_rate": 2.0728560108875807e-08, "loss": 0.3732, "step": 3509 }, { "epoch": 2.9217536071032186, "grad_norm": 0.2907252311706543, "learning_rate": 2.0290169781313483e-08, "loss": 0.418, "step": 3510 }, { "epoch": 2.9225860155382906, "grad_norm": 0.27951502799987793, "learning_rate": 1.985645548847337e-08, "loss": 0.3932, "step": 3511 }, { "epoch": 2.923418423973363, "grad_norm": 0.2931711673736572, "learning_rate": 1.9427417637619685e-08, "loss": 0.3873, "step": 3512 }, { "epoch": 2.9242508324084353, "grad_norm": 0.30858153104782104, "learning_rate": 1.9003056631627935e-08, "loss": 0.4157, "step": 3513 }, { "epoch": 2.9250832408435072, "grad_norm": 0.2937333285808563, "learning_rate": 1.8583372868979933e-08, "loss": 0.3687, "step": 3514 }, { "epoch": 2.925915649278579, "grad_norm": 0.2885035574436188, "learning_rate": 1.8168366743765432e-08, "loss": 0.3595, "step": 3515 }, { "epoch": 2.9267480577136515, "grad_norm": 0.2732231914997101, "learning_rate": 1.775803864568326e-08, "loss": 0.3982, "step": 3516 }, { "epoch": 2.927580466148724, "grad_norm": 0.29774099588394165, "learning_rate": 1.7352388960038548e-08, "loss": 0.4037, "step": 3517 }, { "epoch": 2.928412874583796, "grad_norm": 0.304595410823822, "learning_rate": 1.695141806774325e-08, "loss": 0.3923, "step": 3518 }, { "epoch": 2.9292452830188678, "grad_norm": 0.27520307898521423, "learning_rate": 1.6555126345316197e-08, "loss": 0.3416, "step": 3519 }, { "epoch": 2.93007769145394, "grad_norm": 0.30991899967193604, "learning_rate": 1.6163514164882486e-08, "loss": 0.4287, "step": 3520 }, { "epoch": 2.930910099889012, "grad_norm": 0.29716676473617554, "learning_rate": 1.577658189417186e-08, "loss": 0.3619, "step": 3521 }, { "epoch": 2.9317425083240845, "grad_norm": 0.2918042540550232, "learning_rate": 1.539432989652201e-08, "loss": 0.3875, "step": 3522 }, { "epoch": 2.9325749167591564, "grad_norm": 0.27701038122177124, "learning_rate": 1.5016758530873033e-08, "loss": 0.3748, "step": 3523 }, { "epoch": 2.9334073251942288, "grad_norm": 0.31016066670417786, "learning_rate": 1.4643868151771323e-08, "loss": 0.394, "step": 3524 }, { "epoch": 2.9342397336293007, "grad_norm": 0.29906758666038513, "learning_rate": 1.4275659109367346e-08, "loss": 0.4065, "step": 3525 }, { "epoch": 2.935072142064373, "grad_norm": 0.31481271982192993, "learning_rate": 1.3912131749416746e-08, "loss": 0.3975, "step": 3526 }, { "epoch": 2.935904550499445, "grad_norm": 0.2872684597969055, "learning_rate": 1.3553286413277022e-08, "loss": 0.3697, "step": 3527 }, { "epoch": 2.9367369589345174, "grad_norm": 0.29366058111190796, "learning_rate": 1.3199123437910855e-08, "loss": 0.3647, "step": 3528 }, { "epoch": 2.9375693673695893, "grad_norm": 0.29561111330986023, "learning_rate": 1.2849643155882773e-08, "loss": 0.3693, "step": 3529 }, { "epoch": 2.938401775804661, "grad_norm": 0.2844444811344147, "learning_rate": 1.2504845895361384e-08, "loss": 0.3812, "step": 3530 }, { "epoch": 2.9392341842397336, "grad_norm": 0.26833999156951904, "learning_rate": 1.2164731980117694e-08, "loss": 0.4002, "step": 3531 }, { "epoch": 2.940066592674806, "grad_norm": 0.2657759487628937, "learning_rate": 1.1829301729524567e-08, "loss": 0.3429, "step": 3532 }, { "epoch": 2.940899001109878, "grad_norm": 0.31585893034935, "learning_rate": 1.1498555458555604e-08, "loss": 0.3983, "step": 3533 }, { "epoch": 2.94173140954495, "grad_norm": 0.3071114420890808, "learning_rate": 1.1172493477789037e-08, "loss": 0.4385, "step": 3534 }, { "epoch": 2.942563817980022, "grad_norm": 0.2719644010066986, "learning_rate": 1.085111609340217e-08, "loss": 0.3399, "step": 3535 }, { "epoch": 2.9433962264150946, "grad_norm": 0.2977891266345978, "learning_rate": 1.0534423607173604e-08, "loss": 0.38, "step": 3536 }, { "epoch": 2.9442286348501665, "grad_norm": 0.2826799154281616, "learning_rate": 1.022241631648324e-08, "loss": 0.3787, "step": 3537 }, { "epoch": 2.9450610432852384, "grad_norm": 0.2901372015476227, "learning_rate": 9.915094514311719e-09, "loss": 0.4123, "step": 3538 }, { "epoch": 2.945893451720311, "grad_norm": 0.2899583578109741, "learning_rate": 9.612458489239308e-09, "loss": 0.4093, "step": 3539 }, { "epoch": 2.946725860155383, "grad_norm": 0.27518314123153687, "learning_rate": 9.314508525446464e-09, "loss": 0.3605, "step": 3540 }, { "epoch": 2.947558268590455, "grad_norm": 0.29123416543006897, "learning_rate": 9.021244902713833e-09, "loss": 0.3942, "step": 3541 }, { "epoch": 2.948390677025527, "grad_norm": 0.2904079258441925, "learning_rate": 8.732667896421131e-09, "loss": 0.3999, "step": 3542 }, { "epoch": 2.9492230854605994, "grad_norm": 0.29001057147979736, "learning_rate": 8.448777777546601e-09, "loss": 0.3751, "step": 3543 }, { "epoch": 2.9500554938956713, "grad_norm": 0.2952643632888794, "learning_rate": 8.169574812668668e-09, "loss": 0.3758, "step": 3544 }, { "epoch": 2.9508879023307437, "grad_norm": 0.3037968873977661, "learning_rate": 7.895059263963168e-09, "loss": 0.4062, "step": 3545 }, { "epoch": 2.9517203107658156, "grad_norm": 0.29707083106040955, "learning_rate": 7.625231389205567e-09, "loss": 0.3736, "step": 3546 }, { "epoch": 2.952552719200888, "grad_norm": 0.3003142178058624, "learning_rate": 7.360091441768746e-09, "loss": 0.3837, "step": 3547 }, { "epoch": 2.95338512763596, "grad_norm": 0.2987581789493561, "learning_rate": 7.099639670623548e-09, "loss": 0.3834, "step": 3548 }, { "epoch": 2.9542175360710323, "grad_norm": 0.28041496872901917, "learning_rate": 6.8438763203393375e-09, "loss": 0.366, "step": 3549 }, { "epoch": 2.9550499445061043, "grad_norm": 0.28142842650413513, "learning_rate": 6.59280163108178e-09, "loss": 0.4162, "step": 3550 }, { "epoch": 2.9558823529411766, "grad_norm": 0.2837742865085602, "learning_rate": 6.346415838614506e-09, "loss": 0.3371, "step": 3551 }, { "epoch": 2.9567147613762486, "grad_norm": 0.29957345128059387, "learning_rate": 6.104719174298557e-09, "loss": 0.4113, "step": 3552 }, { "epoch": 2.9575471698113205, "grad_norm": 0.2940880358219147, "learning_rate": 5.867711865090719e-09, "loss": 0.3893, "step": 3553 }, { "epoch": 2.958379578246393, "grad_norm": 0.29609382152557373, "learning_rate": 5.635394133545191e-09, "loss": 0.3688, "step": 3554 }, { "epoch": 2.9592119866814652, "grad_norm": 0.29784825444221497, "learning_rate": 5.40776619781247e-09, "loss": 0.4102, "step": 3555 }, { "epoch": 2.960044395116537, "grad_norm": 0.2848165035247803, "learning_rate": 5.184828271639353e-09, "loss": 0.3718, "step": 3556 }, { "epoch": 2.960876803551609, "grad_norm": 0.30922722816467285, "learning_rate": 4.966580564368384e-09, "loss": 0.3542, "step": 3557 }, { "epoch": 2.9617092119866815, "grad_norm": 0.2894100546836853, "learning_rate": 4.7530232809378515e-09, "loss": 0.4281, "step": 3558 }, { "epoch": 2.962541620421754, "grad_norm": 0.2721484899520874, "learning_rate": 4.54415662188179e-09, "loss": 0.392, "step": 3559 }, { "epoch": 2.9633740288568258, "grad_norm": 0.30090397596359253, "learning_rate": 4.339980783329423e-09, "loss": 0.3977, "step": 3560 }, { "epoch": 2.9642064372918977, "grad_norm": 0.29261451959609985, "learning_rate": 4.140495957006274e-09, "loss": 0.3847, "step": 3561 }, { "epoch": 2.96503884572697, "grad_norm": 0.2850942015647888, "learning_rate": 3.945702330230839e-09, "loss": 0.4031, "step": 3562 }, { "epoch": 2.9658712541620424, "grad_norm": 0.2756821811199188, "learning_rate": 3.755600085918465e-09, "loss": 0.364, "step": 3563 }, { "epoch": 2.9667036625971144, "grad_norm": 0.31473076343536377, "learning_rate": 3.5701894025791383e-09, "loss": 0.446, "step": 3564 }, { "epoch": 2.9675360710321863, "grad_norm": 0.29244500398635864, "learning_rate": 3.3894704543152578e-09, "loss": 0.4213, "step": 3565 }, { "epoch": 2.9683684794672587, "grad_norm": 0.29834458231925964, "learning_rate": 3.213443410826078e-09, "loss": 0.3975, "step": 3566 }, { "epoch": 2.9692008879023306, "grad_norm": 0.29561495780944824, "learning_rate": 3.0421084374038234e-09, "loss": 0.3765, "step": 3567 }, { "epoch": 2.970033296337403, "grad_norm": 0.28357571363449097, "learning_rate": 2.875465694935353e-09, "loss": 0.4125, "step": 3568 }, { "epoch": 2.970865704772475, "grad_norm": 0.3278956413269043, "learning_rate": 2.7135153399004967e-09, "loss": 0.4449, "step": 3569 }, { "epoch": 2.9716981132075473, "grad_norm": 0.2880827784538269, "learning_rate": 2.5562575243737176e-09, "loss": 0.364, "step": 3570 }, { "epoch": 2.972530521642619, "grad_norm": 0.30118292570114136, "learning_rate": 2.4036923960230053e-09, "loss": 0.3864, "step": 3571 }, { "epoch": 2.9733629300776916, "grad_norm": 0.30025699734687805, "learning_rate": 2.255820098109873e-09, "loss": 0.3575, "step": 3572 }, { "epoch": 2.9741953385127635, "grad_norm": 0.31271275877952576, "learning_rate": 2.112640769488805e-09, "loss": 0.424, "step": 3573 }, { "epoch": 2.975027746947836, "grad_norm": 0.28177496790885925, "learning_rate": 1.974154544607254e-09, "loss": 0.3734, "step": 3574 }, { "epoch": 2.975860155382908, "grad_norm": 0.2948905825614929, "learning_rate": 1.8403615535067531e-09, "loss": 0.4235, "step": 3575 }, { "epoch": 2.9766925638179798, "grad_norm": 0.28703781962394714, "learning_rate": 1.71126192182125e-09, "loss": 0.3976, "step": 3576 }, { "epoch": 2.977524972253052, "grad_norm": 0.3083464205265045, "learning_rate": 1.586855770777107e-09, "loss": 0.4025, "step": 3577 }, { "epoch": 2.9783573806881245, "grad_norm": 0.30387553572654724, "learning_rate": 1.4671432171947663e-09, "loss": 0.4421, "step": 3578 }, { "epoch": 2.9791897891231964, "grad_norm": 0.2780333161354065, "learning_rate": 1.3521243734854195e-09, "loss": 0.3695, "step": 3579 }, { "epoch": 2.9800221975582684, "grad_norm": 0.31834548711776733, "learning_rate": 1.2417993476543377e-09, "loss": 0.4009, "step": 3580 }, { "epoch": 2.9808546059933407, "grad_norm": 0.27020248770713806, "learning_rate": 1.136168243298097e-09, "loss": 0.3783, "step": 3581 }, { "epoch": 2.981687014428413, "grad_norm": 0.29213130474090576, "learning_rate": 1.0352311596067976e-09, "loss": 0.398, "step": 3582 }, { "epoch": 2.982519422863485, "grad_norm": 0.307743102312088, "learning_rate": 9.389881913618448e-10, "loss": 0.4089, "step": 3583 }, { "epoch": 2.983351831298557, "grad_norm": 0.31198549270629883, "learning_rate": 8.474394289376131e-10, "loss": 0.4056, "step": 3584 }, { "epoch": 2.9841842397336293, "grad_norm": 0.30002060532569885, "learning_rate": 7.605849582986713e-10, "loss": 0.3865, "step": 3585 }, { "epoch": 2.9850166481687017, "grad_norm": 0.28173932433128357, "learning_rate": 6.784248610042232e-10, "loss": 0.3695, "step": 3586 }, { "epoch": 2.9858490566037736, "grad_norm": 0.29308822751045227, "learning_rate": 6.009592142036669e-10, "loss": 0.3928, "step": 3587 }, { "epoch": 2.9866814650388456, "grad_norm": 0.317353218793869, "learning_rate": 5.281880906382597e-10, "loss": 0.4158, "step": 3588 }, { "epoch": 2.987513873473918, "grad_norm": 0.30219030380249023, "learning_rate": 4.6011155864111865e-10, "loss": 0.3933, "step": 3589 }, { "epoch": 2.98834628190899, "grad_norm": 0.28345680236816406, "learning_rate": 3.967296821383304e-10, "loss": 0.3719, "step": 3590 }, { "epoch": 2.9891786903440623, "grad_norm": 0.2939116358757019, "learning_rate": 3.380425206461757e-10, "loss": 0.3956, "step": 3591 }, { "epoch": 2.990011098779134, "grad_norm": 0.30421409010887146, "learning_rate": 2.8405012927223975e-10, "loss": 0.384, "step": 3592 }, { "epoch": 2.9908435072142066, "grad_norm": 0.2770386338233948, "learning_rate": 2.3475255871707737e-10, "loss": 0.3904, "step": 3593 }, { "epoch": 2.9916759156492785, "grad_norm": 0.28260740637779236, "learning_rate": 1.901498552714376e-10, "loss": 0.3735, "step": 3594 }, { "epoch": 2.992508324084351, "grad_norm": 0.29308584332466125, "learning_rate": 1.5024206081848401e-10, "loss": 0.4103, "step": 3595 }, { "epoch": 2.993340732519423, "grad_norm": 0.3127232491970062, "learning_rate": 1.1502921283212953e-10, "loss": 0.442, "step": 3596 }, { "epoch": 2.994173140954495, "grad_norm": 0.3048544228076935, "learning_rate": 8.451134437814646e-11, "loss": 0.3834, "step": 3597 }, { "epoch": 2.995005549389567, "grad_norm": 0.2796577513217926, "learning_rate": 5.86884841130564e-11, "loss": 0.3338, "step": 3598 }, { "epoch": 2.995837957824639, "grad_norm": 0.29299935698509216, "learning_rate": 3.7560656284685305e-11, "loss": 0.418, "step": 3599 }, { "epoch": 2.9966703662597114, "grad_norm": 0.28302428126335144, "learning_rate": 2.1127880733273764e-11, "loss": 0.3776, "step": 3600 }, { "epoch": 2.9975027746947838, "grad_norm": 0.2750588357448578, "learning_rate": 9.390172888701366e-12, "loss": 0.3898, "step": 3601 }, { "epoch": 2.9983351831298557, "grad_norm": 0.30292966961860657, "learning_rate": 2.347543773262295e-12, "loss": 0.4144, "step": 3602 }, { "epoch": 2.9991675915649276, "grad_norm": 0.29896464943885803, "learning_rate": 0.0, "loss": 0.3686, "step": 3603 }, { "epoch": 2.9991675915649276, "step": 3603, "total_flos": 4646503280312320.0, "train_loss": 0.438159415813211, "train_runtime": 72439.8933, "train_samples_per_second": 4.776, "train_steps_per_second": 0.05 } ], "logging_steps": 1.0, "max_steps": 3603, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4646503280312320.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }